mirror of
https://github.com/Marketscrape/marketscrape-web.git
synced 2026-05-24 13:44:29 -04:00
Fixed issues with day/hour calculations.
This commit is contained in:
@@ -9,35 +9,35 @@
|
||||
<div class="row">
|
||||
<div class="col-md-6 mx-auto">
|
||||
<figure class="figure">
|
||||
<img src=" {{ image }}" class="figure-img img-fluid rounded center" style="object-fit: fill;">
|
||||
<figcaption class="figure-caption"><code>{{ title }}</code> was listed <code>{{ days }} days</code> and <code>{{ hours }} hours</code> ago, for <code>${{ list_price}}</code>.</figcaption>
|
||||
</figure>
|
||||
<div class="table-responsive">
|
||||
<table class="table table-striped">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Range:</td>
|
||||
<td>${{ lower_bound }} - ${{ upper_bound }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Median:</td>
|
||||
<td>${{ median }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Description:</td>
|
||||
<td>{{ sentiment_rating }}/5.0</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Price:</td>
|
||||
<td>{{ price_rating }}/5.0</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Overall:</td>
|
||||
<td>{{ average_rating }}/5.0</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<img src=" {{ image }}" class="figure-img img-fluid rounded" style="object-fit: fill;">
|
||||
<figcaption class="figure-caption"><code>{{ title }}</code> was listed <code>{{ days }} days</code> and <code>{{ hours }} hours</code> ago, for <code>${{ list_price}}</code>.</figcaption>
|
||||
</figure>
|
||||
<div class="table-responsive">
|
||||
<table class="table table-striped">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Range:</td>
|
||||
<td>${{ lower_bound }} - ${{ upper_bound }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Median:</td>
|
||||
<td>${{ median }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Description:</td>
|
||||
<td>{{ sentiment_rating }}/5.0</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Price:</td>
|
||||
<td>{{ price_rating }}/5.0</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Overall:</td>
|
||||
<td>{{ average_rating }}/5.0</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -26,33 +26,24 @@ class Index(View):
|
||||
if form.is_valid():
|
||||
url = form.cleaned_data['url']
|
||||
|
||||
# Shorten the URL listing to the title of the listing
|
||||
shortened_url = re.search(r".*[0-9]", url).group(0)
|
||||
# Use the shortened URL and convert it to mobile, to get the price of the listing
|
||||
mobile_url = shortened_url.replace("www", "m")
|
||||
# Find the ID of the product
|
||||
market_id = (re.search(r"\/item\/([0-9]*)", url)).group(1)
|
||||
|
||||
# Get the image of the listing
|
||||
image = self.get_listing_image(self.create_soup(mobile_url, headers=None))
|
||||
|
||||
# Get the number of days and hours the listing has been active
|
||||
days, hours = self.get_listing_date(self.create_soup(mobile_url, headers=None))
|
||||
|
||||
# Get the sentiment rating of the listing
|
||||
sentiment_rating = self.sentiment_analysis(self.get_listing_description(self.create_soup(url, headers=None)))
|
||||
|
||||
# Get the title of the listing
|
||||
title = self.get_listing_title(self.create_soup(url, headers=None))
|
||||
|
||||
# Get the minimum, maximum, and median prices of the viable products found on Google Shopping
|
||||
list_price = self.get_listing_price(self.create_soup(mobile_url, headers=None))
|
||||
list_price = re.sub("[\$,]", "", list_price)
|
||||
initial_price = int(re.sub("[\$,]", "", list_price))
|
||||
|
||||
lower_bound, upper_bound, median = self.find_viable_product(title, ramp_down=0.0)
|
||||
|
||||
# Calculate the price difference between the listing and the median price of the viable products, and generate ratings
|
||||
price_rating = self.price_difference_rating(initial_price, median)
|
||||
average_rating = statistics.mean([sentiment_rating, price_rating])
|
||||
|
||||
@@ -77,11 +68,9 @@ class Index(View):
|
||||
return render(request, 'scraper/result.html', context)
|
||||
|
||||
def price_difference_rating(self, initial, final):
|
||||
# If the listing price is less than or equal to the median price found online, set the rating to 5
|
||||
if initial <= final:
|
||||
rating = 5.0
|
||||
else:
|
||||
# If the listing price is greater than the median price found online, calculate the difference
|
||||
difference = min(initial, final) / max(initial, final)
|
||||
rating = (difference / 20) * 100
|
||||
|
||||
@@ -96,55 +85,44 @@ class Index(View):
|
||||
url = "https://www.google.com/search?q=" + title + "&sa=X&biw=1920&bih=927&tbm=shop&sxsrf=ALiCzsbtwkWiDOQEcm_9X1UBlEG1iaqXtg%3A1663739640147&ei=-KYqY6CsCLez0PEP0Ias2AI&ved=0ahUKEwigiP-RmaX6AhW3GTQIHVADCysQ4dUDCAU&uact=5&oq=REPLACE&gs_lcp=Cgtwcm9kdWN0cy1jYxADMgUIABCABDIFCAAQgAQyBQgAEIAEMgsIABCABBCxAxCDATIECAAQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEOgsIABAeEA8QsAMQGDoNCAAQHhAPELADEAUQGDoGCAAQChADSgQIQRgBUM4MWO4TYJoVaAFwAHgAgAFDiAGNA5IBATeYAQCgAQHIAQPAAQE&sclient=products-cc"
|
||||
|
||||
soup = self.create_soup(url, headers)
|
||||
# Set the similarity threshold to a initial value, and decrease it when no products are found
|
||||
similarity_threshold = 0.45
|
||||
similarity_threshold = 0.25
|
||||
|
||||
try:
|
||||
prices = self.listing_product_similarity(soup, title, similarity_threshold)
|
||||
# The length of the list of prices should be greater than 0 if there are viable products
|
||||
filtered_prices_descriptions = self.listing_product_similarity(soup, title, similarity_threshold)
|
||||
prices = list(filtered_prices_descriptions.values())
|
||||
assert len(prices) > 0
|
||||
except AssertionError:
|
||||
print("Error: no viable products found, now searching for more general products...")
|
||||
while len(prices) == 0:
|
||||
# If no viable products are found, the search is further generalized by 5%, until a reasonable number of products are found
|
||||
ramp_down += 0.05
|
||||
prices = self.listing_product_similarity(soup, title, similarity_threshold - ramp_down)
|
||||
|
||||
# Get the median price of the viable products
|
||||
filtered_prices_descriptions = self.listing_product_similarity(soup, title, similarity_threshold - ramp_down)
|
||||
prices = list(filtered_prices_descriptions.values())
|
||||
|
||||
median = statistics.median_grouped(prices)
|
||||
|
||||
return min(prices), max(prices), median
|
||||
|
||||
def clean_title_description(self, title):
|
||||
# Remove punctuation
|
||||
cleaned = re.sub(r"[^A-Za-z0-9\s]+", " ", title)
|
||||
# Remove extra spaces
|
||||
cleaned = re.sub(r"\s+", " ", cleaned)
|
||||
|
||||
return cleaned
|
||||
|
||||
def listing_product_similarity(self, soup, title, similarity_threshold):
|
||||
# Get the median price of the product
|
||||
normalized = self.get_product_price(soup)
|
||||
# Get the product description
|
||||
description = self.get_product_description(soup)
|
||||
|
||||
price_description = {}
|
||||
# Iterate through the product descriptions
|
||||
for key, value in zip(description, normalized):
|
||||
google_shopping_title = self.clean_title_description(key.text.lower())
|
||||
listing_title = self.clean_title_description(title.lower())
|
||||
# Get the similarity between the listing title and the product description on Google Shopping
|
||||
price_description[key.text] = [value, SequenceMatcher(None, google_shopping_title, listing_title).ratio()]
|
||||
|
||||
prices = []
|
||||
# Iterate through the product descriptions and their similarity scores
|
||||
filtered_prices_descriptions = {}
|
||||
for key, value in price_description.items():
|
||||
# If the similarity score is greater than the similarity threshold, add the price to the list of prices
|
||||
if value[1] >= similarity_threshold:
|
||||
prices.append(value[0])
|
||||
|
||||
return prices
|
||||
filtered_prices_descriptions[key] = value[0]
|
||||
|
||||
return filtered_prices_descriptions
|
||||
|
||||
def get_product_description(self, soup):
|
||||
# Get the description of the product
|
||||
@@ -161,57 +139,44 @@ class Index(View):
|
||||
|
||||
|
||||
def get_product_price(self, soup):
|
||||
# Get the price of the product
|
||||
prices = soup.find_all("span", {"class": "HRLxBb"})
|
||||
|
||||
# Extract the price from the span
|
||||
values = []
|
||||
for price in prices:
|
||||
values.append(price.text)
|
||||
|
||||
# Remove the dollar sign from the price
|
||||
normalized = [re.sub("\$", "", price) for price in values]
|
||||
# Convert the price to a float
|
||||
normalized = [re.search(r"[0-9,.]*", price).group(0) for price in normalized]
|
||||
# Remove the commas from the price
|
||||
normalized = [float(price.replace(",", "")) for price in normalized]
|
||||
|
||||
# Remove statistical outliers as to not skew the median price
|
||||
outlierless = self.reject_outliers(np.array(normalized))
|
||||
|
||||
return outlierless
|
||||
|
||||
def clean_listing_title(self, title):
|
||||
# Certain symbols are not allowed in the search query for Google Shopping, so they must be removed
|
||||
title = re.sub(r"#", "%2", title)
|
||||
title = re.sub(r"&", "%26", title)
|
||||
|
||||
return title
|
||||
|
||||
def get_listing_price(self, soup):
|
||||
# Get the price of the listing
|
||||
spans = soup.find_all("span")
|
||||
|
||||
# Check if the listing is free
|
||||
free = [span.text for span in spans if "free" in span.text.lower()]
|
||||
if (free):
|
||||
return free
|
||||
|
||||
# Find the span that contains the price of the listing and extract the price
|
||||
price = [str(span.text) for span in spans if "$" in span.text][0]
|
||||
|
||||
return price
|
||||
|
||||
def get_listing_image(self, soup):
|
||||
# Get the image of the listing
|
||||
images = soup.find_all("img")
|
||||
# Find the image that is the listing image
|
||||
image = [image["src"] for image in images if "https://scontent" in image["src"]]
|
||||
|
||||
return image
|
||||
|
||||
def get_listing_title(self, soup):
|
||||
# Get the title of the listing
|
||||
title = soup.find("meta", {"name": "DC.title"})
|
||||
title_content = title["content"]
|
||||
return title_content
|
||||
@@ -220,19 +185,26 @@ class Index(View):
|
||||
tag = soup.find('abbr')
|
||||
tag = tag.text.strip()
|
||||
|
||||
month_str = re.search(r"[a-zA-Z]+", tag).group(0)
|
||||
month_num = datetime.datetime.strptime(month_str, '%B').month
|
||||
try:
|
||||
month_str = re.search(r"[a-zA-Z]+", tag).group(0)
|
||||
month_num = datetime.datetime.strptime(month_str, '%B').month
|
||||
except ValueError:
|
||||
hour_str = re.search(r"[0-9]+", tag).group(0)
|
||||
return 0, hour_str
|
||||
|
||||
try:
|
||||
year_str = re.search(r"[0-9]{4}", tag).group(0)
|
||||
except AttributeError:
|
||||
year_str = datetime.datetime.now().year
|
||||
|
||||
date_str = re.search(r"[0-9]+", tag).group(0)
|
||||
year_str = datetime.datetime.now().year
|
||||
|
||||
time_str = re.search(r"[0-9]+:[0-9]+", tag).group(0)
|
||||
am_pm = re.search(r"[A-Z]{2}", tag).group(0)
|
||||
|
||||
formated_time = f'{time_str}:00 {am_pm}'
|
||||
formated_date = f'{year_str}-{month_num}-{date_str}'
|
||||
|
||||
date_str = f'{year_str}-{month_num}-{date_str}'
|
||||
|
||||
dt_str = f'{date_str} {formated_time}'
|
||||
dt_str = f'{formated_date} {formated_time}'
|
||||
dt = datetime.datetime.strptime(dt_str, '%Y-%m-%d %I:%M:%S %p')
|
||||
|
||||
now = datetime.datetime.now()
|
||||
@@ -243,47 +215,37 @@ class Index(View):
|
||||
|
||||
return days, hours
|
||||
|
||||
|
||||
def create_soup(self, url, headers):
|
||||
# Create a request object
|
||||
response = requests.get(url, headers=headers)
|
||||
# Create a BeautifulSoup object
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
return soup
|
||||
|
||||
def clean_text(self, text):
|
||||
# Remove punctuation
|
||||
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
|
||||
tokenized = tokenizer.tokenize(text)
|
||||
# Lowercase all words
|
||||
tokenized = [word.lower() for word in tokenized]
|
||||
|
||||
# Remove stopwords
|
||||
stop_words = stopwords.words('english')
|
||||
# Filter out any tokens not containing letters
|
||||
filtered = [word for word in tokenized if word not in stop_words and word.isalpha()]
|
||||
|
||||
# Lemmatize all words
|
||||
lemmatizer = WordNetLemmatizer()
|
||||
lemmatized = [lemmatizer.lemmatize(word) for word in filtered]
|
||||
|
||||
return " ".join(lemmatized)
|
||||
|
||||
def get_listing_description(self, soup):
|
||||
# Get the description of the listing
|
||||
description = soup.find("meta", {"name": "DC.description"})
|
||||
description_content = description["content"]
|
||||
|
||||
return self.clean_text(description_content)
|
||||
|
||||
def sentiment_analysis(self, text):
|
||||
# Create a SentimentIntensityAnalyzer object
|
||||
sia = SentimentIntensityAnalyzer()
|
||||
sentiment = sia.polarity_scores(text)
|
||||
# Get the sentiment scores
|
||||
neg, neu, pos, compound = sentiment["neg"], sentiment["neu"], sentiment["pos"], sentiment["compound"]
|
||||
|
||||
# Assign a rating based on the compound score
|
||||
if compound > 0.0:
|
||||
rating = 5 * max(pos, compound)
|
||||
elif compound < 0.0:
|
||||
|
||||
Reference in New Issue
Block a user