From ee8f081e2ef750230a36b755d57e13e9fe759012 Mon Sep 17 00:00:00 2001 From: Bhavanvir Rai Date: Sat, 18 Mar 2023 02:07:06 -0700 Subject: [PATCH] Fixed issues with day/hour calculations. --- scraper/templates/scraper/result.html | 58 +++++++++--------- scraper/views.py | 88 ++++++++------------------- 2 files changed, 54 insertions(+), 92 deletions(-) diff --git a/scraper/templates/scraper/result.html b/scraper/templates/scraper/result.html index 55cd0cc..38c4339 100644 --- a/scraper/templates/scraper/result.html +++ b/scraper/templates/scraper/result.html @@ -9,35 +9,35 @@
- -
{{ title }} was listed {{ days }} days and {{ hours }} hours ago, for ${{ list_price}}.
-
-
- - - - - - - - - - - - - - - - - - - - - - - -
Range:${{ lower_bound }} - ${{ upper_bound }}
Median:${{ median }}
Description:{{ sentiment_rating }}/5.0
Price:{{ price_rating }}/5.0
Overall:{{ average_rating }}/5.0
-
+ +
{{ title }} was listed {{ days }} days and {{ hours }} hours ago, for ${{ list_price}}.
+ +
+ + + + + + + + + + + + + + + + + + + + + + + +
Range:${{ lower_bound }} - ${{ upper_bound }}
Median:${{ median }}
Description:{{ sentiment_rating }}/5.0
Price:{{ price_rating }}/5.0
Overall:{{ average_rating }}/5.0
+
diff --git a/scraper/views.py b/scraper/views.py index 7e41cd8..f941b63 100644 --- a/scraper/views.py +++ b/scraper/views.py @@ -26,33 +26,24 @@ class Index(View): if form.is_valid(): url = form.cleaned_data['url'] - # Shorten the URL listing to the title of the listing shortened_url = re.search(r".*[0-9]", url).group(0) - # Use the shortened URL and convert it to mobile, to get the price of the listing mobile_url = shortened_url.replace("www", "m") - # Find the ID of the product market_id = (re.search(r"\/item\/([0-9]*)", url)).group(1) - # Get the image of the listing image = self.get_listing_image(self.create_soup(mobile_url, headers=None)) - # Get the number of days and hours the listing has been active days, hours = self.get_listing_date(self.create_soup(mobile_url, headers=None)) - # Get the sentiment rating of the listing sentiment_rating = self.sentiment_analysis(self.get_listing_description(self.create_soup(url, headers=None))) - # Get the title of the listing title = self.get_listing_title(self.create_soup(url, headers=None)) - # Get the minimum, maximum, and median prices of the viable products found on Google Shopping list_price = self.get_listing_price(self.create_soup(mobile_url, headers=None)) list_price = re.sub("[\$,]", "", list_price) initial_price = int(re.sub("[\$,]", "", list_price)) lower_bound, upper_bound, median = self.find_viable_product(title, ramp_down=0.0) - # Calculate the price difference between the listing and the median price of the viable products, and generate ratings price_rating = self.price_difference_rating(initial_price, median) average_rating = statistics.mean([sentiment_rating, price_rating]) @@ -77,11 +68,9 @@ class Index(View): return render(request, 'scraper/result.html', context) def price_difference_rating(self, initial, final): - # If the listing price is less than or equal to the median price found online, set the rating to 5 if initial <= final: rating = 5.0 else: - # If the listing price is greater than the median price found online, calculate the difference difference = min(initial, final) / max(initial, final) rating = (difference / 20) * 100 @@ -96,55 +85,44 @@ class Index(View): url = "https://www.google.com/search?q=" + title + "&sa=X&biw=1920&bih=927&tbm=shop&sxsrf=ALiCzsbtwkWiDOQEcm_9X1UBlEG1iaqXtg%3A1663739640147&ei=-KYqY6CsCLez0PEP0Ias2AI&ved=0ahUKEwigiP-RmaX6AhW3GTQIHVADCysQ4dUDCAU&uact=5&oq=REPLACE&gs_lcp=Cgtwcm9kdWN0cy1jYxADMgUIABCABDIFCAAQgAQyBQgAEIAEMgsIABCABBCxAxCDATIECAAQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEOgsIABAeEA8QsAMQGDoNCAAQHhAPELADEAUQGDoGCAAQChADSgQIQRgBUM4MWO4TYJoVaAFwAHgAgAFDiAGNA5IBATeYAQCgAQHIAQPAAQE&sclient=products-cc" soup = self.create_soup(url, headers) - # Set the similarity threshold to a initial value, and decrease it when no products are found - similarity_threshold = 0.45 + similarity_threshold = 0.25 try: - prices = self.listing_product_similarity(soup, title, similarity_threshold) - # The length of the list of prices should be greater than 0 if there are viable products + filtered_prices_descriptions = self.listing_product_similarity(soup, title, similarity_threshold) + prices = list(filtered_prices_descriptions.values()) assert len(prices) > 0 except AssertionError: - print("Error: no viable products found, now searching for more general products...") while len(prices) == 0: - # If no viable products are found, the search is further generalized by 5%, until a reasonable number of products are found ramp_down += 0.05 - prices = self.listing_product_similarity(soup, title, similarity_threshold - ramp_down) - - # Get the median price of the viable products + filtered_prices_descriptions = self.listing_product_similarity(soup, title, similarity_threshold - ramp_down) + prices = list(filtered_prices_descriptions.values()) + median = statistics.median_grouped(prices) return min(prices), max(prices), median def clean_title_description(self, title): - # Remove punctuation cleaned = re.sub(r"[^A-Za-z0-9\s]+", " ", title) - # Remove extra spaces cleaned = re.sub(r"\s+", " ", cleaned) return cleaned def listing_product_similarity(self, soup, title, similarity_threshold): - # Get the median price of the product normalized = self.get_product_price(soup) - # Get the product description description = self.get_product_description(soup) price_description = {} - # Iterate through the product descriptions for key, value in zip(description, normalized): google_shopping_title = self.clean_title_description(key.text.lower()) listing_title = self.clean_title_description(title.lower()) - # Get the similarity between the listing title and the product description on Google Shopping price_description[key.text] = [value, SequenceMatcher(None, google_shopping_title, listing_title).ratio()] - prices = [] - # Iterate through the product descriptions and their similarity scores + filtered_prices_descriptions = {} for key, value in price_description.items(): - # If the similarity score is greater than the similarity threshold, add the price to the list of prices if value[1] >= similarity_threshold: - prices.append(value[0]) - - return prices + filtered_prices_descriptions[key] = value[0] + + return filtered_prices_descriptions def get_product_description(self, soup): # Get the description of the product @@ -161,57 +139,44 @@ class Index(View): def get_product_price(self, soup): - # Get the price of the product prices = soup.find_all("span", {"class": "HRLxBb"}) - # Extract the price from the span values = [] for price in prices: values.append(price.text) - # Remove the dollar sign from the price normalized = [re.sub("\$", "", price) for price in values] - # Convert the price to a float normalized = [re.search(r"[0-9,.]*", price).group(0) for price in normalized] - # Remove the commas from the price normalized = [float(price.replace(",", "")) for price in normalized] - # Remove statistical outliers as to not skew the median price outlierless = self.reject_outliers(np.array(normalized)) return outlierless def clean_listing_title(self, title): - # Certain symbols are not allowed in the search query for Google Shopping, so they must be removed title = re.sub(r"#", "%2", title) title = re.sub(r"&", "%26", title) return title def get_listing_price(self, soup): - # Get the price of the listing spans = soup.find_all("span") - # Check if the listing is free free = [span.text for span in spans if "free" in span.text.lower()] if (free): return free - # Find the span that contains the price of the listing and extract the price price = [str(span.text) for span in spans if "$" in span.text][0] return price def get_listing_image(self, soup): - # Get the image of the listing images = soup.find_all("img") - # Find the image that is the listing image image = [image["src"] for image in images if "https://scontent" in image["src"]] return image def get_listing_title(self, soup): - # Get the title of the listing title = soup.find("meta", {"name": "DC.title"}) title_content = title["content"] return title_content @@ -220,19 +185,26 @@ class Index(View): tag = soup.find('abbr') tag = tag.text.strip() - month_str = re.search(r"[a-zA-Z]+", tag).group(0) - month_num = datetime.datetime.strptime(month_str, '%B').month + try: + month_str = re.search(r"[a-zA-Z]+", tag).group(0) + month_num = datetime.datetime.strptime(month_str, '%B').month + except ValueError: + hour_str = re.search(r"[0-9]+", tag).group(0) + return 0, hour_str + + try: + year_str = re.search(r"[0-9]{4}", tag).group(0) + except AttributeError: + year_str = datetime.datetime.now().year date_str = re.search(r"[0-9]+", tag).group(0) - year_str = datetime.datetime.now().year - time_str = re.search(r"[0-9]+:[0-9]+", tag).group(0) am_pm = re.search(r"[A-Z]{2}", tag).group(0) + formated_time = f'{time_str}:00 {am_pm}' + formated_date = f'{year_str}-{month_num}-{date_str}' - date_str = f'{year_str}-{month_num}-{date_str}' - - dt_str = f'{date_str} {formated_time}' + dt_str = f'{formated_date} {formated_time}' dt = datetime.datetime.strptime(dt_str, '%Y-%m-%d %I:%M:%S %p') now = datetime.datetime.now() @@ -243,47 +215,37 @@ class Index(View): return days, hours + def create_soup(self, url, headers): - # Create a request object response = requests.get(url, headers=headers) - # Create a BeautifulSoup object soup = BeautifulSoup(response.text, 'html.parser') return soup def clean_text(self, text): - # Remove punctuation tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+') tokenized = tokenizer.tokenize(text) - # Lowercase all words tokenized = [word.lower() for word in tokenized] - # Remove stopwords stop_words = stopwords.words('english') - # Filter out any tokens not containing letters filtered = [word for word in tokenized if word not in stop_words and word.isalpha()] - # Lemmatize all words lemmatizer = WordNetLemmatizer() lemmatized = [lemmatizer.lemmatize(word) for word in filtered] return " ".join(lemmatized) def get_listing_description(self, soup): - # Get the description of the listing description = soup.find("meta", {"name": "DC.description"}) description_content = description["content"] return self.clean_text(description_content) def sentiment_analysis(self, text): - # Create a SentimentIntensityAnalyzer object sia = SentimentIntensityAnalyzer() sentiment = sia.polarity_scores(text) - # Get the sentiment scores neg, neu, pos, compound = sentiment["neg"], sentiment["neu"], sentiment["pos"], sentiment["compound"] - # Assign a rating based on the compound score if compound > 0.0: rating = 5 * max(pos, compound) elif compound < 0.0: