From ed0aeba4ec8053446fcd629ab4a647626d58dfd6 Mon Sep 17 00:00:00 2001 From: alexwholland Date: Fri, 13 Jan 2023 21:23:58 -0800 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Added=20Sentiment=20analysis?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../scraper/templates/scraper/result.html | 7 + marketscrape/scraper/views.py | 200 +++++++++++++++++- 2 files changed, 206 insertions(+), 1 deletion(-) diff --git a/marketscrape/scraper/templates/scraper/result.html b/marketscrape/scraper/templates/scraper/result.html index 27aef61..f17607e 100644 --- a/marketscrape/scraper/templates/scraper/result.html +++ b/marketscrape/scraper/templates/scraper/result.html @@ -7,6 +7,13 @@

The shortened_url is: {{ shortened_url }}

The mobile_url is: {{ mobile_url }}

The market_id is: {{ market_id }}

+

The sentiment_rating is: {{ sentiment_rating }}

+

The title is: {{ title }}

+

The list price is: {{ list_price }}

+

The initial price is: {{ initial_price }}

+

lower bound: {{ lower_bound }}

+

upper bound: {{ upper_bound }}

+

median: {{ median }}

{% endblock content %} \ No newline at end of file diff --git a/marketscrape/scraper/views.py b/marketscrape/scraper/views.py index 0a6f433..1c398fd 100644 --- a/marketscrape/scraper/views.py +++ b/marketscrape/scraper/views.py @@ -1,7 +1,19 @@ from django.shortcuts import render from django.views import View from .forms import MarketForm + import re +from nltk.corpus import stopwords +from nltk.sentiment import SentimentIntensityAnalyzer +from nltk.tokenize import RegexpTokenizer +from nltk.stem import WordNetLemmatizer +import requests +from bs4 import BeautifulSoup +from difflib import SequenceMatcher + +import statistics +import numpy as np + class Index(View): def get(self, request): @@ -20,11 +32,197 @@ class Index(View): mobile_url = shortened_url.replace("www", "m") # Find the ID of the product market_id = (re.search(r"\/item\/([0-9]*)", input_url)).group(1) + + # Get the sentiment rating of the listing + sentiment_rating = self.sentiment_analysis(self.get_listing_description(self.create_soup(input_url, headers=None))) + + # Get the title of the listing + title = self.get_listing_title(self.create_soup(input_url, headers=None)) + + # Get the minimum, maximum, and median prices of the viable products found on Google Shopping + list_price = self.get_listing_price(self.create_soup(mobile_url, headers=None)) + initial_price = int(re.sub("[\$,]", "", list_price)) + + lower_bound, upper_bound, median = self.find_viable_product(title, ramp_down=0.0) + context = { 'shortened_url': shortened_url, 'mobile_url': mobile_url, 'market_id': market_id, + 'sentiment_rating': sentiment_rating, + 'title': title, + 'list_price': list_price, + 'initial_price': initial_price, + 'lower_bound': lower_bound, + 'upper_bound': upper_bound, + 'median': median, } - return render(request, 'scraper/result.html', context) + + def find_viable_product(self, title, ramp_down): + title = self.clean_listing_title(title) + headers = { + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582" + } + url = "https://www.google.com/search?q=" + title + "&sa=X&biw=1920&bih=927&tbm=shop&sxsrf=ALiCzsbtwkWiDOQEcm_9X1UBlEG1iaqXtg%3A1663739640147&ei=-KYqY6CsCLez0PEP0Ias2AI&ved=0ahUKEwigiP-RmaX6AhW3GTQIHVADCysQ4dUDCAU&uact=5&oq=REPLACE&gs_lcp=Cgtwcm9kdWN0cy1jYxADMgUIABCABDIFCAAQgAQyBQgAEIAEMgsIABCABBCxAxCDATIECAAQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEOgsIABAeEA8QsAMQGDoNCAAQHhAPELADEAUQGDoGCAAQChADSgQIQRgBUM4MWO4TYJoVaAFwAHgAgAFDiAGNA5IBATeYAQCgAQHIAQPAAQE&sclient=products-cc" + + soup = self.create_soup(url, headers) + # Set the similarity threshold to a initial value, and decrease it when no products are found + similarity_threshold = 0.45 + + try: + prices = self.listing_product_similarity(soup, title, similarity_threshold) + # The length of the list of prices should be greater than 0 if there are viable products + assert len(prices) > 0 + except AssertionError: + print("Error: no viable products found, now searching for more general products...") + while len(prices) == 0: + # If no viable products are found, the search is further generalized by 5%, until a reasonable number of products are found + ramp_down += 0.05 + prices = self.listing_product_similarity(soup, title, similarity_threshold - ramp_down) + + # Get the median price of the viable products + median = statistics.median_grouped(prices) + + return min(prices), max(prices), median + + def clean_title_description(self, title): + # Remove punctuation + cleaned = re.sub(r"[^A-Za-z0-9\s]+", " ", title) + # Remove extra spaces + cleaned = re.sub(r"\s+", " ", cleaned) + + return cleaned + + def listing_product_similarity(self, soup, title, similarity_threshold): + # Get the median price of the product + normalized = self.get_product_price(soup) + # Get the product description + description = self.get_product_description(soup) + + price_description = {} + # Iterate through the product descriptions + for key, value in zip(description, normalized): + google_shopping_title = self.clean_title_description(key.text.lower()) + listing_title = self.clean_title_description(title.lower()) + # Get the similarity between the listing title and the product description on Google Shopping + price_description[key.text] = [value, SequenceMatcher(None, google_shopping_title, listing_title).ratio()] + + prices = [] + # Iterate through the product descriptions and their similarity scores + for key, value in price_description.items(): + # If the similarity score is greater than the similarity threshold, add the price to the list of prices + if value[1] >= similarity_threshold: + prices.append(value[0]) + + return prices + + def get_product_description(self, soup): + # Get the description of the product + description = soup.find_all("div", {"class": "rgHvZc"}) + return description + + def reject_outliers(self, data, m=1.5): + distribution = np.abs(data - np.median(data)) + m_deviation = np.median(distribution) + standard = distribution / (m_deviation if m_deviation else 1.) + return data[standard < m].tolist() + + + def get_product_price(self, soup): + # Get the price of the product + prices = soup.find_all("span", {"class": "HRLxBb"}) + + # Extract the price from the span + values = [] + for price in prices: + values.append(price.text) + + # Remove the dollar sign from the price + normalized = [re.sub("\$", "", price) for price in values] + # Convert the price to a float + normalized = [re.search(r"[0-9,.]*", price).group(0) for price in normalized] + # Remove the commas from the price + normalized = [float(price.replace(",", "")) for price in normalized] + + # Remove statistical outliers as to not skew the median price + outlierless = self.reject_outliers(np.array(normalized)) + + return outlierless + + def clean_listing_title(self, title): + # Certain symbols are not allowed in the search query for Google Shopping, so they must be removed + title = re.sub(r"#", "%2", title) + title = re.sub(r"&", "%26", title) + return title + + def get_listing_price(self, soup): + # Get the price of the listing + spans = soup.find_all("span") + + # Check if the listing is free + free = [span.text for span in spans if "free" in span.text.lower()] + if (free): + return free + + # Find the span that contains the price of the listing and extract the price + price = [str(span.text) for span in spans if "$" in span.text][0] + return price + + def get_listing_title(self, soup): + # Get the title of the listing + title = soup.find("meta", {"name": "DC.title"}) + title_content = title["content"] + return title_content + + def create_soup(self, url, headers): + # Create a request object + response = requests.get(url, headers=headers) + # Create a BeautifulSoup object + soup = BeautifulSoup(response.text, 'html.parser') + return soup + + def clean_text(self, text): + # Remove punctuation + tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+') + tokenized = tokenizer.tokenize(text) + # Lowercase all words + tokenized = [word.lower() for word in tokenized] + + # Remove stopwords + stop_words = stopwords.words('english') + # Filter out any tokens not containing letters + filtered = [word for word in tokenized if word not in stop_words and word.isalpha()] + + # Lemmatize all words + lemmatizer = WordNetLemmatizer() + lemmatized = [lemmatizer.lemmatize(word) for word in filtered] + + return " ".join(lemmatized) + + def get_listing_description(self, soup): + # Get the description of the listing + description = soup.find("meta", {"name": "DC.description"}) + description_content = description["content"] + return self.clean_text(description_content) + + def sentiment_analysis(self, text): + # Create a SentimentIntensityAnalyzer object + sia = SentimentIntensityAnalyzer() + sentiment = sia.polarity_scores(text) + # Get the sentiment scores + neg, neu, pos, compound = sentiment["neg"], sentiment["neu"], sentiment["pos"], sentiment["compound"] + + # Assign a rating based on the compound score + if compound > 0.0: + rating = 5 * max(pos, compound) + elif compound < 0.0: + rating = 5 * min(neg, compound) + else: + rating = 5 * neu + return abs(rating) + + + \ No newline at end of file