From ed0aeba4ec8053446fcd629ab4a647626d58dfd6 Mon Sep 17 00:00:00 2001
From: alexwholland <alexwholland@gmail.com>
Date: Fri, 13 Jan 2023 21:23:58 -0800
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Added=20Sentiment=20analysis?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../scraper/templates/scraper/result.html     |   7 +
 marketscrape/scraper/views.py                 | 200 +++++++++++++++++-
 2 files changed, 206 insertions(+), 1 deletion(-)
diff --git a/marketscrape/scraper/templates/scraper/result.html b/marketscrape/scraper/templates/scraper/result.html
index 27aef61..f17607e 100644
--- a/marketscrape/scraper/templates/scraper/result.html
+++ b/marketscrape/scraper/templates/scraper/result.html
@@ -7,6 +7,13 @@
             <h3>The shortened_url is: {{ shortened_url }}</h3>
             <h3>The mobile_url is: {{ mobile_url }}</h3>
             <h3>The market_id is: {{ market_id }}</h3>
+			<h3>The sentiment_rating is: {{ sentiment_rating }}</h3>
+			<h3>The title is: {{ title }}</h3>
+			<h3>The list price is: {{ list_price }}</h3>
+			<h3>The initial price is: {{ initial_price }}</h3>
+			<h3>lower bound: {{ lower_bound }}</h3>
+			<h3>upper bound: {{ upper_bound }}</h3>
+			<h3>median: {{ median }}</h3>
 		</div>
 	</div>
 {% endblock content %}
\ No newline at end of file
diff --git a/marketscrape/scraper/views.py b/marketscrape/scraper/views.py
index 0a6f433..1c398fd 100644
--- a/marketscrape/scraper/views.py
+++ b/marketscrape/scraper/views.py
@@ -1,7 +1,19 @@
 from django.shortcuts import render
 from django.views import View
 from .forms import MarketForm
+
 import re
+from nltk.corpus import stopwords
+from nltk.sentiment import SentimentIntensityAnalyzer
+from nltk.tokenize import RegexpTokenizer
+from nltk.stem import WordNetLemmatizer
+import requests
+from bs4 import BeautifulSoup
+from difflib import SequenceMatcher
+
+import statistics
+import numpy as np
+
 
 class Index(View):
     def get(self, request):
@@ -20,11 +32,197 @@ class Index(View):
         mobile_url = shortened_url.replace("www", "m")
         # Find the ID of the product
         market_id = (re.search(r"\/item\/([0-9]*)", input_url)).group(1)
+        
+        # Get the sentiment rating of the listing
+        sentiment_rating = self.sentiment_analysis(self.get_listing_description(self.create_soup(input_url, headers=None)))
+
+        # Get the title of the listing
+        title = self.get_listing_title(self.create_soup(input_url, headers=None))
+        
+         # Get the minimum, maximum, and median prices of the viable products found on Google Shopping
+        list_price = self.get_listing_price(self.create_soup(mobile_url, headers=None))
+        initial_price = int(re.sub("[\$,]", "", list_price))
+
+        lower_bound, upper_bound, median = self.find_viable_product(title, ramp_down=0.0)
+
 
         context = {
             'shortened_url': shortened_url,
             'mobile_url': mobile_url,
             'market_id': market_id,
+            'sentiment_rating': sentiment_rating,
+            'title': title,
+            'list_price': list_price,
+            'initial_price': initial_price,
+            'lower_bound': lower_bound,
+            'upper_bound': upper_bound,
+            'median': median,
         }
-        
         return render(request, 'scraper/result.html', context)
+
+    def find_viable_product(self, title, ramp_down):
+        title = self.clean_listing_title(title)
+        headers = { 
+            "User-Agent":
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
+        }
+        url = "https://www.google.com/search?q=" + title + "&sa=X&biw=1920&bih=927&tbm=shop&sxsrf=ALiCzsbtwkWiDOQEcm_9X1UBlEG1iaqXtg%3A1663739640147&ei=-KYqY6CsCLez0PEP0Ias2AI&ved=0ahUKEwigiP-RmaX6AhW3GTQIHVADCysQ4dUDCAU&uact=5&oq=REPLACE&gs_lcp=Cgtwcm9kdWN0cy1jYxADMgUIABCABDIFCAAQgAQyBQgAEIAEMgsIABCABBCxAxCDATIECAAQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEOgsIABAeEA8QsAMQGDoNCAAQHhAPELADEAUQGDoGCAAQChADSgQIQRgBUM4MWO4TYJoVaAFwAHgAgAFDiAGNA5IBATeYAQCgAQHIAQPAAQE&sclient=products-cc"
+
+        soup = self.create_soup(url, headers)
+        # Set the similarity threshold to a initial value, and decrease it when no products are found
+        similarity_threshold = 0.45
+
+        try:
+            prices = self.listing_product_similarity(soup, title, similarity_threshold)
+            # The length of the list of prices should be greater than 0 if there are viable products
+            assert len(prices) > 0
+        except AssertionError:
+            print("Error: no viable products found, now searching for more general products...")
+            while len(prices) == 0:
+                # If no viable products are found, the search is further generalized by 5%, until a reasonable number of products are found
+                ramp_down += 0.05
+                prices = self.listing_product_similarity(soup, title, similarity_threshold - ramp_down)
+        
+        # Get the median price of the viable products
+        median = statistics.median_grouped(prices)
+        
+        return min(prices), max(prices), median
+
+    def clean_title_description(self, title):
+        # Remove punctuation
+        cleaned = re.sub(r"[^A-Za-z0-9\s]+", " ", title)
+        # Remove extra spaces
+        cleaned = re.sub(r"\s+", " ", cleaned)
+
+        return cleaned
+
+    def listing_product_similarity(self, soup, title, similarity_threshold):
+        # Get the median price of the product
+        normalized = self.get_product_price(soup)
+        # Get the product description
+        description = self.get_product_description(soup)
+
+        price_description = {}
+        # Iterate through the product descriptions
+        for key, value in zip(description, normalized):
+            google_shopping_title = self.clean_title_description(key.text.lower())
+            listing_title = self.clean_title_description(title.lower())
+            # Get the similarity between the listing title and the product description on Google Shopping
+            price_description[key.text] = [value, SequenceMatcher(None, google_shopping_title, listing_title).ratio()]
+
+        prices = []
+        # Iterate through the product descriptions and their similarity scores
+        for key, value in price_description.items():
+            # If the similarity score is greater than the similarity threshold, add the price to the list of prices
+            if value[1] >= similarity_threshold:
+                prices.append(value[0])
+        
+        return prices
+
+    def get_product_description(self, soup):
+        # Get the description of the product
+        description = soup.find_all("div", {"class": "rgHvZc"})
+        return description
+
+    def reject_outliers(self, data, m=1.5):
+        distribution = np.abs(data - np.median(data))
+        m_deviation = np.median(distribution)
+        standard = distribution / (m_deviation if m_deviation else 1.)
+        return data[standard < m].tolist()
+
+
+    def get_product_price(self, soup):
+        # Get the price of the product
+        prices = soup.find_all("span", {"class": "HRLxBb"})
+
+        # Extract the price from the span
+        values = []
+        for price in prices:
+            values.append(price.text)
+
+        # Remove the dollar sign from the price
+        normalized = [re.sub("\$", "", price) for price in values]
+        # Convert the price to a float
+        normalized = [re.search(r"[0-9,.]*", price).group(0) for price in normalized]
+        # Remove the commas from the price
+        normalized = [float(price.replace(",", "")) for price in normalized]
+        
+        # Remove statistical outliers as to not skew the median price
+        outlierless = self.reject_outliers(np.array(normalized))
+
+        return outlierless
+
+    def clean_listing_title(self, title):
+        # Certain symbols are not allowed in the search query for Google Shopping, so they must be removed
+        title = re.sub(r"#", "%2", title)
+        title = re.sub(r"&", "%26", title)
+        return title
+
+    def get_listing_price(self, soup):
+        # Get the price of the listing
+        spans = soup.find_all("span")
+
+        # Check if the listing is free
+        free = [span.text for span in spans if "free" in span.text.lower()]
+        if (free):
+            return free
+
+        # Find the span that contains the price of the listing and extract the price
+        price = [str(span.text) for span in spans if "$" in span.text][0]
+        return price
+
+    def get_listing_title(self, soup):
+        # Get the title of the listing
+        title = soup.find("meta", {"name": "DC.title"})
+        title_content = title["content"]
+        return title_content
+
+    def create_soup(self, url, headers):
+        # Create a request object 
+        response = requests.get(url, headers=headers)
+        # Create a BeautifulSoup object
+        soup = BeautifulSoup(response.text, 'html.parser')
+        return soup
+
+    def clean_text(self, text):
+        # Remove punctuation
+        tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
+        tokenized = tokenizer.tokenize(text)
+        # Lowercase all words
+        tokenized = [word.lower() for word in tokenized]
+
+        # Remove stopwords
+        stop_words = stopwords.words('english')
+        # Filter out any tokens not containing letters
+        filtered = [word for word in tokenized if word not in stop_words and word.isalpha()]
+
+        # Lemmatize all words
+        lemmatizer = WordNetLemmatizer()
+        lemmatized = [lemmatizer.lemmatize(word) for word in filtered]
+        
+        return " ".join(lemmatized)
+
+    def get_listing_description(self, soup):
+        # Get the description of the listing
+        description = soup.find("meta", {"name": "DC.description"})
+        description_content = description["content"]
+        return self.clean_text(description_content)
+
+    def sentiment_analysis(self, text):
+        # Create a SentimentIntensityAnalyzer object
+        sia = SentimentIntensityAnalyzer()
+        sentiment = sia.polarity_scores(text)
+        # Get the sentiment scores
+        neg, neu, pos, compound = sentiment["neg"], sentiment["neu"], sentiment["pos"], sentiment["compound"]
+
+        # Assign a rating based on the compound score
+        if compound > 0.0:
+            rating = 5 * max(pos, compound)
+        elif compound < 0.0:
+            rating = 5 * min(neg, compound)
+        else:
+            rating = 5 * neu
+        return abs(rating)
+
+
+    
\ No newline at end of file