From ee8f081e2ef750230a36b755d57e13e9fe759012 Mon Sep 17 00:00:00 2001
From: Bhavanvir Rai <bhavanvir.r@gmail.com>
Date: Sat, 18 Mar 2023 02:07:06 -0700
Subject: [PATCH] Fixed issues with day/hour calculations.

---
 scraper/templates/scraper/result.html | 58 +++++++++---------
 scraper/views.py                      | 88 ++++++++-------------------
 2 files changed, 54 insertions(+), 92 deletions(-)
diff --git a/scraper/templates/scraper/result.html b/scraper/templates/scraper/result.html
index 55cd0cc..38c4339 100644
--- a/scraper/templates/scraper/result.html
+++ b/scraper/templates/scraper/result.html
@@ -9,35 +9,35 @@
             <div class="row">
                 <div class="col-md-6 mx-auto">
                     <figure class="figure">
-                        <img src=" {{ image }}" class="figure-img img-fluid rounded center" style="object-fit: fill;">
-                        <figcaption class="figure-caption"><code>{{ title }}</code> was listed <code>{{ days }} days</code> and <code>{{ hours }} hours</code> ago, for <code>${{ list_price}}</code>.</figcaption>
-                      </figure>
-                      <div class="table-responsive">
-                        <table class="table table-striped">
-                          <tbody>
-                            <tr>
-                              <td>Range:</td>
-                              <td>${{ lower_bound }} - ${{ upper_bound }}</td>
-                            </tr>
-                            <tr>
-                              <td>Median:</td>
-                              <td>${{ median }}</td>
-                            </tr>
-                            <tr>
-                              <td>Description:</td>
-                              <td>{{ sentiment_rating }}/5.0</td>
-                            </tr>
-                            <tr>
-                              <td>Price:</td>
-                              <td>{{ price_rating }}/5.0</td>
-                            </tr>
-                            <tr>
-                              <td>Overall:</td>
-                              <td>{{ average_rating }}/5.0</td>
-                            </tr>
-                          </tbody>
-                        </table>
-                      </div>
+                      <img src=" {{ image }}" class="figure-img img-fluid rounded" style="object-fit: fill;">
+                      <figcaption class="figure-caption"><code>{{ title }}</code> was listed <code>{{ days }} days</code> and <code>{{ hours }} hours</code> ago, for <code>${{ list_price}}</code>.</figcaption>
+                    </figure>
+                    <div class="table-responsive">
+                      <table class="table table-striped">
+                        <tbody>
+                          <tr>
+                            <td>Range:</td>
+                            <td>${{ lower_bound }} - ${{ upper_bound }}</td>
+                          </tr>
+                          <tr>
+                            <td>Median:</td>
+                            <td>${{ median }}</td>
+                          </tr>
+                          <tr>
+                            <td>Description:</td>
+                            <td>{{ sentiment_rating }}/5.0</td>
+                          </tr>
+                          <tr>
+                            <td>Price:</td>
+                            <td>{{ price_rating }}/5.0</td>
+                          </tr>
+                          <tr>
+                            <td>Overall:</td>
+                            <td>{{ average_rating }}/5.0</td>
+                          </tr>
+                        </tbody>
+                      </table>
+                    </div>
                 </div>
             </div>
         </div>
diff --git a/scraper/views.py b/scraper/views.py
index 7e41cd8..f941b63 100644
--- a/scraper/views.py
+++ b/scraper/views.py
@@ -26,33 +26,24 @@ class Index(View):
         if form.is_valid():
             url = form.cleaned_data['url']
 
-        # Shorten the URL listing to the title of the listing
         shortened_url = re.search(r".*[0-9]", url).group(0)
-        # Use the shortened URL and convert it to mobile, to get the price of the listing
         mobile_url = shortened_url.replace("www", "m")
-        # Find the ID of the product
         market_id = (re.search(r"\/item\/([0-9]*)", url)).group(1)
 
-        # Get the image of the listing
         image = self.get_listing_image(self.create_soup(mobile_url, headers=None))
 
-        # Get the number of days and hours the listing has been active
         days, hours = self.get_listing_date(self.create_soup(mobile_url, headers=None))
         
-        # Get the sentiment rating of the listing
         sentiment_rating = self.sentiment_analysis(self.get_listing_description(self.create_soup(url, headers=None)))
 
-        # Get the title of the listing
         title = self.get_listing_title(self.create_soup(url, headers=None))
         
-        # Get the minimum, maximum, and median prices of the viable products found on Google Shopping
         list_price = self.get_listing_price(self.create_soup(mobile_url, headers=None))
         list_price = re.sub("[\$,]", "", list_price)
         initial_price = int(re.sub("[\$,]", "", list_price))
 
         lower_bound, upper_bound, median = self.find_viable_product(title, ramp_down=0.0)
 
-        # Calculate the price difference between the listing and the median price of the viable products, and generate ratings
         price_rating = self.price_difference_rating(initial_price, median)
         average_rating = statistics.mean([sentiment_rating, price_rating])
 
@@ -77,11 +68,9 @@ class Index(View):
         return render(request, 'scraper/result.html', context)
 
     def price_difference_rating(self, initial, final):
-        # If the listing price is less than or equal to the median price found online, set the rating to 5
         if initial <= final:
             rating = 5.0
         else:
-            # If the listing price is greater than the median price found online, calculate the difference
             difference = min(initial, final) / max(initial, final)
             rating = (difference / 20) * 100
 
@@ -96,55 +85,44 @@ class Index(View):
         url = "https://www.google.com/search?q=" + title + "&sa=X&biw=1920&bih=927&tbm=shop&sxsrf=ALiCzsbtwkWiDOQEcm_9X1UBlEG1iaqXtg%3A1663739640147&ei=-KYqY6CsCLez0PEP0Ias2AI&ved=0ahUKEwigiP-RmaX6AhW3GTQIHVADCysQ4dUDCAU&uact=5&oq=REPLACE&gs_lcp=Cgtwcm9kdWN0cy1jYxADMgUIABCABDIFCAAQgAQyBQgAEIAEMgsIABCABBCxAxCDATIECAAQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEOgsIABAeEA8QsAMQGDoNCAAQHhAPELADEAUQGDoGCAAQChADSgQIQRgBUM4MWO4TYJoVaAFwAHgAgAFDiAGNA5IBATeYAQCgAQHIAQPAAQE&sclient=products-cc"
 
         soup = self.create_soup(url, headers)
-        # Set the similarity threshold to a initial value, and decrease it when no products are found
-        similarity_threshold = 0.45
+        similarity_threshold = 0.25
 
         try:
-            prices = self.listing_product_similarity(soup, title, similarity_threshold)
-            # The length of the list of prices should be greater than 0 if there are viable products
+            filtered_prices_descriptions = self.listing_product_similarity(soup, title, similarity_threshold)
+            prices = list(filtered_prices_descriptions.values())
             assert len(prices) > 0
         except AssertionError:
-            print("Error: no viable products found, now searching for more general products...")
             while len(prices) == 0:
-                # If no viable products are found, the search is further generalized by 5%, until a reasonable number of products are found
                 ramp_down += 0.05
-                prices = self.listing_product_similarity(soup, title, similarity_threshold - ramp_down)
-        
-        # Get the median price of the viable products
+                filtered_prices_descriptions = self.listing_product_similarity(soup, title, similarity_threshold - ramp_down)
+                prices = list(filtered_prices_descriptions.values())
+
         median = statistics.median_grouped(prices)
         
         return min(prices), max(prices), median
 
     def clean_title_description(self, title):
-        # Remove punctuation
         cleaned = re.sub(r"[^A-Za-z0-9\s]+", " ", title)
-        # Remove extra spaces
         cleaned = re.sub(r"\s+", " ", cleaned)
 
         return cleaned
 
     def listing_product_similarity(self, soup, title, similarity_threshold):
-        # Get the median price of the product
         normalized = self.get_product_price(soup)
-        # Get the product description
         description = self.get_product_description(soup)
 
         price_description = {}
-        # Iterate through the product descriptions
         for key, value in zip(description, normalized):
             google_shopping_title = self.clean_title_description(key.text.lower())
             listing_title = self.clean_title_description(title.lower())
-            # Get the similarity between the listing title and the product description on Google Shopping
             price_description[key.text] = [value, SequenceMatcher(None, google_shopping_title, listing_title).ratio()]
 
-        prices = []
-        # Iterate through the product descriptions and their similarity scores
+        filtered_prices_descriptions = {}
         for key, value in price_description.items():
-            # If the similarity score is greater than the similarity threshold, add the price to the list of prices
             if value[1] >= similarity_threshold:
-                prices.append(value[0])
-        
-        return prices
+                filtered_prices_descriptions[key] = value[0]
+
+        return filtered_prices_descriptions
 
     def get_product_description(self, soup):
         # Get the description of the product
@@ -161,57 +139,44 @@ class Index(View):
 
 
     def get_product_price(self, soup):
-        # Get the price of the product
         prices = soup.find_all("span", {"class": "HRLxBb"})
 
-        # Extract the price from the span
         values = []
         for price in prices:
             values.append(price.text)
 
-        # Remove the dollar sign from the price
         normalized = [re.sub("\$", "", price) for price in values]
-        # Convert the price to a float
         normalized = [re.search(r"[0-9,.]*", price).group(0) for price in normalized]
-        # Remove the commas from the price
         normalized = [float(price.replace(",", "")) for price in normalized]
         
-        # Remove statistical outliers as to not skew the median price
         outlierless = self.reject_outliers(np.array(normalized))
 
         return outlierless
 
     def clean_listing_title(self, title):
-        # Certain symbols are not allowed in the search query for Google Shopping, so they must be removed
         title = re.sub(r"#", "%2", title)
         title = re.sub(r"&", "%26", title)
 
         return title
 
     def get_listing_price(self, soup):
-        # Get the price of the listing
         spans = soup.find_all("span")
 
-        # Check if the listing is free
         free = [span.text for span in spans if "free" in span.text.lower()]
         if (free):
             return free
 
-        # Find the span that contains the price of the listing and extract the price
         price = [str(span.text) for span in spans if "$" in span.text][0]
 
         return price
     
     def get_listing_image(self, soup):
-        # Get the image of the listing
         images = soup.find_all("img")
-        # Find the image that is the listing image
         image = [image["src"] for image in images if "https://scontent" in image["src"]]
 
         return image
 
     def get_listing_title(self, soup):
-        # Get the title of the listing
         title = soup.find("meta", {"name": "DC.title"})
         title_content = title["content"]
         return title_content
@@ -220,19 +185,26 @@ class Index(View):
         tag = soup.find('abbr')
         tag = tag.text.strip()
 
-        month_str = re.search(r"[a-zA-Z]+", tag).group(0)
-        month_num = datetime.datetime.strptime(month_str, '%B').month
+        try:
+            month_str = re.search(r"[a-zA-Z]+", tag).group(0)
+            month_num = datetime.datetime.strptime(month_str, '%B').month
+        except ValueError:
+            hour_str = re.search(r"[0-9]+", tag).group(0)
+            return 0, hour_str
+
+        try:
+            year_str = re.search(r"[0-9]{4}", tag).group(0)
+        except AttributeError:
+            year_str = datetime.datetime.now().year
 
         date_str = re.search(r"[0-9]+", tag).group(0)
-        year_str = datetime.datetime.now().year
-
         time_str = re.search(r"[0-9]+:[0-9]+", tag).group(0)
         am_pm = re.search(r"[A-Z]{2}", tag).group(0)
+
         formated_time = f'{time_str}:00 {am_pm}'
+        formated_date = f'{year_str}-{month_num}-{date_str}'
 
-        date_str = f'{year_str}-{month_num}-{date_str}'
-
-        dt_str = f'{date_str} {formated_time}'
+        dt_str = f'{formated_date} {formated_time}'
         dt = datetime.datetime.strptime(dt_str, '%Y-%m-%d %I:%M:%S %p')
 
         now = datetime.datetime.now()
@@ -243,47 +215,37 @@ class Index(View):
 
         return days, hours
 
+
     def create_soup(self, url, headers):
-        # Create a request object 
         response = requests.get(url, headers=headers)
-        # Create a BeautifulSoup object
         soup = BeautifulSoup(response.text, 'html.parser')
 
         return soup
 
     def clean_text(self, text):
-        # Remove punctuation
         tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
         tokenized = tokenizer.tokenize(text)
-        # Lowercase all words
         tokenized = [word.lower() for word in tokenized]
 
-        # Remove stopwords
         stop_words = stopwords.words('english')
-        # Filter out any tokens not containing letters
         filtered = [word for word in tokenized if word not in stop_words and word.isalpha()]
 
-        # Lemmatize all words
         lemmatizer = WordNetLemmatizer()
         lemmatized = [lemmatizer.lemmatize(word) for word in filtered]
         
         return " ".join(lemmatized)
 
     def get_listing_description(self, soup):
-        # Get the description of the listing
         description = soup.find("meta", {"name": "DC.description"})
         description_content = description["content"]
 
         return self.clean_text(description_content)
 
     def sentiment_analysis(self, text):
-        # Create a SentimentIntensityAnalyzer object
         sia = SentimentIntensityAnalyzer()
         sentiment = sia.polarity_scores(text)
-        # Get the sentiment scores
         neg, neu, pos, compound = sentiment["neg"], sentiment["neu"], sentiment["pos"], sentiment["compound"]
 
-        # Assign a rating based on the compound score
         if compound > 0.0:
             rating = 5 * max(pos, compound)
         elif compound < 0.0:

Range:	${{ lower_bound }} - ${{ upper_bound }}
Median:	${{ median }}
Description:	{{ sentiment_rating }}/5.0
Price:	{{ price_rating }}/5.0
Overall:	{{ average_rating }}/5.0