From 6b7ff3fcd3adb1582b4ff1cca06a39febdca1f83 Mon Sep 17 00:00:00 2001
From: Bhavanvir Rai <bhavanvir.r@gmail.com>
Date: Fri, 5 May 2023 14:48:34 -0700
Subject: [PATCH] Bunch of changes.

---
 scraper/shopping_class.py             | 46 ++++++++++++++++++++++-----
 scraper/templates/scraper/result.html | 11 ++++---
 scraper/utils.py                      | 43 ++++++++++++++++++-------
 scraper/views.py                      | 24 ++++++--------
 4 files changed, 87 insertions(+), 37 deletions(-)

diff --git a/scraper/shopping_class.py b/scraper/shopping_class.py
index 91e5d1b..ec87841 100644
--- a/scraper/shopping_class.py
+++ b/scraper/shopping_class.py
@@ -40,7 +40,7 @@ class GoogleShoppingScraper:
 
         return description
 
-    def get_product_price(self) -> np.ndarray:
+    def get_product_price(self) -> list[float]:
         """
         Extracts the price of each product from the HTML.
 
@@ -63,6 +63,29 @@ class GoogleShoppingScraper:
         normalized = [float(price.replace(",", "")) for price in normalized]
 
         return normalized
+    
+    def get_product_shipping(self) -> list[float]:
+        """
+        Extracts the shipping cost of each product from the HTML.
+
+        Args:
+            soup: The HTML to extract the shipping cost from.
+            
+        Returns:
+            The shipping cost of each product. The shipping cost is represented as a
+            NumPy array.
+        """
+
+        shipping = self.soup.find_all("span", {"class": "dD8iuc"})
+        
+        values = []
+        for ship in shipping:
+            values.append(ship.text)
+        
+        cleansed = [re.search(r"([0-9]+\.[0-9]+)|(Free)", ship).group(0) for ship in values]
+        cleansed = [float(ship) if ship != "Free" else 0.0 for ship in cleansed]
+
+        return cleansed
 
     def get_product_url(self) -> str:
         """
@@ -113,7 +136,7 @@ class GoogleShoppingScraper:
 
         return similarity
     
-    def remove_outliers(self, titles: list[str], prices: list[float], urls: list[str]) -> tuple[list[str], list[float], list[str]]:
+    def remove_outliers(self, titles: list[str], prices: list[float], shipping: list[float], urls: list[str]) -> tuple[list[str], list[float], list[float], list[str]]:
         """
         Removes outliers from a set of data consisting of titles, prices, and URLs.
 
@@ -130,9 +153,10 @@ class GoogleShoppingScraper:
 
         titles = [title for i, title in enumerate(titles) if i not in outlier_indices]
         prices = [price for i, price in enumerate(prices) if i not in outlier_indices]
+        shipping = [ship for i, ship in enumerate(shipping) if i not in outlier_indices]
         urls = [url for i, url in enumerate(urls) if i not in outlier_indices]
 
-        return titles, prices, urls
+        return titles, prices, shipping, urls
 
     def get_product_info(self):
         """
@@ -153,15 +177,17 @@ class GoogleShoppingScraper:
 
         titles = self.get_product_title()
         prices = self.get_product_price()
+        shipping = self.get_product_shipping()
         urls = self.get_product_url()
 
-        titles, prices, urls = self.remove_outliers(titles, prices, urls)
+        titles, prices, shipping, urls = self.remove_outliers(titles, prices, shipping, urls)
 
         product_info = []
-        for title, price, url in zip(titles, prices, urls):
+        for title, price, ship, url in zip(titles, prices, shipping, urls):
             product_info.append({
                 'title': clean_text(title.text.lower()),
                 'price': price,
+                'shipping': ship,
                 'url': url
             })
 
@@ -199,7 +225,7 @@ class GoogleShoppingScraper:
 
         return min_price_item
         
-    def construct_candidates(self, descriptions, prices, urls, similarities):
+    def construct_candidates(self, descriptions, prices, shipping, urls, similarities):
         """
         Constructs a list of candidates from the descriptions, prices, and
         urls.
@@ -217,6 +243,7 @@ class GoogleShoppingScraper:
         for i in range(len(descriptions)):
             candidates[descriptions[i]] = {
                 "price": prices[i],
+                "shipping": shipping[i],
                 "url": urls[i],
                 "similarity": similarities[i]
             }
@@ -240,6 +267,7 @@ class GoogleShoppingScraper:
 
         descriptions = []
         prices = []
+        shipping = []
         urls = []
         similarities = []
 
@@ -265,14 +293,15 @@ class GoogleShoppingScraper:
                     if filtered_prices_descriptions:
                         consecutively_empty = 0
                     else:
-                        consecutively_empty +=1
+                        consecutively_empty += 1
 
             descriptions += list(filtered_prices_descriptions.keys())
             prices += [f"{product['price']:,.2f}" for product in filtered_prices_descriptions.values()]
+            shipping += [f"{product['shipping']:,.2f}" for product in filtered_prices_descriptions.values()]
             urls += [product['url'] for product in filtered_prices_descriptions.values()]
             similarities += [product['similarity'] for product in filtered_prices_descriptions.values()]
 
-        return descriptions, prices, urls, similarities
+        return descriptions, prices, shipping, urls, similarities
 
     def filter_products_by_similarity(self, product_info: list, target_title: str, similarity_threshold: float):
         """
@@ -296,6 +325,7 @@ class GoogleShoppingScraper:
                 if similarity >= similarity_threshold:
                     filtered_products[product['title']] = {
                         'price': product['price'],
+                        'shipping': product['shipping'],
                         'url': product['url'],
                         'similarity': similarity
                     }
diff --git a/scraper/templates/scraper/result.html b/scraper/templates/scraper/result.html
index 740ae48..e580d72 100644
--- a/scraper/templates/scraper/result.html
+++ b/scraper/templates/scraper/result.html
@@ -35,8 +35,13 @@
             </h4>
             <p>After running our advanced algorithms and crunching the numbers, we have <b>identified {{ best_title }}</b> as the ultimate bargaining chip for you! 
                 With a jaw-dropping <b>match percentage of {{ best_score }}%</b>, it's practically a match made in heaven with your chosen listing. And the cherry on top? 
-                It's currently listed at a steal of a price - <b>just ${{ best_price }}</b> - under the super cool <b>category of {{ best_category }} </b>. 
-                You won't find a better deal anywhere else!
+                It's currently listed at a steal of a price - <b>just ${{ best_price }}
+                    {% if best_shipping != "0.00" %} 
+                        with ${{ best_shipping }} in shipping.
+                    {% else %}
+                        with free shipping.
+                    {% endif %}
+                </b>You won't find a better deal anywhere else!
             </p>
             <hr>
             <p>{% if best_context.type == 'decrease' %}
@@ -46,8 +51,6 @@
                 {% endif %}
             </p>
         </div>
-
-        <!-- <div class="Stars" style="--rating: {{ price_rating }}" aria-label="Rating of this product is {{ price_rating }} out of 5."></div> -->
           
         <div class="card" style="margin-top: 2.5rem;">
             <div class="card-header">
diff --git a/scraper/utils.py b/scraper/utils.py
index de32c06..71b4fd8 100644
--- a/scraper/utils.py
+++ b/scraper/utils.py
@@ -90,7 +90,7 @@ def reject_outliers(data: list[float], m: float) -> list[int]:
 
     return indices.tolist()
 
-def price_difference_rating(initial: float, final: float) -> float:
+def price_difference_rating(initial: float, final: float, days: int) -> float:
     """
     The rating is based on the difference between the initial and final
     price. The rating is 0 if the final price is greater than the initial
@@ -101,16 +101,31 @@ def price_difference_rating(initial: float, final: float) -> float:
     Args:
         initial: The initial price.
         final: The final price.
+        days: The number of days a listing has been active.
 
     Returns:
         The rating.
     """
+    
+    # Decay constant (a value greater than 0)
+    decay_constant = 0.01
+
+    # Adjust this value to control the rate of increase of the penalty
+    linear_factor = 0.0125
+
+    # Threshold number of days after which the penalty is applied
+    threshold_days = 7
+
+    if days >= threshold_days:
+        days_past_threshold = days - threshold_days
+        penalty_amount = initial*np.exp(-decay_constant*days_past_threshold) + linear_factor*days_past_threshold*initial
+        initial += penalty_amount
 
     if initial <= final:
         rating = 5.0
     else:
         price_difference = initial - final
-        rating = 5.0 - (price_difference / initial) * 5.0
+        rating = 5.0 - (price_difference/initial)*5.0
     
     return max(0.0, min(rating, 5.0))
 
@@ -144,7 +159,7 @@ def percentage_difference(list_price: float, best_price: float) -> dict:
 
     return difference
 
-def create_chart(categorized: dict, similar_prices: list[float], similar_descriptions: list[str], listing_currency: str, listing_title: str) -> object:
+def create_chart(categorized: dict, similar_prices: list[float], similar_shipping: list[float], similar_descriptions: list[str], listing_currency: str, listing_title: str) -> object:
     """
     Creates a line chart visualization based on the categorized items, their prices, and their descriptions.
 
@@ -157,30 +172,36 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip
         A JSON string containing the Plotly figure of the line chart.
     """
 
-    items, prices, descriptions = [], [], []
+    items, prices, shipping, descriptions = [], [], [], []
     unit = 1
 
     for categories, titles in categorized.items():
         items.append(categories)
 
-        sub_prices, sub_descriptions = [], []
+        sub_prices, sub_shipping, sub_descriptions = [], [], []
         for title in titles:
             idx = similar_descriptions.index(title)
-            sub_prices.append(similar_prices[idx])
 
+            sub_prices.append(similar_prices[idx])
+            sub_shipping.append(similar_shipping[idx])
             sub_descriptions.append(title)
         prices.append(sub_prices)
+        shipping.append(sub_shipping)
         descriptions.append(sub_descriptions)
     
     sort_indices = [sorted(range(len(sublist)), key=lambda x: sublist[x]) for sublist in prices]
     sorted_prices = [[sublist[i] for i in indices] for sublist, indices in zip(prices, sort_indices)]
+
+    sorted_shipping = [[sublist[i] for i in indices] for sublist, indices in zip(shipping, sort_indices)]
+    formatted_shipping = [[f"${ship}" if ship != "0.00" else "Free" for ship in row] for row in sorted_shipping]
+
     sorted_descriptions = [[sublist[i] for i in indices] for sublist, indices in zip(descriptions, sort_indices)]
 
     fig = go.Figure()
 
     for i, _ in enumerate(items):
         x = [j*unit + 1 for j in range(len(sorted_prices[i]))]
-        hovertext = [f"Product: {desc.title()}<br>Price: ${price:.2f}" for price, desc in zip(sorted_prices[i], sorted_descriptions[i])]
+        hovertext = [f"Product: {desc.title()}<br>Price: ${price:.2f}<br>Shipping: {ship}" for price, ship, desc in zip(sorted_prices[i], formatted_shipping[i], sorted_descriptions[i])]
         fig.add_trace(go.Scatter(
             x=x, y=sorted_prices[i], 
             mode='markers', 
@@ -221,7 +242,7 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip
     
     return fig.to_json()
 
-def create_wordcloud(urls: list[str]) -> tuple[object, dict]:
+def create_wordcloud(urls: list[str]) -> object:
     """
     Creates a word cloud visualization based on a list of website URLs.
 
@@ -251,15 +272,15 @@ def create_wordcloud(urls: list[str]) -> tuple[object, dict]:
     fig = px.imshow(wordcloud)
     fig.update_layout(
         xaxis_title="Website URL",
-        yaxis_title="Citations (Bigger is Better)",
+        yaxis_title="Citations",
         title={
-            'text': "Word Cloud of Websites",
+            'text': "Frequently Cited Websites",
             'xanchor': 'center',
             'yanchor': 'top',
             'y': 0.9,
             'x': 0.5})
 
-    return fig.to_json(), dict(website_counts)
+    return fig.to_json()
 
 def categorize_titles(items: list[str]) -> dict:
     """
diff --git a/scraper/views.py b/scraper/views.py
index 8218aee..9afd416 100644
--- a/scraper/views.py
+++ b/scraper/views.py
@@ -48,34 +48,33 @@ class Index(View):
 
             # Find viable products based on the title
             cleaned_title = remove_illegal_characters(title)
-            similar_descriptions, similar_prices, similar_urls, similar_scores = shopping_instance.find_viable_product(cleaned_title, ramp_down=0.0)
-            candidates = shopping_instance.construct_candidates(similar_descriptions, similar_prices, similar_urls, similar_scores)
+            similar_descriptions, similar_prices, similar_shipping, similar_urls, similar_scores = shopping_instance.find_viable_product(cleaned_title, ramp_down=0.0)
+            candidates = shopping_instance.construct_candidates(similar_descriptions, similar_prices, similar_shipping, similar_urls, similar_scores)
             
             # Convert prices to float and shorten the descriptions if necessary
             similar_prices = [float(price.replace(',', '')) for price in similar_prices]
 
             # Categorize the titles and create the chart and wordcloud
             categorized = categorize_titles(similar_descriptions)
-            chart = create_chart(categorized, similar_prices, similar_descriptions, currency, title)
-            wordcloud, website_counts = create_wordcloud(similar_urls)   
+            chart = create_chart(categorized, similar_prices, similar_shipping, similar_descriptions, currency, title)
+            wordcloud = create_wordcloud(similar_urls)   
 
             # Based on the best similar product, get the price, description, category, and URL
             best_product = shopping_instance.lowest_price_highest_similarity(candidates)
 
             idx = similar_urls.index(best_product[1]["url"])
             best_price = f"{similar_prices[idx]:,.2f}"
+            best_shipping = similar_shipping[idx]
             best_title = similar_descriptions[idx]
             best_score = best_product[1]["similarity"] * 100
-            best_category = [key for key, value in categorized.items() if [item for item in value if item == best_title]][0]
 
-            # Percetage difference between the listing price and the best found price
-            best_context = percentage_difference(float(price), float(best_price.replace(",", "")))
-            price_rating = price_difference_rating(float(price), float(best_price.replace(",", "")))
+            # Percetage difference between the listing price and the best found price (including shipping)
+            best_total = float(best_price.replace(",", "")) + float(best_shipping.replace(",", ""))
+            best_context = percentage_difference(float(price), best_total,)
+            price_rating = price_difference_rating(float(price), best_total, days)
 
             # Get the total number of items
             total_items = len(similar_descriptions)
-            max_citations = max([value for value in website_counts.values()])
-            max_website = [key for key, value in website_counts.items() if value == max_citations][0]
 
             # Create the context 
             context = {
@@ -97,12 +96,9 @@ class Index(View):
                 'categorized': categorized,
                 'total_items': total_items,
                 'best_price': best_price,
+                'best_shipping': best_shipping,
                 'best_title': best_title.title(),
                 'best_score': round(best_score, 2),
-                'website_counts': website_counts,
-                'max_website': max_website,
-                'max_citations': max_citations,
-                'best_category': best_category,
                 'best_context': best_context
             }