Merge pull request #61 from bhavanvir/main

Bunch of changes.
2026-05-19 11:14:29 -04:00 · 2023-05-05 19:45:45 -07:00
parent ca70dcfd66 3ed1230369
commit 66b030cd89
4 changed files with 118 additions and 57 deletions
--- a/scraper/shopping_class.py
+++ b/scraper/shopping_class.py
@@ -40,7 +40,7 @@ class GoogleShoppingScraper:

        return description

-    def get_product_price(self) -> np.ndarray:
+    def get_product_price(self) -> list[float]:
        """
        Extracts the price of each product from the HTML.

@@ -63,6 +63,29 @@ class GoogleShoppingScraper:
        normalized = [float(price.replace(",", "")) for price in normalized]

        return normalized
+    
+    def get_product_shipping(self) -> list[float]:
+        """
+        Extracts the shipping cost of each product from the HTML.
+
+        Args:
+            soup: The HTML to extract the shipping cost from.
+            
+        Returns:
+            The shipping cost of each product. The shipping cost is represented as a
+            NumPy array.
+        """
+
+        shipping = self.soup.find_all("span", {"class": "dD8iuc"})
+        
+        values = []
+        for ship in shipping:
+            values.append(ship.text)
+        
+        cleansed = [re.search(r"([0-9]+\.[0-9]+)|(Free)", ship).group(0) for ship in values]
+        cleansed = [float(ship) if ship != "Free" else 0.0 for ship in cleansed]
+
+        return cleansed

    def get_product_url(self) -> str:
        """
@@ -113,13 +136,14 @@ class GoogleShoppingScraper:

        return similarity
    
-    def remove_outliers(self, titles: list[str], prices: list[float], urls: list[str]) -> tuple[list[str], list[float], list[str]]:
+    def remove_outliers(self, titles: list[str], prices: list[float], shipping: list[float], urls: list[str]) -> tuple[list[str], list[float], list[float], list[str]]:
        """
        Removes outliers from a set of data consisting of titles, prices, and URLs.

        Args:
            titles (list[str]): A list of titles of the items.
            prices (list[float]): A list of prices of the items.
+            shipping (list[float]): A list of shipping costs of the items.
            urls (list[str]): A list of URLs of the items.

        Returns:
@@ -130,9 +154,10 @@ class GoogleShoppingScraper:

        titles = [title for i, title in enumerate(titles) if i not in outlier_indices]
        prices = [price for i, price in enumerate(prices) if i not in outlier_indices]
+        shipping = [ship for i, ship in enumerate(shipping) if i not in outlier_indices]
        urls = [url for i, url in enumerate(urls) if i not in outlier_indices]

-        return titles, prices, urls
+        return titles, prices, shipping, urls

    def get_product_info(self):
        """
@@ -153,15 +178,17 @@ class GoogleShoppingScraper:

        titles = self.get_product_title()
        prices = self.get_product_price()
+        shipping = self.get_product_shipping()
        urls = self.get_product_url()

-        titles, prices, urls = self.remove_outliers(titles, prices, urls)
+        titles, prices, shipping, urls = self.remove_outliers(titles, prices, shipping, urls)

        product_info = []
-        for title, price, url in zip(titles, prices, urls):
+        for title, price, ship, url in zip(titles, prices, shipping, urls):
            product_info.append({
                'title': clean_text(title.text.lower()),
                'price': price,
+                'shipping': ship,
                'url': url
            })

@@ -199,7 +226,7 @@ class GoogleShoppingScraper:

        return min_price_item
        
-    def construct_candidates(self, descriptions, prices, urls, similarities):
+    def construct_candidates(self, descriptions, prices, shipping, urls, similarities):
        """
        Constructs a list of candidates from the descriptions, prices, and
        urls.
@@ -207,6 +234,7 @@ class GoogleShoppingScraper:
        Args:
            descriptions: The descriptions of the products.
            prices: The prices of the products.
+            shipping: The shipping costs of the products.
            urls: The urls of the products.

        Returns:
@@ -217,6 +245,7 @@ class GoogleShoppingScraper:
        for i in range(len(descriptions)):
            candidates[descriptions[i]] = {
                "price": prices[i],
+                "shipping": shipping[i],
                "url": urls[i],
                "similarity": similarities[i]
            }
@@ -240,6 +269,7 @@ class GoogleShoppingScraper:

        descriptions = []
        prices = []
+        shipping = []
        urls = []
        similarities = []

@@ -265,14 +295,15 @@ class GoogleShoppingScraper:
                    if filtered_prices_descriptions:
                        consecutively_empty = 0
                    else:
-                        consecutively_empty +=1
+                        consecutively_empty += 1

            descriptions += list(filtered_prices_descriptions.keys())
            prices += [f"{product['price']:,.2f}" for product in filtered_prices_descriptions.values()]
+            shipping += [f"{product['shipping']:,.2f}" for product in filtered_prices_descriptions.values()]
            urls += [product['url'] for product in filtered_prices_descriptions.values()]
            similarities += [product['similarity'] for product in filtered_prices_descriptions.values()]

-        return descriptions, prices, urls, similarities
+        return descriptions, prices, shipping, urls, similarities

    def filter_products_by_similarity(self, product_info: list, target_title: str, similarity_threshold: float):
        """
@@ -296,6 +327,7 @@ class GoogleShoppingScraper:
                if similarity >= similarity_threshold:
                    filtered_products[product['title']] = {
                        'price': product['price'],
+                        'shipping': product['shipping'],
                        'url': product['url'],
                        'similarity': similarity
                    }
--- a/scraper/templates/scraper/result.html
+++ b/scraper/templates/scraper/result.html
@@ -35,8 +35,13 @@
            </h4>
            <p>After running our advanced algorithms and crunching the numbers, we have <b>identified {{ best_title }}</b> as the ultimate bargaining chip for you! 
                With a jaw-dropping <b>match percentage of {{ best_score }}%</b>, it's practically a match made in heaven with your chosen listing. And the cherry on top? 
-                It's currently listed at a steal of a price - <b>just ${{ best_price }}</b> - under the super cool <b>category of {{ best_category }} </b>. 
-                You won't find a better deal anywhere else!
+                It's currently listed at a steal of a price - <b>just ${{ best_price }}
+                    {% if best_shipping != "0.00" %} 
+                        with ${{ best_shipping }} in shipping.
+                    {% else %}
+                        with free shipping.
+                    {% endif %}
+                </b>You won't find a better deal anywhere else!
            </p>
            <hr>
            <p>{% if best_context.type == 'decrease' %}
@@ -46,8 +51,6 @@
                {% endif %}
            </p>
        </div>
-
-        <!-- <div class="Stars" style="--rating: {{ price_rating }}" aria-label="Rating of this product is {{ price_rating }} out of 5."></div> -->
          
        <div class="card" style="margin-top: 2.5rem;">
            <div class="card-header">
--- a/scraper/utils.py
+++ b/scraper/utils.py
@@ -8,6 +8,7 @@ import plotly.graph_objects as go
 from sklearn.cluster import KMeans
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import PolynomialFeatures
 from wordcloud import WordCloud
 from collections import Counter

@@ -90,7 +91,7 @@ def reject_outliers(data: list[float], m: float) -> list[int]:

    return indices.tolist()

-def price_difference_rating(initial: float, final: float) -> float:
+def price_difference_rating(initial: float, final: float, days: int) -> float:
    """
    The rating is based on the difference between the initial and final
    price. The rating is 0 if the final price is greater than the initial
@@ -101,16 +102,31 @@ def price_difference_rating(initial: float, final: float) -> float:
    Args:
        initial: The initial price.
        final: The final price.
+        days: The number of days a listing has been active.

    Returns:
        The rating.
    """
+    
+    # Decay constant (a value greater than 0)
+    decay_constant = 0.01
+
+    # Adjust this value to control the rate of increase of the penalty
+    linear_factor = 0.0125
+
+    # Threshold number of days after which the penalty is applied
+    threshold_days = 7
+
+    if days >= threshold_days:
+        days_past_threshold = days - threshold_days
+        penalty_amount = initial*np.exp(-decay_constant*days_past_threshold) + linear_factor*days_past_threshold*initial
+        initial += penalty_amount

    if initial <= final:
        rating = 5.0
    else:
        price_difference = initial - final
-        rating = 5.0 - (price_difference / initial) * 5.0
+        rating = 5.0 - (price_difference/initial)*5.0
    
    return max(0.0, min(rating, 5.0))

@@ -144,43 +160,50 @@ def percentage_difference(list_price: float, best_price: float) -> dict:

    return difference

-def create_chart(categorized: dict, similar_prices: list[float], similar_descriptions: list[str], listing_currency: str, listing_title: str) -> object:
+def create_chart(categorized: dict, similar_prices: list[float], similar_shipping: list[float], similar_descriptions: list[str], listing_currency: str, listing_title: str) -> object:
    """
    Creates a line chart visualization based on the categorized items, their prices, and their descriptions.

    Args:
        categorized (dict): A dictionary where the keys are the names of the clusters and the values are lists of the items in that cluster.
        similar_prices (list[float]): A list of prices of the items.
+        similar_shipping (list[float]): A list of shipping costs of the items.
        similar_descriptions (list[str]): A list of descriptions of the items.

    Returns:
        A JSON string containing the Plotly figure of the line chart.
    """

-    items, prices, descriptions = [], [], []
+    items, prices, shipping, descriptions = [], [], [], []
    unit = 1

    for categories, titles in categorized.items():
        items.append(categories)

-        sub_prices, sub_descriptions = [], []
+        sub_prices, sub_shipping, sub_descriptions = [], [], []
        for title in titles:
            idx = similar_descriptions.index(title)
-            sub_prices.append(similar_prices[idx])

+            sub_prices.append(similar_prices[idx])
+            sub_shipping.append(similar_shipping[idx])
            sub_descriptions.append(title)
        prices.append(sub_prices)
+        shipping.append(sub_shipping)
        descriptions.append(sub_descriptions)
    
    sort_indices = [sorted(range(len(sublist)), key=lambda x: sublist[x]) for sublist in prices]
    sorted_prices = [[sublist[i] for i in indices] for sublist, indices in zip(prices, sort_indices)]
+
+    sorted_shipping = [[sublist[i] for i in indices] for sublist, indices in zip(shipping, sort_indices)]
+    formatted_shipping = [[f"${ship}" if ship != "0.00" else "Free" for ship in row] for row in sorted_shipping]
+
    sorted_descriptions = [[sublist[i] for i in indices] for sublist, indices in zip(descriptions, sort_indices)]

    fig = go.Figure()

    for i, _ in enumerate(items):
        x = [j*unit + 1 for j in range(len(sorted_prices[i]))]
-        hovertext = [f"Product: {desc.title()}<br>Price: ${price:.2f}" for price, desc in zip(sorted_prices[i], sorted_descriptions[i])]
+        hovertext = [f"Product: {desc.title()}<br>Price: ${price:.2f}<br>Shipping: {ship}" for price, ship, desc in zip(sorted_prices[i], formatted_shipping[i], sorted_descriptions[i])]
        fig.add_trace(go.Scatter(
            x=x, y=sorted_prices[i], 
            mode='markers', 
@@ -188,12 +211,17 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip
            text=hovertext, 
            name=f"Category {i + 1}"))

-    # Compute the linear regression on all data points
+    # Compute the polynomial regression on all data points
    x = np.concatenate([np.arange(len(prices))*unit + 1 for prices in sorted_prices])
    y = np.concatenate(sorted_prices)
-    reg = LinearRegression().fit(x.reshape(-1, 1), y)
-    x_reg = [np.min(x), np.max(x)*1.5]
-    y_reg = reg.predict(np.array(x_reg).reshape(-1, 1))
+
+    poly_features = PolynomialFeatures(degree=4, include_bias=True)
+    x_poly = poly_features.fit_transform(x.reshape(-1, 1))
+
+    reg = LinearRegression().fit(x_poly, y)
+    x_reg = np.linspace(np.min(x), np.max(x), num=100)
+    x_reg_poly = poly_features.fit_transform(x_reg.reshape(-1, 1))
+    y_reg = reg.predict(x_reg_poly)

    # Add the trend line to the plot
    fig.add_trace(
@@ -202,14 +230,15 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip
            mode='lines', 
            name='Trend Line'))

-    # Add prediction annotation for the trend line
-    prediction = reg.predict([[10]])[0]
-    fig.add_annotation(x=10, y=prediction, text=f"Expected Price: ${prediction:.2f} {listing_currency}", showarrow=True)
-        
+    # Add annotations to all x values
+    for x_val in x:
+        y_val = reg.predict(poly_features.transform([[x_val]]))[0]
+        fig.add_annotation(x=x_val, y=y_val, text=f"Prediction: ${y_val:.2f}", showarrow=True)
+            
    fig.update_layout(
        template='plotly_white', 
        hovermode='closest', 
-        xaxis_title="Product", 
+        xaxis_title="Product Number", 
        yaxis_title=f"Price $({listing_currency})", 
        legend_title="Categories", 
        title={
@@ -221,7 +250,7 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip
    
    return fig.to_json()

-def create_wordcloud(urls: list[str]) -> tuple[object, dict]:
+def create_wordcloud(urls: list[str]) -> object:
    """
    Creates a word cloud visualization based on a list of website URLs.

@@ -251,15 +280,15 @@ def create_wordcloud(urls: list[str]) -> tuple[object, dict]:
    fig = px.imshow(wordcloud)
    fig.update_layout(
        xaxis_title="Website URL",
-        yaxis_title="Citations (Bigger is Better)",
+        yaxis_title="Citations",
        title={
-            'text': "Word Cloud of Websites",
+            'text': "Frequently Cited Websites",
            'xanchor': 'center',
            'yanchor': 'top',
            'y': 0.9,
            'x': 0.5})

-    return fig.to_json(), dict(website_counts)
+    return fig.to_json()

 def categorize_titles(items: list[str]) -> dict:
    """
@@ -275,20 +304,21 @@ def categorize_titles(items: list[str]) -> dict:
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(items)

-    num_clusters = len(items) // 5
-    kmeans = KMeans(n_clusters=num_clusters, n_init=10)
+    wcss = []
+    for i in range(1, len(items)//2):
+        kmeans = KMeans(n_clusters=i, n_init=10)
+        kmeans.fit(X)
+        wcss.append(kmeans.inertia_)
+
+    # Select the optimal number of clusters based on the elbow point
+    optimal_num_clusters = int(min(wcss))
+    kmeans = KMeans(n_clusters=optimal_num_clusters, n_init=10)
    kmeans.fit(X)

-    cluster_names = []
-    for i in range(num_clusters):
-        cluster_items = [items[j] for j in range(len(items)) if kmeans.labels_[j] == i]
-        representative_item = f"{i+1}"
-        cluster_names.append(representative_item)
-
    clusters = {}
-    for i in range(num_clusters):
+    for i in range(optimal_num_clusters):
        cluster_items = [items[j] for j in range(len(items)) if kmeans.labels_[j] == i]
-        cluster_name = cluster_names[i]
-        clusters[cluster_name] = cluster_items
+        representative_item = f"{i + 1}"
+        clusters[representative_item] = cluster_items

    return clusters
--- a/scraper/views.py
+++ b/scraper/views.py
@@ -48,34 +48,33 @@ class Index(View):

            # Find viable products based on the title
            cleaned_title = remove_illegal_characters(title)
-            similar_descriptions, similar_prices, similar_urls, similar_scores = shopping_instance.find_viable_product(cleaned_title, ramp_down=0.0)
-            candidates = shopping_instance.construct_candidates(similar_descriptions, similar_prices, similar_urls, similar_scores)
+            similar_descriptions, similar_prices, similar_shipping, similar_urls, similar_scores = shopping_instance.find_viable_product(cleaned_title, ramp_down=0.0)
+            candidates = shopping_instance.construct_candidates(similar_descriptions, similar_prices, similar_shipping, similar_urls, similar_scores)
            
            # Convert prices to float and shorten the descriptions if necessary
            similar_prices = [float(price.replace(',', '')) for price in similar_prices]

            # Categorize the titles and create the chart and wordcloud
            categorized = categorize_titles(similar_descriptions)
-            chart = create_chart(categorized, similar_prices, similar_descriptions, currency, title)
-            wordcloud, website_counts = create_wordcloud(similar_urls)   
+            chart = create_chart(categorized, similar_prices, similar_shipping, similar_descriptions, currency, title)
+            wordcloud = create_wordcloud(similar_urls)   

            # Based on the best similar product, get the price, description, category, and URL
            best_product = shopping_instance.lowest_price_highest_similarity(candidates)

            idx = similar_urls.index(best_product[1]["url"])
            best_price = f"{similar_prices[idx]:,.2f}"
+            best_shipping = similar_shipping[idx]
            best_title = similar_descriptions[idx]
            best_score = best_product[1]["similarity"] * 100
-            best_category = [key for key, value in categorized.items() if [item for item in value if item == best_title]][0]

-            # Percetage difference between the listing price and the best found price
-            best_context = percentage_difference(float(price), float(best_price.replace(",", "")))
-            price_rating = price_difference_rating(float(price), float(best_price.replace(",", "")))
+            # Percetage difference between the listing price and the best found price (including shipping)
+            best_total = float(best_price.replace(",", "")) + float(best_shipping.replace(",", ""))
+            best_context = percentage_difference(float(price), best_total,)
+            price_rating = price_difference_rating(float(price), best_total, days)

            # Get the total number of items
            total_items = len(similar_descriptions)
-            max_citations = max([value for value in website_counts.values()])
-            max_website = [key for key, value in website_counts.items() if value == max_citations][0]

            # Create the context 
            context = {
@@ -97,12 +96,9 @@ class Index(View):
                'categorized': categorized,
                'total_items': total_items,
                'best_price': best_price,
+                'best_shipping': best_shipping,
                'best_title': best_title.title(),
                'best_score': round(best_score, 2),
-                'website_counts': website_counts,
-                'max_website': max_website,
-                'max_citations': max_citations,
-                'best_category': best_category,
                'best_context': best_context
            }