Merge pull request #42 from bhavanvir/main

Now scraping more pages in Google Shopping + updated Results page, ag…
2026-05-19 11:14:29 -04:00 · 2023-04-22 22:31:10 -07:00
parent d5910797e0 fd7dc8a783
commit 17da2bb14e
3 changed files with 72 additions and 34 deletions
--- a/scraper/templates/scraper/result.html
+++ b/scraper/templates/scraper/result.html
@@ -60,14 +60,13 @@
                </p>
                <div id="chart" data-chart="{{ chart }}"></div>
                <p>
-                    We found that the best deal occured in the <span style="color:#28a745">{{ best_similar_category }}</span> category 
-                    with a price of <span style="color:#28a745">${{ best_similar_price }}</span> for the <a style="color:#28a745" href="{{ best_similar_url }}" target="_blank">{{ best_similar_description }}</a> product, 
+                    Looks like we've found a winner! Our fancy-schmancy algorithm suggests that the <a style="color:#28a745" href="{{ best_similar_url }}" target="_blank">{{ best_similar_description }}</a> product, located in the <span style="color:#28a745">{{ best_similar_category }}</span> category, is the closest match to your list item, with a mind-blowing similarity score of <span style="color:#28a745">{{ best_similar_score }}%</span> and the sweetest price of <span style="color:#28a745">${{ best_similar_price}}</span>.
                    {% if list_best_context.type == "decrease" %}
-                        which is a <span style="color:#28a745">{{ list_best_context.type }}</span> of <span style="color:#28a745">{{ list_best_context.amount }}%</span>
+                        Talk about a bargain! This baby is <span style="color:#28a745">{{ list_best_context.amount }}%</span> cheaper than your original pick!
                    {% else %}
-                        which is an <span style="color:#dc3545">{{ list_best_context.type }}</span> of <span style="color:#dc3545">{{ list_best_context.amount }}%</span>
+                        Oops, looks like this one's <span style="color:#dc3545">{{ list_best_context.amount }}%</span> more expensive than your original choice.
                    {% endif %}
-                    from the listed item!
+                    Anyway, we won't tell your list item that you've found a new love 😉
                </p>
            </div>
        </div>
--- a/scraper/utils.py
+++ b/scraper/utils.py
@@ -152,7 +152,34 @@ def price_difference_rating(initial: float, final: float) -> float:

    return rating

-def find_viable_product(title: str, ramp_down: float) -> tuple[float, float, float]:
+def lowest_price_highest_similarity(filtered_prices_descriptions: dict) -> tuple[float, str, float]:
+    """
+    Finds the lowest price and the highest similarity of the filtered
+    prices and descriptions.
+
+    Args:
+        filtered_prices_descriptions: The filtered prices and descriptions.
+
+    Returns:
+        The lowest price, the highest similarity, and the description
+        associated with the highest similarity.
+    """
+    max_similarity = 0
+    min_price = float('inf')
+    result = None
+
+    for item, info in filtered_prices_descriptions.items():
+        similarity = info['similarity']
+        price = info['price']
+        if similarity > max_similarity or (similarity == max_similarity and price < min_price):
+            max_similarity = similarity
+            min_price = price
+            result = (item, info)
+
+    return result
+
+
+def find_viable_product(title: str, ramp_down: float) -> tuple[list, list, list]:
    """
    Finds viable products based on the title of the Marketplace listing,
    and utilizes the ramp down of the previous product in the sequence, to 
@@ -171,26 +198,34 @@ def find_viable_product(title: str, ramp_down: float) -> tuple[float, float, flo
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
    }

-    url = f"https://www.google.com/search?q={cleaned_title}&sa=X&biw=1920&bih=927&tbm=shop&sxsrf=ALiCzsbtwkWiDOQEcm_9X1UBlEG1iaqXtg%3A1663739640147&ei=-KYqY6CsCLez0PEP0Ias2AI&ved=0ahUKEwigiP-RmaX6AhW3GTQIHVADCysQ4dUDCAU&uact=5&oq=REPLACE&gs_lcp=Cgtwcm9kdWN0cy1jYxADMgUIABCABDIFCAAQgAQyBQgAEIAEMgsIABCABBCxAxCDATIECAAQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEOgsIABAeEA8QsAMQGDoNCAAQHhAPELADEAUQGDoGCAAQChADSgQIQRgBUM4MWO4TYJoVaAFwAHgAgAFDiAGNA5IBATeYAQCgAQHIAQPAAQE&sclient=products-cc"
-    soup = create_soup(url, headers)
-    similarity_threshold = 0.25
+    descriptions = []
+    prices = []
+    urls = []

-    try:
-        filtered_prices_descriptions = listing_product_similarity(soup, cleaned_title, similarity_threshold)
-        assert len(filtered_prices_descriptions) > 0
-    except AssertionError:
-        while len(filtered_prices_descriptions) == 0:
-            ramp_down += 0.05
-            filtered_prices_descriptions = listing_product_similarity(soup, cleaned_title, similarity_threshold - ramp_down)
+    for page_number in range(3):
+        start = page_number * 60
+        url = f"https://www.google.com/search?q={cleaned_title}&tbs=vw:d&tbm=shop&sxsrf=APwXEdeCneQw6hWKHlHMJptjJHcIzqvmvw:1682209446957&ei=pnpEZILiOcmD0PEPifacgAw&start={start}&sa=N&ved=0ahUKEwiCzZfE3r7-AhXJATQIHQk7B8AQ8tMDCLEY&biw=1920&bih=927&dpr=1"
+        soup = create_soup(url, headers)
+        similarity_threshold = 0.25

-    descriptions = list(filtered_prices_descriptions.keys())
+        try:
+            filtered_prices_descriptions = listing_product_similarity(soup, cleaned_title, similarity_threshold)
+            assert len(filtered_prices_descriptions) > 0
+        except AssertionError:
+            while len(filtered_prices_descriptions) == 0:
+                ramp_down += 0.05
+                filtered_prices_descriptions = listing_product_similarity(soup, cleaned_title, similarity_threshold - ramp_down)

-    prices = list(filtered_prices_descriptions.values())
-    prices = [f"{price['price']:,.2f}" for price in prices]
+        descriptions += list(filtered_prices_descriptions.keys())
+
+        prices += [f"{price['price']:,.2f}" for price in filtered_prices_descriptions.values()]
+
+        urls += [price['url'] for price in filtered_prices_descriptions.values()]
+
+    best_result = lowest_price_highest_similarity(filtered_prices_descriptions)
+
+    return descriptions, prices, urls, best_result

-    urls = [price['url'] for price in filtered_prices_descriptions.values()]
-    
-    return descriptions, prices, urls

 def listing_product_similarity(soup: BeautifulSoup, title: str, similarity_threshold: float) -> dict:
    """
@@ -217,7 +252,7 @@ def listing_product_similarity(soup: BeautifulSoup, title: str, similarity_thres
    filtered_prices_descriptions = {}
    for key, value in price_description.items():
        if value['similarity'] >= similarity_threshold:
-            filtered_prices_descriptions[key] = {'price': value['price'], 'url': value['url']}
+            filtered_prices_descriptions[key] = {'price': value['price'], 'url': value['url'], 'similarity': value['similarity']}

    return filtered_prices_descriptions

--- a/scraper/views.py
+++ b/scraper/views.py
@@ -35,24 +35,27 @@ class Index(View):
            price = scraper_instance.get_listing_price()
            city = scraper_instance.get_listing_city()

-            similar_descriptions, similar_prices, similar_urls = find_viable_product(title, ramp_down=0.0)
+            similar_descriptions, similar_prices, similar_urls, best_similar_product = find_viable_product(title, ramp_down=0.0)
            similar_prices = [float(price.replace(',', '')) for price in similar_prices]
-            shortened_item_names = [description[:10] + '...' if len(description) > 10 else description for description in similar_descriptions]
+            shortened_item_names = [description[:8] + '...' if len(description) > 10 else description for description in similar_descriptions]
+
+            # Based on the best similar product, get the price, description, category, and URL
+            idx = similar_urls.index(best_similar_product[1]["url"])
+            best_similar_price = f"{similar_prices[idx]:,.2f}"
+            best_similar_description = similar_descriptions[idx]
+            best_similar_category = shortened_item_names[idx]
+            best_similar_url = similar_urls[idx]
+            best_similar_score = best_similar_product[1]["similarity"] * 100

            # Create a DataFrame from the data
            data = {'Product': shortened_item_names, 'Price': similar_prices, 'Description': similar_descriptions, 'URL': similar_urls}
            df = pd.DataFrame(data)

+            # Used to determine colour range bounds
            cmin = min(similar_prices)
            cmax = max(similar_prices)

-            idx = similar_prices.index(cmin)
-            best_similar_price = f"{similar_prices[idx]:,.2f}"
-            best_similar_description = similar_descriptions[idx]
-            best_similar_category = shortened_item_names[idx]
-            best_similar_url = similar_urls[idx]
-
-            # Ratio 
+            # Ratio to limit the total bubble size
            desired_diameter = 150
            sizeref = cmax / desired_diameter

@@ -65,7 +68,7 @@ class Index(View):
            list_best_context = percentage_difference(float(price), float(best_similar_price))

            # Needs to be redone
-            price_rating = price_difference_rating(float(price), float(cmin))
+            price_rating = price_difference_rating(float(price), float(best_similar_price))

            categories = list(set(shortened_item_names))

@@ -88,7 +91,8 @@ class Index(View):
                'best_similar_description': best_similar_description,
                'best_similar_category': best_similar_category,
                'best_similar_url': best_similar_url,
-                'list_best_context': list_best_context,
+                'best_similar_score': f"{best_similar_score:.2f}",
+                'list_best_context': list_best_context
            }

            return render(request, 'scraper/result.html', context)