From fd7dc8a783ef583f78ed4073efd40af90a923872 Mon Sep 17 00:00:00 2001
From: Bhavanvir Rai
- We found that the best deal occured in the {{ best_similar_category }} category - with a price of ${{ best_similar_price }} for the {{ best_similar_description }} product, + Looks like we've found a winner! Our fancy-schmancy algorithm suggests that the {{ best_similar_description }} product, located in the {{ best_similar_category }} category, is the closest match to your list item, with a mind-blowing similarity score of {{ best_similar_score }}% and the sweetest price of ${{ best_similar_price}}. {% if list_best_context.type == "decrease" %} - which is a {{ list_best_context.type }} of {{ list_best_context.amount }}% + Talk about a bargain! This baby is {{ list_best_context.amount }}% cheaper than your original pick! {% else %} - which is an {{ list_best_context.type }} of {{ list_best_context.amount }}% + Oops, looks like this one's {{ list_best_context.amount }}% more expensive than your original choice. {% endif %} - from the listed item! + Anyway, we won't tell your list item that you've found a new love 😉
diff --git a/scraper/utils.py b/scraper/utils.py index 9289301..325ae2a 100644 --- a/scraper/utils.py +++ b/scraper/utils.py @@ -152,7 +152,34 @@ def price_difference_rating(initial: float, final: float) -> float: return rating -def find_viable_product(title: str, ramp_down: float) -> tuple[float, float, float]: +def lowest_price_highest_similarity(filtered_prices_descriptions: dict) -> tuple[float, str, float]: + """ + Finds the lowest price and the highest similarity of the filtered + prices and descriptions. + + Args: + filtered_prices_descriptions: The filtered prices and descriptions. + + Returns: + The lowest price, the highest similarity, and the description + associated with the highest similarity. + """ + max_similarity = 0 + min_price = float('inf') + result = None + + for item, info in filtered_prices_descriptions.items(): + similarity = info['similarity'] + price = info['price'] + if similarity > max_similarity or (similarity == max_similarity and price < min_price): + max_similarity = similarity + min_price = price + result = (item, info) + + return result + + +def find_viable_product(title: str, ramp_down: float) -> tuple[list, list, list]: """ Finds viable products based on the title of the Marketplace listing, and utilizes the ramp down of the previous product in the sequence, to @@ -171,26 +198,34 @@ def find_viable_product(title: str, ramp_down: float) -> tuple[float, float, flo "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582" } - url = f"https://www.google.com/search?q={cleaned_title}&sa=X&biw=1920&bih=927&tbm=shop&sxsrf=ALiCzsbtwkWiDOQEcm_9X1UBlEG1iaqXtg%3A1663739640147&ei=-KYqY6CsCLez0PEP0Ias2AI&ved=0ahUKEwigiP-RmaX6AhW3GTQIHVADCysQ4dUDCAU&uact=5&oq=REPLACE&gs_lcp=Cgtwcm9kdWN0cy1jYxADMgUIABCABDIFCAAQgAQyBQgAEIAEMgsIABCABBCxAxCDATIECAAQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEOgsIABAeEA8QsAMQGDoNCAAQHhAPELADEAUQGDoGCAAQChADSgQIQRgBUM4MWO4TYJoVaAFwAHgAgAFDiAGNA5IBATeYAQCgAQHIAQPAAQE&sclient=products-cc" - soup = create_soup(url, headers) - similarity_threshold = 0.25 + descriptions = [] + prices = [] + urls = [] - try: - filtered_prices_descriptions = listing_product_similarity(soup, cleaned_title, similarity_threshold) - assert len(filtered_prices_descriptions) > 0 - except AssertionError: - while len(filtered_prices_descriptions) == 0: - ramp_down += 0.05 - filtered_prices_descriptions = listing_product_similarity(soup, cleaned_title, similarity_threshold - ramp_down) + for page_number in range(3): + start = page_number * 60 + url = f"https://www.google.com/search?q={cleaned_title}&tbs=vw:d&tbm=shop&sxsrf=APwXEdeCneQw6hWKHlHMJptjJHcIzqvmvw:1682209446957&ei=pnpEZILiOcmD0PEPifacgAw&start={start}&sa=N&ved=0ahUKEwiCzZfE3r7-AhXJATQIHQk7B8AQ8tMDCLEY&biw=1920&bih=927&dpr=1" + soup = create_soup(url, headers) + similarity_threshold = 0.25 - descriptions = list(filtered_prices_descriptions.keys()) + try: + filtered_prices_descriptions = listing_product_similarity(soup, cleaned_title, similarity_threshold) + assert len(filtered_prices_descriptions) > 0 + except AssertionError: + while len(filtered_prices_descriptions) == 0: + ramp_down += 0.05 + filtered_prices_descriptions = listing_product_similarity(soup, cleaned_title, similarity_threshold - ramp_down) - prices = list(filtered_prices_descriptions.values()) - prices = [f"{price['price']:,.2f}" for price in prices] + descriptions += list(filtered_prices_descriptions.keys()) + + prices += [f"{price['price']:,.2f}" for price in filtered_prices_descriptions.values()] + + urls += [price['url'] for price in filtered_prices_descriptions.values()] + + best_result = lowest_price_highest_similarity(filtered_prices_descriptions) + + return descriptions, prices, urls, best_result - urls = [price['url'] for price in filtered_prices_descriptions.values()] - - return descriptions, prices, urls def listing_product_similarity(soup: BeautifulSoup, title: str, similarity_threshold: float) -> dict: """ @@ -217,7 +252,7 @@ def listing_product_similarity(soup: BeautifulSoup, title: str, similarity_thres filtered_prices_descriptions = {} for key, value in price_description.items(): if value['similarity'] >= similarity_threshold: - filtered_prices_descriptions[key] = {'price': value['price'], 'url': value['url']} + filtered_prices_descriptions[key] = {'price': value['price'], 'url': value['url'], 'similarity': value['similarity']} return filtered_prices_descriptions diff --git a/scraper/views.py b/scraper/views.py index 506b792..803125b 100644 --- a/scraper/views.py +++ b/scraper/views.py @@ -35,24 +35,27 @@ class Index(View): price = scraper_instance.get_listing_price() city = scraper_instance.get_listing_city() - similar_descriptions, similar_prices, similar_urls = find_viable_product(title, ramp_down=0.0) + similar_descriptions, similar_prices, similar_urls, best_similar_product = find_viable_product(title, ramp_down=0.0) similar_prices = [float(price.replace(',', '')) for price in similar_prices] - shortened_item_names = [description[:10] + '...' if len(description) > 10 else description for description in similar_descriptions] + shortened_item_names = [description[:8] + '...' if len(description) > 10 else description for description in similar_descriptions] + + # Based on the best similar product, get the price, description, category, and URL + idx = similar_urls.index(best_similar_product[1]["url"]) + best_similar_price = f"{similar_prices[idx]:,.2f}" + best_similar_description = similar_descriptions[idx] + best_similar_category = shortened_item_names[idx] + best_similar_url = similar_urls[idx] + best_similar_score = best_similar_product[1]["similarity"] * 100 # Create a DataFrame from the data data = {'Product': shortened_item_names, 'Price': similar_prices, 'Description': similar_descriptions, 'URL': similar_urls} df = pd.DataFrame(data) + # Used to determine colour range bounds cmin = min(similar_prices) cmax = max(similar_prices) - idx = similar_prices.index(cmin) - best_similar_price = f"{similar_prices[idx]:,.2f}" - best_similar_description = similar_descriptions[idx] - best_similar_category = shortened_item_names[idx] - best_similar_url = similar_urls[idx] - - # Ratio + # Ratio to limit the total bubble size desired_diameter = 150 sizeref = cmax / desired_diameter @@ -65,7 +68,7 @@ class Index(View): list_best_context = percentage_difference(float(price), float(best_similar_price)) # Needs to be redone - price_rating = price_difference_rating(float(price), float(cmin)) + price_rating = price_difference_rating(float(price), float(best_similar_price)) categories = list(set(shortened_item_names)) @@ -88,7 +91,8 @@ class Index(View): 'best_similar_description': best_similar_description, 'best_similar_category': best_similar_category, 'best_similar_url': best_similar_url, - 'list_best_context': list_best_context, + 'best_similar_score': f"{best_similar_score:.2f}", + 'list_best_context': list_best_context } return render(request, 'scraper/result.html', context)