From 6b7ff3fcd3adb1582b4ff1cca06a39febdca1f83 Mon Sep 17 00:00:00 2001 From: Bhavanvir Rai Date: Fri, 5 May 2023 14:48:34 -0700 Subject: [PATCH] Bunch of changes. --- scraper/shopping_class.py | 46 ++++++++++++++++++++++----- scraper/templates/scraper/result.html | 11 ++++--- scraper/utils.py | 43 ++++++++++++++++++------- scraper/views.py | 24 ++++++-------- 4 files changed, 87 insertions(+), 37 deletions(-) diff --git a/scraper/shopping_class.py b/scraper/shopping_class.py index 91e5d1b..ec87841 100644 --- a/scraper/shopping_class.py +++ b/scraper/shopping_class.py @@ -40,7 +40,7 @@ class GoogleShoppingScraper: return description - def get_product_price(self) -> np.ndarray: + def get_product_price(self) -> list[float]: """ Extracts the price of each product from the HTML. @@ -63,6 +63,29 @@ class GoogleShoppingScraper: normalized = [float(price.replace(",", "")) for price in normalized] return normalized + + def get_product_shipping(self) -> list[float]: + """ + Extracts the shipping cost of each product from the HTML. + + Args: + soup: The HTML to extract the shipping cost from. + + Returns: + The shipping cost of each product. The shipping cost is represented as a + NumPy array. + """ + + shipping = self.soup.find_all("span", {"class": "dD8iuc"}) + + values = [] + for ship in shipping: + values.append(ship.text) + + cleansed = [re.search(r"([0-9]+\.[0-9]+)|(Free)", ship).group(0) for ship in values] + cleansed = [float(ship) if ship != "Free" else 0.0 for ship in cleansed] + + return cleansed def get_product_url(self) -> str: """ @@ -113,7 +136,7 @@ class GoogleShoppingScraper: return similarity - def remove_outliers(self, titles: list[str], prices: list[float], urls: list[str]) -> tuple[list[str], list[float], list[str]]: + def remove_outliers(self, titles: list[str], prices: list[float], shipping: list[float], urls: list[str]) -> tuple[list[str], list[float], list[float], list[str]]: """ Removes outliers from a set of data consisting of titles, prices, and URLs. @@ -130,9 +153,10 @@ class GoogleShoppingScraper: titles = [title for i, title in enumerate(titles) if i not in outlier_indices] prices = [price for i, price in enumerate(prices) if i not in outlier_indices] + shipping = [ship for i, ship in enumerate(shipping) if i not in outlier_indices] urls = [url for i, url in enumerate(urls) if i not in outlier_indices] - return titles, prices, urls + return titles, prices, shipping, urls def get_product_info(self): """ @@ -153,15 +177,17 @@ class GoogleShoppingScraper: titles = self.get_product_title() prices = self.get_product_price() + shipping = self.get_product_shipping() urls = self.get_product_url() - titles, prices, urls = self.remove_outliers(titles, prices, urls) + titles, prices, shipping, urls = self.remove_outliers(titles, prices, shipping, urls) product_info = [] - for title, price, url in zip(titles, prices, urls): + for title, price, ship, url in zip(titles, prices, shipping, urls): product_info.append({ 'title': clean_text(title.text.lower()), 'price': price, + 'shipping': ship, 'url': url }) @@ -199,7 +225,7 @@ class GoogleShoppingScraper: return min_price_item - def construct_candidates(self, descriptions, prices, urls, similarities): + def construct_candidates(self, descriptions, prices, shipping, urls, similarities): """ Constructs a list of candidates from the descriptions, prices, and urls. @@ -217,6 +243,7 @@ class GoogleShoppingScraper: for i in range(len(descriptions)): candidates[descriptions[i]] = { "price": prices[i], + "shipping": shipping[i], "url": urls[i], "similarity": similarities[i] } @@ -240,6 +267,7 @@ class GoogleShoppingScraper: descriptions = [] prices = [] + shipping = [] urls = [] similarities = [] @@ -265,14 +293,15 @@ class GoogleShoppingScraper: if filtered_prices_descriptions: consecutively_empty = 0 else: - consecutively_empty +=1 + consecutively_empty += 1 descriptions += list(filtered_prices_descriptions.keys()) prices += [f"{product['price']:,.2f}" for product in filtered_prices_descriptions.values()] + shipping += [f"{product['shipping']:,.2f}" for product in filtered_prices_descriptions.values()] urls += [product['url'] for product in filtered_prices_descriptions.values()] similarities += [product['similarity'] for product in filtered_prices_descriptions.values()] - return descriptions, prices, urls, similarities + return descriptions, prices, shipping, urls, similarities def filter_products_by_similarity(self, product_info: list, target_title: str, similarity_threshold: float): """ @@ -296,6 +325,7 @@ class GoogleShoppingScraper: if similarity >= similarity_threshold: filtered_products[product['title']] = { 'price': product['price'], + 'shipping': product['shipping'], 'url': product['url'], 'similarity': similarity } diff --git a/scraper/templates/scraper/result.html b/scraper/templates/scraper/result.html index 740ae48..e580d72 100644 --- a/scraper/templates/scraper/result.html +++ b/scraper/templates/scraper/result.html @@ -35,8 +35,13 @@

After running our advanced algorithms and crunching the numbers, we have identified {{ best_title }} as the ultimate bargaining chip for you! With a jaw-dropping match percentage of {{ best_score }}%, it's practically a match made in heaven with your chosen listing. And the cherry on top? - It's currently listed at a steal of a price - just ${{ best_price }} - under the super cool category of {{ best_category }} . - You won't find a better deal anywhere else! + It's currently listed at a steal of a price - just ${{ best_price }} + {% if best_shipping != "0.00" %} + with ${{ best_shipping }} in shipping. + {% else %} + with free shipping. + {% endif %} + You won't find a better deal anywhere else!


{% if best_context.type == 'decrease' %} @@ -46,8 +51,6 @@ {% endif %}

- -
diff --git a/scraper/utils.py b/scraper/utils.py index de32c06..71b4fd8 100644 --- a/scraper/utils.py +++ b/scraper/utils.py @@ -90,7 +90,7 @@ def reject_outliers(data: list[float], m: float) -> list[int]: return indices.tolist() -def price_difference_rating(initial: float, final: float) -> float: +def price_difference_rating(initial: float, final: float, days: int) -> float: """ The rating is based on the difference between the initial and final price. The rating is 0 if the final price is greater than the initial @@ -101,16 +101,31 @@ def price_difference_rating(initial: float, final: float) -> float: Args: initial: The initial price. final: The final price. + days: The number of days a listing has been active. Returns: The rating. """ + + # Decay constant (a value greater than 0) + decay_constant = 0.01 + + # Adjust this value to control the rate of increase of the penalty + linear_factor = 0.0125 + + # Threshold number of days after which the penalty is applied + threshold_days = 7 + + if days >= threshold_days: + days_past_threshold = days - threshold_days + penalty_amount = initial*np.exp(-decay_constant*days_past_threshold) + linear_factor*days_past_threshold*initial + initial += penalty_amount if initial <= final: rating = 5.0 else: price_difference = initial - final - rating = 5.0 - (price_difference / initial) * 5.0 + rating = 5.0 - (price_difference/initial)*5.0 return max(0.0, min(rating, 5.0)) @@ -144,7 +159,7 @@ def percentage_difference(list_price: float, best_price: float) -> dict: return difference -def create_chart(categorized: dict, similar_prices: list[float], similar_descriptions: list[str], listing_currency: str, listing_title: str) -> object: +def create_chart(categorized: dict, similar_prices: list[float], similar_shipping: list[float], similar_descriptions: list[str], listing_currency: str, listing_title: str) -> object: """ Creates a line chart visualization based on the categorized items, their prices, and their descriptions. @@ -157,30 +172,36 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip A JSON string containing the Plotly figure of the line chart. """ - items, prices, descriptions = [], [], [] + items, prices, shipping, descriptions = [], [], [], [] unit = 1 for categories, titles in categorized.items(): items.append(categories) - sub_prices, sub_descriptions = [], [] + sub_prices, sub_shipping, sub_descriptions = [], [], [] for title in titles: idx = similar_descriptions.index(title) - sub_prices.append(similar_prices[idx]) + sub_prices.append(similar_prices[idx]) + sub_shipping.append(similar_shipping[idx]) sub_descriptions.append(title) prices.append(sub_prices) + shipping.append(sub_shipping) descriptions.append(sub_descriptions) sort_indices = [sorted(range(len(sublist)), key=lambda x: sublist[x]) for sublist in prices] sorted_prices = [[sublist[i] for i in indices] for sublist, indices in zip(prices, sort_indices)] + + sorted_shipping = [[sublist[i] for i in indices] for sublist, indices in zip(shipping, sort_indices)] + formatted_shipping = [[f"${ship}" if ship != "0.00" else "Free" for ship in row] for row in sorted_shipping] + sorted_descriptions = [[sublist[i] for i in indices] for sublist, indices in zip(descriptions, sort_indices)] fig = go.Figure() for i, _ in enumerate(items): x = [j*unit + 1 for j in range(len(sorted_prices[i]))] - hovertext = [f"Product: {desc.title()}
Price: ${price:.2f}" for price, desc in zip(sorted_prices[i], sorted_descriptions[i])] + hovertext = [f"Product: {desc.title()}
Price: ${price:.2f}
Shipping: {ship}" for price, ship, desc in zip(sorted_prices[i], formatted_shipping[i], sorted_descriptions[i])] fig.add_trace(go.Scatter( x=x, y=sorted_prices[i], mode='markers', @@ -221,7 +242,7 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip return fig.to_json() -def create_wordcloud(urls: list[str]) -> tuple[object, dict]: +def create_wordcloud(urls: list[str]) -> object: """ Creates a word cloud visualization based on a list of website URLs. @@ -251,15 +272,15 @@ def create_wordcloud(urls: list[str]) -> tuple[object, dict]: fig = px.imshow(wordcloud) fig.update_layout( xaxis_title="Website URL", - yaxis_title="Citations (Bigger is Better)", + yaxis_title="Citations", title={ - 'text': "Word Cloud of Websites", + 'text': "Frequently Cited Websites", 'xanchor': 'center', 'yanchor': 'top', 'y': 0.9, 'x': 0.5}) - return fig.to_json(), dict(website_counts) + return fig.to_json() def categorize_titles(items: list[str]) -> dict: """ diff --git a/scraper/views.py b/scraper/views.py index 8218aee..9afd416 100644 --- a/scraper/views.py +++ b/scraper/views.py @@ -48,34 +48,33 @@ class Index(View): # Find viable products based on the title cleaned_title = remove_illegal_characters(title) - similar_descriptions, similar_prices, similar_urls, similar_scores = shopping_instance.find_viable_product(cleaned_title, ramp_down=0.0) - candidates = shopping_instance.construct_candidates(similar_descriptions, similar_prices, similar_urls, similar_scores) + similar_descriptions, similar_prices, similar_shipping, similar_urls, similar_scores = shopping_instance.find_viable_product(cleaned_title, ramp_down=0.0) + candidates = shopping_instance.construct_candidates(similar_descriptions, similar_prices, similar_shipping, similar_urls, similar_scores) # Convert prices to float and shorten the descriptions if necessary similar_prices = [float(price.replace(',', '')) for price in similar_prices] # Categorize the titles and create the chart and wordcloud categorized = categorize_titles(similar_descriptions) - chart = create_chart(categorized, similar_prices, similar_descriptions, currency, title) - wordcloud, website_counts = create_wordcloud(similar_urls) + chart = create_chart(categorized, similar_prices, similar_shipping, similar_descriptions, currency, title) + wordcloud = create_wordcloud(similar_urls) # Based on the best similar product, get the price, description, category, and URL best_product = shopping_instance.lowest_price_highest_similarity(candidates) idx = similar_urls.index(best_product[1]["url"]) best_price = f"{similar_prices[idx]:,.2f}" + best_shipping = similar_shipping[idx] best_title = similar_descriptions[idx] best_score = best_product[1]["similarity"] * 100 - best_category = [key for key, value in categorized.items() if [item for item in value if item == best_title]][0] - # Percetage difference between the listing price and the best found price - best_context = percentage_difference(float(price), float(best_price.replace(",", ""))) - price_rating = price_difference_rating(float(price), float(best_price.replace(",", ""))) + # Percetage difference between the listing price and the best found price (including shipping) + best_total = float(best_price.replace(",", "")) + float(best_shipping.replace(",", "")) + best_context = percentage_difference(float(price), best_total,) + price_rating = price_difference_rating(float(price), best_total, days) # Get the total number of items total_items = len(similar_descriptions) - max_citations = max([value for value in website_counts.values()]) - max_website = [key for key, value in website_counts.items() if value == max_citations][0] # Create the context context = { @@ -97,12 +96,9 @@ class Index(View): 'categorized': categorized, 'total_items': total_items, 'best_price': best_price, + 'best_shipping': best_shipping, 'best_title': best_title.title(), 'best_score': round(best_score, 2), - 'website_counts': website_counts, - 'max_website': max_website, - 'max_citations': max_citations, - 'best_category': best_category, 'best_context': best_context }