From 85891475a496e1fd342a3993f27d8a96daf2d5b1 Mon Sep 17 00:00:00 2001 From: Bhavanvir Rai Date: Fri, 5 May 2023 23:17:08 -0700 Subject: [PATCH] Algorithmic tweaks + minor changes to Results page. --- .devcontainer/requirements.txt | 1 - scraper/shop_class.py | 14 ++++-- ...otWordCloud.js => plotCountryCitations.js} | 2 +- scraper/templates/scraper/result.html | 6 +-- scraper/utils.py | 49 ++++++++++++------- scraper/views.py | 6 +-- 6 files changed, 48 insertions(+), 30 deletions(-) rename scraper/static/{plotWordCloud.js => plotCountryCitations.js} (77%) diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt index 467f333..94cd475 100644 --- a/.devcontainer/requirements.txt +++ b/.devcontainer/requirements.txt @@ -9,5 +9,4 @@ plotly==5.14.1 plotly-express==0.4.1 regex==2023.3.23 requests==2.28.2 -wordcloud==1.8.2.2 scikit-learn==1.2.2 \ No newline at end of file diff --git a/scraper/shop_class.py b/scraper/shop_class.py index df652f9..27c2f1a 100644 --- a/scraper/shop_class.py +++ b/scraper/shop_class.py @@ -150,12 +150,16 @@ class EbayScraper: A tuple of three lists: (1) titles with outliers removed, (2) prices with outliers removed, and (3) countries with outliers removed. """ - outlier_indices = reject_outliers(np.array(prices), m=1.5) + # Minimum number of items required to start removing outliers + removal_threshold = 100 - titles = [title for i, title in enumerate(titles) if i not in outlier_indices] - prices = [price for i, price in enumerate(prices) if i not in outlier_indices] - shipping = [ship for i, ship in enumerate(shipping) if i not in outlier_indices] - countries = [country for i, country in enumerate(countries) if i not in outlier_indices] + if len(titles) >= removal_threshold: + outlier_indices = reject_outliers(np.array(prices), m=1.5) + + titles = [title for i, title in enumerate(titles) if i not in outlier_indices] + prices = [price for i, price in enumerate(prices) if i not in outlier_indices] + shipping = [ship for i, ship in enumerate(shipping) if i not in outlier_indices] + countries = [country for i, country in enumerate(countries) if i not in outlier_indices] return titles, prices, shipping, countries diff --git a/scraper/static/plotWordCloud.js b/scraper/static/plotCountryCitations.js similarity index 77% rename from scraper/static/plotWordCloud.js rename to scraper/static/plotCountryCitations.js index 8cf9331..5f52654 100644 --- a/scraper/static/plotWordCloud.js +++ b/scraper/static/plotCountryCitations.js @@ -1,5 +1,5 @@ document.addEventListener("DOMContentLoaded", function () { - var chart = document.getElementById('render-wordcloud'); + var chart = document.getElementById('render-bargraph'); var chartContent = chart.getAttribute('data-chart'); var chartObject = JSON.parse(chartContent); Plotly.newPlot(chart, chartObject); diff --git a/scraper/templates/scraper/result.html b/scraper/templates/scraper/result.html index 27cf984..a368854 100644 --- a/scraper/templates/scraper/result.html +++ b/scraper/templates/scraper/result.html @@ -110,15 +110,15 @@
-

Country Frequency

+

Country Frequency

-
+
- + {% endblock content %} diff --git a/scraper/utils.py b/scraper/utils.py index c162daa..9e466de 100644 --- a/scraper/utils.py +++ b/scraper/utils.py @@ -3,11 +3,9 @@ from .exceptions import * import numpy as np import requests import re -import plotly.express as px import plotly.graph_objects as go from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures -from wordcloud import WordCloud from collections import Counter def remove_illegal_characters(title: str) -> str: @@ -244,27 +242,42 @@ def create_chart(similar_prices: list[float], similar_shipping: list[float], sim return fig.to_json() -def create_wordcloud(urls: list[str]) -> object: +def create_bargraph(countries: list[str]) -> object: """ - Creates a word cloud visualization based on a list of website URLs. + Creates a word cloud visualization based on a list of countries. Args: - urls (list[str]): A list of website URLs to be used to generate the word cloud. + countries (list[str]): A list of countries to be used to generate the word cloud. Returns: - A tuple of the following: - - A JSON string containing the Plotly Express figure of the word cloud. - - A dictionary where the keys are the website names and the values are the frequency count of each website in the URLs list. + A JSON string containing the Plotly Express figure of the word cloud. """ - website_counts = Counter(urls) - wordcloud = WordCloud( - background_color='white', - scale=4, - prefer_horizontal=0.9, - colormap='RdYlGn_r').generate_from_frequencies(website_counts) - - fig = px.imshow(wordcloud) + # Count the occurrences of each country + country_counts = Counter(countries) + + # Get the names and counts of the countries + country_names = list(country_counts.keys()) + country_values = list(country_counts.values()) + + # Create a bar graph with the country names on the x-axis and counts on the y-axis + fig = go.Figure( + go.Bar( + x=country_names, + y=country_values, + hoverinfo='text', + hovertext=[f"Country: {country}
Citations: {count}" for country, count in zip(country_names, country_values)], + marker=dict( + color=country_values, + colorscale='RdYlGn_r', + showscale=True, + colorbar=dict( + title='Citations' + ) + ) + ) + ) + fig.update_layout( xaxis_title="Country of Origin", yaxis_title="Citations", @@ -273,6 +286,8 @@ def create_wordcloud(urls: list[str]) -> object: 'xanchor': 'center', 'yanchor': 'top', 'y': 0.9, - 'x': 0.5}) + 'x': 0.5}, + plot_bgcolor='rgba(0,0,0,0)' + ) return fig.to_json() \ No newline at end of file diff --git a/scraper/views.py b/scraper/views.py index aedfa4f..1ed0565 100644 --- a/scraper/views.py +++ b/scraper/views.py @@ -69,9 +69,9 @@ class Index(View): best_context = percentage_difference(float(price), best_total,) price_rating = price_difference_rating(float(price), best_total, days) - # Categorize the titles and create the chart and wordcloud + # Categorize the titles and create the chart and bargraph chart = create_chart(similar_prices, similar_shipping, similar_descriptions, currency, title, best_title) - wordcloud = create_wordcloud(similar_countries) + bargraph = create_bargraph(similar_countries) # Get the total number of items total_items = len(similar_descriptions) @@ -83,7 +83,7 @@ class Index(View): 'title': title, 'price': f"{float(price):,.2f}", 'chart': chart, - 'wordcloud': wordcloud, + 'bargraph': bargraph, 'price_rating': round(price_rating, 1), 'days': days, 'hours': hours,