From 85891475a496e1fd342a3993f27d8a96daf2d5b1 Mon Sep 17 00:00:00 2001
From: Bhavanvir Rai <bhavanvir.r@gmail.com>
Date: Fri, 5 May 2023 23:17:08 -0700
Subject: [PATCH] Algorithmic tweaks + minor changes to Results page.

---
 .devcontainer/requirements.txt                |  1 -
 scraper/shop_class.py                         | 14 ++++--
 ...otWordCloud.js => plotCountryCitations.js} |  2 +-
 scraper/templates/scraper/result.html         |  6 +--
 scraper/utils.py                              | 49 ++++++++++++-------
 scraper/views.py                              |  6 +--
 6 files changed, 48 insertions(+), 30 deletions(-)
 rename scraper/static/{plotWordCloud.js => plotCountryCitations.js} (77%)
diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt
index 467f333..94cd475 100644
--- a/.devcontainer/requirements.txt
+++ b/.devcontainer/requirements.txt
@@ -9,5 +9,4 @@ plotly==5.14.1
 plotly-express==0.4.1
 regex==2023.3.23
 requests==2.28.2
-wordcloud==1.8.2.2
 scikit-learn==1.2.2
\ No newline at end of file
diff --git a/scraper/shop_class.py b/scraper/shop_class.py
index df652f9..27c2f1a 100644
--- a/scraper/shop_class.py
+++ b/scraper/shop_class.py
@@ -150,12 +150,16 @@ class EbayScraper:
             A tuple of three lists: (1) titles with outliers removed, (2) prices with outliers removed, and (3) countries with outliers removed.
         """
 
-        outlier_indices = reject_outliers(np.array(prices), m=1.5)
+        # Minimum number of items required to start removing outliers
+        removal_threshold = 100
 
-        titles = [title for i, title in enumerate(titles) if i not in outlier_indices]
-        prices = [price for i, price in enumerate(prices) if i not in outlier_indices]
-        shipping = [ship for i, ship in enumerate(shipping) if i not in outlier_indices]
-        countries = [country for i, country in enumerate(countries) if i not in outlier_indices]
+        if len(titles) >= removal_threshold:
+            outlier_indices = reject_outliers(np.array(prices), m=1.5)
+
+            titles = [title for i, title in enumerate(titles) if i not in outlier_indices]
+            prices = [price for i, price in enumerate(prices) if i not in outlier_indices]
+            shipping = [ship for i, ship in enumerate(shipping) if i not in outlier_indices]
+            countries = [country for i, country in enumerate(countries) if i not in outlier_indices]
 
         return titles, prices, shipping, countries
 
diff --git a/scraper/static/plotWordCloud.js b/scraper/static/plotCountryCitations.js
similarity index 77%
rename from scraper/static/plotWordCloud.js
rename to scraper/static/plotCountryCitations.js
index 8cf9331..5f52654 100644
--- a/scraper/static/plotWordCloud.js
+++ b/scraper/static/plotCountryCitations.js
@@ -1,5 +1,5 @@
 document.addEventListener("DOMContentLoaded", function () {
-    var chart = document.getElementById('render-wordcloud');
+    var chart = document.getElementById('render-bargraph');
     var chartContent = chart.getAttribute('data-chart');
     var chartObject = JSON.parse(chartContent);
     Plotly.newPlot(chart, chartObject);
diff --git a/scraper/templates/scraper/result.html b/scraper/templates/scraper/result.html
index 27cf984..a368854 100644
--- a/scraper/templates/scraper/result.html
+++ b/scraper/templates/scraper/result.html
@@ -110,15 +110,15 @@
 
         <div class="card" style="margin-top: 2.5rem; margin-bottom: 2.5rem;">
             <div class="card-header">
-                <h4><i class="fas fa-globe"></i> Country Frequency</h4>
+                <h4><i class="fas fa-chart-bar"></i> Country Frequency</h4>
             </div>
             <div class="card-body">
 
-                <div id="render-wordcloud" data-chart="{{ wordcloud }}"></div>
+                <div id="render-bargraph" data-chart="{{ bargraph }}"></div>
             </div>
         </div>
     </div>
 
     <script src="{% static 'plotSimilarResults.js' %}"></script>
-    <script src="{% static 'plotWordCloud.js' %}"></script>
+    <script src="{% static 'plotCountryCitations.js' %}"></script>
 {% endblock content %}
diff --git a/scraper/utils.py b/scraper/utils.py
index c162daa..9e466de 100644
--- a/scraper/utils.py
+++ b/scraper/utils.py
@@ -3,11 +3,9 @@ from .exceptions import *
 import numpy as np
 import requests
 import re
-import plotly.express as px
 import plotly.graph_objects as go
 from sklearn.linear_model import LinearRegression
 from sklearn.preprocessing import PolynomialFeatures
-from wordcloud import WordCloud
 from collections import Counter
 
 def remove_illegal_characters(title: str) -> str:
@@ -244,27 +242,42 @@ def create_chart(similar_prices: list[float], similar_shipping: list[float], sim
         
     return fig.to_json()
 
-def create_wordcloud(urls: list[str]) -> object:
+def create_bargraph(countries: list[str]) -> object:
     """
-    Creates a word cloud visualization based on a list of website URLs.
+    Creates a word cloud visualization based on a list of countries.
 
     Args:
-        urls (list[str]): A list of website URLs to be used to generate the word cloud.
+        countries (list[str]): A list of countries to be used to generate the word cloud.
 
     Returns:
-        A tuple of the following:
-        - A JSON string containing the Plotly Express figure of the word cloud.
-        - A dictionary where the keys are the website names and the values are the frequency count of each website in the URLs list.
+        A JSON string containing the Plotly Express figure of the word cloud.
     """
 
-    website_counts = Counter(urls)
-    wordcloud = WordCloud(
-        background_color='white',
-        scale=4, 
-        prefer_horizontal=0.9,
-        colormap='RdYlGn_r').generate_from_frequencies(website_counts)
-
-    fig = px.imshow(wordcloud)
+    # Count the occurrences of each country
+    country_counts = Counter(countries)
+    
+    # Get the names and counts of the countries
+    country_names = list(country_counts.keys())
+    country_values = list(country_counts.values())
+    
+    # Create a bar graph with the country names on the x-axis and counts on the y-axis
+    fig = go.Figure(
+        go.Bar(
+            x=country_names,
+            y=country_values,
+            hoverinfo='text',
+            hovertext=[f"Country: {country}<br>Citations: {count}" for country, count in zip(country_names, country_values)],
+            marker=dict(
+                color=country_values,
+                colorscale='RdYlGn_r',
+                showscale=True,
+                colorbar=dict(
+                    title='Citations'
+                )
+            )
+        )
+    )
+    
     fig.update_layout(
         xaxis_title="Country of Origin",
         yaxis_title="Citations",
@@ -273,6 +286,8 @@ def create_wordcloud(urls: list[str]) -> object:
             'xanchor': 'center',
             'yanchor': 'top',
             'y': 0.9,
-            'x': 0.5})
+            'x': 0.5},
+        plot_bgcolor='rgba(0,0,0,0)'
+    )
 
     return fig.to_json()
\ No newline at end of file
diff --git a/scraper/views.py b/scraper/views.py
index aedfa4f..1ed0565 100644
--- a/scraper/views.py
+++ b/scraper/views.py
@@ -69,9 +69,9 @@ class Index(View):
             best_context = percentage_difference(float(price), best_total,)
             price_rating = price_difference_rating(float(price), best_total, days)
 
-            # Categorize the titles and create the chart and wordcloud
+            # Categorize the titles and create the chart and bargraph
             chart = create_chart(similar_prices, similar_shipping, similar_descriptions, currency, title, best_title)
-            wordcloud = create_wordcloud(similar_countries)   
+            bargraph = create_bargraph(similar_countries)   
 
             # Get the total number of items
             total_items = len(similar_descriptions)
@@ -83,7 +83,7 @@ class Index(View):
                 'title': title,
                 'price': f"{float(price):,.2f}",
                 'chart': chart,
-                'wordcloud': wordcloud,
+                'bargraph': bargraph,
                 'price_rating': round(price_rating, 1),
                 'days': days,
                 'hours': hours,