Merge pull request #61 from bhavanvir/main

Bunch of changes.
This commit is contained in:
Bhavanvir Rai
2023-05-05 19:45:45 -07:00
committed by GitHub
4 changed files with 118 additions and 57 deletions

View File

@@ -40,7 +40,7 @@ class GoogleShoppingScraper:
return description
def get_product_price(self) -> np.ndarray:
def get_product_price(self) -> list[float]:
"""
Extracts the price of each product from the HTML.
@@ -63,6 +63,29 @@ class GoogleShoppingScraper:
normalized = [float(price.replace(",", "")) for price in normalized]
return normalized
def get_product_shipping(self) -> list[float]:
"""
Extracts the shipping cost of each product from the HTML.
Args:
soup: The HTML to extract the shipping cost from.
Returns:
The shipping cost of each product. The shipping cost is represented as a
NumPy array.
"""
shipping = self.soup.find_all("span", {"class": "dD8iuc"})
values = []
for ship in shipping:
values.append(ship.text)
cleansed = [re.search(r"([0-9]+\.[0-9]+)|(Free)", ship).group(0) for ship in values]
cleansed = [float(ship) if ship != "Free" else 0.0 for ship in cleansed]
return cleansed
def get_product_url(self) -> str:
"""
@@ -113,13 +136,14 @@ class GoogleShoppingScraper:
return similarity
def remove_outliers(self, titles: list[str], prices: list[float], urls: list[str]) -> tuple[list[str], list[float], list[str]]:
def remove_outliers(self, titles: list[str], prices: list[float], shipping: list[float], urls: list[str]) -> tuple[list[str], list[float], list[float], list[str]]:
"""
Removes outliers from a set of data consisting of titles, prices, and URLs.
Args:
titles (list[str]): A list of titles of the items.
prices (list[float]): A list of prices of the items.
shipping (list[float]): A list of shipping costs of the items.
urls (list[str]): A list of URLs of the items.
Returns:
@@ -130,9 +154,10 @@ class GoogleShoppingScraper:
titles = [title for i, title in enumerate(titles) if i not in outlier_indices]
prices = [price for i, price in enumerate(prices) if i not in outlier_indices]
shipping = [ship for i, ship in enumerate(shipping) if i not in outlier_indices]
urls = [url for i, url in enumerate(urls) if i not in outlier_indices]
return titles, prices, urls
return titles, prices, shipping, urls
def get_product_info(self):
"""
@@ -153,15 +178,17 @@ class GoogleShoppingScraper:
titles = self.get_product_title()
prices = self.get_product_price()
shipping = self.get_product_shipping()
urls = self.get_product_url()
titles, prices, urls = self.remove_outliers(titles, prices, urls)
titles, prices, shipping, urls = self.remove_outliers(titles, prices, shipping, urls)
product_info = []
for title, price, url in zip(titles, prices, urls):
for title, price, ship, url in zip(titles, prices, shipping, urls):
product_info.append({
'title': clean_text(title.text.lower()),
'price': price,
'shipping': ship,
'url': url
})
@@ -199,7 +226,7 @@ class GoogleShoppingScraper:
return min_price_item
def construct_candidates(self, descriptions, prices, urls, similarities):
def construct_candidates(self, descriptions, prices, shipping, urls, similarities):
"""
Constructs a list of candidates from the descriptions, prices, and
urls.
@@ -207,6 +234,7 @@ class GoogleShoppingScraper:
Args:
descriptions: The descriptions of the products.
prices: The prices of the products.
shipping: The shipping costs of the products.
urls: The urls of the products.
Returns:
@@ -217,6 +245,7 @@ class GoogleShoppingScraper:
for i in range(len(descriptions)):
candidates[descriptions[i]] = {
"price": prices[i],
"shipping": shipping[i],
"url": urls[i],
"similarity": similarities[i]
}
@@ -240,6 +269,7 @@ class GoogleShoppingScraper:
descriptions = []
prices = []
shipping = []
urls = []
similarities = []
@@ -265,14 +295,15 @@ class GoogleShoppingScraper:
if filtered_prices_descriptions:
consecutively_empty = 0
else:
consecutively_empty +=1
consecutively_empty += 1
descriptions += list(filtered_prices_descriptions.keys())
prices += [f"{product['price']:,.2f}" for product in filtered_prices_descriptions.values()]
shipping += [f"{product['shipping']:,.2f}" for product in filtered_prices_descriptions.values()]
urls += [product['url'] for product in filtered_prices_descriptions.values()]
similarities += [product['similarity'] for product in filtered_prices_descriptions.values()]
return descriptions, prices, urls, similarities
return descriptions, prices, shipping, urls, similarities
def filter_products_by_similarity(self, product_info: list, target_title: str, similarity_threshold: float):
"""
@@ -296,6 +327,7 @@ class GoogleShoppingScraper:
if similarity >= similarity_threshold:
filtered_products[product['title']] = {
'price': product['price'],
'shipping': product['shipping'],
'url': product['url'],
'similarity': similarity
}

View File

@@ -35,8 +35,13 @@
</h4>
<p>After running our advanced algorithms and crunching the numbers, we have <b>identified {{ best_title }}</b> as the ultimate bargaining chip for you!
With a jaw-dropping <b>match percentage of {{ best_score }}%</b>, it's practically a match made in heaven with your chosen listing. And the cherry on top?
It's currently listed at a steal of a price - <b>just ${{ best_price }}</b> - under the super cool <b>category of {{ best_category }} </b>.
You won't find a better deal anywhere else!
It's currently listed at a steal of a price - <b>just ${{ best_price }}
{% if best_shipping != "0.00" %}
with ${{ best_shipping }} in shipping.
{% else %}
with free shipping.
{% endif %}
</b>You won't find a better deal anywhere else!
</p>
<hr>
<p>{% if best_context.type == 'decrease' %}
@@ -46,8 +51,6 @@
{% endif %}
</p>
</div>
<!-- <div class="Stars" style="--rating: {{ price_rating }}" aria-label="Rating of this product is {{ price_rating }} out of 5."></div> -->
<div class="card" style="margin-top: 2.5rem;">
<div class="card-header">

View File

@@ -8,6 +8,7 @@ import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from wordcloud import WordCloud
from collections import Counter
@@ -90,7 +91,7 @@ def reject_outliers(data: list[float], m: float) -> list[int]:
return indices.tolist()
def price_difference_rating(initial: float, final: float) -> float:
def price_difference_rating(initial: float, final: float, days: int) -> float:
"""
The rating is based on the difference between the initial and final
price. The rating is 0 if the final price is greater than the initial
@@ -101,16 +102,31 @@ def price_difference_rating(initial: float, final: float) -> float:
Args:
initial: The initial price.
final: The final price.
days: The number of days a listing has been active.
Returns:
The rating.
"""
# Decay constant (a value greater than 0)
decay_constant = 0.01
# Adjust this value to control the rate of increase of the penalty
linear_factor = 0.0125
# Threshold number of days after which the penalty is applied
threshold_days = 7
if days >= threshold_days:
days_past_threshold = days - threshold_days
penalty_amount = initial*np.exp(-decay_constant*days_past_threshold) + linear_factor*days_past_threshold*initial
initial += penalty_amount
if initial <= final:
rating = 5.0
else:
price_difference = initial - final
rating = 5.0 - (price_difference / initial) * 5.0
rating = 5.0 - (price_difference/initial)*5.0
return max(0.0, min(rating, 5.0))
@@ -144,43 +160,50 @@ def percentage_difference(list_price: float, best_price: float) -> dict:
return difference
def create_chart(categorized: dict, similar_prices: list[float], similar_descriptions: list[str], listing_currency: str, listing_title: str) -> object:
def create_chart(categorized: dict, similar_prices: list[float], similar_shipping: list[float], similar_descriptions: list[str], listing_currency: str, listing_title: str) -> object:
"""
Creates a line chart visualization based on the categorized items, their prices, and their descriptions.
Args:
categorized (dict): A dictionary where the keys are the names of the clusters and the values are lists of the items in that cluster.
similar_prices (list[float]): A list of prices of the items.
similar_shipping (list[float]): A list of shipping costs of the items.
similar_descriptions (list[str]): A list of descriptions of the items.
Returns:
A JSON string containing the Plotly figure of the line chart.
"""
items, prices, descriptions = [], [], []
items, prices, shipping, descriptions = [], [], [], []
unit = 1
for categories, titles in categorized.items():
items.append(categories)
sub_prices, sub_descriptions = [], []
sub_prices, sub_shipping, sub_descriptions = [], [], []
for title in titles:
idx = similar_descriptions.index(title)
sub_prices.append(similar_prices[idx])
sub_prices.append(similar_prices[idx])
sub_shipping.append(similar_shipping[idx])
sub_descriptions.append(title)
prices.append(sub_prices)
shipping.append(sub_shipping)
descriptions.append(sub_descriptions)
sort_indices = [sorted(range(len(sublist)), key=lambda x: sublist[x]) for sublist in prices]
sorted_prices = [[sublist[i] for i in indices] for sublist, indices in zip(prices, sort_indices)]
sorted_shipping = [[sublist[i] for i in indices] for sublist, indices in zip(shipping, sort_indices)]
formatted_shipping = [[f"${ship}" if ship != "0.00" else "Free" for ship in row] for row in sorted_shipping]
sorted_descriptions = [[sublist[i] for i in indices] for sublist, indices in zip(descriptions, sort_indices)]
fig = go.Figure()
for i, _ in enumerate(items):
x = [j*unit + 1 for j in range(len(sorted_prices[i]))]
hovertext = [f"Product: {desc.title()}<br>Price: ${price:.2f}" for price, desc in zip(sorted_prices[i], sorted_descriptions[i])]
hovertext = [f"Product: {desc.title()}<br>Price: ${price:.2f}<br>Shipping: {ship}" for price, ship, desc in zip(sorted_prices[i], formatted_shipping[i], sorted_descriptions[i])]
fig.add_trace(go.Scatter(
x=x, y=sorted_prices[i],
mode='markers',
@@ -188,12 +211,17 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip
text=hovertext,
name=f"Category {i + 1}"))
# Compute the linear regression on all data points
# Compute the polynomial regression on all data points
x = np.concatenate([np.arange(len(prices))*unit + 1 for prices in sorted_prices])
y = np.concatenate(sorted_prices)
reg = LinearRegression().fit(x.reshape(-1, 1), y)
x_reg = [np.min(x), np.max(x)*1.5]
y_reg = reg.predict(np.array(x_reg).reshape(-1, 1))
poly_features = PolynomialFeatures(degree=4, include_bias=True)
x_poly = poly_features.fit_transform(x.reshape(-1, 1))
reg = LinearRegression().fit(x_poly, y)
x_reg = np.linspace(np.min(x), np.max(x), num=100)
x_reg_poly = poly_features.fit_transform(x_reg.reshape(-1, 1))
y_reg = reg.predict(x_reg_poly)
# Add the trend line to the plot
fig.add_trace(
@@ -202,14 +230,15 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip
mode='lines',
name='Trend Line'))
# Add prediction annotation for the trend line
prediction = reg.predict([[10]])[0]
fig.add_annotation(x=10, y=prediction, text=f"Expected Price: ${prediction:.2f} {listing_currency}", showarrow=True)
# Add annotations to all x values
for x_val in x:
y_val = reg.predict(poly_features.transform([[x_val]]))[0]
fig.add_annotation(x=x_val, y=y_val, text=f"Prediction: ${y_val:.2f}", showarrow=True)
fig.update_layout(
template='plotly_white',
hovermode='closest',
xaxis_title="Product",
xaxis_title="Product Number",
yaxis_title=f"Price $({listing_currency})",
legend_title="Categories",
title={
@@ -221,7 +250,7 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip
return fig.to_json()
def create_wordcloud(urls: list[str]) -> tuple[object, dict]:
def create_wordcloud(urls: list[str]) -> object:
"""
Creates a word cloud visualization based on a list of website URLs.
@@ -251,15 +280,15 @@ def create_wordcloud(urls: list[str]) -> tuple[object, dict]:
fig = px.imshow(wordcloud)
fig.update_layout(
xaxis_title="Website URL",
yaxis_title="Citations (Bigger is Better)",
yaxis_title="Citations",
title={
'text': "Word Cloud of Websites",
'text': "Frequently Cited Websites",
'xanchor': 'center',
'yanchor': 'top',
'y': 0.9,
'x': 0.5})
return fig.to_json(), dict(website_counts)
return fig.to_json()
def categorize_titles(items: list[str]) -> dict:
"""
@@ -275,20 +304,21 @@ def categorize_titles(items: list[str]) -> dict:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(items)
num_clusters = len(items) // 5
kmeans = KMeans(n_clusters=num_clusters, n_init=10)
wcss = []
for i in range(1, len(items)//2):
kmeans = KMeans(n_clusters=i, n_init=10)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
# Select the optimal number of clusters based on the elbow point
optimal_num_clusters = int(min(wcss))
kmeans = KMeans(n_clusters=optimal_num_clusters, n_init=10)
kmeans.fit(X)
cluster_names = []
for i in range(num_clusters):
cluster_items = [items[j] for j in range(len(items)) if kmeans.labels_[j] == i]
representative_item = f"{i+1}"
cluster_names.append(representative_item)
clusters = {}
for i in range(num_clusters):
for i in range(optimal_num_clusters):
cluster_items = [items[j] for j in range(len(items)) if kmeans.labels_[j] == i]
cluster_name = cluster_names[i]
clusters[cluster_name] = cluster_items
representative_item = f"{i + 1}"
clusters[representative_item] = cluster_items
return clusters

View File

@@ -48,34 +48,33 @@ class Index(View):
# Find viable products based on the title
cleaned_title = remove_illegal_characters(title)
similar_descriptions, similar_prices, similar_urls, similar_scores = shopping_instance.find_viable_product(cleaned_title, ramp_down=0.0)
candidates = shopping_instance.construct_candidates(similar_descriptions, similar_prices, similar_urls, similar_scores)
similar_descriptions, similar_prices, similar_shipping, similar_urls, similar_scores = shopping_instance.find_viable_product(cleaned_title, ramp_down=0.0)
candidates = shopping_instance.construct_candidates(similar_descriptions, similar_prices, similar_shipping, similar_urls, similar_scores)
# Convert prices to float and shorten the descriptions if necessary
similar_prices = [float(price.replace(',', '')) for price in similar_prices]
# Categorize the titles and create the chart and wordcloud
categorized = categorize_titles(similar_descriptions)
chart = create_chart(categorized, similar_prices, similar_descriptions, currency, title)
wordcloud, website_counts = create_wordcloud(similar_urls)
chart = create_chart(categorized, similar_prices, similar_shipping, similar_descriptions, currency, title)
wordcloud = create_wordcloud(similar_urls)
# Based on the best similar product, get the price, description, category, and URL
best_product = shopping_instance.lowest_price_highest_similarity(candidates)
idx = similar_urls.index(best_product[1]["url"])
best_price = f"{similar_prices[idx]:,.2f}"
best_shipping = similar_shipping[idx]
best_title = similar_descriptions[idx]
best_score = best_product[1]["similarity"] * 100
best_category = [key for key, value in categorized.items() if [item for item in value if item == best_title]][0]
# Percetage difference between the listing price and the best found price
best_context = percentage_difference(float(price), float(best_price.replace(",", "")))
price_rating = price_difference_rating(float(price), float(best_price.replace(",", "")))
# Percetage difference between the listing price and the best found price (including shipping)
best_total = float(best_price.replace(",", "")) + float(best_shipping.replace(",", ""))
best_context = percentage_difference(float(price), best_total,)
price_rating = price_difference_rating(float(price), best_total, days)
# Get the total number of items
total_items = len(similar_descriptions)
max_citations = max([value for value in website_counts.values()])
max_website = [key for key, value in website_counts.items() if value == max_citations][0]
# Create the context
context = {
@@ -97,12 +96,9 @@ class Index(View):
'categorized': categorized,
'total_items': total_items,
'best_price': best_price,
'best_shipping': best_shipping,
'best_title': best_title.title(),
'best_score': round(best_score, 2),
'website_counts': website_counts,
'max_website': max_website,
'max_citations': max_citations,
'best_category': best_category,
'best_context': best_context
}