mirror of
https://github.com/Marketscrape/marketscrape-web.git
synced 2026-05-19 11:14:29 -04:00
@@ -40,7 +40,7 @@ class GoogleShoppingScraper:
|
||||
|
||||
return description
|
||||
|
||||
def get_product_price(self) -> np.ndarray:
|
||||
def get_product_price(self) -> list[float]:
|
||||
"""
|
||||
Extracts the price of each product from the HTML.
|
||||
|
||||
@@ -63,6 +63,29 @@ class GoogleShoppingScraper:
|
||||
normalized = [float(price.replace(",", "")) for price in normalized]
|
||||
|
||||
return normalized
|
||||
|
||||
def get_product_shipping(self) -> list[float]:
|
||||
"""
|
||||
Extracts the shipping cost of each product from the HTML.
|
||||
|
||||
Args:
|
||||
soup: The HTML to extract the shipping cost from.
|
||||
|
||||
Returns:
|
||||
The shipping cost of each product. The shipping cost is represented as a
|
||||
NumPy array.
|
||||
"""
|
||||
|
||||
shipping = self.soup.find_all("span", {"class": "dD8iuc"})
|
||||
|
||||
values = []
|
||||
for ship in shipping:
|
||||
values.append(ship.text)
|
||||
|
||||
cleansed = [re.search(r"([0-9]+\.[0-9]+)|(Free)", ship).group(0) for ship in values]
|
||||
cleansed = [float(ship) if ship != "Free" else 0.0 for ship in cleansed]
|
||||
|
||||
return cleansed
|
||||
|
||||
def get_product_url(self) -> str:
|
||||
"""
|
||||
@@ -113,13 +136,14 @@ class GoogleShoppingScraper:
|
||||
|
||||
return similarity
|
||||
|
||||
def remove_outliers(self, titles: list[str], prices: list[float], urls: list[str]) -> tuple[list[str], list[float], list[str]]:
|
||||
def remove_outliers(self, titles: list[str], prices: list[float], shipping: list[float], urls: list[str]) -> tuple[list[str], list[float], list[float], list[str]]:
|
||||
"""
|
||||
Removes outliers from a set of data consisting of titles, prices, and URLs.
|
||||
|
||||
Args:
|
||||
titles (list[str]): A list of titles of the items.
|
||||
prices (list[float]): A list of prices of the items.
|
||||
shipping (list[float]): A list of shipping costs of the items.
|
||||
urls (list[str]): A list of URLs of the items.
|
||||
|
||||
Returns:
|
||||
@@ -130,9 +154,10 @@ class GoogleShoppingScraper:
|
||||
|
||||
titles = [title for i, title in enumerate(titles) if i not in outlier_indices]
|
||||
prices = [price for i, price in enumerate(prices) if i not in outlier_indices]
|
||||
shipping = [ship for i, ship in enumerate(shipping) if i not in outlier_indices]
|
||||
urls = [url for i, url in enumerate(urls) if i not in outlier_indices]
|
||||
|
||||
return titles, prices, urls
|
||||
return titles, prices, shipping, urls
|
||||
|
||||
def get_product_info(self):
|
||||
"""
|
||||
@@ -153,15 +178,17 @@ class GoogleShoppingScraper:
|
||||
|
||||
titles = self.get_product_title()
|
||||
prices = self.get_product_price()
|
||||
shipping = self.get_product_shipping()
|
||||
urls = self.get_product_url()
|
||||
|
||||
titles, prices, urls = self.remove_outliers(titles, prices, urls)
|
||||
titles, prices, shipping, urls = self.remove_outliers(titles, prices, shipping, urls)
|
||||
|
||||
product_info = []
|
||||
for title, price, url in zip(titles, prices, urls):
|
||||
for title, price, ship, url in zip(titles, prices, shipping, urls):
|
||||
product_info.append({
|
||||
'title': clean_text(title.text.lower()),
|
||||
'price': price,
|
||||
'shipping': ship,
|
||||
'url': url
|
||||
})
|
||||
|
||||
@@ -199,7 +226,7 @@ class GoogleShoppingScraper:
|
||||
|
||||
return min_price_item
|
||||
|
||||
def construct_candidates(self, descriptions, prices, urls, similarities):
|
||||
def construct_candidates(self, descriptions, prices, shipping, urls, similarities):
|
||||
"""
|
||||
Constructs a list of candidates from the descriptions, prices, and
|
||||
urls.
|
||||
@@ -207,6 +234,7 @@ class GoogleShoppingScraper:
|
||||
Args:
|
||||
descriptions: The descriptions of the products.
|
||||
prices: The prices of the products.
|
||||
shipping: The shipping costs of the products.
|
||||
urls: The urls of the products.
|
||||
|
||||
Returns:
|
||||
@@ -217,6 +245,7 @@ class GoogleShoppingScraper:
|
||||
for i in range(len(descriptions)):
|
||||
candidates[descriptions[i]] = {
|
||||
"price": prices[i],
|
||||
"shipping": shipping[i],
|
||||
"url": urls[i],
|
||||
"similarity": similarities[i]
|
||||
}
|
||||
@@ -240,6 +269,7 @@ class GoogleShoppingScraper:
|
||||
|
||||
descriptions = []
|
||||
prices = []
|
||||
shipping = []
|
||||
urls = []
|
||||
similarities = []
|
||||
|
||||
@@ -265,14 +295,15 @@ class GoogleShoppingScraper:
|
||||
if filtered_prices_descriptions:
|
||||
consecutively_empty = 0
|
||||
else:
|
||||
consecutively_empty +=1
|
||||
consecutively_empty += 1
|
||||
|
||||
descriptions += list(filtered_prices_descriptions.keys())
|
||||
prices += [f"{product['price']:,.2f}" for product in filtered_prices_descriptions.values()]
|
||||
shipping += [f"{product['shipping']:,.2f}" for product in filtered_prices_descriptions.values()]
|
||||
urls += [product['url'] for product in filtered_prices_descriptions.values()]
|
||||
similarities += [product['similarity'] for product in filtered_prices_descriptions.values()]
|
||||
|
||||
return descriptions, prices, urls, similarities
|
||||
return descriptions, prices, shipping, urls, similarities
|
||||
|
||||
def filter_products_by_similarity(self, product_info: list, target_title: str, similarity_threshold: float):
|
||||
"""
|
||||
@@ -296,6 +327,7 @@ class GoogleShoppingScraper:
|
||||
if similarity >= similarity_threshold:
|
||||
filtered_products[product['title']] = {
|
||||
'price': product['price'],
|
||||
'shipping': product['shipping'],
|
||||
'url': product['url'],
|
||||
'similarity': similarity
|
||||
}
|
||||
|
||||
@@ -35,8 +35,13 @@
|
||||
</h4>
|
||||
<p>After running our advanced algorithms and crunching the numbers, we have <b>identified {{ best_title }}</b> as the ultimate bargaining chip for you!
|
||||
With a jaw-dropping <b>match percentage of {{ best_score }}%</b>, it's practically a match made in heaven with your chosen listing. And the cherry on top?
|
||||
It's currently listed at a steal of a price - <b>just ${{ best_price }}</b> - under the super cool <b>category of {{ best_category }} </b>.
|
||||
You won't find a better deal anywhere else!
|
||||
It's currently listed at a steal of a price - <b>just ${{ best_price }}
|
||||
{% if best_shipping != "0.00" %}
|
||||
with ${{ best_shipping }} in shipping.
|
||||
{% else %}
|
||||
with free shipping.
|
||||
{% endif %}
|
||||
</b>You won't find a better deal anywhere else!
|
||||
</p>
|
||||
<hr>
|
||||
<p>{% if best_context.type == 'decrease' %}
|
||||
@@ -46,8 +51,6 @@
|
||||
{% endif %}
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<!-- <div class="Stars" style="--rating: {{ price_rating }}" aria-label="Rating of this product is {{ price_rating }} out of 5."></div> -->
|
||||
|
||||
<div class="card" style="margin-top: 2.5rem;">
|
||||
<div class="card-header">
|
||||
|
||||
@@ -8,6 +8,7 @@ import plotly.graph_objects as go
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.preprocessing import PolynomialFeatures
|
||||
from wordcloud import WordCloud
|
||||
from collections import Counter
|
||||
|
||||
@@ -90,7 +91,7 @@ def reject_outliers(data: list[float], m: float) -> list[int]:
|
||||
|
||||
return indices.tolist()
|
||||
|
||||
def price_difference_rating(initial: float, final: float) -> float:
|
||||
def price_difference_rating(initial: float, final: float, days: int) -> float:
|
||||
"""
|
||||
The rating is based on the difference between the initial and final
|
||||
price. The rating is 0 if the final price is greater than the initial
|
||||
@@ -101,16 +102,31 @@ def price_difference_rating(initial: float, final: float) -> float:
|
||||
Args:
|
||||
initial: The initial price.
|
||||
final: The final price.
|
||||
days: The number of days a listing has been active.
|
||||
|
||||
Returns:
|
||||
The rating.
|
||||
"""
|
||||
|
||||
# Decay constant (a value greater than 0)
|
||||
decay_constant = 0.01
|
||||
|
||||
# Adjust this value to control the rate of increase of the penalty
|
||||
linear_factor = 0.0125
|
||||
|
||||
# Threshold number of days after which the penalty is applied
|
||||
threshold_days = 7
|
||||
|
||||
if days >= threshold_days:
|
||||
days_past_threshold = days - threshold_days
|
||||
penalty_amount = initial*np.exp(-decay_constant*days_past_threshold) + linear_factor*days_past_threshold*initial
|
||||
initial += penalty_amount
|
||||
|
||||
if initial <= final:
|
||||
rating = 5.0
|
||||
else:
|
||||
price_difference = initial - final
|
||||
rating = 5.0 - (price_difference / initial) * 5.0
|
||||
rating = 5.0 - (price_difference/initial)*5.0
|
||||
|
||||
return max(0.0, min(rating, 5.0))
|
||||
|
||||
@@ -144,43 +160,50 @@ def percentage_difference(list_price: float, best_price: float) -> dict:
|
||||
|
||||
return difference
|
||||
|
||||
def create_chart(categorized: dict, similar_prices: list[float], similar_descriptions: list[str], listing_currency: str, listing_title: str) -> object:
|
||||
def create_chart(categorized: dict, similar_prices: list[float], similar_shipping: list[float], similar_descriptions: list[str], listing_currency: str, listing_title: str) -> object:
|
||||
"""
|
||||
Creates a line chart visualization based on the categorized items, their prices, and their descriptions.
|
||||
|
||||
Args:
|
||||
categorized (dict): A dictionary where the keys are the names of the clusters and the values are lists of the items in that cluster.
|
||||
similar_prices (list[float]): A list of prices of the items.
|
||||
similar_shipping (list[float]): A list of shipping costs of the items.
|
||||
similar_descriptions (list[str]): A list of descriptions of the items.
|
||||
|
||||
Returns:
|
||||
A JSON string containing the Plotly figure of the line chart.
|
||||
"""
|
||||
|
||||
items, prices, descriptions = [], [], []
|
||||
items, prices, shipping, descriptions = [], [], [], []
|
||||
unit = 1
|
||||
|
||||
for categories, titles in categorized.items():
|
||||
items.append(categories)
|
||||
|
||||
sub_prices, sub_descriptions = [], []
|
||||
sub_prices, sub_shipping, sub_descriptions = [], [], []
|
||||
for title in titles:
|
||||
idx = similar_descriptions.index(title)
|
||||
sub_prices.append(similar_prices[idx])
|
||||
|
||||
sub_prices.append(similar_prices[idx])
|
||||
sub_shipping.append(similar_shipping[idx])
|
||||
sub_descriptions.append(title)
|
||||
prices.append(sub_prices)
|
||||
shipping.append(sub_shipping)
|
||||
descriptions.append(sub_descriptions)
|
||||
|
||||
sort_indices = [sorted(range(len(sublist)), key=lambda x: sublist[x]) for sublist in prices]
|
||||
sorted_prices = [[sublist[i] for i in indices] for sublist, indices in zip(prices, sort_indices)]
|
||||
|
||||
sorted_shipping = [[sublist[i] for i in indices] for sublist, indices in zip(shipping, sort_indices)]
|
||||
formatted_shipping = [[f"${ship}" if ship != "0.00" else "Free" for ship in row] for row in sorted_shipping]
|
||||
|
||||
sorted_descriptions = [[sublist[i] for i in indices] for sublist, indices in zip(descriptions, sort_indices)]
|
||||
|
||||
fig = go.Figure()
|
||||
|
||||
for i, _ in enumerate(items):
|
||||
x = [j*unit + 1 for j in range(len(sorted_prices[i]))]
|
||||
hovertext = [f"Product: {desc.title()}<br>Price: ${price:.2f}" for price, desc in zip(sorted_prices[i], sorted_descriptions[i])]
|
||||
hovertext = [f"Product: {desc.title()}<br>Price: ${price:.2f}<br>Shipping: {ship}" for price, ship, desc in zip(sorted_prices[i], formatted_shipping[i], sorted_descriptions[i])]
|
||||
fig.add_trace(go.Scatter(
|
||||
x=x, y=sorted_prices[i],
|
||||
mode='markers',
|
||||
@@ -188,12 +211,17 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip
|
||||
text=hovertext,
|
||||
name=f"Category {i + 1}"))
|
||||
|
||||
# Compute the linear regression on all data points
|
||||
# Compute the polynomial regression on all data points
|
||||
x = np.concatenate([np.arange(len(prices))*unit + 1 for prices in sorted_prices])
|
||||
y = np.concatenate(sorted_prices)
|
||||
reg = LinearRegression().fit(x.reshape(-1, 1), y)
|
||||
x_reg = [np.min(x), np.max(x)*1.5]
|
||||
y_reg = reg.predict(np.array(x_reg).reshape(-1, 1))
|
||||
|
||||
poly_features = PolynomialFeatures(degree=4, include_bias=True)
|
||||
x_poly = poly_features.fit_transform(x.reshape(-1, 1))
|
||||
|
||||
reg = LinearRegression().fit(x_poly, y)
|
||||
x_reg = np.linspace(np.min(x), np.max(x), num=100)
|
||||
x_reg_poly = poly_features.fit_transform(x_reg.reshape(-1, 1))
|
||||
y_reg = reg.predict(x_reg_poly)
|
||||
|
||||
# Add the trend line to the plot
|
||||
fig.add_trace(
|
||||
@@ -202,14 +230,15 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip
|
||||
mode='lines',
|
||||
name='Trend Line'))
|
||||
|
||||
# Add prediction annotation for the trend line
|
||||
prediction = reg.predict([[10]])[0]
|
||||
fig.add_annotation(x=10, y=prediction, text=f"Expected Price: ${prediction:.2f} {listing_currency}", showarrow=True)
|
||||
|
||||
# Add annotations to all x values
|
||||
for x_val in x:
|
||||
y_val = reg.predict(poly_features.transform([[x_val]]))[0]
|
||||
fig.add_annotation(x=x_val, y=y_val, text=f"Prediction: ${y_val:.2f}", showarrow=True)
|
||||
|
||||
fig.update_layout(
|
||||
template='plotly_white',
|
||||
hovermode='closest',
|
||||
xaxis_title="Product",
|
||||
xaxis_title="Product Number",
|
||||
yaxis_title=f"Price $({listing_currency})",
|
||||
legend_title="Categories",
|
||||
title={
|
||||
@@ -221,7 +250,7 @@ def create_chart(categorized: dict, similar_prices: list[float], similar_descrip
|
||||
|
||||
return fig.to_json()
|
||||
|
||||
def create_wordcloud(urls: list[str]) -> tuple[object, dict]:
|
||||
def create_wordcloud(urls: list[str]) -> object:
|
||||
"""
|
||||
Creates a word cloud visualization based on a list of website URLs.
|
||||
|
||||
@@ -251,15 +280,15 @@ def create_wordcloud(urls: list[str]) -> tuple[object, dict]:
|
||||
fig = px.imshow(wordcloud)
|
||||
fig.update_layout(
|
||||
xaxis_title="Website URL",
|
||||
yaxis_title="Citations (Bigger is Better)",
|
||||
yaxis_title="Citations",
|
||||
title={
|
||||
'text': "Word Cloud of Websites",
|
||||
'text': "Frequently Cited Websites",
|
||||
'xanchor': 'center',
|
||||
'yanchor': 'top',
|
||||
'y': 0.9,
|
||||
'x': 0.5})
|
||||
|
||||
return fig.to_json(), dict(website_counts)
|
||||
return fig.to_json()
|
||||
|
||||
def categorize_titles(items: list[str]) -> dict:
|
||||
"""
|
||||
@@ -275,20 +304,21 @@ def categorize_titles(items: list[str]) -> dict:
|
||||
vectorizer = TfidfVectorizer()
|
||||
X = vectorizer.fit_transform(items)
|
||||
|
||||
num_clusters = len(items) // 5
|
||||
kmeans = KMeans(n_clusters=num_clusters, n_init=10)
|
||||
wcss = []
|
||||
for i in range(1, len(items)//2):
|
||||
kmeans = KMeans(n_clusters=i, n_init=10)
|
||||
kmeans.fit(X)
|
||||
wcss.append(kmeans.inertia_)
|
||||
|
||||
# Select the optimal number of clusters based on the elbow point
|
||||
optimal_num_clusters = int(min(wcss))
|
||||
kmeans = KMeans(n_clusters=optimal_num_clusters, n_init=10)
|
||||
kmeans.fit(X)
|
||||
|
||||
cluster_names = []
|
||||
for i in range(num_clusters):
|
||||
cluster_items = [items[j] for j in range(len(items)) if kmeans.labels_[j] == i]
|
||||
representative_item = f"{i+1}"
|
||||
cluster_names.append(representative_item)
|
||||
|
||||
clusters = {}
|
||||
for i in range(num_clusters):
|
||||
for i in range(optimal_num_clusters):
|
||||
cluster_items = [items[j] for j in range(len(items)) if kmeans.labels_[j] == i]
|
||||
cluster_name = cluster_names[i]
|
||||
clusters[cluster_name] = cluster_items
|
||||
representative_item = f"{i + 1}"
|
||||
clusters[representative_item] = cluster_items
|
||||
|
||||
return clusters
|
||||
@@ -48,34 +48,33 @@ class Index(View):
|
||||
|
||||
# Find viable products based on the title
|
||||
cleaned_title = remove_illegal_characters(title)
|
||||
similar_descriptions, similar_prices, similar_urls, similar_scores = shopping_instance.find_viable_product(cleaned_title, ramp_down=0.0)
|
||||
candidates = shopping_instance.construct_candidates(similar_descriptions, similar_prices, similar_urls, similar_scores)
|
||||
similar_descriptions, similar_prices, similar_shipping, similar_urls, similar_scores = shopping_instance.find_viable_product(cleaned_title, ramp_down=0.0)
|
||||
candidates = shopping_instance.construct_candidates(similar_descriptions, similar_prices, similar_shipping, similar_urls, similar_scores)
|
||||
|
||||
# Convert prices to float and shorten the descriptions if necessary
|
||||
similar_prices = [float(price.replace(',', '')) for price in similar_prices]
|
||||
|
||||
# Categorize the titles and create the chart and wordcloud
|
||||
categorized = categorize_titles(similar_descriptions)
|
||||
chart = create_chart(categorized, similar_prices, similar_descriptions, currency, title)
|
||||
wordcloud, website_counts = create_wordcloud(similar_urls)
|
||||
chart = create_chart(categorized, similar_prices, similar_shipping, similar_descriptions, currency, title)
|
||||
wordcloud = create_wordcloud(similar_urls)
|
||||
|
||||
# Based on the best similar product, get the price, description, category, and URL
|
||||
best_product = shopping_instance.lowest_price_highest_similarity(candidates)
|
||||
|
||||
idx = similar_urls.index(best_product[1]["url"])
|
||||
best_price = f"{similar_prices[idx]:,.2f}"
|
||||
best_shipping = similar_shipping[idx]
|
||||
best_title = similar_descriptions[idx]
|
||||
best_score = best_product[1]["similarity"] * 100
|
||||
best_category = [key for key, value in categorized.items() if [item for item in value if item == best_title]][0]
|
||||
|
||||
# Percetage difference between the listing price and the best found price
|
||||
best_context = percentage_difference(float(price), float(best_price.replace(",", "")))
|
||||
price_rating = price_difference_rating(float(price), float(best_price.replace(",", "")))
|
||||
# Percetage difference between the listing price and the best found price (including shipping)
|
||||
best_total = float(best_price.replace(",", "")) + float(best_shipping.replace(",", ""))
|
||||
best_context = percentage_difference(float(price), best_total,)
|
||||
price_rating = price_difference_rating(float(price), best_total, days)
|
||||
|
||||
# Get the total number of items
|
||||
total_items = len(similar_descriptions)
|
||||
max_citations = max([value for value in website_counts.values()])
|
||||
max_website = [key for key, value in website_counts.items() if value == max_citations][0]
|
||||
|
||||
# Create the context
|
||||
context = {
|
||||
@@ -97,12 +96,9 @@ class Index(View):
|
||||
'categorized': categorized,
|
||||
'total_items': total_items,
|
||||
'best_price': best_price,
|
||||
'best_shipping': best_shipping,
|
||||
'best_title': best_title.title(),
|
||||
'best_score': round(best_score, 2),
|
||||
'website_counts': website_counts,
|
||||
'max_website': max_website,
|
||||
'max_citations': max_citations,
|
||||
'best_category': best_category,
|
||||
'best_context': best_context
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user