From 194d6e748e886d56e208709dd4f6b5a8325c76b7 Mon Sep 17 00:00:00 2001 From: Bhavanvir Rai Date: Sun, 7 May 2023 00:50:22 -0700 Subject: [PATCH] Added 95% confidence interval to price trends, and fixed regex error. --- scraper/shop_class.py | 20 ++++++++------ scraper/utils.py | 61 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 62 insertions(+), 19 deletions(-) diff --git a/scraper/shop_class.py b/scraper/shop_class.py index 84e6a38..13e3567 100644 --- a/scraper/shop_class.py +++ b/scraper/shop_class.py @@ -100,8 +100,17 @@ class EbayScraper: for ship in shipping: values.append(ship.text) - cleansed = [re.search(r"([0-9]+\.[0-9]+)|(Free)|(not specified)", ship).group(0) for ship in values] - cleansed = [float(ship) if ship not in ["Free", "not specified"] else 0.0 for ship in cleansed] + cleansed = [] + for ship in values: + match = re.search(r"([0-9]+.*[0-9])|(Free)|(not specified)", ship) + if match: + if match.group(1): + price = match.group(1).replace(",", "") + cleansed.append(float(price)) + else: + cleansed.append(0.0) + else: + cleansed.append(0.0) return cleansed @@ -292,12 +301,7 @@ class EbayScraper: The descriptions, prices and countries the viable products. """ - descriptions = [] - prices = [] - shipping = [] - countries = [] - conditions = [] - similarities = [] + descriptions, prices, shipping, countries, conditions, similarities = [], [], [], [], [], [] for page_number in range(5): similarity_threshold = 0.35 diff --git a/scraper/utils.py b/scraper/utils.py index d74180a..09d423e 100644 --- a/scraper/utils.py +++ b/scraper/utils.py @@ -6,6 +6,7 @@ import re import plotly.graph_objects as go from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures +from sklearn.metrics import mean_squared_error from collections import Counter def remove_illegal_characters(title: str) -> str: @@ -182,14 +183,22 @@ def create_chart(similar_prices: list[float], similar_shipping: list[float], sim x=sorted_similar_prices[:, 0], y=sorted_similar_shipping, mode='markers', - marker=dict(color=sorted_similar_prices[:, 0], - colorscale='RdYlGn_r', - colorbar=dict(title="Price")), + marker=dict( + color=sorted_similar_prices[:, 0] + sorted_similar_shipping, + colorscale='RdYlGn_r', + colorbar=dict(title="Total Price"), + size=8 + ), hovertemplate="%{text}", - text=[f"Product: {desc.title()}
Price: ${price:.2f}
Shipping: ${ship:.2f}
Condition: {cond}" - for desc, price, ship, cond in zip(sorted_similar_descriptions, sorted_similar_prices[:, 0], sorted_similar_shipping, sorted_similar_conditions)], + text=[ + f"Product: {desc.title()}
Price: ${price:,.2f}
Shipping: ${ship:,.2f}
Condition: {cond}" + for desc, price, ship, cond in zip(sorted_similar_descriptions, sorted_similar_prices[:, 0], sorted_similar_shipping, sorted_similar_conditions) + ], showlegend=False, - name="Products")) + name="Products" + ) + ) + fig.update_layout( template='plotly_white', hovermode='closest', @@ -201,7 +210,9 @@ def create_chart(similar_prices: list[float], similar_shipping: list[float], sim 'xanchor': 'center', 'yanchor': 'top', 'y': 0.9, - 'x': 0.5}) + 'x': 0.5 + } + ) # Add best match annotation best_price = similar_prices[similar_descriptions.index(best_title)] @@ -212,7 +223,7 @@ def create_chart(similar_prices: list[float], similar_shipping: list[float], sim y=[best_shipping], mode='markers', marker=dict( - color='black', + color='#fc0', symbol='star', size=12 ), @@ -222,7 +233,7 @@ def create_chart(similar_prices: list[float], similar_shipping: list[float], sim ) # Perform polynomial regression to obtain polynomial coefficients - poly_features = PolynomialFeatures(degree=3, include_bias=True) + poly_features = PolynomialFeatures(degree=4, include_bias=True) X_poly = poly_features.fit_transform(sorted_similar_prices) poly_model = LinearRegression() poly_model.fit(X_poly, sorted_similar_shipping) @@ -231,15 +242,43 @@ def create_chart(similar_prices: list[float], similar_shipping: list[float], sim X_range_poly = poly_features.fit_transform(X_range.reshape(-1, 1)) Y_range = poly_model.predict(X_range_poly) + # Calculate confidence interval + y_pred = poly_model.predict(X_poly) + mse = mean_squared_error(sorted_similar_shipping, y_pred) + # 95% confidence interval + ci = 1.96 * np.sqrt(mse) + + upper_bound = Y_range + ci + lower_bound = Y_range - ci + fig.add_trace( go.Scatter( x=X_range, - y=poly_model.predict(X_range_poly), + y=Y_range, mode='lines', hovertemplate="%{text}", text=[f"Predicted Price: ${price:.2f}
Predicted Shipping: ${ship:.2f}" for price, ship in zip(X_range, Y_range)], showlegend=False, - name="Polynomial Regression")) + name="Trend Line", + line_color='rgb(128, 128, 128)', + line=dict( + dash='longdash', + ) + ) + ) + + fig.add_trace( + go.Scatter( + x=np.concatenate([X_range, X_range[::-1]]), + y=np.concatenate([upper_bound, lower_bound[::-1]]), + fill='toself', + fillcolor='rgba(128, 128, 128, 0.15)', + line_color='rgba(255, 255, 255, 0)', + hoverinfo="skip", + showlegend=False + ) + ) + return fig.to_json()