Merge pull request #42 from bhavanvir/main

Now scraping more pages in Google Shopping + updated Results page, ag…
This commit is contained in:
Alex Holland
2023-04-22 22:31:10 -07:00
committed by GitHub
3 changed files with 72 additions and 34 deletions

View File

@@ -60,14 +60,13 @@
</p>
<div id="chart" data-chart="{{ chart }}"></div>
<p>
We found that the best deal occured in the <span style="color:#28a745">{{ best_similar_category }}</span> category
with a price of <span style="color:#28a745">${{ best_similar_price }}</span> for the <a style="color:#28a745" href="{{ best_similar_url }}" target="_blank">{{ best_similar_description }}</a> product,
Looks like we've found a winner! Our fancy-schmancy algorithm suggests that the <a style="color:#28a745" href="{{ best_similar_url }}" target="_blank">{{ best_similar_description }}</a> product, located in the <span style="color:#28a745">{{ best_similar_category }}</span> category, is the closest match to your list item, with a mind-blowing similarity score of <span style="color:#28a745">{{ best_similar_score }}%</span> and the sweetest price of <span style="color:#28a745">${{ best_similar_price}}</span>.
{% if list_best_context.type == "decrease" %}
which is a <span style="color:#28a745">{{ list_best_context.type }}</span> of <span style="color:#28a745">{{ list_best_context.amount }}%</span>
Talk about a bargain! This baby is <span style="color:#28a745">{{ list_best_context.amount }}%</span> cheaper than your original pick!
{% else %}
which is an <span style="color:#dc3545">{{ list_best_context.type }}</span> of <span style="color:#dc3545">{{ list_best_context.amount }}%</span>
Oops, looks like this one's <span style="color:#dc3545">{{ list_best_context.amount }}%</span> more expensive than your original choice.
{% endif %}
from the listed item!
Anyway, we won't tell your list item that you've found a new love 😉
</p>
</div>
</div>

View File

@@ -152,7 +152,34 @@ def price_difference_rating(initial: float, final: float) -> float:
return rating
def find_viable_product(title: str, ramp_down: float) -> tuple[float, float, float]:
def lowest_price_highest_similarity(filtered_prices_descriptions: dict) -> tuple[float, str, float]:
"""
Finds the lowest price and the highest similarity of the filtered
prices and descriptions.
Args:
filtered_prices_descriptions: The filtered prices and descriptions.
Returns:
The lowest price, the highest similarity, and the description
associated with the highest similarity.
"""
max_similarity = 0
min_price = float('inf')
result = None
for item, info in filtered_prices_descriptions.items():
similarity = info['similarity']
price = info['price']
if similarity > max_similarity or (similarity == max_similarity and price < min_price):
max_similarity = similarity
min_price = price
result = (item, info)
return result
def find_viable_product(title: str, ramp_down: float) -> tuple[list, list, list]:
"""
Finds viable products based on the title of the Marketplace listing,
and utilizes the ramp down of the previous product in the sequence, to
@@ -171,26 +198,34 @@ def find_viable_product(title: str, ramp_down: float) -> tuple[float, float, flo
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
url = f"https://www.google.com/search?q={cleaned_title}&sa=X&biw=1920&bih=927&tbm=shop&sxsrf=ALiCzsbtwkWiDOQEcm_9X1UBlEG1iaqXtg%3A1663739640147&ei=-KYqY6CsCLez0PEP0Ias2AI&ved=0ahUKEwigiP-RmaX6AhW3GTQIHVADCysQ4dUDCAU&uact=5&oq=REPLACE&gs_lcp=Cgtwcm9kdWN0cy1jYxADMgUIABCABDIFCAAQgAQyBQgAEIAEMgsIABCABBCxAxCDATIECAAQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEOgsIABAeEA8QsAMQGDoNCAAQHhAPELADEAUQGDoGCAAQChADSgQIQRgBUM4MWO4TYJoVaAFwAHgAgAFDiAGNA5IBATeYAQCgAQHIAQPAAQE&sclient=products-cc"
soup = create_soup(url, headers)
similarity_threshold = 0.25
descriptions = []
prices = []
urls = []
try:
filtered_prices_descriptions = listing_product_similarity(soup, cleaned_title, similarity_threshold)
assert len(filtered_prices_descriptions) > 0
except AssertionError:
while len(filtered_prices_descriptions) == 0:
ramp_down += 0.05
filtered_prices_descriptions = listing_product_similarity(soup, cleaned_title, similarity_threshold - ramp_down)
for page_number in range(3):
start = page_number * 60
url = f"https://www.google.com/search?q={cleaned_title}&tbs=vw:d&tbm=shop&sxsrf=APwXEdeCneQw6hWKHlHMJptjJHcIzqvmvw:1682209446957&ei=pnpEZILiOcmD0PEPifacgAw&start={start}&sa=N&ved=0ahUKEwiCzZfE3r7-AhXJATQIHQk7B8AQ8tMDCLEY&biw=1920&bih=927&dpr=1"
soup = create_soup(url, headers)
similarity_threshold = 0.25
descriptions = list(filtered_prices_descriptions.keys())
try:
filtered_prices_descriptions = listing_product_similarity(soup, cleaned_title, similarity_threshold)
assert len(filtered_prices_descriptions) > 0
except AssertionError:
while len(filtered_prices_descriptions) == 0:
ramp_down += 0.05
filtered_prices_descriptions = listing_product_similarity(soup, cleaned_title, similarity_threshold - ramp_down)
prices = list(filtered_prices_descriptions.values())
prices = [f"{price['price']:,.2f}" for price in prices]
descriptions += list(filtered_prices_descriptions.keys())
prices += [f"{price['price']:,.2f}" for price in filtered_prices_descriptions.values()]
urls += [price['url'] for price in filtered_prices_descriptions.values()]
best_result = lowest_price_highest_similarity(filtered_prices_descriptions)
return descriptions, prices, urls, best_result
urls = [price['url'] for price in filtered_prices_descriptions.values()]
return descriptions, prices, urls
def listing_product_similarity(soup: BeautifulSoup, title: str, similarity_threshold: float) -> dict:
"""
@@ -217,7 +252,7 @@ def listing_product_similarity(soup: BeautifulSoup, title: str, similarity_thres
filtered_prices_descriptions = {}
for key, value in price_description.items():
if value['similarity'] >= similarity_threshold:
filtered_prices_descriptions[key] = {'price': value['price'], 'url': value['url']}
filtered_prices_descriptions[key] = {'price': value['price'], 'url': value['url'], 'similarity': value['similarity']}
return filtered_prices_descriptions

View File

@@ -35,24 +35,27 @@ class Index(View):
price = scraper_instance.get_listing_price()
city = scraper_instance.get_listing_city()
similar_descriptions, similar_prices, similar_urls = find_viable_product(title, ramp_down=0.0)
similar_descriptions, similar_prices, similar_urls, best_similar_product = find_viable_product(title, ramp_down=0.0)
similar_prices = [float(price.replace(',', '')) for price in similar_prices]
shortened_item_names = [description[:10] + '...' if len(description) > 10 else description for description in similar_descriptions]
shortened_item_names = [description[:8] + '...' if len(description) > 10 else description for description in similar_descriptions]
# Based on the best similar product, get the price, description, category, and URL
idx = similar_urls.index(best_similar_product[1]["url"])
best_similar_price = f"{similar_prices[idx]:,.2f}"
best_similar_description = similar_descriptions[idx]
best_similar_category = shortened_item_names[idx]
best_similar_url = similar_urls[idx]
best_similar_score = best_similar_product[1]["similarity"] * 100
# Create a DataFrame from the data
data = {'Product': shortened_item_names, 'Price': similar_prices, 'Description': similar_descriptions, 'URL': similar_urls}
df = pd.DataFrame(data)
# Used to determine colour range bounds
cmin = min(similar_prices)
cmax = max(similar_prices)
idx = similar_prices.index(cmin)
best_similar_price = f"{similar_prices[idx]:,.2f}"
best_similar_description = similar_descriptions[idx]
best_similar_category = shortened_item_names[idx]
best_similar_url = similar_urls[idx]
# Ratio
# Ratio to limit the total bubble size
desired_diameter = 150
sizeref = cmax / desired_diameter
@@ -65,7 +68,7 @@ class Index(View):
list_best_context = percentage_difference(float(price), float(best_similar_price))
# Needs to be redone
price_rating = price_difference_rating(float(price), float(cmin))
price_rating = price_difference_rating(float(price), float(best_similar_price))
categories = list(set(shortened_item_names))
@@ -88,7 +91,8 @@ class Index(View):
'best_similar_description': best_similar_description,
'best_similar_category': best_similar_category,
'best_similar_url': best_similar_url,
'list_best_context': list_best_context,
'best_similar_score': f"{best_similar_score:.2f}",
'list_best_context': list_best_context
}
return render(request, 'scraper/result.html', context)