mirror of
https://github.com/Marketscrape/marketscrape-web.git
synced 2026-04-19 13:26:54 -04:00
158 lines
4.7 KiB
Python
158 lines
4.7 KiB
Python
# Web Scraping
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Math
|
|
import statistics
|
|
|
|
# Currency Conversion
|
|
from currency_converter import CurrencyConverter
|
|
|
|
# Sentiment Analysis
|
|
import nltk
|
|
#nltk.download()
|
|
import nltk.corpus
|
|
#nltk.download('stopwords')
|
|
from nltk.corpus import stopwords
|
|
from nltk.sentiment import SentimentIntensityAnalyzer
|
|
from nltk.tokenize import RegexpTokenizer
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
|
# Pattern Matching
|
|
import re
|
|
|
|
def sentiment_analysis(text):
|
|
sia = SentimentIntensityAnalyzer()
|
|
sentiment = sia.polarity_scores(text)
|
|
|
|
if sentiment["compound"] >= 0.05:
|
|
return "🙂"
|
|
elif sentiment["compound"] <= -0.05:
|
|
return "🙁"
|
|
else:
|
|
return "😐"
|
|
|
|
def max_sentiment(sentiment):
|
|
try:
|
|
del sentiment["compound"]
|
|
except KeyError:
|
|
pass
|
|
value = sentiment[max(sentiment, key=sentiment.get)]
|
|
|
|
return value
|
|
|
|
def min_sentiment(sentiment):
|
|
try:
|
|
del sentiment["compound"]
|
|
except KeyError:
|
|
pass
|
|
value = sentiment[min(sentiment, key=sentiment.get)]
|
|
|
|
return value
|
|
|
|
def clean_text(text):
|
|
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
|
|
tokenized = tokenizer.tokenize(text)
|
|
tokenized = [word.lower() for word in tokenized]
|
|
|
|
stop_words = stopwords.words('english')
|
|
filtered = [word for word in tokenized if word not in stop_words and word.isalpha()]
|
|
|
|
lemmatizer = WordNetLemmatizer()
|
|
lemmatized = [lemmatizer.lemmatize(word) for word in filtered]
|
|
|
|
return " ".join(lemmatized)
|
|
|
|
def get_title(soup):
|
|
title = soup.find("meta", {"name": "DC.title"})
|
|
title_content = title["content"]
|
|
|
|
return title_content
|
|
|
|
def get_description(soup):
|
|
description = soup.find("meta", {"name": "DC.description"})
|
|
description_content = description["content"]
|
|
|
|
return clean_text(description_content)
|
|
|
|
def get_price(soup):
|
|
spans = soup.find_all("span")
|
|
price = [str(span.text) for span in spans if "$" in span.text][0]
|
|
|
|
return price
|
|
|
|
def percentage_difference(intial, final):
|
|
value = (final - intial) / intial
|
|
|
|
if value < 0.0:
|
|
return "👎"
|
|
elif value >= 0.0:
|
|
return "👍"
|
|
|
|
def create_soup(url, headers):
|
|
response = requests.get(url, headers=headers)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
return soup
|
|
|
|
def convert_currency(price, base_currency, target_currency):
|
|
c = CurrencyConverter()
|
|
price = c.convert(price, base_currency, target_currency)
|
|
|
|
return price
|
|
|
|
def find_product_prices(title):
|
|
headers = {
|
|
"User-Agent":
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
|
|
}
|
|
url = "https://www.google.com/search?q=" + title + "&sa=X&biw=1920&bih=927&tbm=shop&sxsrf=ALiCzsbtwkWiDOQEcm_9X1UBlEG1iaqXtg%3A1663739640147&ei=-KYqY6CsCLez0PEP0Ias2AI&ved=0ahUKEwigiP-RmaX6AhW3GTQIHVADCysQ4dUDCAU&uact=5&oq=REPLACE&gs_lcp=Cgtwcm9kdWN0cy1jYxADMgUIABCABDIFCAAQgAQyBQgAEIAEMgsIABCABBCxAxCDATIECAAQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEOgsIABAeEA8QsAMQGDoNCAAQHhAPELADEAUQGDoGCAAQChADSgQIQRgBUM4MWO4TYJoVaAFwAHgAgAFDiAGNA5IBATeYAQCgAQHIAQPAAQE&sclient=products-cc"
|
|
|
|
soup = create_soup(url, headers)
|
|
prices = soup.find_all("span", {"class": "HRLxBb"})
|
|
|
|
values = []
|
|
for price in prices:
|
|
values.append(price.text)
|
|
|
|
normalized = [re.sub("\$", "", price) for price in values]
|
|
normalized = [re.search(r"[0-9,.]*", price).group(0) for price in normalized]
|
|
normalized = [float(price.replace(",", "")) for price in normalized]
|
|
normalized = sorted(normalized)
|
|
|
|
median = statistics.median_grouped(normalized)
|
|
deviation = statistics.stdev(normalized)
|
|
|
|
return median, deviation
|
|
|
|
def valid_url(url):
|
|
if re.search(r"^https://www.facebook.com/", url):
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def main():
|
|
url = input("Enter URL: ")
|
|
|
|
if valid_url(url):
|
|
pass
|
|
else:
|
|
print("Invalid URL")
|
|
exit(1)
|
|
|
|
shortened_url = re.search(r".*[0-9]", url).group(0)
|
|
mobile_url = shortened_url.replace("www", "m")
|
|
|
|
sentiment = sentiment_analysis(get_description(create_soup(url, headers=None)))
|
|
title = get_title(create_soup(url, headers=None))
|
|
|
|
initial_price = int(re.sub("[\$,]", "", get_price(create_soup(mobile_url, headers=None))))
|
|
median, deviation = find_product_prices(title)
|
|
|
|
print("\nProduct: {}".format(title))
|
|
print("How we feel about the description: {}".format(sentiment))
|
|
print("How we feel about the price: {}".format(percentage_difference(initial_price, median)))
|
|
print("Price range of similar products we found: ${:,.2f} - ${:,.2f}".format(abs(median - deviation), abs(median + deviation)))
|
|
|
|
if __name__ == "__main__":
|
|
main() |