Files
marketscrape-web/scraper.py
2022-09-20 07:57:17 -07:00

81 lines
2.3 KiB
Python

# Web Scraping
import requests
from bs4 import BeautifulSoup
# Sentiment Analysis
import nltk
#nltk.download()
import nltk.corpus
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
# Pattern Matching
import re
def sentiment_analysis(text):
sia = SentimentIntensityAnalyzer()
sentiment = sia.polarity_scores(text)
if sentiment["compound"] >= 0.05:
return("🙂 with {:.2f}% confidence".format(sentiment["pos"] * 100))
elif sentiment["compound"] <= -0.05:
return("🙁 with {:.2f}% confidence".format(sentiment["neg"] * 100))
else:
return("😐 with {:.2f}% confidence".format(sentiment["neu"] * 100))
def html_debug(soup):
f = open("index.html", "w")
f.write(soup.prettify())
f.close()
def clean_text(text):
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
tokenized = tokenizer.tokenize(text)
tokenized = [word.lower() for word in tokenized]
stop_words = stopwords.words('english')
filtered = [word for word in tokenized if word not in stop_words and word.isalpha()]
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered]
return " ".join(lemmatized)
def get_title(soup):
title = soup.find("meta", {"name": "DC.title"})
title_content = title["content"]
return title_content
def get_description(soup):
description = soup.find("meta", {"name": "DC.description"})
description_content = description["content"]
return clean_text(description_content)
def get_price(soup):
spans = soup.find_all("span")
price = re.search(r"\$[0-9]*[^<]*", str(spans[1])).group(0)
return price
def create_soup(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def main():
url = input("Enter URL: ")
shortened_url = re.search(r"(^.*)?\?", url).group(0)
mobile_url = shortened_url.replace("www", "m")
print("\nHow we feel about this listing: {}".format(sentiment_analysis(get_description(create_soup(url)))))
print("Title: {}".format(get_title(create_soup(url))))
print("Price: {}".format(get_price(create_soup(mobile_url))))
if __name__ == "__main__":
main()