From 075e51d8d439a950fcd745c30afbd4253541cf78 Mon Sep 17 00:00:00 2001 From: Bhavanvir Rai Date: Tue, 20 Sep 2022 07:57:17 -0700 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Sentiment=20analysis=20is?= =?UTF-8?q?=20now=20more=20conclusive.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scraper.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/scraper.py b/scraper.py index 841d3e1..daec027 100644 --- a/scraper.py +++ b/scraper.py @@ -4,11 +4,13 @@ from bs4 import BeautifulSoup # Sentiment Analysis import nltk -nltk.download() +#nltk.download() import nltk.corpus -nltk.download('stopwords') +#nltk.download('stopwords') from nltk.corpus import stopwords from nltk.sentiment import SentimentIntensityAnalyzer +from nltk.tokenize import RegexpTokenizer +from nltk.stem import WordNetLemmatizer # Pattern Matching import re @@ -16,14 +18,13 @@ import re def sentiment_analysis(text): sia = SentimentIntensityAnalyzer() sentiment = sia.polarity_scores(text) - negative, neutral, positive, compound = sentiment['neg'], sentiment['neu'], sentiment['pos'], sentiment['compound'] - - if negative > positive and negative > neutral: - return("🙁 with {:.2f}% confidence".format(negative * 100)) - elif positive > negative: - return("🙂 with {:.2f}% confidence".format((compound - positive) * 100)) + + if sentiment["compound"] >= 0.05: + return("🙂 with {:.2f}% confidence".format(sentiment["pos"] * 100)) + elif sentiment["compound"] <= -0.05: + return("🙁 with {:.2f}% confidence".format(sentiment["neg"] * 100)) else: - return("😐 with {:.2f}% confidence".format(neutral * 100)) + return("😐 with {:.2f}% confidence".format(sentiment["neu"] * 100)) def html_debug(soup): f = open("index.html", "w") @@ -31,12 +32,17 @@ def html_debug(soup): f.close() def clean_text(text): - text = text.lower() - text = re.sub(r"[^A-Za-z0-9]+", " ", text) - text = ' '.join(text.splitlines()) - text = ' '.join([word for word in text.split() if word not in (stopwords.words('english'))]) + tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+') + tokenized = tokenizer.tokenize(text) + tokenized = [word.lower() for word in tokenized] + + stop_words = stopwords.words('english') + filtered = [word for word in tokenized if word not in stop_words and word.isalpha()] + + lemmatizer = WordNetLemmatizer() + lemmatized = [lemmatizer.lemmatize(word) for word in filtered] - return text + return " ".join(lemmatized) def get_title(soup): title = soup.find("meta", {"name": "DC.title"}) @@ -68,7 +74,7 @@ def main(): mobile_url = shortened_url.replace("www", "m") print("\nHow we feel about this listing: {}".format(sentiment_analysis(get_description(create_soup(url))))) - print("Vehicle: {}".format(get_title(create_soup(url)))) + print("Title: {}".format(get_title(create_soup(url)))) print("Price: {}".format(get_price(create_soup(mobile_url)))) if __name__ == "__main__":