From 075e51d8d439a950fcd745c30afbd4253541cf78 Mon Sep 17 00:00:00 2001
From: Bhavanvir Rai <bhavanvir.r@gmail.com>
Date: Tue, 20 Sep 2022 07:57:17 -0700
Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Sentiment=20analysis=20is?=
 =?UTF-8?q?=20now=20more=20conclusive.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scraper.py | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/scraper.py b/scraper.py
index 841d3e1..daec027 100644
--- a/scraper.py
+++ b/scraper.py
@@ -4,11 +4,13 @@ from bs4 import BeautifulSoup
 
 # Sentiment Analysis
 import nltk
-nltk.download()
+#nltk.download()
 import nltk.corpus
-nltk.download('stopwords')
+#nltk.download('stopwords')
 from nltk.corpus import stopwords
 from nltk.sentiment import SentimentIntensityAnalyzer
+from nltk.tokenize import RegexpTokenizer
+from nltk.stem import WordNetLemmatizer
 
 # Pattern Matching
 import re
@@ -16,14 +18,13 @@ import re
 def sentiment_analysis(text):
     sia = SentimentIntensityAnalyzer()
     sentiment = sia.polarity_scores(text)
-    negative, neutral, positive, compound = sentiment['neg'], sentiment['neu'], sentiment['pos'], sentiment['compound']
-  
-    if negative > positive and negative > neutral:
-        return("🙁 with {:.2f}% confidence".format(negative * 100))
-    elif positive > negative:
-        return("🙂 with {:.2f}% confidence".format((compound - positive) * 100))
+   
+    if sentiment["compound"] >= 0.05:
+        return("🙂 with {:.2f}% confidence".format(sentiment["pos"] * 100))
+    elif sentiment["compound"] <= -0.05:
+        return("🙁 with {:.2f}% confidence".format(sentiment["neg"] * 100))
     else:
-        return("😐 with {:.2f}% confidence".format(neutral * 100))
+        return("😐 with {:.2f}% confidence".format(sentiment["neu"] * 100))
 
 def html_debug(soup):
     f = open("index.html", "w")
@@ -31,12 +32,17 @@ def html_debug(soup):
     f.close()
 
 def clean_text(text):
-    text = text.lower()
-    text = re.sub(r"[^A-Za-z0-9]+", " ", text)
-    text = ' '.join(text.splitlines())
-    text = ' '.join([word for word in text.split() if word not in (stopwords.words('english'))])
+    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
+    tokenized = tokenizer.tokenize(text)
+    tokenized = [word.lower() for word in tokenized]
+
+    stop_words = stopwords.words('english')
+    filtered = [word for word in tokenized if word not in stop_words and word.isalpha()]
+
+    lemmatizer = WordNetLemmatizer()
+    lemmatized = [lemmatizer.lemmatize(word) for word in filtered]
     
-    return text
+    return " ".join(lemmatized)
 
 def get_title(soup):
     title = soup.find("meta", {"name": "DC.title"})
@@ -68,7 +74,7 @@ def main():
     mobile_url = shortened_url.replace("www", "m")
 
     print("\nHow we feel about this listing: {}".format(sentiment_analysis(get_description(create_soup(url)))))
-    print("Vehicle: {}".format(get_title(create_soup(url))))
+    print("Title: {}".format(get_title(create_soup(url))))
     print("Price: {}".format(get_price(create_soup(mobile_url))))
 
 if __name__ == "__main__":