⚡️ Sentiment analysis is now more conclusive.

2026-04-29 02:02:38 -04:00 · 2022-09-20 07:57:17 -07:00
parent 4152cf7b40
commit 075e51d8d4
1 changed files with 21 additions and 15 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -4,11 +4,13 @@ from bs4 import BeautifulSoup

 # Sentiment Analysis
 import nltk
-nltk.download()
+#nltk.download()
 import nltk.corpus
-nltk.download('stopwords')
+#nltk.download('stopwords')
 from nltk.corpus import stopwords
 from nltk.sentiment import SentimentIntensityAnalyzer
+from nltk.tokenize import RegexpTokenizer
+from nltk.stem import WordNetLemmatizer

 # Pattern Matching
 import re
@@ -16,14 +18,13 @@ import re
 def sentiment_analysis(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
-    negative, neutral, positive, compound = sentiment['neg'], sentiment['neu'], sentiment['pos'], sentiment['compound']
-  
-    if negative > positive and negative > neutral:
-        return("🙁 with {:.2f}% confidence".format(negative * 100))
-    elif positive > negative:
-        return("🙂 with {:.2f}% confidence".format((compound - positive) * 100))
+   
+    if sentiment["compound"] >= 0.05:
+        return("🙂 with {:.2f}% confidence".format(sentiment["pos"] * 100))
+    elif sentiment["compound"] <= -0.05:
+        return("🙁 with {:.2f}% confidence".format(sentiment["neg"] * 100))
    else:
-        return("😐 with {:.2f}% confidence".format(neutral * 100))
+        return("😐 with {:.2f}% confidence".format(sentiment["neu"] * 100))

 def html_debug(soup):
    f = open("index.html", "w")
@@ -31,12 +32,17 @@ def html_debug(soup):
    f.close()

 def clean_text(text):
-    text = text.lower()
-    text = re.sub(r"[^A-Za-z0-9]+", " ", text)
-    text = ' '.join(text.splitlines())
-    text = ' '.join([word for word in text.split() if word not in (stopwords.words('english'))])
+    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
+    tokenized = tokenizer.tokenize(text)
+    tokenized = [word.lower() for word in tokenized]
+
+    stop_words = stopwords.words('english')
+    filtered = [word for word in tokenized if word not in stop_words and word.isalpha()]
+
+    lemmatizer = WordNetLemmatizer()
+    lemmatized = [lemmatizer.lemmatize(word) for word in filtered]
    
-    return text
+    return " ".join(lemmatized)

 def get_title(soup):
    title = soup.find("meta", {"name": "DC.title"})
@@ -68,7 +74,7 @@ def main():
    mobile_url = shortened_url.replace("www", "m")

    print("\nHow we feel about this listing: {}".format(sentiment_analysis(get_description(create_soup(url)))))
-    print("Vehicle: {}".format(get_title(create_soup(url))))
+    print("Title: {}".format(get_title(create_soup(url))))
    print("Price: {}".format(get_price(create_soup(mobile_url))))

 if __name__ == "__main__":