mirror of
https://github.com/Marketscrape/marketscrape-web.git
synced 2026-04-29 02:02:38 -04:00
⚡️ Sentiment analysis is now more conclusive.
This commit is contained in:
36
scraper.py
36
scraper.py
@@ -4,11 +4,13 @@ from bs4 import BeautifulSoup
|
||||
|
||||
# Sentiment Analysis
|
||||
import nltk
|
||||
nltk.download()
|
||||
#nltk.download()
|
||||
import nltk.corpus
|
||||
nltk.download('stopwords')
|
||||
#nltk.download('stopwords')
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.sentiment import SentimentIntensityAnalyzer
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
|
||||
# Pattern Matching
|
||||
import re
|
||||
@@ -16,14 +18,13 @@ import re
|
||||
def sentiment_analysis(text):
|
||||
sia = SentimentIntensityAnalyzer()
|
||||
sentiment = sia.polarity_scores(text)
|
||||
negative, neutral, positive, compound = sentiment['neg'], sentiment['neu'], sentiment['pos'], sentiment['compound']
|
||||
|
||||
if negative > positive and negative > neutral:
|
||||
return("🙁 with {:.2f}% confidence".format(negative * 100))
|
||||
elif positive > negative:
|
||||
return("🙂 with {:.2f}% confidence".format((compound - positive) * 100))
|
||||
|
||||
if sentiment["compound"] >= 0.05:
|
||||
return("🙂 with {:.2f}% confidence".format(sentiment["pos"] * 100))
|
||||
elif sentiment["compound"] <= -0.05:
|
||||
return("🙁 with {:.2f}% confidence".format(sentiment["neg"] * 100))
|
||||
else:
|
||||
return("😐 with {:.2f}% confidence".format(neutral * 100))
|
||||
return("😐 with {:.2f}% confidence".format(sentiment["neu"] * 100))
|
||||
|
||||
def html_debug(soup):
|
||||
f = open("index.html", "w")
|
||||
@@ -31,12 +32,17 @@ def html_debug(soup):
|
||||
f.close()
|
||||
|
||||
def clean_text(text):
|
||||
text = text.lower()
|
||||
text = re.sub(r"[^A-Za-z0-9]+", " ", text)
|
||||
text = ' '.join(text.splitlines())
|
||||
text = ' '.join([word for word in text.split() if word not in (stopwords.words('english'))])
|
||||
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
|
||||
tokenized = tokenizer.tokenize(text)
|
||||
tokenized = [word.lower() for word in tokenized]
|
||||
|
||||
stop_words = stopwords.words('english')
|
||||
filtered = [word for word in tokenized if word not in stop_words and word.isalpha()]
|
||||
|
||||
lemmatizer = WordNetLemmatizer()
|
||||
lemmatized = [lemmatizer.lemmatize(word) for word in filtered]
|
||||
|
||||
return text
|
||||
return " ".join(lemmatized)
|
||||
|
||||
def get_title(soup):
|
||||
title = soup.find("meta", {"name": "DC.title"})
|
||||
@@ -68,7 +74,7 @@ def main():
|
||||
mobile_url = shortened_url.replace("www", "m")
|
||||
|
||||
print("\nHow we feel about this listing: {}".format(sentiment_analysis(get_description(create_soup(url)))))
|
||||
print("Vehicle: {}".format(get_title(create_soup(url))))
|
||||
print("Title: {}".format(get_title(create_soup(url))))
|
||||
print("Price: {}".format(get_price(create_soup(mobile_url))))
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user