Merge pull request #40 from bhavanvir/main

Removed usage/reference to nltk + added Python and Github Copilot as …
This commit is contained in:
Alex Holland
2023-04-22 13:34:40 -07:00
committed by GitHub
6 changed files with 11 additions and 56 deletions

View File

@@ -9,7 +9,6 @@ COPY requirements.txt .
RUN pip install --upgrade pip
RUN pip install -r requirements.txt
RUN python3 -c "import nltk; nltk.download('all')"
COPY . .

View File

@@ -7,11 +7,19 @@
"context": ".",
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
"dockerfile": "./Dockerfile"
},
"customizations": {
"vscode": {
"extensions": [
"GitHub.copilot",
"ms-python.python"
]
}
}
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

View File

@@ -21,5 +21,4 @@ tzdata==2023.3
urllib3==1.26.15
fontawesomefree==5.15.4
plotly-express==0.4.1
pandas==2.0.0
django-bootstrap-v5==1.0.11
pandas==2.0.0

View File

@@ -68,7 +68,7 @@ class FacebookScraper:
description = self.base_soup.find("meta", {"name": "DC.description"})
description_content = description["content"]
return clean_text(description_content)
return description_content
def is_listing_missing(self) -> bool:
title_element = self.mobile_soup.find("title")

View File

@@ -1,35 +1,9 @@
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from difflib import SequenceMatcher
import numpy as np
import requests
import re
def clean_text(text: str) -> str:
"""
Cleans a string of text by removing punctuation and extra whitespace.
Args:
text: The string of text to clean.
Returns:
The cleaned string of text.
"""
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
tokenized = tokenizer.tokenize(text)
tokenized = [word.lower() for word in tokenized]
stop_words = stopwords.words('english')
filtered = [word for word in tokenized if word not in stop_words and word.isalpha()]
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered]
return " ".join(lemmatized)
def clean_listing_title(title: str) -> str:
"""
Clean a listing title by removing punctuation and converting to lowercase.
@@ -119,28 +93,6 @@ def get_product_price(soup: BeautifulSoup) -> np.ndarray:
return outlierless
def sentiment_analysis(text: str) -> float:
"""
Returns the sentiment score of the text, with higher values indicating a more positive sentiment.
Args:
text (str): The text to analyze.
Returns:
float: The sentiment score, with higher values indicating a more positive sentiment.
"""
sia = SentimentIntensityAnalyzer()
sentiment = sia.polarity_scores(text)
neg, neu, pos, compound = sentiment["neg"], sentiment["neu"], sentiment["pos"], sentiment["compound"]
if compound > 0.0:
rating = 5 * max(pos, compound)
elif compound < 0.0:
rating = 5 * min(neg, compound)
else:
rating = 5 * neu
return abs(rating)
def create_soup(url: str, headers: dict) -> BeautifulSoup:
"""
Create a BeautifulSoup object from a URL.

View File

@@ -34,8 +34,6 @@ class Index(View):
title = scraper_instance.get_listing_title()
list_price = scraper_instance.get_listing_price()
sentiment_rating = sentiment_analysis(listing_description)
list_price = re.sub("[\$,]", "", list_price)
initial_price = int(re.sub("[\$,]", "", list_price))
@@ -77,7 +75,6 @@ class Index(View):
'shortened_url': shortened_url,
'mobile_url': mobile_url,
'market_id': market_id,
'sentiment_rating': round(sentiment_rating, 1),
'title': title,
'list_price': f"{float(list_price):,.2f}",
'initial_price': initial_price,