From d49e3e7251709e5e201d6af11b06452175e344ff Mon Sep 17 00:00:00 2001 From: Bhavanvir Rai Date: Sat, 22 Apr 2023 13:09:11 -0700 Subject: [PATCH] Removed usage/reference to nltk + added Python and Github Copilot as default extensions. --- .devcontainer/Dockerfile | 1 - .devcontainer/devcontainer.json | 10 ++++++- .devcontainer/requirements.txt | 3 +-- scraper/scraper_class.py | 2 +- scraper/utils.py | 48 --------------------------------- scraper/views.py | 3 --- 6 files changed, 11 insertions(+), 56 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 1057e02..1f0cc60 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -9,7 +9,6 @@ COPY requirements.txt . RUN pip install --upgrade pip RUN pip install -r requirements.txt -RUN python3 -c "import nltk; nltk.download('all')" COPY . . diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 36d88d7..ed96220 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -7,11 +7,19 @@ "context": ".", // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. "dockerfile": "./Dockerfile" + }, + "customizations": { + "vscode": { + "extensions": [ + "GitHub.copilot", + "ms-python.python" + ] + } } // Features to add to the dev container. More info: https://containers.dev/features. // "features": {}, - + // Use 'forwardPorts' to make a list of ports inside the container available locally. // "forwardPorts": [], diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt index b94ffd1..c812234 100644 --- a/.devcontainer/requirements.txt +++ b/.devcontainer/requirements.txt @@ -21,5 +21,4 @@ tzdata==2023.3 urllib3==1.26.15 fontawesomefree==5.15.4 plotly-express==0.4.1 -pandas==2.0.0 -django-bootstrap-v5==1.0.11 \ No newline at end of file +pandas==2.0.0 \ No newline at end of file diff --git a/scraper/scraper_class.py b/scraper/scraper_class.py index 2e11d2a..b261d15 100644 --- a/scraper/scraper_class.py +++ b/scraper/scraper_class.py @@ -68,7 +68,7 @@ class FacebookScraper: description = self.base_soup.find("meta", {"name": "DC.description"}) description_content = description["content"] - return clean_text(description_content) + return description_content def is_listing_missing(self) -> bool: title_element = self.mobile_soup.find("title") diff --git a/scraper/utils.py b/scraper/utils.py index a35b753..9289301 100644 --- a/scraper/utils.py +++ b/scraper/utils.py @@ -1,35 +1,9 @@ -from nltk.corpus import stopwords -from nltk.sentiment import SentimentIntensityAnalyzer -from nltk.tokenize import RegexpTokenizer -from nltk.stem import WordNetLemmatizer from bs4 import BeautifulSoup from difflib import SequenceMatcher import numpy as np import requests import re -def clean_text(text: str) -> str: - """ - Cleans a string of text by removing punctuation and extra whitespace. - - Args: - text: The string of text to clean. - - Returns: - The cleaned string of text. - """ - tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+') - tokenized = tokenizer.tokenize(text) - tokenized = [word.lower() for word in tokenized] - - stop_words = stopwords.words('english') - filtered = [word for word in tokenized if word not in stop_words and word.isalpha()] - - lemmatizer = WordNetLemmatizer() - lemmatized = [lemmatizer.lemmatize(word) for word in filtered] - - return " ".join(lemmatized) - def clean_listing_title(title: str) -> str: """ Clean a listing title by removing punctuation and converting to lowercase. @@ -119,28 +93,6 @@ def get_product_price(soup: BeautifulSoup) -> np.ndarray: return outlierless -def sentiment_analysis(text: str) -> float: - """ - Returns the sentiment score of the text, with higher values indicating a more positive sentiment. - - Args: - text (str): The text to analyze. - Returns: - float: The sentiment score, with higher values indicating a more positive sentiment. - """ - sia = SentimentIntensityAnalyzer() - sentiment = sia.polarity_scores(text) - neg, neu, pos, compound = sentiment["neg"], sentiment["neu"], sentiment["pos"], sentiment["compound"] - - if compound > 0.0: - rating = 5 * max(pos, compound) - elif compound < 0.0: - rating = 5 * min(neg, compound) - else: - rating = 5 * neu - - return abs(rating) - def create_soup(url: str, headers: dict) -> BeautifulSoup: """ Create a BeautifulSoup object from a URL. diff --git a/scraper/views.py b/scraper/views.py index b5a1fb5..0eb8473 100644 --- a/scraper/views.py +++ b/scraper/views.py @@ -34,8 +34,6 @@ class Index(View): title = scraper_instance.get_listing_title() list_price = scraper_instance.get_listing_price() - sentiment_rating = sentiment_analysis(listing_description) - list_price = re.sub("[\$,]", "", list_price) initial_price = int(re.sub("[\$,]", "", list_price)) @@ -77,7 +75,6 @@ class Index(View): 'shortened_url': shortened_url, 'mobile_url': mobile_url, 'market_id': market_id, - 'sentiment_rating': round(sentiment_rating, 1), 'title': title, 'list_price': f"{float(list_price):,.2f}", 'initial_price': initial_price,