Merge pull request #40 from bhavanvir/main

Removed usage/reference to nltk + added Python and Github Copilot as …
2026-05-19 11:14:29 -04:00 · 2023-04-22 13:34:40 -07:00
parent 239986d4aa d49e3e7251
commit 9fd077e49b
6 changed files with 11 additions and 56 deletions
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -9,7 +9,6 @@ COPY requirements.txt .

 RUN pip install --upgrade pip
 RUN pip install -r requirements.txt
-RUN python3 -c "import nltk; nltk.download('all')"

 COPY . .

--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -7,11 +7,19 @@
 		"context": ".",
 		// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
 		"dockerfile": "./Dockerfile"
+	},
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"GitHub.copilot",
+				"ms-python.python"
+			]
+		}
 	}

 	// Features to add to the dev container. More info: https://containers.dev/features.
 	// "features": {},
-
+	
 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
 	// "forwardPorts": [],

--- a/.devcontainer/requirements.txt
+++ b/.devcontainer/requirements.txt
@@ -21,5 +21,4 @@ tzdata==2023.3
 urllib3==1.26.15
 fontawesomefree==5.15.4
 plotly-express==0.4.1
-pandas==2.0.0
-django-bootstrap-v5==1.0.11
+pandas==2.0.0
--- a/scraper/scraper_class.py
+++ b/scraper/scraper_class.py
@@ -68,7 +68,7 @@ class FacebookScraper:
        description = self.base_soup.find("meta", {"name": "DC.description"})
        description_content = description["content"]

-        return clean_text(description_content)
+        return description_content

    def is_listing_missing(self) -> bool:
        title_element = self.mobile_soup.find("title")
--- a/scraper/utils.py
+++ b/scraper/utils.py
@@ -1,35 +1,9 @@
-from nltk.corpus import stopwords
-from nltk.sentiment import SentimentIntensityAnalyzer
-from nltk.tokenize import RegexpTokenizer
-from nltk.stem import WordNetLemmatizer
 from bs4 import BeautifulSoup
 from difflib import SequenceMatcher
 import numpy as np
 import requests
 import re

-def clean_text(text: str) -> str:
-    """
-    Cleans a string of text by removing punctuation and extra whitespace.
-
-    Args:
-        text: The string of text to clean.
-
-    Returns:
-        The cleaned string of text.
-    """
-    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
-    tokenized = tokenizer.tokenize(text)
-    tokenized = [word.lower() for word in tokenized]
-
-    stop_words = stopwords.words('english')
-    filtered = [word for word in tokenized if word not in stop_words and word.isalpha()]
-
-    lemmatizer = WordNetLemmatizer()
-    lemmatized = [lemmatizer.lemmatize(word) for word in filtered]
-    
-    return " ".join(lemmatized)
-
 def clean_listing_title(title: str) -> str:
    """
    Clean a listing title by removing punctuation and converting to lowercase.
@@ -119,28 +93,6 @@ def get_product_price(soup: BeautifulSoup) -> np.ndarray:

    return outlierless

-def sentiment_analysis(text: str) -> float:
-    """
-    Returns the sentiment score of the text, with higher values indicating a more positive sentiment.
-
-    Args:
-        text (str): The text to analyze.
-    Returns:
-        float: The sentiment score, with higher values indicating a more positive sentiment.
-    """
-    sia = SentimentIntensityAnalyzer()
-    sentiment = sia.polarity_scores(text)
-    neg, neu, pos, compound = sentiment["neg"], sentiment["neu"], sentiment["pos"], sentiment["compound"]
-
-    if compound > 0.0:
-        rating = 5 * max(pos, compound)
-    elif compound < 0.0:
-        rating = 5 * min(neg, compound)
-    else:
-        rating = 5 * neu
-
-    return abs(rating)
-
 def create_soup(url: str, headers: dict) -> BeautifulSoup:
    """
    Create a BeautifulSoup object from a URL.
--- a/scraper/views.py
+++ b/scraper/views.py
@@ -34,8 +34,6 @@ class Index(View):
            title = scraper_instance.get_listing_title()
            list_price = scraper_instance.get_listing_price()

-            sentiment_rating = sentiment_analysis(listing_description)
-
            list_price = re.sub("[\$,]", "", list_price)
            initial_price = int(re.sub("[\$,]", "", list_price))

@@ -77,7 +75,6 @@ class Index(View):
                'shortened_url': shortened_url,
                'mobile_url': mobile_url,
                'market_id': market_id,
-                'sentiment_rating': round(sentiment_rating, 1),
                'title': title,
                'list_price': f"{float(list_price):,.2f}",
                'initial_price': initial_price,