mirror of
https://github.com/Marketscrape/marketscrape-web.git
synced 2026-05-19 11:14:29 -04:00
Merge pull request #40 from bhavanvir/main
Removed usage/reference to nltk + added Python and Github Copilot as …
This commit is contained in:
@@ -9,7 +9,6 @@ COPY requirements.txt .
|
||||
|
||||
RUN pip install --upgrade pip
|
||||
RUN pip install -r requirements.txt
|
||||
RUN python3 -c "import nltk; nltk.download('all')"
|
||||
|
||||
COPY . .
|
||||
|
||||
|
||||
@@ -7,11 +7,19 @@
|
||||
"context": ".",
|
||||
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
|
||||
"dockerfile": "./Dockerfile"
|
||||
},
|
||||
"customizations": {
|
||||
"vscode": {
|
||||
"extensions": [
|
||||
"GitHub.copilot",
|
||||
"ms-python.python"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
// Features to add to the dev container. More info: https://containers.dev/features.
|
||||
// "features": {},
|
||||
|
||||
|
||||
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
||||
// "forwardPorts": [],
|
||||
|
||||
|
||||
@@ -21,5 +21,4 @@ tzdata==2023.3
|
||||
urllib3==1.26.15
|
||||
fontawesomefree==5.15.4
|
||||
plotly-express==0.4.1
|
||||
pandas==2.0.0
|
||||
django-bootstrap-v5==1.0.11
|
||||
pandas==2.0.0
|
||||
@@ -68,7 +68,7 @@ class FacebookScraper:
|
||||
description = self.base_soup.find("meta", {"name": "DC.description"})
|
||||
description_content = description["content"]
|
||||
|
||||
return clean_text(description_content)
|
||||
return description_content
|
||||
|
||||
def is_listing_missing(self) -> bool:
|
||||
title_element = self.mobile_soup.find("title")
|
||||
|
||||
@@ -1,35 +1,9 @@
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.sentiment import SentimentIntensityAnalyzer
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
from bs4 import BeautifulSoup
|
||||
from difflib import SequenceMatcher
|
||||
import numpy as np
|
||||
import requests
|
||||
import re
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""
|
||||
Cleans a string of text by removing punctuation and extra whitespace.
|
||||
|
||||
Args:
|
||||
text: The string of text to clean.
|
||||
|
||||
Returns:
|
||||
The cleaned string of text.
|
||||
"""
|
||||
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
|
||||
tokenized = tokenizer.tokenize(text)
|
||||
tokenized = [word.lower() for word in tokenized]
|
||||
|
||||
stop_words = stopwords.words('english')
|
||||
filtered = [word for word in tokenized if word not in stop_words and word.isalpha()]
|
||||
|
||||
lemmatizer = WordNetLemmatizer()
|
||||
lemmatized = [lemmatizer.lemmatize(word) for word in filtered]
|
||||
|
||||
return " ".join(lemmatized)
|
||||
|
||||
def clean_listing_title(title: str) -> str:
|
||||
"""
|
||||
Clean a listing title by removing punctuation and converting to lowercase.
|
||||
@@ -119,28 +93,6 @@ def get_product_price(soup: BeautifulSoup) -> np.ndarray:
|
||||
|
||||
return outlierless
|
||||
|
||||
def sentiment_analysis(text: str) -> float:
|
||||
"""
|
||||
Returns the sentiment score of the text, with higher values indicating a more positive sentiment.
|
||||
|
||||
Args:
|
||||
text (str): The text to analyze.
|
||||
Returns:
|
||||
float: The sentiment score, with higher values indicating a more positive sentiment.
|
||||
"""
|
||||
sia = SentimentIntensityAnalyzer()
|
||||
sentiment = sia.polarity_scores(text)
|
||||
neg, neu, pos, compound = sentiment["neg"], sentiment["neu"], sentiment["pos"], sentiment["compound"]
|
||||
|
||||
if compound > 0.0:
|
||||
rating = 5 * max(pos, compound)
|
||||
elif compound < 0.0:
|
||||
rating = 5 * min(neg, compound)
|
||||
else:
|
||||
rating = 5 * neu
|
||||
|
||||
return abs(rating)
|
||||
|
||||
def create_soup(url: str, headers: dict) -> BeautifulSoup:
|
||||
"""
|
||||
Create a BeautifulSoup object from a URL.
|
||||
|
||||
@@ -34,8 +34,6 @@ class Index(View):
|
||||
title = scraper_instance.get_listing_title()
|
||||
list_price = scraper_instance.get_listing_price()
|
||||
|
||||
sentiment_rating = sentiment_analysis(listing_description)
|
||||
|
||||
list_price = re.sub("[\$,]", "", list_price)
|
||||
initial_price = int(re.sub("[\$,]", "", list_price))
|
||||
|
||||
@@ -77,7 +75,6 @@ class Index(View):
|
||||
'shortened_url': shortened_url,
|
||||
'mobile_url': mobile_url,
|
||||
'market_id': market_id,
|
||||
'sentiment_rating': round(sentiment_rating, 1),
|
||||
'title': title,
|
||||
'list_price': f"{float(list_price):,.2f}",
|
||||
'initial_price': initial_price,
|
||||
|
||||
Reference in New Issue
Block a user