mirror of
https://github.com/Marketscrape/marketscrape-web.git
synced 2026-05-19 03:04:31 -04:00
114 lines
3.5 KiB
Python
114 lines
3.5 KiB
Python
import datetime
|
|
import re
|
|
import json
|
|
from .utils import *
|
|
|
|
class FacebookScraper:
|
|
def __init__(self, mobile_soup, base_soup):
|
|
self.mobile_soup = mobile_soup
|
|
self.base_soup = base_soup
|
|
|
|
script_tag = self.base_soup.find_all("script", {"type": "application/ld+json"})
|
|
json_content = {}
|
|
|
|
for script in script_tag:
|
|
script_content= script.string
|
|
try:
|
|
parsed_content = json.loads(script_content)
|
|
json_content.update(parsed_content)
|
|
except json.decoder.JSONDecodeError:
|
|
pass
|
|
|
|
self.json_content = json_content
|
|
|
|
def get_listing_price(self) -> float:
|
|
|
|
return self.json_content["offers"]["price"]
|
|
|
|
def get_listing_title(self) -> str:
|
|
|
|
return self.json_content["name"]
|
|
|
|
def get_listing_description(self) -> str:
|
|
|
|
return self.json_content['description']
|
|
|
|
def get_listing_city(self) -> str:
|
|
|
|
return self.json_content["itemListElement"][1]["name"]
|
|
|
|
def get_listing_condition(self) -> str:
|
|
# Item condition distribution based off of Mercari's dataset
|
|
item_conditions = {
|
|
"New": 43.21,
|
|
"Used - Like New": 29.15,
|
|
"Used - Good": 25.33,
|
|
"Used - Fair": 2.16,
|
|
"Refurbished": 0.15
|
|
}
|
|
|
|
schema = self.json_content["itemCondition"]
|
|
if schema!= None and schema.replace("https://schema.org/", "") == "NewCondition":
|
|
return "New"
|
|
else:
|
|
while True:
|
|
condition = np.random.choice(list(item_conditions.keys()), p=[v/100 for v in item_conditions.values()])
|
|
if condition != "New":
|
|
break
|
|
return condition
|
|
|
|
def get_listing_category(self) -> str:
|
|
|
|
return self.json_content["itemListElement"][2]["name"]
|
|
|
|
def get_listing_image(self) -> str:
|
|
images = self.mobile_soup.find_all("img")
|
|
image = [image["src"] for image in images if "https://scontent" in image["src"]]
|
|
|
|
return image[0]
|
|
|
|
def get_listing_date(self) -> tuple[int, int]:
|
|
tag = self.mobile_soup.find('abbr')
|
|
tag = tag.text.strip()
|
|
|
|
try:
|
|
month_str = re.search(r"[a-zA-Z]+", tag).group(0)
|
|
month_num = datetime.datetime.strptime(month_str, '%B').month
|
|
except ValueError:
|
|
hour_str = re.search(r"[0-9]+", tag).group(0)
|
|
return (0, int(hour_str))
|
|
|
|
try:
|
|
year_str = re.search(r"[0-9]{4}", tag).group(0)
|
|
except AttributeError:
|
|
year_str = datetime.datetime.now().year
|
|
|
|
date_str = re.search(r"[0-9]+", tag).group(0)
|
|
time_str = re.search(r"[0-9]+:[0-9]+", tag).group(0)
|
|
am_pm_str = re.search(r"[A-Z]{2}", tag).group(0)
|
|
|
|
formated_time = f'{time_str}:00 {am_pm_str}'
|
|
formated_date = f'{year_str}-{month_num}-{date_str}'
|
|
|
|
dt_str = f'{formated_date} {formated_time}'
|
|
formated_dt = datetime.datetime.strptime(dt_str, '%Y-%m-%d %I:%M:%S %p')
|
|
|
|
now = datetime.datetime.now()
|
|
diff = now - formated_dt
|
|
|
|
days = diff.days
|
|
hours = diff.seconds // 3600
|
|
|
|
return (days, hours)
|
|
|
|
def is_listing_missing(self) -> bool:
|
|
title_element = self.mobile_soup.find("title")
|
|
title = title_element.get_text()
|
|
|
|
text_to_find = "Buy and sell things locally on Facebook Marketplace."
|
|
found = self.mobile_soup.find(string=text_to_find)
|
|
|
|
if title.lower() == "page not found" or found:
|
|
return True
|
|
|
|
return False |