Files
marketscrape-web/scraper/scraper_class.py

114 lines
3.5 KiB
Python

import datetime
import re
import json
from .utils import *
class FacebookScraper:
def __init__(self, mobile_soup, base_soup):
self.mobile_soup = mobile_soup
self.base_soup = base_soup
script_tag = self.base_soup.find_all("script", {"type": "application/ld+json"})
json_content = {}
for script in script_tag:
script_content= script.string
try:
parsed_content = json.loads(script_content)
json_content.update(parsed_content)
except json.decoder.JSONDecodeError:
pass
self.json_content = json_content
def get_listing_price(self) -> float:
return self.json_content["offers"]["price"]
def get_listing_title(self) -> str:
return self.json_content["name"]
def get_listing_description(self) -> str:
return self.json_content['description']
def get_listing_city(self) -> str:
return self.json_content["itemListElement"][1]["name"]
def get_listing_condition(self) -> str:
# Item condition distribution based off of Mercari's dataset
item_conditions = {
"New": 43.21,
"Used - Like New": 29.15,
"Used - Good": 25.33,
"Used - Fair": 2.16,
"Refurbished": 0.15
}
schema = self.json_content["itemCondition"]
if schema!= None and schema.replace("https://schema.org/", "") == "NewCondition":
return "New"
else:
while True:
condition = np.random.choice(list(item_conditions.keys()), p=[v/100 for v in item_conditions.values()])
if condition != "New":
break
return condition
def get_listing_category(self) -> str:
return self.json_content["itemListElement"][2]["name"]
def get_listing_image(self) -> str:
images = self.mobile_soup.find_all("img")
image = [image["src"] for image in images if "https://scontent" in image["src"]]
return image[0]
def get_listing_date(self) -> tuple[int, int]:
tag = self.mobile_soup.find('abbr')
tag = tag.text.strip()
try:
month_str = re.search(r"[a-zA-Z]+", tag).group(0)
month_num = datetime.datetime.strptime(month_str, '%B').month
except ValueError:
hour_str = re.search(r"[0-9]+", tag).group(0)
return (0, int(hour_str))
try:
year_str = re.search(r"[0-9]{4}", tag).group(0)
except AttributeError:
year_str = datetime.datetime.now().year
date_str = re.search(r"[0-9]+", tag).group(0)
time_str = re.search(r"[0-9]+:[0-9]+", tag).group(0)
am_pm_str = re.search(r"[A-Z]{2}", tag).group(0)
formated_time = f'{time_str}:00 {am_pm_str}'
formated_date = f'{year_str}-{month_num}-{date_str}'
dt_str = f'{formated_date} {formated_time}'
formated_dt = datetime.datetime.strptime(dt_str, '%Y-%m-%d %I:%M:%S %p')
now = datetime.datetime.now()
diff = now - formated_dt
days = diff.days
hours = diff.seconds // 3600
return (days, hours)
def is_listing_missing(self) -> bool:
title_element = self.mobile_soup.find("title")
title = title_element.get_text()
text_to_find = "Buy and sell things locally on Facebook Marketplace."
found = self.mobile_soup.find(string=text_to_find)
if title.lower() == "page not found" or found:
return True
return False