mirror of
https://github.com/BoPeng/ai-marketplace-monitor.git
synced 2025-12-23 22:28:18 -05:00
* Fix premature keyword filtering in Facebook marketplace scraping Addresses a critical bug where listings were incorrectly rejected during keyword filtering before their descriptions had been extracted from Facebook's detail pages. ## Problem The dual-check workflow was failing: 1. First check_listing() call: On search results with empty descriptions 2. Second check_listing() call: After get_listing_details() populates descriptions Listings with keywords only in descriptions were being rejected prematurely, causing false negatives for valid matches. ## Solution - Add description_available parameter to check_listing() to control filtering - Skip keyword filtering when descriptions haven't been fetched yet - Maintain antikeyword filtering on available text (title-only or title+description) - Add warning logs when description extraction fails ## Changes - facebook.py: Add description_available parameter and conditional filtering logic - monitor.py: Add defensive tuple unpacking for get_listing_details() return value - Add comprehensive test suite for keyword filtering edge cases - Fix minor formatting inconsistencies throughout codebase ## Testing - New test suite demonstrates the bug and validates the fix - Tests cover keyword filtering with/without descriptions - Antikeyword filtering verified to work correctly in all cases - All existing tests continue to pass Fixes: #247 * Apply minor code improvements from PR feedback - Remove redundant description_available=True parameter (uses default) - Move self.logger check to outer if condition to avoid nested checks These changes improve code clarity and efficiency.
80 lines
3.1 KiB
Python
80 lines
3.1 KiB
Python
import time
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
from pytest_playwright.pytest_playwright import CreateContextCallback # type: ignore
|
|
|
|
from ai_marketplace_monitor.facebook import FacebookSearchResultPage, parse_listing
|
|
|
|
|
|
def test_search_page(
|
|
new_context: CreateContextCallback, filename: str = "search_result_1.html"
|
|
) -> None:
|
|
local_file_path = Path(__file__).parent / filename
|
|
page = new_context(java_script_enabled=False).new_page()
|
|
page.goto(f"file://{local_file_path}")
|
|
|
|
for _ in range(10):
|
|
p = FacebookSearchResultPage(page)
|
|
page.wait_for_load_state("domcontentloaded")
|
|
listings = p.get_listings()
|
|
if len(listings) != 0:
|
|
break
|
|
time.sleep(1)
|
|
|
|
for idx, listing in enumerate(listings):
|
|
assert listing.marketplace == "facebook"
|
|
assert listing.id.isnumeric(), f"wrong id for listing {idx + 1} with title {listing.title}"
|
|
assert listing.title, f"No title is found {idx + 1} with title "
|
|
assert listing.image, f"wrong image for listing {idx + 1} with title {listing.title}"
|
|
assert listing.post_url, f"wrong post_url for listing {idx + 1} with title {listing.title}"
|
|
assert listing.price, f"wrong price for listing {idx + 1} with title {listing.title}"
|
|
if idx == 10:
|
|
assert (
|
|
listing.location == ""
|
|
), f"listing {idx + 1} with title {listing.title} has empty location"
|
|
else:
|
|
assert (
|
|
listing.location
|
|
), f"wrong location for listing {idx + 1} with title {listing.title}"
|
|
assert listing.seller == "", "Seller should be empty"
|
|
|
|
assert len(listings) == 21
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"filename,price,seller,location",
|
|
[
|
|
("regular_listing.html", "$10", "Austin Ewing", "MS"),
|
|
("rental_listing.html", "$150", "Perry Burton", "Houston, TX"),
|
|
(
|
|
"auto_with_about_and_description_listing.html",
|
|
"**unspecified**",
|
|
"Lily Ortiz",
|
|
"Houston, TX",
|
|
),
|
|
("auto_with_description_listing.html", "€6,695", "Abdel Abdel", "Bergen op Zoom, NB"),
|
|
],
|
|
)
|
|
def test_listing_page(
|
|
new_context: CreateContextCallback,
|
|
filename: str,
|
|
price: str,
|
|
seller: str,
|
|
location: str,
|
|
) -> None:
|
|
local_file_path = Path(__file__).parent / filename
|
|
|
|
page = new_context(java_script_enabled=False).new_page()
|
|
page.goto(f"file://{local_file_path}")
|
|
page.wait_for_load_state("domcontentloaded")
|
|
listing = parse_listing(page, "post_url", None)
|
|
|
|
assert listing is not None, f"Should be able to parse {filename}"
|
|
assert listing.title, f"Title of {filename} should be {listing.title}"
|
|
assert listing.price == price, f"Price of {filename} should be {listing.price}"
|
|
assert listing.location == location, f"Location of {filename} should be {listing.location}"
|
|
assert listing.seller == seller, f"Seller of {filename} should be {listing.seller}"
|
|
assert listing.image, f"Image of {filename} should not be empty"
|
|
assert listing.post_url, f"post_url of {filename} should not be empty"
|