Files
ai-marketplace-monitor/tests/test_facebook.py
adawalli eb8763226f Fix premature keyword filtering in Facebook marketplace scraping (#248)
* Fix premature keyword filtering in Facebook marketplace scraping

Addresses a critical bug where listings were incorrectly rejected during
keyword filtering before their descriptions had been extracted from
Facebook's detail pages.

## Problem

The dual-check workflow was failing:
1. First check_listing() call: On search results with empty descriptions
2. Second check_listing() call: After get_listing_details() populates descriptions

Listings with keywords only in descriptions were being rejected prematurely,
causing false negatives for valid matches.

## Solution

- Add description_available parameter to check_listing() to control filtering
- Skip keyword filtering when descriptions haven't been fetched yet
- Maintain antikeyword filtering on available text (title-only or title+description)
- Add warning logs when description extraction fails

## Changes

- facebook.py: Add description_available parameter and conditional filtering logic
- monitor.py: Add defensive tuple unpacking for get_listing_details() return value
- Add comprehensive test suite for keyword filtering edge cases
- Fix minor formatting inconsistencies throughout codebase

## Testing

- New test suite demonstrates the bug and validates the fix
- Tests cover keyword filtering with/without descriptions
- Antikeyword filtering verified to work correctly in all cases
- All existing tests continue to pass

Fixes: #247

* Apply minor code improvements from PR feedback

- Remove redundant description_available=True parameter (uses default)
- Move self.logger check to outer if condition to avoid nested checks

These changes improve code clarity and efficiency.
2025-08-09 20:28:14 -05:00

80 lines
3.1 KiB
Python

import time
from pathlib import Path
import pytest
from pytest_playwright.pytest_playwright import CreateContextCallback # type: ignore
from ai_marketplace_monitor.facebook import FacebookSearchResultPage, parse_listing
def test_search_page(
new_context: CreateContextCallback, filename: str = "search_result_1.html"
) -> None:
local_file_path = Path(__file__).parent / filename
page = new_context(java_script_enabled=False).new_page()
page.goto(f"file://{local_file_path}")
for _ in range(10):
p = FacebookSearchResultPage(page)
page.wait_for_load_state("domcontentloaded")
listings = p.get_listings()
if len(listings) != 0:
break
time.sleep(1)
for idx, listing in enumerate(listings):
assert listing.marketplace == "facebook"
assert listing.id.isnumeric(), f"wrong id for listing {idx + 1} with title {listing.title}"
assert listing.title, f"No title is found {idx + 1} with title "
assert listing.image, f"wrong image for listing {idx + 1} with title {listing.title}"
assert listing.post_url, f"wrong post_url for listing {idx + 1} with title {listing.title}"
assert listing.price, f"wrong price for listing {idx + 1} with title {listing.title}"
if idx == 10:
assert (
listing.location == ""
), f"listing {idx + 1} with title {listing.title} has empty location"
else:
assert (
listing.location
), f"wrong location for listing {idx + 1} with title {listing.title}"
assert listing.seller == "", "Seller should be empty"
assert len(listings) == 21
@pytest.mark.parametrize(
"filename,price,seller,location",
[
("regular_listing.html", "$10", "Austin Ewing", "MS"),
("rental_listing.html", "$150", "Perry Burton", "Houston, TX"),
(
"auto_with_about_and_description_listing.html",
"**unspecified**",
"Lily Ortiz",
"Houston, TX",
),
("auto_with_description_listing.html", "€6,695", "Abdel Abdel", "Bergen op Zoom, NB"),
],
)
def test_listing_page(
new_context: CreateContextCallback,
filename: str,
price: str,
seller: str,
location: str,
) -> None:
local_file_path = Path(__file__).parent / filename
page = new_context(java_script_enabled=False).new_page()
page.goto(f"file://{local_file_path}")
page.wait_for_load_state("domcontentloaded")
listing = parse_listing(page, "post_url", None)
assert listing is not None, f"Should be able to parse {filename}"
assert listing.title, f"Title of {filename} should be {listing.title}"
assert listing.price == price, f"Price of {filename} should be {listing.price}"
assert listing.location == location, f"Location of {filename} should be {listing.location}"
assert listing.seller == seller, f"Seller of {filename} should be {listing.seller}"
assert listing.image, f"Image of {filename} should not be empty"
assert listing.post_url, f"post_url of {filename} should not be empty"