diff --git a/cwa_book_downloader/bypass/__init__.py b/cwa_book_downloader/bypass/__init__.py index f3a5339..2c01256 100644 --- a/cwa_book_downloader/bypass/__init__.py +++ b/cwa_book_downloader/bypass/__init__.py @@ -1 +1,5 @@ """Cloudflare bypass utilities.""" + + +class BypassCancelledException(Exception): + """Raised when a bypass operation is cancelled.""" diff --git a/cwa_book_downloader/bypass/external_bypasser.py b/cwa_book_downloader/bypass/external_bypasser.py index c8aaa08..14d321e 100644 --- a/cwa_book_downloader/bypass/external_bypasser.py +++ b/cwa_book_downloader/bypass/external_bypasser.py @@ -1,31 +1,27 @@ """External Cloudflare bypasser using FlareSolverr.""" -from threading import Event -from typing import Optional, TYPE_CHECKING -import requests -import time import random +import time +from threading import Event +from typing import TYPE_CHECKING, Optional +import requests + +from cwa_book_downloader.bypass import BypassCancelledException from cwa_book_downloader.core.config import config from cwa_book_downloader.core.logger import setup_logger if TYPE_CHECKING: from cwa_book_downloader.download import network - -class BypassCancelledException(Exception): - """Raised when a bypass operation is cancelled.""" - pass - logger = setup_logger(__name__) -# Connection timeout (seconds) - how long to wait for external bypasser to accept connection +# Timeout constants (seconds) CONNECT_TIMEOUT = 10 -# Maximum read timeout cap (seconds) - hard limit regardless of EXT_BYPASSER_TIMEOUT MAX_READ_TIMEOUT = 120 -# Buffer added to bypasser's configured timeout (seconds) - accounts for processing overhead READ_TIMEOUT_BUFFER = 15 -# Retry settings for bypasser failures + +# Retry settings MAX_RETRY = 5 BACKOFF_BASE = 1.0 BACKOFF_CAP = 10.0 @@ -48,22 +44,13 @@ def _fetch_via_bypasser(target_url: str) -> Optional[str]: logger.error("External bypasser not configured. Check EXT_BYPASSER_URL and EXT_BYPASSER_PATH.") return None - bypasser_endpoint = f"{bypasser_url}{bypasser_path}" - headers = {"Content-Type": "application/json"} - payload = { - "cmd": "request.get", - "url": target_url, - "maxTimeout": bypasser_timeout - } - - # Calculate read timeout: bypasser timeout (ms -> s) + buffer, capped at max read_timeout = min((bypasser_timeout / 1000) + READ_TIMEOUT_BUFFER, MAX_READ_TIMEOUT) try: response = requests.post( - bypasser_endpoint, - headers=headers, - json=payload, + f"{bypasser_url}{bypasser_path}", + headers={"Content-Type": "application/json"}, + json={"cmd": "request.get", "url": target_url, "maxTimeout": bypasser_timeout}, timeout=(CONNECT_TIMEOUT, read_timeout) ) response.raise_for_status() @@ -73,17 +60,13 @@ def _fetch_via_bypasser(target_url: str) -> Optional[str]: message = result.get('message', '') logger.debug(f"External bypasser response for '{target_url}': {status} - {message}") - # Check for error status (bypasser returns status="error" with solution=null on failure) if status != 'ok': logger.warning(f"External bypasser failed for '{target_url}': {status} - {message}") return None solution = result.get('solution') - if not solution: - logger.warning(f"External bypasser returned empty solution for '{target_url}'") - return None + html = solution.get('response', '') if solution else '' - html = solution.get('response', '') if not html: logger.warning(f"External bypasser returned empty response for '{target_url}'") return None @@ -92,16 +75,36 @@ def _fetch_via_bypasser(target_url: str) -> Optional[str]: except requests.exceptions.Timeout: logger.warning(f"External bypasser timed out for '{target_url}' (connect: {CONNECT_TIMEOUT}s, read: {read_timeout:.0f}s)") - return None except requests.exceptions.RequestException as e: logger.warning(f"External bypasser request failed for '{target_url}': {e}") - return None except (KeyError, TypeError, ValueError) as e: logger.warning(f"External bypasser returned malformed response for '{target_url}': {e}") - return None + + return None -def get_bypassed_page(url: str, selector: Optional["network.AAMirrorSelector"] = None, cancel_flag: Optional[Event] = None) -> Optional[str]: +def _check_cancelled(cancel_flag: Optional[Event], context: str) -> None: + """Check if operation was cancelled and raise exception if so.""" + if cancel_flag and cancel_flag.is_set(): + logger.info(f"External bypasser cancelled {context}") + raise BypassCancelledException("Bypass cancelled") + + +def _sleep_with_cancellation(seconds: float, cancel_flag: Optional[Event]) -> None: + """Sleep for the specified duration, checking for cancellation each second.""" + for _ in range(int(seconds)): + _check_cancelled(cancel_flag, "during backoff") + time.sleep(1) + remaining = seconds - int(seconds) + if remaining > 0: + time.sleep(remaining) + + +def get_bypassed_page( + url: str, + selector: Optional["network.AAMirrorSelector"] = None, + cancel_flag: Optional[Event] = None +) -> Optional[str]: """Fetch HTML content from a URL using an external Cloudflare bypasser service. Retries with exponential backoff and mirror/DNS rotation on failure. @@ -118,13 +121,11 @@ def get_bypassed_page(url: str, selector: Optional["network.AAMirrorSelector"] = BypassCancelledException: If cancel_flag is set during operation """ from cwa_book_downloader.download import network as network_module + sel = selector or network_module.AAMirrorSelector() for attempt in range(1, MAX_RETRY + 1): - # Check for cancellation before each attempt - if cancel_flag and cancel_flag.is_set(): - logger.info("External bypasser cancelled by user") - raise BypassCancelledException("Bypass cancelled") + _check_cancelled(cancel_flag, "by user") attempt_url = sel.rewrite(url) result = _fetch_via_bypasser(attempt_url) @@ -134,27 +135,11 @@ def get_bypassed_page(url: str, selector: Optional["network.AAMirrorSelector"] = if attempt == MAX_RETRY: break - # Check for cancellation before backoff wait - if cancel_flag and cancel_flag.is_set(): - logger.info("External bypasser cancelled during retry") - raise BypassCancelledException("Bypass cancelled") - - # Backoff with jitter before retry, checking cancellation during wait delay = min(BACKOFF_CAP, BACKOFF_BASE * (2 ** (attempt - 1))) + random.random() logger.info(f"External bypasser attempt {attempt}/{MAX_RETRY} failed, retrying in {delay:.1f}s") - # Check cancellation during delay (check every second) - for _ in range(int(delay)): - if cancel_flag and cancel_flag.is_set(): - logger.info("External bypasser cancelled during backoff") - raise BypassCancelledException("Bypass cancelled") - time.sleep(1) - # Sleep remaining fraction - remaining = delay - int(delay) - if remaining > 0: - time.sleep(remaining) + _sleep_with_cancellation(delay, cancel_flag) - # Rotate mirror/DNS for next attempt new_base, action = sel.next_mirror_or_rotate_dns() if action in ("mirror", "dns") and new_base: logger.info(f"Rotated {action} for retry") diff --git a/cwa_book_downloader/bypass/fingerprint.py b/cwa_book_downloader/bypass/fingerprint.py new file mode 100644 index 0000000..0af4081 --- /dev/null +++ b/cwa_book_downloader/bypass/fingerprint.py @@ -0,0 +1,57 @@ +"""Browser fingerprint profile management for bypass stealth.""" + +import random +from typing import Optional + +from cwa_book_downloader.core.logger import setup_logger + +logger = setup_logger(__name__) + +COMMON_RESOLUTIONS = [ + (1920, 1080, 0.35), + (1366, 768, 0.18), + (1536, 864, 0.10), + (1440, 900, 0.08), + (1280, 720, 0.07), + (1600, 900, 0.06), + (1280, 800, 0.05), + (2560, 1440, 0.04), + (1680, 1050, 0.04), + (1920, 1200, 0.03), +] + +# Current screen size (module-level singleton) +_current_screen_size: Optional[tuple[int, int]] = None + + +def get_screen_size() -> tuple[int, int]: + global _current_screen_size + if _current_screen_size is None: + _current_screen_size = _generate_screen_size() + logger.debug(f"Generated initial screen size: {_current_screen_size[0]}x{_current_screen_size[1]}") + return _current_screen_size + + +def rotate_screen_size() -> tuple[int, int]: + global _current_screen_size + old_size = _current_screen_size + _current_screen_size = _generate_screen_size() + width, height = _current_screen_size + + if old_size: + logger.info(f"Rotated screen size: {old_size[0]}x{old_size[1]} -> {width}x{height}") + else: + logger.info(f"Generated screen size: {width}x{height}") + + return _current_screen_size + + +def clear_screen_size() -> None: + global _current_screen_size + _current_screen_size = None + + +def _generate_screen_size() -> tuple[int, int]: + resolutions = [(w, h) for w, h, _ in COMMON_RESOLUTIONS] + weights = [weight for _, _, weight in COMMON_RESOLUTIONS] + return random.choices(resolutions, weights=weights)[0] diff --git a/cwa_book_downloader/bypass/internal_bypasser.py b/cwa_book_downloader/bypass/internal_bypasser.py index c03320b..d074748 100644 --- a/cwa_book_downloader/bypass/internal_bypasser.py +++ b/cwa_book_downloader/bypass/internal_bypasser.py @@ -11,44 +11,18 @@ from threading import Event from typing import Optional from urllib.parse import urlparse - -class BypassCancelledException(Exception): - """Raised when a bypass operation is cancelled.""" - pass - import requests from seleniumbase import Driver +from cwa_book_downloader.bypass import BypassCancelledException +from cwa_book_downloader.bypass.fingerprint import clear_screen_size, get_screen_size from cwa_book_downloader.config import env -from cwa_book_downloader.download import network -from cwa_book_downloader.config.settings import RECORDING_DIR, VIRTUAL_SCREEN_SIZE from cwa_book_downloader.config.env import LOG_DIR +from cwa_book_downloader.config.settings import RECORDING_DIR from cwa_book_downloader.core.config import config as app_config from cwa_book_downloader.core.logger import setup_logger - - -def _get_proxies() -> dict: - """Get current proxy configuration from config singleton.""" - proxy_mode = app_config.get("PROXY_MODE", "none") - - if proxy_mode == "socks5": - socks_proxy = app_config.get("SOCKS5_PROXY", "") - if socks_proxy: - return {"http": socks_proxy, "https": socks_proxy} - elif proxy_mode == "http": - proxies = {} - http_proxy = app_config.get("HTTP_PROXY", "") - https_proxy = app_config.get("HTTPS_PROXY", "") - if http_proxy: - proxies["http"] = http_proxy - if https_proxy: - proxies["https"] = https_proxy - elif http_proxy: - # Fallback: use HTTP proxy for HTTPS if HTTPS proxy not specified - proxies["https"] = http_proxy - return proxies - - return {} +from cwa_book_downloader.download import network +from cwa_book_downloader.download.network import get_proxies logger = setup_logger(__name__) @@ -97,6 +71,20 @@ DDG_COOKIE_NAMES = {'__ddg1_', '__ddg2_', '__ddg5_', '__ddg8_', '__ddg9_', '__dd FULL_COOKIE_DOMAINS = {'z-lib.fm', 'z-lib.gs', 'z-lib.id', 'z-library.sk', 'zlibrary-global.se'} +def _get_base_domain(domain: str) -> str: + """Extract base domain from hostname (e.g., 'www.example.com' -> 'example.com').""" + return '.'.join(domain.split('.')[-2:]) if '.' in domain else domain + + +def _should_extract_cookie(name: str, extract_all: bool) -> bool: + """Determine if a cookie should be extracted based on its name.""" + if extract_all: + return True + is_cf = name in CF_COOKIE_NAMES or name.startswith('cf_') + is_ddg = name in DDG_COOKIE_NAMES or name.startswith('__ddg') + return is_cf or is_ddg + + def _extract_cookies_from_driver(driver, url: str) -> None: """Extract cookies from Chrome after successful bypass.""" try: @@ -105,24 +93,13 @@ def _extract_cookies_from_driver(driver, url: str) -> None: if not domain: return - # Get base domain for storage and full-cookie check - base_domain = '.'.join(domain.split('.')[-2:]) if '.' in domain else domain + base_domain = _get_base_domain(domain) extract_all = base_domain in FULL_COOKIE_DOMAINS - cookies = driver.get_cookies() cookies_found = {} - - for cookie in cookies: + for cookie in driver.get_cookies(): name = cookie.get('name', '') - - if extract_all: - should_extract = True - else: - is_cf = name in CF_COOKIE_NAMES or name.startswith('cf_') - is_ddg = name in DDG_COOKIE_NAMES or name.startswith('__ddg') - should_extract = is_cf or is_ddg - - if should_extract: + if _should_extract_cookie(name, extract_all): cookies_found[name] = { 'value': cookie.get('value', ''), 'domain': cookie.get('domain', domain), @@ -132,23 +109,24 @@ def _extract_cookies_from_driver(driver, url: str) -> None: 'httpOnly': cookie.get('httpOnly', True), } - if cookies_found: - # Extract User-Agent - Cloudflare ties cf_clearance to the UA - try: - user_agent = driver.execute_script("return navigator.userAgent") - except Exception: - user_agent = None + if not cookies_found: + return - with _cf_cookies_lock: - _cf_cookies[base_domain] = cookies_found - if user_agent: - _cf_user_agents[base_domain] = user_agent - logger.debug(f"Stored UA for {base_domain}: {user_agent[:60]}...") - else: - logger.debug(f"No UA captured for {base_domain}") + try: + user_agent = driver.execute_script("return navigator.userAgent") + except Exception: + user_agent = None - cookie_type = "all" if extract_all else "protection" - logger.debug(f"Extracted {len(cookies_found)} {cookie_type} cookies for {base_domain}") + with _cf_cookies_lock: + _cf_cookies[base_domain] = cookies_found + if user_agent: + _cf_user_agents[base_domain] = user_agent + logger.debug(f"Stored UA for {base_domain}: {user_agent[:60]}...") + else: + logger.debug(f"No UA captured for {base_domain}") + + cookie_type = "all" if extract_all else "protection" + logger.debug(f"Extracted {len(cookies_found)} {cookie_type} cookies for {base_domain}") except Exception as e: logger.debug(f"Failed to extract cookies: {e}") @@ -159,15 +137,13 @@ def get_cf_cookies_for_domain(domain: str) -> dict[str, str]: if not domain: return {} - # Get base domain - base_domain = '.'.join(domain.split('.')[-2:]) if '.' in domain else domain + base_domain = _get_base_domain(domain) with _cf_cookies_lock: cookies = _cf_cookies.get(base_domain, {}) if not cookies: return {} - # Check if cf_clearance exists and hasn't expired cf_clearance = cookies.get('cf_clearance', {}) if cf_clearance: expiry = cf_clearance.get('expiry') @@ -176,7 +152,6 @@ def get_cf_cookies_for_domain(domain: str) -> dict[str, str]: _cf_cookies.pop(base_domain, None) return {} - # Return simple name->value dict for requests return {name: c['value'] for name, c in cookies.items()} @@ -189,16 +164,15 @@ def get_cf_user_agent_for_domain(domain: str) -> Optional[str]: """Get the User-Agent that was used during bypass for a domain.""" if not domain: return None - base_domain = '.'.join(domain.split('.')[-2:]) if '.' in domain else domain with _cf_cookies_lock: - return _cf_user_agents.get(base_domain) + return _cf_user_agents.get(_get_base_domain(domain)) def clear_cf_cookies(domain: str = None) -> None: """Clear stored Cloudflare cookies and User-Agent. If domain is None, clear all.""" with _cf_cookies_lock: if domain: - base_domain = '.'.join(domain.split('.')[-2:]) if '.' in domain else domain + base_domain = _get_base_domain(domain) _cf_cookies.pop(base_domain, None) _cf_user_agents.pop(base_domain, None) else: @@ -216,18 +190,7 @@ def _reset_pyautogui_display_state(): def _cleanup_orphan_processes() -> int: - """Kill any orphan Chrome, ChromeDriver, Xvfb, and ffmpeg processes. - - This should be called at startup before initializing the bypasser to ensure - no zombie processes from previous crashes are consuming memory or interfering. - - Safety: Only runs in Docker mode to avoid killing user's browser processes - on development machines. - - Returns: - Number of processes killed. - """ - # Safety: only cleanup in Docker mode to avoid killing user's browser + """Kill orphan Chrome/Xvfb/ffmpeg processes. Only runs in Docker mode.""" if not env.DOCKERMODE: return 0 @@ -239,34 +202,35 @@ def _cleanup_orphan_processes() -> int: for proc_name in processes_to_kill: try: - # Use pgrep to find processes, then kill them result = subprocess.run( ["pgrep", "-f", proc_name], capture_output=True, text=True, timeout=5 ) - if result.returncode == 0 and result.stdout.strip(): - pids = result.stdout.strip().split('\n') - count = len(pids) - if count > 0: - logger.info(f"Found {count} orphan {proc_name} process(es), killing...") - kill_result = subprocess.run( - ["pkill", "-9", "-f", proc_name], - capture_output=True, - timeout=5 - ) - if kill_result.returncode == 0: - total_killed += count - else: - logger.warning(f"pkill for {proc_name} returned {kill_result.returncode}, processes may not have been killed") + if result.returncode != 0 or not result.stdout.strip(): + continue + + pids = result.stdout.strip().split('\n') + count = len(pids) + logger.info(f"Found {count} orphan {proc_name} process(es), killing...") + + kill_result = subprocess.run( + ["pkill", "-9", "-f", proc_name], + capture_output=True, + timeout=5 + ) + if kill_result.returncode == 0: + total_killed += count + else: + logger.warning(f"pkill for {proc_name} returned {kill_result.returncode}") + except subprocess.TimeoutExpired: logger.warning(f"Timeout while checking for {proc_name} processes") except Exception as e: logger.debug(f"Error checking for {proc_name} processes: {e}") if total_killed > 0: - # Give processes time to fully terminate time.sleep(1) logger.info(f"Cleaned up {total_killed} orphan process(es)") logger.log_resource_usage() @@ -350,8 +314,7 @@ def _is_bypassed(sb, escape_emojis: bool = True) -> bool: return True # Check for protection indicators (means NOT bypassed) - if found := _check_indicators(title, body, CLOUDFLARE_INDICATORS + DDOS_GUARD_INDICATORS): - logger.debug(f"Protection indicator found: '{found}'") + if _check_indicators(title, body, CLOUDFLARE_INDICATORS + DDOS_GUARD_INDICATORS): return False # Cloudflare URL patterns @@ -374,17 +337,14 @@ def _is_bypassed(sb, escape_emojis: bool = True) -> bool: def _simulate_human_behavior(sb) -> None: """Simulate human-like behavior before bypass attempt.""" try: - # Random short wait (human reaction time) time.sleep(random.uniform(0.5, 1.5)) - # Maybe scroll a bit (30% chance) if random.random() < 0.3: sb.scroll_down(random.randint(20, 50)) time.sleep(random.uniform(0.2, 0.5)) sb.scroll_up(random.randint(10, 30)) time.sleep(random.uniform(0.2, 0.4)) - # Brief mouse jiggle via PyAutoGUI try: import pyautogui x, y = pyautogui.position() @@ -394,9 +354,9 @@ def _simulate_human_behavior(sb) -> None: duration=random.uniform(0.05, 0.15) ) except Exception as e: - logger.debug(f"Mouse jiggle failed (non-critical): {e}") + logger.debug(f"Mouse jiggle failed: {e}") except Exception as e: - logger.debug(f"Human simulation failed (non-critical): {e}") + logger.debug(f"Human simulation failed: {e}") def _bypass_method_handle_captcha(sb) -> bool: @@ -435,26 +395,22 @@ def _bypass_method_click_captcha(sb) -> bool: def _bypass_method_humanlike(sb) -> bool: - """Method 4: Human-like behavior with scroll, wait, and reload.""" + """Human-like behavior with scroll, wait, and reload.""" try: logger.debug("Attempting bypass: human-like interaction") - - # Extended human-like wait time.sleep(random.uniform(6, 10)) - # Scroll behavior try: sb.scroll_to_bottom() time.sleep(random.uniform(1, 2)) sb.scroll_to_top() time.sleep(random.uniform(2, 3)) except Exception as e: - logger.debug(f"Scroll behavior failed (non-critical): {e}") + logger.debug(f"Scroll behavior failed: {e}") if _is_bypassed(sb): return True - # Try refresh logger.debug("Trying page refresh...") sb.refresh() time.sleep(random.uniform(5, 8)) @@ -462,12 +418,11 @@ def _bypass_method_humanlike(sb) -> bool: if _is_bypassed(sb): return True - # Final captcha click attempt try: sb.uc_gui_click_captcha() time.sleep(random.uniform(3, 5)) except Exception as e: - logger.debug(f"Final captcha click failed (non-critical): {e}") + logger.debug(f"Final captcha click failed: {e}") return _is_bypassed(sb) except Exception as e: @@ -475,26 +430,28 @@ def _bypass_method_humanlike(sb) -> bool: return False -def _bypass_method_cdp_solve(sb) -> bool: - """Method 5: CDP Mode with solve_captcha() - WebDriver disconnected, no PyAutoGUI. +def _safe_reconnect(sb) -> None: + """Safely attempt to reconnect WebDriver after CDP mode.""" + try: + sb.reconnect() + except Exception as e: + logger.debug(f"Reconnect failed: {e}") - CDP Mode completely disconnects WebDriver during interaction, making detection - much harder. The solve_captcha() method auto-detects challenge type. + +def _bypass_method_cdp_solve(sb) -> bool: + """CDP Mode with solve_captcha() - WebDriver disconnected, no PyAutoGUI. + + CDP Mode disconnects WebDriver during interaction, making detection harder. + The solve_captcha() method auto-detects challenge type. """ try: logger.debug("Attempting bypass: CDP Mode solve_captcha") - current_url = sb.get_current_url() - - # Activate CDP mode - this disconnects WebDriver - sb.activate_cdp_mode(current_url) + sb.activate_cdp_mode(sb.get_current_url()) time.sleep(random.uniform(1, 2)) - # Try CDP solve_captcha (auto-detects challenge type) try: sb.cdp.solve_captcha() time.sleep(random.uniform(3, 5)) - - # Reconnect WebDriver to check result sb.reconnect() time.sleep(random.uniform(1, 2)) @@ -502,101 +459,82 @@ def _bypass_method_cdp_solve(sb) -> bool: return True except Exception as e: logger.debug(f"CDP solve_captcha failed: {e}") - # Make sure we reconnect on failure - try: - sb.reconnect() - except Exception as reconnect_e: - logger.debug(f"Reconnect after CDP solve failure failed: {reconnect_e}") + _safe_reconnect(sb) return False except Exception as e: logger.debug(f"CDP Mode solve failed: {e}") - # Ensure WebDriver is reconnected - try: - sb.reconnect() - except Exception as reconnect_e: - logger.debug(f"Reconnect after CDP Mode solve failed: {reconnect_e}") + _safe_reconnect(sb) return False +CDP_CLICK_SELECTORS = [ + "#turnstile-widget div", # Cloudflare Turnstile + "#cf-turnstile div", # Alternative CF Turnstile + "iframe[src*='challenges']", # CF challenge iframe + "input[type='checkbox']", # Generic checkbox (DDOS-Guard) + "[class*='checkbox']", # Class-based checkbox + "#challenge-running", # CF challenge indicator +] + + def _bypass_method_cdp_click(sb) -> bool: """CDP Mode with native clicking - no PyAutoGUI dependency. - Uses sb.cdp.click() which is native CDP clicking added in SeleniumBase 4.45.6. - This doesn't require PyAutoGUI at all. + Uses sb.cdp.click() which is native CDP clicking (SeleniumBase 4.45.6+). """ try: logger.debug("Attempting bypass: CDP Mode native click") - current_url = sb.get_current_url() - - # Activate CDP mode - sb.activate_cdp_mode(current_url) + sb.activate_cdp_mode(sb.get_current_url()) time.sleep(random.uniform(1, 2)) - # Common captcha/challenge selectors to try - selectors = [ - "#turnstile-widget div", # Cloudflare Turnstile (parent above shadow-root) - "#cf-turnstile div", # Alternative CF Turnstile - "iframe[src*='challenges']", # CF challenge iframe - "input[type='checkbox']", # Generic checkbox (DDOS-Guard) - "[class*='checkbox']", # Class-based checkbox - "#challenge-running", # CF challenge indicator - ] - - for selector in selectors: + for selector in CDP_CLICK_SELECTORS: try: - # Check if element exists and is visible - if sb.cdp.is_element_visible(selector): - logger.debug(f"CDP clicking: {selector}") - sb.cdp.click(selector) - time.sleep(random.uniform(2, 4)) + if not sb.cdp.is_element_visible(selector): + continue - # Reconnect and check - sb.reconnect() - time.sleep(random.uniform(1, 2)) + logger.debug(f"CDP clicking: {selector}") + sb.cdp.click(selector) + time.sleep(random.uniform(2, 4)) - if _is_bypassed(sb): - return True + sb.reconnect() + time.sleep(random.uniform(1, 2)) - # Re-enter CDP mode for next attempt - sb.activate_cdp_mode(sb.get_current_url()) - time.sleep(random.uniform(0.5, 1)) + if _is_bypassed(sb): + return True + + sb.activate_cdp_mode(sb.get_current_url()) + time.sleep(random.uniform(0.5, 1)) except Exception as e: logger.debug(f"CDP click on '{selector}' failed: {e}") - continue - - # Final reconnect - try: - sb.reconnect() - except Exception as e: - logger.debug(f"Final reconnect in CDP click failed: {e}") + _safe_reconnect(sb) return _is_bypassed(sb) except Exception as e: logger.debug(f"CDP Mode click failed: {e}") - try: - sb.reconnect() - except Exception as reconnect_e: - logger.debug(f"Reconnect after CDP Mode click failed: {reconnect_e}") + _safe_reconnect(sb) return False +CDP_GUI_CLICK_SELECTORS = [ + "#turnstile-widget div", # Cloudflare Turnstile + "#cf-turnstile div", # Alternative CF Turnstile + "#challenge-stage div", # CF challenge stage + "input[type='checkbox']", # Generic checkbox + "[class*='cb-i']", # DDOS-Guard checkbox +] + + def _bypass_method_cdp_gui_click(sb) -> bool: """CDP Mode with PyAutoGUI-based clicking - uses actual mouse movement. - For advanced protections (Kasada, DataDome, Akamai), the docs recommend - using gui_* methods with actual mouse movements instead of CDP clicks. - This is the most human-like approach in CDP mode. + Most human-like approach for advanced protections (Kasada, DataDome, Akamai). """ try: logger.debug("Attempting bypass: CDP Mode gui_click (mouse-based)") - current_url = sb.get_current_url() - - # Activate CDP mode - sb.activate_cdp_mode(current_url) + sb.activate_cdp_mode(sb.get_current_url()) time.sleep(random.uniform(1, 2)) - # Try the dedicated CDP captcha method first try: logger.debug("Trying cdp.gui_click_captcha()") sb.cdp.gui_click_captcha() @@ -613,107 +551,108 @@ def _bypass_method_cdp_gui_click(sb) -> bool: except Exception as e: logger.debug(f"cdp.gui_click_captcha() failed: {e}") - # Turnstile selectors - use parent above shadow-root as per docs - selectors = [ - "#turnstile-widget div", # Cloudflare Turnstile - "#cf-turnstile div", # Alternative CF Turnstile - "#challenge-stage div", # CF challenge stage - "input[type='checkbox']", # Generic checkbox - "[class*='cb-i']", # DDOS-Guard checkbox - ] - - for selector in selectors: + for selector in CDP_GUI_CLICK_SELECTORS: try: - if sb.cdp.is_element_visible(selector): - logger.debug(f"CDP gui_click_element: {selector}") - sb.cdp.gui_click_element(selector) - time.sleep(random.uniform(3, 5)) + if not sb.cdp.is_element_visible(selector): + continue - sb.reconnect() - time.sleep(random.uniform(1, 2)) + logger.debug(f"CDP gui_click_element: {selector}") + sb.cdp.gui_click_element(selector) + time.sleep(random.uniform(3, 5)) - if _is_bypassed(sb): - return True + sb.reconnect() + time.sleep(random.uniform(1, 2)) - sb.activate_cdp_mode(sb.get_current_url()) - time.sleep(random.uniform(0.5, 1)) + if _is_bypassed(sb): + return True + + sb.activate_cdp_mode(sb.get_current_url()) + time.sleep(random.uniform(0.5, 1)) except Exception as e: logger.debug(f"CDP gui_click on '{selector}' failed: {e}") - continue - - # Final reconnect - try: - sb.reconnect() - except Exception as e: - logger.debug(f"Final reconnect in CDP gui_click failed: {e}") + _safe_reconnect(sb) return _is_bypassed(sb) except Exception as e: logger.debug(f"CDP Mode gui_click failed: {e}") - try: - sb.reconnect() - except Exception as reconnect_e: - logger.debug(f"Reconnect after CDP Mode gui_click failed: {reconnect_e}") + _safe_reconnect(sb) return False +BYPASS_METHODS = [ + _bypass_method_cdp_solve, + _bypass_method_cdp_click, + _bypass_method_cdp_gui_click, + _bypass_method_handle_captcha, + _bypass_method_click_captcha, + _bypass_method_humanlike, +] + +MAX_CONSECUTIVE_SAME_CHALLENGE = 3 + + +def _check_cancellation(cancel_flag: Optional[Event], message: str) -> None: + """Check if cancellation was requested and raise if so.""" + if cancel_flag and cancel_flag.is_set(): + logger.info(message) + raise BypassCancelledException("Bypass cancelled") + + def _bypass(sb, max_retries: Optional[int] = None, cancel_flag: Optional[Event] = None) -> bool: - """Bypass function with unified strategies for Cloudflare and DDOS-Guard protection. - - Uses a unified method order that works for both protection types. - Prioritizes CDP Mode (stealthier) with UC Mode PyAutoGUI fallbacks: - - 1. CDP Mode solve - WebDriver disconnected, uses cdp.solve_captcha() - 2. CDP Mode click - Native CDP clicking, no PyAutoGUI (4.45.6+) - 3. CDP Mode gui_click - CDP with PyAutoGUI mouse movement (most human-like) - 4. uc_gui_handle_captcha() - TAB+SPACEBAR via PyAutoGUI (UC Mode fallback) - 5. uc_gui_click_captcha() - Direct click via PyAutoGUI (UC Mode fallback) - 6. Human-like interaction - Scroll, wait, reload, retry - - Returns True if bypass succeeded, False otherwise. - """ + """Attempt to bypass Cloudflare/DDOS-Guard protection using multiple methods.""" max_retries = max_retries if max_retries is not None else app_config.MAX_RETRY - # Unified method order - works for both Cloudflare and DDOS-Guard - # Prioritizes CDP Mode (stealthier), falls back to UC Mode PyAutoGUI methods - methods = [ - _bypass_method_cdp_solve, # CDP Mode, WebDriver disconnected - _bypass_method_cdp_click, # CDP native click, no PyAutoGUI - _bypass_method_cdp_gui_click, # CDP with PyAutoGUI (most human-like) - _bypass_method_handle_captcha, # TAB+SPACEBAR via PyAutoGUI (UC Mode) - _bypass_method_click_captcha, # Direct click via PyAutoGUI (UC Mode) - _bypass_method_humanlike, # Last resort with scroll/refresh - ] + last_challenge_type = None + consecutive_same_challenge = 0 for try_count in range(max_retries): - # Check for cancellation before each attempt - if cancel_flag and cancel_flag.is_set(): - logger.info("Bypass cancelled by user") - raise BypassCancelledException("Bypass cancelled") + _check_cancellation(cancel_flag, "Bypass cancelled by user") if _is_bypassed(sb): if try_count == 0: - logger.info("Page already bypassed (no challenge or auto-solved by uc_open_with_reconnect)") + logger.info("Page already bypassed") return True - # Log challenge type for debugging (but don't branch on it) challenge_type = _detect_challenge_type(sb) - logger.info(f"Challenge detected: {challenge_type}") + logger.debug(f"Challenge detected: {challenge_type}") - method = methods[try_count % len(methods)] + # No challenge detected but page doesn't look bypassed - wait and retry + if challenge_type == "none": + logger.info("No challenge detected, waiting for page to settle...") + time.sleep(random.uniform(2, 3)) + if _is_bypassed(sb): + return True + # Try a simple reconnect instead of captcha methods + try: + sb.reconnect() + time.sleep(random.uniform(1, 2)) + if _is_bypassed(sb): + logger.info("Bypass successful after reconnect") + return True + except Exception as e: + logger.debug(f"Reconnect during no-challenge wait failed: {e}") + continue + + if challenge_type == last_challenge_type: + consecutive_same_challenge += 1 + if consecutive_same_challenge >= MAX_CONSECUTIVE_SAME_CHALLENGE: + logger.warning( + f"Same challenge ({challenge_type}) detected {consecutive_same_challenge} times - aborting" + ) + return False + else: + consecutive_same_challenge = 1 + last_challenge_type = challenge_type + + method = BYPASS_METHODS[try_count % len(BYPASS_METHODS)] logger.info(f"Bypass attempt {try_count + 1}/{max_retries} using {method.__name__}") - # Progressive backoff with cancellation checks (randomized) if try_count > 0: wait_time = min(random.uniform(2, 4) * try_count, 12) logger.info(f"Waiting {wait_time:.1f}s before trying...") - # Check cancellation during wait (check every second) for _ in range(int(wait_time)): - if cancel_flag and cancel_flag.is_set(): - logger.info("Bypass cancelled during wait") - raise BypassCancelledException("Bypass cancelled") + _check_cancellation(cancel_flag, "Bypass cancelled during wait") time.sleep(1) - # Sleep remaining fractional second time.sleep(wait_time - int(wait_time)) try: @@ -730,180 +669,174 @@ def _bypass(sb, max_retries: Optional[int] = None, cancel_flag: Optional[Event] logger.warning("Exceeded maximum retries. Bypass failed.") return False -def _get_chromium_args(): - """Build Chrome arguments dynamically, pre-resolving hostnames via Python's DNS. +def _get_chromium_args() -> list[str]: + """Build Chrome arguments, pre-resolving hostnames via Python's patched DNS. - Instead of trying to configure Chrome's DNS (which is unreliable), we pre-resolve - AA hostnames using Python's patched socket (which uses DoH/custom DNS) and pass - the resolved IPs directly to Chrome via --host-resolver-rules. This bypasses - Chrome's DNS entirely for those hosts. + Pre-resolves AA hostnames and passes IPs to Chrome via --host-resolver-rules, + bypassing Chrome's DNS entirely for those hosts. """ arguments = [ - # Ignore certificate and SSL errors (similar to curl's --insecure) "--ignore-certificate-errors", "--ignore-ssl-errors", "--allow-running-insecure-content", "--ignore-certificate-errors-spki-list", "--ignore-certificate-errors-skip-list" ] - - # Conditionally add verbose logging arguments + if app_config.get("DEBUG", False): arguments.extend([ - "--enable-logging", # Enable Chrome browser logging - "--v=1", # Set verbosity level for Chrome logs + "--enable-logging", + "--v=1", "--log-file=" + str(LOG_DIR / "chrome_browser.log") ]) - # Add proxy settings if configured - proxies = _get_proxies() + proxies = get_proxies() if proxies: proxy_url = proxies.get('https') or proxies.get('http') if proxy_url: arguments.append(f'--proxy-server={proxy_url}') - # --- Pre-resolve AA hostnames and map them directly in Chrome --- - # This bypasses Chrome's DNS entirely - we resolve via Python's patched socket.getaddrinfo - # (which uses DoH/Cloudflare when system DNS fails) and tell Chrome to use those IPs - host_rules = [] - - try: - aa_urls = network.get_available_aa_urls() - for url in aa_urls: - hostname = urlparse(url).hostname - if hostname: - try: - # Use socket.getaddrinfo which IS patched by our network module - # (DoH/Cloudflare if system DNS failed) - # getaddrinfo returns: [(family, type, proto, canonname, sockaddr), ...] - # sockaddr for IPv4 is (ip, port) - results = socket.getaddrinfo(hostname, 443, socket.AF_INET) - if results: - ip = results[0][4][0] # First result, sockaddr tuple, IP address - host_rules.append(f"MAP {hostname} {ip}") - logger.debug(f"Chrome: Pre-resolved {hostname} -> {ip}") - else: - logger.warning(f"Chrome: No addresses returned for {hostname}") - except socket.gaierror as e: - logger.warning(f"Chrome: Could not pre-resolve {hostname}: {e}") - - if host_rules: - # Join all rules with comma, e.g. "MAP host1 ip1, MAP host2 ip2" - rules_str = ", ".join(host_rules) - arguments.append(f'--host-resolver-rules={rules_str}') - logger.info(f"Chrome: Using host resolver rules for {len(host_rules)} hosts") - else: - logger.warning("Chrome: No hosts could be pre-resolved, Chrome will use its own DNS") - - except Exception as e: - logger.error_trace(f"Error pre-resolving hostnames for Chrome: {e}") - + host_rules = _build_host_resolver_rules() + if host_rules: + arguments.append(f'--host-resolver-rules={", ".join(host_rules)}') + logger.debug(f"Chrome: Using host resolver rules for {len(host_rules)} hosts") + else: + logger.warning("Chrome: No hosts could be pre-resolved") + return arguments -def _get(url, retry: Optional[int] = None, cancel_flag: Optional[Event] = None): - retry = retry if retry is not None else app_config.MAX_RETRY - # Check for cancellation before starting - if cancel_flag and cancel_flag.is_set(): - logger.info("Bypass cancelled before starting") - raise BypassCancelledException("Bypass cancelled") + +def _build_host_resolver_rules() -> list[str]: + """Pre-resolve AA hostnames and build Chrome host resolver rules.""" + host_rules = [] try: - logger.info(f"SB_GET: {url}") + for url in network.get_available_aa_urls(): + hostname = urlparse(url).hostname + if not hostname: + continue + + try: + results = socket.getaddrinfo(hostname, 443, socket.AF_INET) + if results: + ip = results[0][4][0] + host_rules.append(f"MAP {hostname} {ip}") + logger.debug(f"Chrome: Pre-resolved {hostname} -> {ip}") + else: + logger.warning(f"Chrome: No addresses returned for {hostname}") + except socket.gaierror as e: + logger.warning(f"Chrome: Could not pre-resolve {hostname}: {e}") + except Exception as e: + logger.error_trace(f"Error pre-resolving hostnames for Chrome: {e}") + + return host_rules + +DRIVER_RESET_ERRORS = {"WebDriverException", "SessionNotCreatedException", "TimeoutException", "MaxRetryError"} + + +def _get(url: str, retry: Optional[int] = None, cancel_flag: Optional[Event] = None) -> str: + """Fetch URL with Cloudflare bypass. Retries on failure.""" + retry = retry if retry is not None else app_config.MAX_RETRY + _check_cancellation(cancel_flag, "Bypass cancelled before starting") + + try: + logger.debug(f"SB_GET: {url}") sb = _get_driver() - # Enhanced page loading with better error handling + hostname = urlparse(url).hostname or "" + if has_valid_cf_cookies(hostname): + reconnect_time = 1.0 + logger.debug(f"Using fast reconnect ({reconnect_time}s) - valid cookies exist") + else: + reconnect_time = app_config.DEFAULT_SLEEP + logger.debug(f"Using standard reconnect ({reconnect_time}s) - no cached cookies") + logger.debug("Opening URL with SeleniumBase...") - sb.uc_open_with_reconnect(url, app_config.DEFAULT_SLEEP) - time.sleep(app_config.DEFAULT_SLEEP) + sb.uc_open_with_reconnect(url, reconnect_time) - # Check for cancellation after page load - if cancel_flag and cancel_flag.is_set(): - logger.info("Bypass cancelled after page load") - raise BypassCancelledException("Bypass cancelled") + _check_cancellation(cancel_flag, "Bypass cancelled after page load") - # Log current page title and URL for debugging try: - current_url = sb.get_current_url() - current_title = sb.get_title() - logger.debug(f"Page loaded - URL: {current_url}, Title: {current_title}") - except Exception as debug_e: - logger.debug(f"Could not get page info: {debug_e}") + logger.debug(f"Page loaded - URL: {sb.get_current_url()}, Title: {sb.get_title()}") + except Exception as e: + logger.debug(f"Could not get page info: {e}") - # Attempt bypass with cancellation support logger.debug("Starting bypass process...") if _bypass(sb, cancel_flag=cancel_flag): - logger.info("Bypass successful.") - # Extract cookies for sharing with requests library _extract_cookies_from_driver(sb, url) return sb.page_source - else: - logger.warning("Bypass completed but page still shows Cloudflare protection") - # Log page content for debugging (truncated) - try: - page_text = sb.get_text("body")[:500] + "..." if len(sb.get_text("body")) > 500 else sb.get_text("body") - logger.debug(f"Page content: {page_text}") - except Exception: - pass + + logger.warning("Bypass completed but page still shows protection") + try: + body = sb.get_text("body") + logger.debug(f"Page content: {body[:500]}..." if len(body) > 500 else body) + except Exception: + pass except BypassCancelledException: raise except Exception as e: - error_details = f"Exception type: {type(e).__name__}, Message: {str(e)}" - stack_trace = traceback.format_exc() + error_details = f"{type(e).__name__}: {e}" if retry == 0: - logger.error(f"Failed to initialize browser after all retries: {error_details}") - logger.debug(f"Full stack trace: {stack_trace}") + logger.error(f"Failed after all retries: {error_details}") + logger.debug(f"Stack trace: {traceback.format_exc()}") _reset_driver() - raise e + raise - logger.warning(f"Failed to bypass Cloudflare (retry {app_config.MAX_RETRY - retry + 1}/{app_config.MAX_RETRY}): {error_details}") - logger.debug(f"Stack trace: {stack_trace}") + logger.warning(f"Bypass failed (retry {app_config.MAX_RETRY - retry + 1}/{app_config.MAX_RETRY}): {error_details}") + logger.debug(f"Stack trace: {traceback.format_exc()}") - # Reset driver on certain errors - error_type = type(e).__name__ - if error_type in ("WebDriverException", "SessionNotCreatedException", "TimeoutException", "MaxRetryError"): + if type(e).__name__ in DRIVER_RESET_ERRORS: logger.info("Restarting bypasser due to browser error...") _reset_driver() - # Check for cancellation before retry - if cancel_flag and cancel_flag.is_set(): - logger.info("Bypass cancelled before retry") - raise BypassCancelledException("Bypass cancelled") - + _check_cancellation(cancel_flag, "Bypass cancelled before retry") return _get(url, retry - 1, cancel_flag) -def get(url, retry: Optional[int] = None, cancel_flag: Optional[Event] = None): +def get(url: str, retry: Optional[int] = None, cancel_flag: Optional[Event] = None) -> str: """Fetch a URL with protection bypass.""" retry = retry if retry is not None else app_config.MAX_RETRY global LAST_USED + with LOCKED: - # Check for cookies AFTER acquiring lock - another request may have - # completed bypass while we were waiting, making Chrome unnecessary - parsed = urlparse(url) - cookies = get_cf_cookies_for_domain(parsed.hostname or "") + # Try cookies first - another request may have completed bypass while waiting + cookies = get_cf_cookies_for_domain(urlparse(url).hostname or "") if cookies: try: - response = requests.get(url, cookies=cookies, proxies=_get_proxies(), timeout=(5, 10)) + response = requests.get(url, cookies=cookies, proxies=get_proxies(), timeout=(5, 10)) if response.status_code == 200: - logger.debug(f"Cookies available after lock wait - skipped Chrome") + logger.debug("Cookies available after lock wait - skipped Chrome") LAST_USED = time.time() return response.text except Exception: - pass # Fall through to Chrome bypass + pass - ret = _get(url, retry, cancel_flag) + result = _get(url, retry, cancel_flag) LAST_USED = time.time() - return ret + return result -def _init_driver(): +def _init_driver() -> Driver: + """Initialize the Chrome driver with undetected-chromedriver settings.""" global DRIVER if DRIVER: _reset_driver() - # Build Chrome args dynamically to pick up current DNS settings from network module + chromium_args = _get_chromium_args() + screen_width, screen_height = get_screen_size() + logger.debug(f"Initializing Chrome driver with args: {chromium_args}") - driver = Driver(uc=True, headless=False, incognito=True, size=f"{VIRTUAL_SCREEN_SIZE[0]},{VIRTUAL_SCREEN_SIZE[1]}", chromium_arg=chromium_args) + logger.debug(f"Browser screen size: {screen_width}x{screen_height}") + + driver = Driver( + uc=True, + headless=False, + incognito=True, + locale="en", + ad_block=True, + size=f"{screen_width},{screen_height}", + chromium_arg=chromium_args, + ) driver.set_page_load_timeout(60) DRIVER = driver time.sleep(app_config.DEFAULT_SLEEP) @@ -916,19 +849,24 @@ def _ensure_display_initialized(): return if not (env.DOCKERMODE and app_config.get("USE_CF_BYPASS", True)): return - + from pyvirtualdisplay import Display - display = Display(visible=False, size=VIRTUAL_SCREEN_SIZE) + # Get the screen size (generates a random one if not already set) + screen_width, screen_height = get_screen_size() + # Add padding for browser chrome (title bar, borders, taskbar space) + display_width = screen_width + 100 + display_height = screen_height + 150 + display = Display(visible=False, size=(display_width, display_height)) display.start() DISPLAY["xvfb"] = display - logger.info("Virtual display started") + logger.info(f"Virtual display started: {display_width}x{display_height}") time.sleep(app_config.DEFAULT_SLEEP) _reset_pyautogui_display_state() def _get_driver(): global DRIVER, DISPLAY, LAST_USED - logger.info("Getting driver...") + logger.debug("Getting driver...") LAST_USED = time.time() _ensure_display_initialized() @@ -941,11 +879,16 @@ def _get_driver(): timestamp = datetime.now().strftime("%y%m%d-%H%M%S") output_file = RECORDING_DIR / f"screen_recording_{timestamp}.mp4" + # Get the display size (screen size + padding) + screen_width, screen_height = get_screen_size() + display_width = screen_width + 100 + display_height = screen_height + 150 + ffmpeg_cmd = [ "ffmpeg", "-y", "-f", "x11grab", - "-video_size", f"{VIRTUAL_SCREEN_SIZE[0]}x{VIRTUAL_SCREEN_SIZE[1]}", + "-video_size", f"{display_width}x{display_height}", "-i", f":{display.display}", "-c:v", "libx264", "-preset", "ultrafast", # or "veryfast" (trade speed for slightly better compression) @@ -960,7 +903,7 @@ def _get_driver(): output_file.as_posix(), "-nostats", "-loglevel", "0" ] - logger.info("Starting FFmpeg recording to %s", output_file) + logger.debug("Starting FFmpeg recording to %s", output_file) logger.debug_trace(f"FFmpeg command: {' '.join(ffmpeg_cmd)}") DISPLAY["ffmpeg"] = subprocess.Popen(ffmpeg_cmd) @@ -977,68 +920,62 @@ def _get_driver(): logger.log_resource_usage() return DRIVER -def _reset_driver(): +def _reset_driver() -> None: """Reset the browser driver and cleanup all associated processes.""" logger.log_resource_usage() logger.info("Shutting down Cloudflare bypasser...") global DRIVER, DISPLAY - - # Quit driver + + clear_screen_size() + if DRIVER: try: DRIVER.quit() except Exception as e: logger.warning(f"Error quitting driver: {e}") DRIVER = None - - # Stop virtual display + if DISPLAY["xvfb"]: try: DISPLAY["xvfb"].stop() except Exception as e: logger.warning(f"Error stopping display: {e}") DISPLAY["xvfb"] = None - - # Stop ffmpeg recording + if DISPLAY["ffmpeg"]: try: DISPLAY["ffmpeg"].send_signal(signal.SIGINT) except Exception as e: logger.debug(f"Error stopping ffmpeg: {e}") DISPLAY["ffmpeg"] = None - - # Kill any lingering processes + time.sleep(0.5) for process in ["Xvfb", "ffmpeg", "chrom"]: try: os.system(f"pkill -f {process}") except Exception as e: logger.debug(f"Error killing {process}: {e}") - + time.sleep(0.5) - logger.info("Cloudflare bypasser shut down (browser and display stopped)") + logger.info("Cloudflare bypasser shut down") logger.log_resource_usage() -def _restart_chrome_only(): - """Restart just Chrome (not the display) to pick up new DNS settings. +def _restart_chrome_only() -> None: + """Restart Chrome (not the display) to pick up new DNS settings. - Called when DNS provider rotates in auto mode. The display is kept running - to avoid the slower full restart. Chrome will be re-initialized with fresh - pre-resolved IPs from the new DNS provider. + Called when DNS provider rotates. Display is kept running to avoid slower full restart. """ global DRIVER, LAST_USED logger.debug("Restarting Chrome to apply new DNS settings...") - # Quit existing driver if DRIVER: try: DRIVER.quit() except Exception as e: - logger.debug(f"Error quitting driver during DNS rotation restart: {e}") + logger.debug(f"Error quitting driver during DNS rotation: {e}") DRIVER = None - # Kill any lingering Chrome processes (same pattern as _reset_driver) try: os.system("pkill -f chrom") except Exception as e: @@ -1046,58 +983,47 @@ def _restart_chrome_only(): time.sleep(0.5) - # Re-initialize driver with new DNS settings - # _get_chromium_args() will re-resolve hostnames using the new DNS try: _init_driver() LAST_USED = time.time() logger.debug("Chrome restarted with updated DNS settings") except Exception as e: logger.warning(f"Failed to restart Chrome after DNS rotation: {e}") - # Don't raise - the bypasser can try again on next request def _on_dns_rotation(provider_name: str, servers: list, doh_url: str) -> None: """Callback invoked when network.py rotates DNS provider. - If Chrome is currently running, schedule a restart in the background. - This is async to avoid blocking the request that triggered DNS rotation. + Schedules an async Chrome restart to avoid blocking the current request. """ - global DRIVER, _dns_rotation_pending + global _dns_rotation_pending if DRIVER is None: return - # Always set pending flag - the restart will happen asynchronously - # This avoids blocking the current request (which triggered DNS rotation) with _dns_rotation_lock: if _dns_rotation_pending: - return # Already scheduled + return _dns_rotation_pending = True def _async_restart(): global _dns_rotation_pending logger.debug(f"DNS rotated to {provider_name} - restarting Chrome in background") with LOCKED: - # Clear flag before restart (under lock to be safe) with _dns_rotation_lock: _dns_rotation_pending = False _restart_chrome_only() - restart_thread = threading.Thread(target=_async_restart, daemon=True) - restart_thread.start() + threading.Thread(target=_async_restart, daemon=True).start() -def _cleanup_driver(): +def _cleanup_driver() -> None: """Reset driver after inactivity timeout. - Uses a longer timeout (4x) when UI clients are connected to avoid - resetting while users are actively browsing. After all clients disconnect, - the standard timeout applies as a grace period before shutdown. + Uses 4x longer timeout when UI clients are connected. """ global LAST_USED - # Check for active UI connections try: from cwa_book_downloader.api.websocket import ws_manager has_active_clients = ws_manager.has_active_connections() @@ -1105,117 +1031,96 @@ def _cleanup_driver(): ws_manager = None has_active_clients = False - # Use longer timeout when UI is connected (user might be browsing) timeout_minutes = app_config.BYPASS_RELEASE_INACTIVE_MIN if has_active_clients: - timeout_minutes *= 4 # 20 min default when UI open vs 5 min after disconnect + timeout_minutes *= 4 with LOCKED: - if LAST_USED and time.time() - LAST_USED >= timeout_minutes * 60: - logger.info(f"Cloudflare bypasser idle for {timeout_minutes} min - shutting down to free resources") - _reset_driver() - LAST_USED = None + if not LAST_USED or time.time() - LAST_USED < timeout_minutes * 60: + return - # If clients are still connected, request warmup on next connect so the - # bypasser restarts when the user becomes active again - if has_active_clients and ws_manager: - ws_manager.request_warmup_on_next_connect() - logger.debug("Requested warmup on next client connect (clients still connected)") + logger.info(f"Bypasser idle for {timeout_minutes} min - shutting down") + _reset_driver() + LAST_USED = None + logger.log_resource_usage() -def _cleanup_loop(): + if has_active_clients and ws_manager: + ws_manager.request_warmup_on_next_connect() + logger.debug("Requested warmup on next client connect") + +def _cleanup_loop() -> None: + """Background loop that periodically checks for idle timeout.""" while True: _cleanup_driver() time.sleep(max(app_config.BYPASS_RELEASE_INACTIVE_MIN / 2, 1)) -def _init_cleanup_thread(): - cleanup_thread = threading.Thread(target=_cleanup_loop) - cleanup_thread.daemon = True - cleanup_thread.start() -def warmup(): - """Pre-initialize the virtual display and Chrome browser to eliminate cold start time. - - This function can be called when a user connects to the web UI to - warm up the bypasser environment before it's actually needed. - Both the display and Chrome driver are initialized so the first - bypass request is nearly instant. - - Warmup is skipped in the following scenarios: - - BYPASS_WARMUP_ON_CONNECT is false (explicit disable) - - Not running in Docker mode - - USE_CF_BYPASS is disabled - - AA_DONATOR_KEY is set (user has fast downloads, bypass rarely needed) - - Note: Even when warmup is skipped, the bypasser can still start on-demand - when actually needed for a download. - """ - global DRIVER, LAST_USED - +def _init_cleanup_thread() -> None: + """Start the background cleanup thread.""" + threading.Thread(target=_cleanup_loop, daemon=True).start() + +def _should_warmup() -> bool: + """Check if warmup should proceed based on configuration.""" if not app_config.get("BYPASS_WARMUP_ON_CONNECT", True): - logger.debug("Bypasser warmup disabled via BYPASS_WARMUP_ON_CONNECT") - return - + logger.debug("Bypasser warmup disabled via config") + return False if not env.DOCKERMODE: logger.debug("Bypasser warmup skipped - not in Docker mode") - return - + return False if not app_config.get("USE_CF_BYPASS", True): logger.debug("Bypasser warmup skipped - CF bypass disabled") - return - + return False if app_config.get("AA_DONATOR_KEY", ""): - logger.debug("Bypasser warmup skipped - AA donator key set (fast downloads available)") + logger.debug("Bypasser warmup skipped - AA donator key set") + return False + return True + + +def warmup() -> None: + """Pre-initialize the virtual display and Chrome browser. + + Called when a user connects to the web UI. Skipped if warmup is disabled, + not in Docker mode, CF bypass is disabled, or AA donator key is set. + """ + global LAST_USED + + if not _should_warmup(): return with LOCKED: if is_warmed_up(): - logger.debug("Bypasser already fully warmed up") + logger.debug("Bypasser already warmed up") return - # Clean up any orphan processes from previous crashes before starting fresh. - # This must be AFTER the is_warmed_up() check to avoid killing a healthy - # Chrome browser that was started by a previous warmup. _cleanup_orphan_processes() - # If we get here, either nothing is initialized OR the driver is unhealthy. - # Reset any stale state before reinitializing to avoid the warmup thinking - # things are already set up when the underlying processes are dead. if DRIVER is not None or DISPLAY["xvfb"] is not None: logger.info("Resetting stale bypasser state before warmup...") _reset_driver() - logger.info("Warming up Cloudflare bypasser (pre-initializing display and browser)...") - + logger.info("Warming up Cloudflare bypasser...") + try: - # Initialize virtual display (FFmpeg recording starts later on first actual request) _ensure_display_initialized() - - # Initialize Chrome driver + if DRIVER is None: logger.info("Pre-initializing Chrome browser...") _init_driver() LAST_USED = time.time() logger.info("Chrome browser ready") - - logger.info("Bypasser warmup complete - ready for instant bypass") + + logger.info("Bypasser warmup complete") logger.log_resource_usage() - + except Exception as e: logger.warning(f"Failed to warm up bypasser: {e}") def _is_driver_healthy() -> bool: - """Check if the Chrome driver is actually responsive (not just non-None). - - The DRIVER variable can be non-None but the underlying Chrome process may have - crashed silently. This function pings the driver to verify it's actually alive. - """ - global DRIVER + """Check if the Chrome driver is responsive (not just non-None).""" if DRIVER is None: return False try: - # Try a simple operation that requires the driver to be responsive - # get_current_url() is lightweight and doesn't change state DRIVER.get_current_url() return True except Exception as e: @@ -1224,89 +1129,78 @@ def _is_driver_healthy() -> bool: def is_warmed_up() -> bool: - """Check if the bypasser is fully warmed up (display and browser initialized and healthy).""" + """Check if the bypasser is fully warmed up (display and browser initialized).""" if DISPLAY["xvfb"] is None or DRIVER is None: return False return _is_driver_healthy() -def shutdown_if_idle(): + +def shutdown_if_idle() -> None: """Start the inactivity countdown when all WebSocket clients disconnect. - - Instead of immediately shutting down, this sets LAST_USED to start the - inactivity timer. The cleanup loop will then shut down the bypasser after - BYPASS_RELEASE_INACTIVE_MIN minutes, giving users a grace period to return - (e.g., if they refresh the page or briefly navigate away). - - If there are active downloads, the timer naturally won't trigger until - they complete since LAST_USED gets updated on each bypass operation. + + Sets LAST_USED to start the timer. The cleanup loop shuts down after + BYPASS_RELEASE_INACTIVE_MIN minutes of inactivity. """ global LAST_USED - + with LOCKED: if not is_warmed_up(): logger.debug("Bypasser already shut down") return - - # Start the inactivity countdown + LAST_USED = time.time() - logger.info(f"All clients disconnected - bypasser will shut down after {app_config.BYPASS_RELEASE_INACTIVE_MIN} min of inactivity") + logger.info(f"All clients disconnected - shutdown after {app_config.BYPASS_RELEASE_INACTIVE_MIN} min of inactivity") _init_cleanup_thread() -# Register for DNS rotation notifications so Chrome can restart with new DNS settings -# Only register if using the internal Chrome bypasser (not external FlareSolverr) -# Note: This module is only imported when internal bypasser is selected, so this check -# is redundant but kept for safety. Use app_config for consistency with other modules. +# Register for DNS rotation notifications (Chrome restarts with new DNS settings) if app_config.get("USE_CF_BYPASS", True) and not app_config.get("USING_EXTERNAL_BYPASSER", False): network.register_dns_rotation_callback(_on_dns_rotation) -def get_bypassed_page(url: str, selector: Optional[network.AAMirrorSelector] = None, cancel_flag: Optional[Event] = None) -> Optional[str]: - """Fetch HTML content from a URL using the internal Cloudflare Bypasser. +def _try_with_cached_cookies(url: str, hostname: str) -> Optional[str]: + """Attempt request with cached cookies before using Chrome.""" + cookies = get_cf_cookies_for_domain(hostname) + if not cookies: + return None - Args: - url: Target URL - selector: Optional mirror selector for AA URL rewriting - cancel_flag: Optional threading Event to signal cancellation + try: + headers = {} + stored_ua = get_cf_user_agent_for_domain(hostname) + if stored_ua: + headers['User-Agent'] = stored_ua - Returns: - str: HTML content if successful, None otherwise + logger.debug(f"Trying request with cached cookies: {url}") + response = requests.get(url, cookies=cookies, headers=headers, proxies=get_proxies(), timeout=(5, 10)) + if response.status_code == 200: + logger.debug("Cached cookies worked, skipped Chrome bypass") + return response.text + except Exception: + pass - Raises: - BypassCancelledException: If cancel_flag is set during operation - """ + return None + + +def get_bypassed_page( + url: str, + selector: Optional[network.AAMirrorSelector] = None, + cancel_flag: Optional[Event] = None +) -> Optional[str]: + """Fetch HTML content from a URL using the internal Cloudflare Bypasser.""" sel = selector or network.AAMirrorSelector() attempt_url = sel.rewrite(url) + hostname = urlparse(attempt_url).hostname or "" - # Before using Chrome, check if cookies are available (from a previous bypass) - # This helps concurrent downloads avoid unnecessary Chrome usage - parsed = urlparse(attempt_url) - hostname = parsed.hostname or "" - cookies = get_cf_cookies_for_domain(hostname) - if cookies: - try: - # Use stored UA - Cloudflare ties cf_clearance to the UA that solved the challenge - headers = {} - stored_ua = get_cf_user_agent_for_domain(hostname) - if stored_ua: - headers['User-Agent'] = stored_ua - logger.debug(f"Trying request with cached cookies before Chrome: {attempt_url}") - response = requests.get(attempt_url, cookies=cookies, headers=headers, proxies=_get_proxies(), timeout=(5, 10)) - if response.status_code == 200: - logger.debug(f"Cached cookies worked, skipped Chrome bypass") - return response.text - except Exception: - pass # Fall through to Chrome bypass + cached_result = _try_with_cached_cookies(attempt_url, hostname) + if cached_result: + return cached_result try: response_html = get(attempt_url, cancel_flag=cancel_flag) except BypassCancelledException: raise except Exception: - # Check for cancellation before retry - if cancel_flag and cancel_flag.is_set(): - raise BypassCancelledException("Bypass cancelled") - # On failure, try mirror/DNS rotation for AA-like URLs + _check_cancellation(cancel_flag, "Bypass cancelled") new_base, action = sel.next_mirror_or_rotate_dns() if action in ("mirror", "dns") and new_base: attempt_url = sel.rewrite(url) @@ -1314,8 +1208,7 @@ def get_bypassed_page(url: str, selector: Optional[network.AAMirrorSelector] = N else: raise - logger.debug(f"Cloudflare Bypasser response length: {len(response_html)}") - if response_html.strip() == "": + if not response_html.strip(): raise requests.exceptions.RequestException("Failed to bypass Cloudflare") return response_html diff --git a/cwa_book_downloader/config/env.py b/cwa_book_downloader/config/env.py index c2f36d2..b4dbc59 100644 --- a/cwa_book_downloader/config/env.py +++ b/cwa_book_downloader/config/env.py @@ -41,33 +41,47 @@ def _read_debug_from_config() -> bool: # Authentication and session settings SESSION_COOKIE_SECURE_ENV = os.getenv("SESSION_COOKIE_SECURE", "false") -CWA_DB = os.getenv("CWA_DB_PATH") -CWA_DB_PATH = Path(CWA_DB) if CWA_DB else None +def _resolve_cwa_db_path() -> Path | None: + """ + Resolve the Calibre-Web database path. + + Priority: + 1. CWA_DB_PATH env var (backwards compatibility) + 2. Default path /auth/app.db if it exists and is a valid SQLite file + + Returns None if no valid database is found. + """ + # Check env var first (backwards compatibility) + env_path = os.getenv("CWA_DB_PATH") + if env_path: + path = Path(env_path) + if path.exists() and path.is_file() and _is_sqlite_file(path): + return path + + # Check default mount path + default_path = Path("/auth/app.db") + if default_path.exists() and default_path.is_file() and _is_sqlite_file(default_path): + return default_path + + return None + + +def _is_sqlite_file(path: Path) -> bool: + """Check if a file is a valid SQLite database by reading magic bytes.""" + try: + with open(path, "rb") as f: + header = f.read(16) + return header[:16] == b"SQLite format 3\x00" + except (OSError, PermissionError): + return False + + +CWA_DB_PATH = _resolve_cwa_db_path() CONFIG_DIR = Path(os.getenv("CONFIG_DIR", "/config")) LOG_ROOT = Path(os.getenv("LOG_ROOT", "/var/log/")) LOG_DIR = LOG_ROOT / "cwa-book-downloader" TMP_DIR = Path(os.getenv("TMP_DIR", "/tmp/cwa-book-downloader")) INGEST_DIR = Path(os.getenv("INGEST_DIR", "/cwa-book-ingest")) -INGEST_DIR_BOOK_FICTION = os.getenv("INGEST_DIR_BOOK_FICTION", "") -INGEST_DIR_BOOK_NON_FICTION = os.getenv("INGEST_DIR_BOOK_NON_FICTION", "") -INGEST_DIR_BOOK_UNKNOWN = os.getenv("INGEST_DIR_BOOK_UNKNOWN", "") -INGEST_DIR_MAGAZINE = os.getenv("INGEST_DIR_MAGAZINE", "") -INGEST_DIR_COMIC_BOOK = os.getenv("INGEST_DIR_COMIC_BOOK", "") -INGEST_DIR_AUDIOBOOK = os.getenv("INGEST_DIR_AUDIOBOOK", "") -INGEST_DIR_STANDARDS_DOCUMENT = os.getenv("INGEST_DIR_STANDARDS_DOCUMENT", "") -INGEST_DIR_MUSICAL_SCORE = os.getenv("INGEST_DIR_MUSICAL_SCORE", "") -INGEST_DIR_OTHER = os.getenv("INGEST_DIR_OTHER", "") -DOWNLOAD_PATHS = { - "book (fiction)": Path(INGEST_DIR_BOOK_FICTION) if INGEST_DIR_BOOK_FICTION else INGEST_DIR, - "book (non-fiction)": Path(INGEST_DIR_BOOK_NON_FICTION) if INGEST_DIR_BOOK_NON_FICTION else INGEST_DIR, - "book (unknown)": Path(INGEST_DIR_BOOK_UNKNOWN) if INGEST_DIR_BOOK_UNKNOWN else INGEST_DIR, - "magazine": Path(INGEST_DIR_MAGAZINE) if INGEST_DIR_MAGAZINE else INGEST_DIR, - "comic book": Path(INGEST_DIR_COMIC_BOOK) if INGEST_DIR_COMIC_BOOK else INGEST_DIR, - "audiobook": Path(INGEST_DIR_AUDIOBOOK) if INGEST_DIR_AUDIOBOOK else INGEST_DIR, - "standards document": Path(INGEST_DIR_STANDARDS_DOCUMENT) if INGEST_DIR_STANDARDS_DOCUMENT else INGEST_DIR, - "musical score": Path(INGEST_DIR_MUSICAL_SCORE) if INGEST_DIR_MUSICAL_SCORE else INGEST_DIR, - "other": Path(INGEST_DIR_OTHER) if INGEST_DIR_OTHER else INGEST_DIR, -} STATUS_TIMEOUT = int(os.getenv("STATUS_TIMEOUT", "3600")) USE_BOOK_TITLE = string_to_bool(os.getenv("USE_BOOK_TITLE", "false")) diff --git a/cwa_book_downloader/config/security.py b/cwa_book_downloader/config/security.py index c785004..045f571 100644 --- a/cwa_book_downloader/config/security.py +++ b/cwa_book_downloader/config/security.py @@ -142,14 +142,12 @@ def security_settings(): key="USE_CWA_AUTH", label="Use Calibre-Web Database", description=( - "Authenticate using your existing Calibre-Web users instead of the credentials above." - if cwa_db_available - else "Authenticate using your existing Calibre-Web users. Set the CWA_DB_PATH environment variable to your Calibre-Web app.db file to enable this option." + "Use your existing Calibre-Web user credentials for authentication." ), default=False, env_supported=False, disabled=not cwa_db_available, - disabled_reason="Set the CWA_DB_PATH environment variable to your Calibre-Web app.db file path to enable this option.", + disabled_reason="Mount your Calibre-Web app.db to /auth/app.db in docker compose to enable.", ), ] diff --git a/cwa_book_downloader/config/settings.py b/cwa_book_downloader/config/settings.py index 1018f01..d8a082c 100644 --- a/cwa_book_downloader/config/settings.py +++ b/cwa_book_downloader/config/settings.py @@ -61,14 +61,6 @@ if env.USING_EXTERNAL_BYPASSER and env.USE_CF_BYPASS: "or consider using the internal bypasser which integrates with the app's DNS system." ) -# Proxy settings -PROXIES = {} -if env.HTTP_PROXY: - PROXIES["http"] = env.HTTP_PROXY -if env.HTTPS_PROXY: - PROXIES["https"] = env.HTTPS_PROXY -logger.debug(f"PROXIES: {PROXIES}") - # Anna's Archive settings AA_BASE_URL = env._AA_BASE_URL AA_AVAILABLE_URLS = ["https://annas-archive.org", "https://annas-archive.se", "https://annas-archive.li"] @@ -97,8 +89,7 @@ if CUSTOM_SCRIPT: # Debugging settings if not env.USING_EXTERNAL_BYPASSER: - # Virtual display settings for debugging internal cloudflare bypasser - VIRTUAL_SCREEN_SIZE = (1024, 768) + # Recording directory for debugging internal cloudflare bypasser RECORDING_DIR = env.LOG_DIR / "recording" diff --git a/cwa_book_downloader/core/image_cache.py b/cwa_book_downloader/core/image_cache.py index f4bcc2a..e6e5f78 100644 --- a/cwa_book_downloader/core/image_cache.py +++ b/cwa_book_downloader/core/image_cache.py @@ -563,7 +563,7 @@ def get_image_cache() -> ImageCacheService: max_size_mb=max_size_mb, ttl_seconds=ttl_seconds, ) - logger.info(f"Initialized image cache: {cache_dir} (max {max_size_mb}MB, TTL {ttl_days} days)") + logger.debug(f"Initialized image cache: {cache_dir} (max {max_size_mb}MB, TTL {ttl_days} days)") return _instance diff --git a/cwa_book_downloader/core/logger.py b/cwa_book_downloader/core/logger.py index 8300637..fadd9ef 100644 --- a/cwa_book_downloader/core/logger.py +++ b/cwa_book_downloader/core/logger.py @@ -40,11 +40,21 @@ class CustomLogger(logging.Logger): def log_resource_usage(self): import psutil + + # Sum RSS of all processes for actual app memory + app_memory_mb = 0 + for proc in psutil.process_iter(['memory_info']): + try: + if proc.info['memory_info']: + app_memory_mb += proc.info['memory_info'].rss / (1024 * 1024) + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + memory = psutil.virtual_memory() + system_used_mb = memory.used / (1024 * 1024) available_mb = memory.available / (1024 * 1024) - memory_used_mb = memory.used / (1024 * 1024) cpu_percent = psutil.cpu_percent() - self.debug(f"Container Memory: Available={available_mb:.2f} MB, Used={memory_used_mb:.2f} MB, CPU: {cpu_percent:.2f}%") + self.debug(f"Container Memory: App={app_memory_mb:.2f} MB, System={system_used_mb:.2f} MB, Available={available_mb:.2f} MB, CPU: {cpu_percent:.2f}%") def setup_logger(name: str, log_file: Path = LOG_FILE) -> CustomLogger: diff --git a/cwa_book_downloader/core/utils.py b/cwa_book_downloader/core/utils.py index 6bb99dd..2670fad 100644 --- a/cwa_book_downloader/core/utils.py +++ b/cwa_book_downloader/core/utils.py @@ -5,9 +5,60 @@ Provides common helper functions used across the application. """ import base64 +from pathlib import Path from typing import Optional +CONTENT_TYPES = [ + "book (fiction)", + "book (non-fiction)", + "book (unknown)", + "magazine", + "comic book", + "audiobook", + "standards document", + "musical score", + "other", +] + +_CONTENT_TYPE_TO_CONFIG_KEY = { + "book (fiction)": "INGEST_DIR_BOOK_FICTION", + "book (non-fiction)": "INGEST_DIR_BOOK_NON_FICTION", + "book (unknown)": "INGEST_DIR_BOOK_UNKNOWN", + "magazine": "INGEST_DIR_MAGAZINE", + "comic book": "INGEST_DIR_COMIC_BOOK", + "audiobook": "INGEST_DIR_AUDIOBOOK", + "standards document": "INGEST_DIR_STANDARDS_DOCUMENT", + "musical score": "INGEST_DIR_MUSICAL_SCORE", + "other": "INGEST_DIR_OTHER", +} + + +def get_ingest_dir(content_type: Optional[str] = None) -> Path: + """Get the ingest directory for a content type, falling back to default.""" + from cwa_book_downloader.core.config import config + + default_ingest_dir = Path(config.get("INGEST_DIR", "/cwa-book-ingest")) + + if not content_type: + return default_ingest_dir + + # Normalize content type for lookup + content_type_lower = content_type.lower().strip() + + # Look up the config key for this content type + config_key = _CONTENT_TYPE_TO_CONFIG_KEY.get(content_type_lower) + if not config_key: + return default_ingest_dir + + # Get the custom directory from config (empty string means use default) + custom_dir = config.get(config_key, "") + if custom_dir: + return Path(custom_dir) + + return default_ingest_dir + + def transform_cover_url(cover_url: Optional[str], cache_id: str) -> Optional[str]: """ Transform an external cover URL to a local proxy URL when caching is enabled. diff --git a/cwa_book_downloader/download/http.py b/cwa_book_downloader/download/http.py index ee30c87..7641dca 100644 --- a/cwa_book_downloader/download/http.py +++ b/cwa_book_downloader/download/http.py @@ -11,6 +11,7 @@ import requests from tqdm import tqdm from cwa_book_downloader.download import network +from cwa_book_downloader.download.network import get_proxies from cwa_book_downloader.core.config import config as app_config from cwa_book_downloader.core.logger import setup_logger @@ -90,31 +91,6 @@ REQUEST_TIMEOUT = (5, 10) # (connect, read) MAX_DOWNLOAD_RETRIES = 2 MAX_RESUME_ATTEMPTS = 3 - -def _get_proxies() -> dict: - """Get current proxy configuration from config singleton.""" - proxy_mode = app_config.get("PROXY_MODE", "none") - - if proxy_mode == "socks5": - socks_proxy = app_config.get("SOCKS5_PROXY", "") - if socks_proxy: - return {"http": socks_proxy, "https": socks_proxy} - elif proxy_mode == "http": - proxies = {} - http_proxy = app_config.get("HTTP_PROXY", "") - https_proxy = app_config.get("HTTPS_PROXY", "") - if http_proxy: - proxies["http"] = http_proxy - if https_proxy: - proxies["https"] = https_proxy - elif http_proxy: - # Fallback: use HTTP proxy for HTTPS if HTTPS proxy not specified - proxies["https"] = http_proxy - return proxies - - return {} - - RETRYABLE_CODES = (429, 500, 502, 503, 504) CONNECTION_ERRORS = (requests.exceptions.ConnectionError, requests.exceptions.Timeout, requests.exceptions.SSLError, requests.exceptions.ChunkedEncodingError) @@ -213,7 +189,7 @@ def html_get_page( stored_ua = get_cf_user_agent_for_domain(hostname) if stored_ua: headers['User-Agent'] = stored_ua - response = requests.get(current_url, proxies=_get_proxies(), timeout=REQUEST_TIMEOUT, cookies=cookies, headers=headers) + response = requests.get(current_url, proxies=get_proxies(), timeout=REQUEST_TIMEOUT, cookies=cookies, headers=headers) response.raise_for_status() time.sleep(1) return response.text @@ -309,7 +285,7 @@ def download_url( logger.debug(f"No stored UA available for {hostname}") if cookies: logger.debug(f"Using {len(cookies)} cookies for {hostname}: {list(cookies.keys())}") - response = requests.get(current_url, stream=True, proxies=_get_proxies(), timeout=REQUEST_TIMEOUT, cookies=cookies, headers=headers) + response = requests.get(current_url, stream=True, proxies=get_proxies(), timeout=REQUEST_TIMEOUT, cookies=cookies, headers=headers) response.raise_for_status() if status_callback: @@ -426,7 +402,7 @@ def _try_resume( if stored_ua: resume_headers['User-Agent'] = stored_ua response = requests.get( - url, stream=True, proxies=_get_proxies(), timeout=REQUEST_TIMEOUT, + url, stream=True, proxies=get_proxies(), timeout=REQUEST_TIMEOUT, headers=resume_headers, cookies=cookies ) diff --git a/cwa_book_downloader/download/network.py b/cwa_book_downloader/download/network.py index 6fd2723..511e70b 100644 --- a/cwa_book_downloader/download/network.py +++ b/cwa_book_downloader/download/network.py @@ -18,7 +18,7 @@ from cwa_book_downloader.core.config import config as app_config from datetime import datetime, timedelta -def _get_proxies() -> dict: +def get_proxies() -> dict: """Get current proxy configuration from config singleton.""" proxy_mode = app_config.get("PROXY_MODE", "none") @@ -367,7 +367,7 @@ class DoHResolver: response = self.session.get( self.base_url, params=params, - proxies=_get_proxies(), + proxies=get_proxies(), timeout=10 # Increased from 5s to handle slow network conditions ) response.raise_for_status() @@ -789,7 +789,7 @@ def init_dns_resolvers(): DOH_SERVER = "" config.CUSTOM_DNS = [] config.DOH_SERVER = "" - logger.info("Using system DNS (auto mode - will switch on failure)") + logger.debug("Using system DNS (auto mode - will switch on failure)") socket.getaddrinfo = cast(Any, create_system_failover_getaddrinfo()) return @@ -845,10 +845,10 @@ def _initialize_aa_state() -> None: _current_aa_url_index = _aa_urls.index(state['aa_base_url']) AA_BASE_URL = state['aa_base_url'] else: - logger.info(f"AA_BASE_URL: auto, checking available urls {_aa_urls}") + logger.debug(f"AA_BASE_URL: auto, checking available urls {_aa_urls}") for i, url in enumerate(_aa_urls): try: - response = requests.get(url, proxies=_get_proxies(), timeout=3) + response = requests.get(url, proxies=get_proxies(), timeout=3) if response.status_code == 200: _current_aa_url_index = i AA_BASE_URL = url diff --git a/cwa_book_downloader/download/orchestrator.py b/cwa_book_downloader/download/orchestrator.py index 67ad4ef..ca3f224 100644 --- a/cwa_book_downloader/download/orchestrator.py +++ b/cwa_book_downloader/download/orchestrator.py @@ -35,7 +35,8 @@ from typing import Any, Dict, List, Optional, Tuple from cwa_book_downloader.release_sources import direct_download from cwa_book_downloader.release_sources.direct_download import SearchUnavailable from cwa_book_downloader.core.config import config -from cwa_book_downloader.config.env import TMP_DIR, DOWNLOAD_PATHS, INGEST_DIR +from cwa_book_downloader.config.env import TMP_DIR +from cwa_book_downloader.core.utils import get_ingest_dir from cwa_book_downloader.download.archive import is_archive, process_archive from cwa_book_downloader.release_sources import get_handler, get_source_display_name from cwa_book_downloader.core.logger import setup_logger @@ -332,8 +333,7 @@ def queue_book(book_id: str, priority: int = 0, source: str = "direct_download") bool: True if book was successfully queued """ try: - # Fetch book info for display purposes - book_info = direct_download.get_book_info(book_id) + book_info = direct_download.get_book_info(book_id, fetch_download_count=False) if not book_info: logger.warning(f"Could not fetch book info for {book_id}") return False @@ -618,8 +618,9 @@ def _post_process_download( """ # Route to content-type-specific ingest directory if configured content_type = task.content_type.lower() if task.content_type else None - ingest_dir = DOWNLOAD_PATHS.get(content_type, INGEST_DIR) - if content_type and ingest_dir != INGEST_DIR: + default_ingest_dir = get_ingest_dir() + ingest_dir = get_ingest_dir(content_type) + if content_type and ingest_dir != default_ingest_dir: logger.debug(f"Routing content type '{content_type}' to {ingest_dir}") os.makedirs(ingest_dir, exist_ok=True) diff --git a/cwa_book_downloader/main.py b/cwa_book_downloader/main.py index a042ba1..3739cdb 100644 --- a/cwa_book_downloader/main.py +++ b/cwa_book_downloader/main.py @@ -147,8 +147,8 @@ def get_auth_mode() -> str: try: security_config = load_config_file("security") - # 1. Check for explicit CWA auth - if security_config.get("USE_CWA_AUTH") and CWA_DB_PATH and os.path.isfile(CWA_DB_PATH): + # 1. Check for explicit CWA auth (CWA_DB_PATH is pre-validated at startup) + if security_config.get("USE_CWA_AUTH") and CWA_DB_PATH: return "cwa" # 2. Check for built-in credentials if security_config.get("BUILTIN_USERNAME") and security_config.get("BUILTIN_PASSWORD_HASH"): @@ -249,9 +249,9 @@ def login_required(f): if auth_mode == "none": return f(*args, **kwargs) - # If CWA mode and database path is invalid, return error - if auth_mode == "cwa" and CWA_DB_PATH and not os.path.isfile(CWA_DB_PATH): - logger.error(f"CWA_DB_PATH is set to {CWA_DB_PATH} but this is not a valid path") + # If CWA mode and database disappeared after startup, return error + if auth_mode == "cwa" and CWA_DB_PATH and not CWA_DB_PATH.exists(): + logger.error(f"CWA database at {CWA_DB_PATH} is no longer accessible") return jsonify({"error": "Internal Server Error"}), 500 # Check if user has a valid session @@ -932,9 +932,9 @@ def api_login() -> Union[Response, Tuple[Response, int]]: # CWA database authentication mode if auth_mode == "cwa": - # Validate CWA database path - if not os.path.isfile(CWA_DB_PATH): - logger.error(f"CWA_DB_PATH is set to {CWA_DB_PATH} but this is not a valid path") + # Verify database still exists (it was validated at startup) + if not CWA_DB_PATH or not CWA_DB_PATH.exists(): + logger.error(f"CWA database at {CWA_DB_PATH} is no longer accessible") return jsonify({"error": "Database configuration error"}), 500 try: @@ -1333,13 +1333,11 @@ def api_releases() -> Union[Response, Tuple[Response, int]]: cache_id = f"{provider}_{book_id}" book_dict['cover_url'] = transform_cover_url(book_dict['cover_url'], cache_id) - # Get search info from direct_download source (if it was searched) search_info = {} - if "direct_download" in source_instances: - dd_source = source_instances["direct_download"] - if hasattr(dd_source, 'last_search_type'): - search_info["direct_download"] = { - "search_type": dd_source.last_search_type + for source_name, source_instance in source_instances.items(): + if hasattr(source_instance, 'last_search_type') and source_instance.last_search_type: + search_info[source_name] = { + "search_type": source_instance.last_search_type } response = { diff --git a/cwa_book_downloader/metadata_providers/hardcover.py b/cwa_book_downloader/metadata_providers/hardcover.py index 2e1a6f4..8e8ddff 100644 --- a/cwa_book_downloader/metadata_providers/hardcover.py +++ b/cwa_book_downloader/metadata_providers/hardcover.py @@ -1,6 +1,7 @@ """Hardcover.app metadata provider. Requires API key.""" import requests +from datetime import datetime from typing import Any, Dict, List, Optional from cwa_book_downloader.core.cache import cacheable @@ -186,7 +187,8 @@ class HardcoverProvider(MetadataProvider): # Build cache key from options (include fields and settings for cache differentiation) fields_key = ":".join(f"{k}={v}" for k, v in sorted(options.fields.items())) exclude_compilations = app_config.get("HARDCOVER_EXCLUDE_COMPILATIONS", False) - cache_key = f"{options.query}:{options.search_type.value}:{options.sort.value}:{options.limit}:{options.page}:{fields_key}:excl_comp={exclude_compilations}" + exclude_unreleased = app_config.get("HARDCOVER_EXCLUDE_UNRELEASED", False) + cache_key = f"{options.query}:{options.search_type.value}:{options.sort.value}:{options.limit}:{options.page}:{fields_key}:excl_comp={exclude_compilations}:excl_unrel={exclude_unreleased}" return self._search_cached(cache_key, options) @cacheable(ttl_key="METADATA_CACHE_SEARCH_TTL", ttl_default=300, key_prefix="hardcover:search") @@ -270,8 +272,10 @@ class HardcoverProvider(MetadataProvider): hits = results_obj if isinstance(results_obj, list) else [] found_count = 0 - # Parse hits, filtering compilations if enabled + # Parse hits, filtering compilations and unreleased books if enabled exclude_compilations = app_config.get("HARDCOVER_EXCLUDE_COMPILATIONS", False) + exclude_unreleased = app_config.get("HARDCOVER_EXCLUDE_UNRELEASED", False) + current_year = datetime.now().year books = [] for hit in hits: item = hit.get("document", hit) if isinstance(hit, dict) else hit @@ -279,6 +283,10 @@ class HardcoverProvider(MetadataProvider): continue if exclude_compilations and item.get("compilation"): continue + if exclude_unreleased: + release_year = item.get("release_year") + if release_year is not None and release_year > current_year: + continue book = self._parse_search_result(item) if book: books.append(book) @@ -871,4 +879,10 @@ def hardcover_settings(): description="Filter out compilations, anthologies, and omnibus editions from search results", default=False, ), + CheckboxField( + key="HARDCOVER_EXCLUDE_UNRELEASED", + label="Exclude Unreleased Books", + description="Filter out books with a release year in the future", + default=False, + ), ] diff --git a/cwa_book_downloader/release_sources/__init__.py b/cwa_book_downloader/release_sources/__init__.py index f434441..1aa52c6 100644 --- a/cwa_book_downloader/release_sources/__init__.py +++ b/cwa_book_downloader/release_sources/__init__.py @@ -117,6 +117,7 @@ class ReleaseColumnConfig: leading_cell: Optional[LeadingCellConfig] = None # Defaults to thumbnail mode if None online_servers: Optional[List[str]] = None # For IRC: list of currently online server nicks cache_ttl_seconds: Optional[int] = None # How long to cache results (default: 5 min) + supported_filters: Optional[List[str]] = None # Which filters this source supports: ["format", "language"] def serialize_column_config(config: ReleaseColumnConfig) -> Dict[str, Any]: @@ -162,6 +163,10 @@ def serialize_column_config(config: ReleaseColumnConfig) -> Dict[str, Any]: if config.cache_ttl_seconds is not None: result["cache_ttl_seconds"] = config.cache_ttl_seconds + # Include supported filters (sources declare which filters they support) + if config.supported_filters is not None: + result["supported_filters"] = config.supported_filters + return result @@ -198,7 +203,8 @@ def _default_column_config() -> ReleaseColumnConfig: hide_mobile=False, # Size shown on mobile ), ], - grid_template="minmax(0,2fr) 60px 80px 80px" + grid_template="minmax(0,2fr) 60px 80px 80px", + supported_filters=["format", "language"], # Default: both filters available ) diff --git a/cwa_book_downloader/release_sources/direct_download.py b/cwa_book_downloader/release_sources/direct_download.py index 3c33135..acdb493 100644 --- a/cwa_book_downloader/release_sources/direct_download.py +++ b/cwa_book_downloader/release_sources/direct_download.py @@ -14,8 +14,9 @@ from bs4 import BeautifulSoup, NavigableString, Tag from cwa_book_downloader.download import http as downloader from cwa_book_downloader.download import network -from cwa_book_downloader.config.env import DEBUG_SKIP_SOURCES, DOWNLOAD_PATHS, TMP_DIR +from cwa_book_downloader.config.env import DEBUG_SKIP_SOURCES, TMP_DIR from cwa_book_downloader.core.config import config +from cwa_book_downloader.core.utils import CONTENT_TYPES from cwa_book_downloader.core.logger import setup_logger from cwa_book_downloader.core.models import BookInfo, SearchFilters, DownloadTask from cwa_book_downloader.metadata_providers import BookMetadata @@ -202,11 +203,13 @@ def search_books(query: str, filters: SearchFilters) -> List[BookInfo]: return books -def get_book_info(book_id: str) -> BookInfo: +def get_book_info(book_id: str, fetch_download_count: bool = True) -> BookInfo: """Get detailed information for a specific book. Args: book_id: Book identifier (MD5 hash) + fetch_download_count: Whether to fetch download count from summary API. + Only needed for display in DetailsModal, not for downloads. Returns: BookInfo: Detailed book information including download URLs @@ -220,7 +223,7 @@ def get_book_info(book_id: str) -> BookInfo: soup = BeautifulSoup(html, "html.parser") - return _parse_book_info_page(soup, book_id) + return _parse_book_info_page(soup, book_id, fetch_download_count) def _parse_search_result_row(row: Tag) -> Optional[BookInfo]: @@ -249,7 +252,7 @@ def _parse_search_result_row(row: Tag) -> Optional[BookInfo]: return None -def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: +def _parse_book_info_page(soup: BeautifulSoup, book_id: str, fetch_download_count: bool = True) -> BookInfo: """Parse the book info page HTML into a BookInfo object.""" data = soup.select_one("body > main > div:nth-of-type(1)") @@ -332,7 +335,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: # Preserve original case but uppercase the unit (e.g., "5.2 mb" -> "5.2 MB") size = re.sub(r'(kb|mb|gb|tb)', lambda m: m.group(1).upper(), f.strip(), flags=re.IGNORECASE) if content == "": - for ct in DOWNLOAD_PATHS.keys(): + for ct in CONTENT_TYPES: if ct in f.strip().lower(): content = ct break @@ -366,16 +369,16 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: # Extract additional metadata info = _extract_book_metadata(original_divs[-6]) - # Fetch download count from the summary API (loaded async on the page) - try: - summary_url = f"{network.get_aa_base_url()}/dyn/md5/summary/{book_id}" - summary_response = downloader.html_get_page(summary_url, selector=network.AAMirrorSelector()) - if summary_response: - summary_data = json.loads(summary_response) - if "downloads_total" in summary_data: - info["Downloads"] = [str(summary_data["downloads_total"])] - except Exception as e: - logger.debug(f"Failed to fetch download count for {book_id}: {e}") + if fetch_download_count: + try: + summary_url = f"{network.get_aa_base_url()}/dyn/md5/summary/{book_id}" + summary_response = downloader.html_get_page(summary_url, selector=network.AAMirrorSelector()) + if summary_response: + summary_data = json.loads(summary_response) + if "downloads_total" in summary_data: + info["Downloads"] = [str(summary_data["downloads_total"])] + except Exception as e: + logger.debug(f"Failed to fetch download count for {book_id}: {e}") book_info.info = info @@ -539,7 +542,7 @@ def _fetch_aa_page_urls(book_info: BookInfo, urls_by_source: Dict[str, List[str] # Otherwise fetch the page fresh try: - fresh_book_info = get_book_info(book_info.id) + fresh_book_info = get_book_info(book_info.id, fetch_download_count=False) for url in fresh_book_info.download_urls: source_type = _url_source_types.get(url) if source_type: @@ -869,7 +872,7 @@ def _extract_slow_download_url( sleep_time = min(raw_countdown, MAX_COUNTDOWN_SECONDS) if raw_countdown > MAX_COUNTDOWN_SECONDS: logger.warning(f"Countdown {raw_countdown}s exceeds max, capping at {MAX_COUNTDOWN_SECONDS}s") - logger.info(f"Waiting {sleep_time}s for {title}") + logger.info(f"AA waitlist: {sleep_time}s for {title}") # Live countdown with status updates remaining = sleep_time @@ -991,7 +994,8 @@ class DirectDownloadSource(ReleaseSource): hide_mobile=False, # Size shown on mobile ), ], - grid_template="minmax(0,2fr) 60px 80px 80px" + grid_template="minmax(0,2fr) 60px 80px 80px", + supported_filters=["format", "language"], # AA has reliable language metadata ) def search( diff --git a/cwa_book_downloader/release_sources/irc/source.py b/cwa_book_downloader/release_sources/irc/source.py index 4a186f9..42369b8 100644 --- a/cwa_book_downloader/release_sources/irc/source.py +++ b/cwa_book_downloader/release_sources/irc/source.py @@ -109,6 +109,7 @@ class IRCReleaseSource(ReleaseSource): leading_cell=LeadingCellConfig(type=LeadingCellType.NONE), online_servers=list(self._online_servers) if self._online_servers else None, cache_ttl_seconds=1800, # 30 minutes - IRC searches are slow, cache longer + supported_filters=["format"], # IRC has no language metadata ) def search( @@ -211,9 +212,37 @@ class IRCReleaseSource(ReleaseSource): return ' '.join(parts) + # Format priority for sorting (lower = higher priority) + FORMAT_PRIORITY = { + 'epub': 0, + 'mobi': 1, + 'azw3': 2, + 'azw': 3, + 'fb2': 4, + 'djvu': 5, + 'pdf': 6, + 'cbr': 7, + 'cbz': 8, + 'doc': 9, + 'docx': 10, + 'rtf': 11, + 'txt': 12, + 'html': 13, + 'htm': 14, + 'rar': 15, + 'zip': 16, + } + def _convert_to_releases(self, results: List[SearchResult]) -> List[Release]: - """Convert parsed results to Release objects.""" + """Convert parsed results to Release objects. + + Results are sorted by: + 1. Online status (online servers first) + 2. Format priority (epub > mobi > azw3 > ...) + 3. Server name (alphabetically) + """ releases = [] + online_servers = self._online_servers or set() for result in results: release = Release( @@ -233,6 +262,20 @@ class IRCReleaseSource(ReleaseSource): ) releases.append(release) + # Tiered sort: online first, then by format priority, then by server name + def sort_key(release: Release) -> tuple: + server = release.extra.get("server", "") + is_online = server in online_servers + fmt = release.format.lower() if release.format else "" + format_priority = self.FORMAT_PRIORITY.get(fmt, 99) + return ( + 0 if is_online else 1, # Online first + format_priority, # Then by format + server.lower(), # Then alphabetically by server + ) + + releases.sort(key=sort_key) + return releases @staticmethod diff --git a/cwa_book_downloader/release_sources/prowlarr/clients/qbittorrent.py b/cwa_book_downloader/release_sources/prowlarr/clients/qbittorrent.py index eb32f3c..44d6742 100644 --- a/cwa_book_downloader/release_sources/prowlarr/clients/qbittorrent.py +++ b/cwa_book_downloader/release_sources/prowlarr/clients/qbittorrent.py @@ -55,8 +55,8 @@ class QBittorrentClient(DownloadClient): """Test connection to qBittorrent.""" try: self._client.auth_log_in() - version = self._client.app.version - return True, f"Connected to qBittorrent {version}" + api_version = self._client.app.web_api_version + return True, f"Connected to qBittorrent (API v{api_version})" except Exception as e: return False, f"Connection failed: {str(e)}" diff --git a/cwa_book_downloader/release_sources/prowlarr/settings.py b/cwa_book_downloader/release_sources/prowlarr/settings.py index 0f68f67..3eed864 100644 --- a/cwa_book_downloader/release_sources/prowlarr/settings.py +++ b/cwa_book_downloader/release_sources/prowlarr/settings.py @@ -115,8 +115,8 @@ def _test_qbittorrent_connection(current_values: Dict[str, Any] = None) -> Dict[ client = Client(host=url, username=username, password=password) client.auth_log_in() - version = client.app.version - return {"success": True, "message": f"Connected to qBittorrent {version}"} + api_version = client.app.web_api_version + return {"success": True, "message": f"Connected to qBittorrent (API v{api_version})"} except ImportError: return {"success": False, "message": "qbittorrent-api package not installed"} except Exception as e: diff --git a/cwa_book_downloader/release_sources/prowlarr/source.py b/cwa_book_downloader/release_sources/prowlarr/source.py index 01766b3..ed657fa 100644 --- a/cwa_book_downloader/release_sources/prowlarr/source.py +++ b/cwa_book_downloader/release_sources/prowlarr/source.py @@ -197,6 +197,10 @@ class ProwlarrSource(ReleaseSource): name = "prowlarr" display_name = "Prowlarr" + def __init__(self): + self.last_search_type: Optional[str] = None + self._category_filtered_indexers: List[int] = [] + @classmethod def get_column_config(cls) -> ReleaseColumnConfig: """Column configuration for Prowlarr releases.""" @@ -250,6 +254,7 @@ class ProwlarrSource(ReleaseSource): ], grid_template="minmax(0,2fr) minmax(80px,1fr) 60px 70px 70px 80px", leading_cell=LeadingCellConfig(type=LeadingCellType.NONE), # No leading cell for Prowlarr + supported_filters=["format"], # Prowlarr has unreliable language metadata ) def _get_client(self) -> Optional[ProwlarrClient]: @@ -300,7 +305,7 @@ class ProwlarrSource(ReleaseSource): Args: book: Book metadata to search for - expand_search: Ignored - Prowlarr always uses title+author search + expand_search: If True, skip category filtering (broader search) languages: Ignored - Prowlarr doesn't support language filtering Returns: @@ -340,31 +345,46 @@ class ProwlarrSource(ReleaseSource): logger.warning("No indexers selected - configure indexers in Prowlarr settings") return [] - # Book categories: 7000 (Books parent), 7020 (EBook), 7030 (Comics), etc. - # We search the parent category which includes all subcategories - book_categories = [7000] + if expand_search: + if not self._category_filtered_indexers: + logger.debug("No category-filtered indexers to expand") + return [] + indexers_to_search = self._category_filtered_indexers + categories = None + self.last_search_type = "expanded" + else: + indexers_to_search = indexer_ids + categories = [7000] + self._category_filtered_indexers = [] + self.last_search_type = "categories" - logger.debug(f"Searching Prowlarr: query='{query}', indexers={indexer_ids}") + logger.debug(f"Searching Prowlarr: query='{query}', indexers={indexers_to_search}, categories={categories}") all_results = [] try: - # Make separate API call for each indexer - for indexer_id in indexer_ids: + for indexer_id in indexers_to_search: try: - raw_results = client.search(query=query, indexer_ids=[indexer_id], categories=book_categories) + raw_results = client.search(query=query, indexer_ids=[indexer_id], categories=categories) + + if raw_results and categories: + self._category_filtered_indexers.append(indexer_id) + elif not raw_results and categories: + logger.debug(f"Indexer {indexer_id}: retrying without category filter") + raw_results = client.search(query=query, indexer_ids=[indexer_id], categories=None) + if raw_results: all_results.extend(raw_results) except Exception as e: logger.warning(f"Search failed for indexer {indexer_id}: {e}") - continue + + if not expand_search and not self._category_filtered_indexers: + self.last_search_type = "expanded" results = [_prowlarr_result_to_release(r) for r in all_results] - # Log consolidated summary if results: torrent_count = sum(1 for r in results if r.protocol == "torrent") nzb_count = sum(1 for r in results if r.protocol == "nzb") - # Get unique indexer names indexers = sorted(set(r.indexer for r in results if r.indexer)) indexer_str = ", ".join(indexers) if indexers else "unknown" logger.info(f"Prowlarr: {len(results)} results ({torrent_count} torrent, {nzb_count} nzb) from {indexer_str}") diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index bd74a41..3d120d6 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -15,9 +15,6 @@ services: - ./.local/ingest:/cwa-book-ingest - ./.local/log:/var/log/cwa-book-downloader - ./.local/tmp:/tmp/cwa-book-downloader - # Mount source code for development (no rebuild needed for Python code changes) - ./cwa_book_downloader:/app/cwa_book_downloader:ro - # Download client volume - required for Prowlarr/torrent/usenet integration - # IMPORTANT: Both sides of this mount must match your download client's volume mount exactly. - # Example: if qBittorrent has "/mnt/storage/downloads:/data/torrents", use the same here: - # - /mnt/storage/downloads:/data/torrents + # Download client mount (must match your torrent/usenet client's volume) + # - /path/to/downloads:/downloads diff --git a/docker-compose.extbp.dev.yml b/docker-compose.extbp.dev.yml index 9c943bd..a2cad3c 100644 --- a/docker-compose.extbp.dev.yml +++ b/docker-compose.extbp.dev.yml @@ -18,10 +18,8 @@ services: - ./.local/ingest:/cwa-book-ingest - ./.local/log:/var/log/cwa-book-downloader - ./.local/tmp:/tmp/cwa-book-downloader - # Download client volume - required for Prowlarr/torrent/usenet integration - # IMPORTANT: Both sides of this mount must match your download client's volume mount exactly. - # Example: if qBittorrent has "/mnt/storage/downloads:/data/torrents", use the same here: - # - /mnt/storage/downloads:/data/torrents + # Download client mount (must match your torrent/usenet client's volume) + # - /path/to/downloads:/downloads flaresolverr: image: ghcr.io/flaresolverr/flaresolverr:latest diff --git a/docker-compose.extbp.yml b/docker-compose.extbp.yml index f7a66a5..7da38d7 100644 --- a/docker-compose.extbp.yml +++ b/docker-compose.extbp.yml @@ -3,22 +3,18 @@ services: calibre-web-automated-book-downloader-extbp: image: ghcr.io/calibrain/calibre-web-automated-book-downloader-extbp:latest environment: - TZ: America/New_York + # TZ: America/New_York EXT_BYPASSER_URL: http://flaresolverr:8191 # PUID: 1000 # PGID: 1000 - # CWA_DB_PATH: /auth/app.db ports: - 8084:8084 restart: unless-stopped volumes: - - /tmp/data/calibre-web/ingest:/cwa-book-ingest - - /path/to/config:/config - # - /cwa/config/path/app.db:/auth/app.db:ro - # Download client volume - required for Prowlarr/torrent/usenet integration - # IMPORTANT: Both sides of this mount must match your download client's volume mount exactly. - # Example: if qBittorrent has "/mnt/storage/downloads:/data/torrents", use the same here: - # - /mnt/storage/downloads:/data/torrents + - /path/to/ingest:/cwa-book-ingest # Book ingest directory + - /path/to/config:/config # App configuration + # Download client mount (must match your torrent/usenet client's volume) + # - /path/to/downloads:/downloads flaresolverr: image: ghcr.io/flaresolverr/flaresolverr:latest diff --git a/docker-compose.tor.dev.yml b/docker-compose.tor.dev.yml index a495b96..02b91a6 100644 --- a/docker-compose.tor.dev.yml +++ b/docker-compose.tor.dev.yml @@ -15,7 +15,5 @@ services: - ./.local/ingest:/cwa-book-ingest - ./.local/log:/var/log/cwa-book-downloader - ./.local/tmp:/tmp/cwa-book-downloader - # Download client volume - required for Prowlarr/torrent/usenet integration - # IMPORTANT: Both sides of this mount must match your download client's volume mount exactly. - # Example: if qBittorrent has "/mnt/storage/downloads:/data/torrents", use the same here: - # - /mnt/storage/downloads:/data/torrents + # Download client mount (must match your torrent/usenet client's volume) + # - /path/to/downloads:/downloads diff --git a/docker-compose.tor.yml b/docker-compose.tor.yml index a4d91c1..1b8871c 100644 --- a/docker-compose.tor.yml +++ b/docker-compose.tor.yml @@ -4,11 +4,10 @@ services: image: ghcr.io/calibrain/calibre-web-automated-book-downloader-tor:latest environment: FLASK_PORT: 8084 - TZ: America/New_York + # TZ: America/New_York USING_TOR: true # PUID: 1000 # PGID: 1000 - # CWA_DB_PATH: /auth/app.db cap_add: - NET_ADMIN - NET_RAW @@ -16,10 +15,7 @@ services: - 8084:8084 restart: unless-stopped volumes: - - /tmp/data/calibre-web/ingest:/cwa-book-ingest - - /path/to/config:/config - # - /cwa/config/path/app.db:/auth/app.db:ro - # Download client volume - required for Prowlarr/torrent/usenet integration - # IMPORTANT: Both sides of this mount must match your download client's volume mount exactly. - # Example: if qBittorrent has "/mnt/storage/downloads:/data/torrents", use the same here: - # - /mnt/storage/downloads:/data/torrents + - /path/to/ingest:/cwa-book-ingest # Book ingest directory + - /path/to/config:/config # App configuration + # Download client mount (must match your torrent/usenet client's volume) + # - /path/to/downloads:/downloads diff --git a/docker-compose.yml b/docker-compose.yml index 17a7432..fee005b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,17 +3,14 @@ services: image: ghcr.io/calibrain/calibre-web-automated-book-downloader:latest container_name: calibre-web-automated-book-downloader environment: - TZ: America/New_York + # TZ: America/New_York # PUID: 1000 # PGID: 1000 - # CWA_DB_PATH: /auth/app.db ports: - 8084:8084 restart: unless-stopped volumes: - - /tmp/data/calibre-web/ingest:/cwa-book-ingest # This is where the books will be downloaded and ingested by your book management application - - /path/to/config:/config # Configuration files and database - # Download client volume - required for Prowlarr/torrent/usenet integration - # IMPORTANT: Both sides of this mount must match your download client's volume mount exactly. - # Example: if qBittorrent has "/mnt/storage/downloads:/data/torrents", use the same here: - # - /mnt/storage/downloads:/data/torrents + - /path/to/ingest:/cwa-book-ingest # Book ingest directory + - /path/to/config:/config # App configuration + # Download client mount (must match your torrent/usenet client's volume) + # - /path/to/downloads:/downloads diff --git a/src/frontend/src/components/ReleaseModal.tsx b/src/frontend/src/components/ReleaseModal.tsx index f7eef2d..98d400f 100644 --- a/src/frontend/src/components/ReleaseModal.tsx +++ b/src/frontend/src/components/ReleaseModal.tsx @@ -9,7 +9,7 @@ import { ReleaseCell } from './ReleaseCell'; import { getColorStyleFromHint } from '../utils/colorMaps'; import { getNestedValue } from '../utils/objectHelpers'; import { LanguageMultiSelect } from './LanguageMultiSelect'; -import { LANGUAGE_OPTION_ALL, LANGUAGE_OPTION_DEFAULT, getLanguageFilterValues } from '../utils/languageFilters'; +import { LANGUAGE_OPTION_ALL, LANGUAGE_OPTION_DEFAULT, getLanguageFilterValues, releaseLanguageMatchesFilter } from '../utils/languageFilters'; // Module-level cache for release search results // Key format: `${provider}:${provider_id}:${source}` @@ -92,6 +92,7 @@ const DEFAULT_COLUMN_CONFIG: ReleaseColumnConfig = { }, ], grid_template: 'minmax(0,2fr) 60px 80px 80px', + supported_filters: ['format', 'language'], // Default: both filters available }; interface ReleaseModalProps { @@ -916,32 +917,27 @@ export const ReleaseModal = ({ const supportedLower = supportedFormats.map((f) => f.toLowerCase()); return releases.filter((r) => { - // Format filtering: always filter by supported formats - if (r.format) { - const fmt = r.format.toLowerCase(); - // If user selected a specific format, filter to that - if (formatFilter) { - if (fmt !== formatFilter.toLowerCase()) return false; - } else { - // Otherwise, only show supported formats - if (!supportedLower.includes(fmt)) return false; - } - } + // Format filtering + const fmt = r.format?.toLowerCase(); - // Language filtering using resolved language codes - // null or includes 'all' means show all languages - // Otherwise filter to the specific language codes + if (formatFilter) { + // User selected a specific format - must match exactly + if (!fmt || fmt !== formatFilter.toLowerCase()) return false; + } else if (fmt) { + // No specific filter - show only supported formats + if (!supportedLower.includes(fmt)) return false; + } + // Releases with no format pass through when no filter is set (show all) + + // Language filtering const releaseLang = r.extra?.language as string | undefined; - if (releaseLang && resolvedLanguageCodes && !resolvedLanguageCodes.includes(LANGUAGE_OPTION_ALL)) { - const releaseLangLower = releaseLang.toLowerCase(); - if (!resolvedLanguageCodes.some(code => code.toLowerCase() === releaseLangLower)) { - return false; - } + if (!releaseLanguageMatchesFilter(releaseLang, resolvedLanguageCodes ?? defaultLanguages)) { + return false; } return true; }); - }, [releasesBySource, activeTab, formatFilter, resolvedLanguageCodes, supportedFormats]); + }, [releasesBySource, activeTab, formatFilter, resolvedLanguageCodes, supportedFormats, defaultLanguages]); // Get column config from response or use default const columnConfig = useMemo((): ReleaseColumnConfig => { @@ -1254,7 +1250,9 @@ export const ReleaseModal = ({ {/* Filter funnel button - stays fixed */} - {(availableFormats.length > 0 || bookLanguages.length > 0) && ( + {/* Only show filter button if source supports at least one filter type */} + {((columnConfig.supported_filters?.includes('format') && availableFormats.length > 0) || + (columnConfig.supported_filters?.includes('language') && bookLanguages.length > 0)) && ( {({ close }) => (
- {availableFormats.length > 0 && ( + {columnConfig.supported_filters?.includes('format') && availableFormats.length > 0 && ( )} - + {columnConfig.supported_filters?.includes('language') && ( + + )} {/* Apply button - for AA, re-fetches with language filter; for others, just closes */} {activeTab === 'direct_download' && (