From b10458a48bbe339840f228dc65b59ecdf60c829c Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 2 Feb 2026 20:32:19 +0000 Subject: [PATCH] Patch: Migrate bypasser to pure CDP + Misc fixes (#575) Bypasser: - Refactored internal bypasser logic to use SeleniumBase Pure CDP mode, removed chromedriver dependencies and UC code. - Added dedicated threading for internal bypasser functions, fixes any potential asyncio CPU spike behavior - Fixed WebGL issue with Chromium 144. Reverted 1.0.3 hotfix and updated to latest Chromium Misc: - Added M4A color mapping - Fix frontend language filtering with multi-language releases - Added "days" age for usenet/torrent releases - Improved entrypoint chown efficiency - Added `ONBOARDING` env variable, default true --- Dockerfile | 11 +- docs/environment-variables.md | 28 +- entrypoint.sh | 30 +- scripts/generate_env_docs.py | 6 + shelfmark/bypass/internal_bypasser.py | 801 ++++++++----------- shelfmark/config/env.py | 8 + shelfmark/config/settings.py | 2 +- shelfmark/core/onboarding.py | 6 + shelfmark/release_sources/direct_download.py | 1 + src/frontend/src/components/ReleaseCell.tsx | 12 +- src/frontend/src/components/ReleaseModal.tsx | 11 +- src/frontend/src/utils/colorMaps.ts | 1 + src/frontend/src/utils/languageFilters.ts | 34 +- tests/bypass/test_internal_bypasser.py | 67 ++ 14 files changed, 518 insertions(+), 500 deletions(-) diff --git a/Dockerfile b/Dockerfile index b8bbdac..00bf0d6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -130,11 +130,11 @@ RUN apt-get update && \ xvfb \ # For screen recording ffmpeg \ - # --- Chromium --- - chromium=143.0.7499.169-1~deb13u1 \ - chromium-common=143.0.7499.169-1~deb13u1 \ - # --- ChromeDriver --- - chromium-driver=143.0.7499.169-1~deb13u1 \ + # --- Chromium (unpinned - uses latest from Debian repos) --- + # Chrome 144+ requires --enable-unsafe-swiftshader for WebGL in Docker. + # This flag is set in internal_bypasser.py _get_browser_args() + chromium \ + chromium-common \ # For tkinter (pyautogui) python3-tk \ # For RAR extraction @@ -152,7 +152,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # Grant read/execute permissions to others RUN chmod -R o+rx /usr/bin/chromium && \ - chmod -R o+rx /usr/bin/chromedriver && \ chmod -R o+rwx /usr/local/lib/python3.10/site-packages/seleniumbase/drivers/ # Default command to run the application entrypoint script diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 8dee363..31a0d13 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -36,12 +36,13 @@ These environment variables are used at startup before the settings system loads | `CONFIG_DIR` | Directory for storing configuration files and plugin settings. | string (path) | `/config` | | `LOG_ROOT` | Root directory for log files. | string (path) | `/var/log/` | | `TMP_DIR` | Staging directory for downloads before moving to destination. | string (path) | `/tmp/shelfmark` | -| `ENABLE_LOGGING` | Enable file logging to LOG_ROOT/shelfmark/shelfmark.log. | boolean | `true` | +| `ENABLE_LOGGING` | Enable file logging under LOG_ROOT/shelfmark/ (including shelfmark.log and startup logs). | boolean | `true` | | `FLASK_HOST` | Host address for the Flask web server. | string | `0.0.0.0` | | `FLASK_PORT` | Port number for the Flask web server. | number | `8084` | | `SESSION_COOKIE_SECURE` | Enable secure cookies (requires HTTPS). | boolean | `false` | | `CWA_DB_PATH` | Path to the Calibre-Web database for authentication integration. | string (path) | `/auth/app.db` | | `DOCKERMODE` | Indicates the application is running inside a Docker container. | boolean | `false` | +| `ONBOARDING` | Show the onboarding wizard on first run. Set to false to skip (useful for ephemeral storage). | boolean | `true` |
Detailed descriptions @@ -69,7 +70,7 @@ Staging directory for downloads before moving to destination. #### `ENABLE_LOGGING` -Enable file logging to LOG_ROOT/shelfmark/shelfmark.log. +Enable file logging under LOG_ROOT/shelfmark/ (including shelfmark.log and startup logs). - **Type:** boolean - **Default:** `true` @@ -109,6 +110,13 @@ Indicates the application is running inside a Docker container. - **Type:** boolean - **Default:** `false` +#### `ONBOARDING` + +Show the onboarding wizard on first run. Set to false to skip (useful for ephemeral storage). + +- **Type:** boolean +- **Default:** `true` +
## General @@ -243,8 +251,8 @@ The release source tab to open by default in the release modal. | `BOOKS_OUTPUT_MODE` | Choose where completed book files are sent. | string (choice) | `folder` | | `INGEST_DIR` | Directory where downloaded files are saved. | string | `/books` | | `FILE_ORGANIZATION` | Choose how downloaded book files are named and organized. | string (choice) | `rename` | -| `TEMPLATE_RENAME` | Variables: {Author}, {Title}, {Year}. Universal adds: {Series}, {SeriesPosition}, {Subtitle}. Rename templates are filename-only (no '/' or '\'); use Organize for folders. | string | `{Author} - {Title} ({Year})` | -| `TEMPLATE_ORGANIZE` | Use / to create folders. Variables: {Author}, {Title}, {Year}. Universal adds: {Series}, {SeriesPosition}, {Subtitle} | string | `{Author}/{Title} ({Year})` | +| `TEMPLATE_RENAME` | Variables: {Author}, {Title}, {Year}. Universal adds: {Series}, {SeriesPosition}, {Subtitle}. Use arbitrary prefix/suffix: {Vol. SeriesPosition - } outputs 'Vol. 2 - ' when set, nothing when empty. Rename templates are filename-only (no '/' or '\'); use Organize for folders. | string | `{Author} - {Title} ({Year})` | +| `TEMPLATE_ORGANIZE` | Use / to create folders. Variables: {Author}, {Title}, {Year}. Universal adds: {Series}, {SeriesPosition}, {Subtitle}. Use arbitrary prefix/suffix: {Vol. SeriesPosition - } outputs 'Vol. 2 - ' when set, nothing when empty. | string | `{Author}/{Title} ({Year})` | | `HARDLINK_TORRENTS` | Create hardlinks instead of copying. Preserves seeding but archives won't be extracted. Don't use if destination is a library ingest folder. | boolean | `false` | | `BOOKLORE_HOST` | Base URL of your Booklore instance | string | _none_ | | `BOOKLORE_USERNAME` | Booklore account username | string | _none_ | @@ -253,8 +261,8 @@ The release source tab to open by default in the release modal. | `BOOKLORE_PATH_ID` | Booklore library path for uploads. | string (choice) | _none_ | | `DESTINATION_AUDIOBOOK` | Leave empty to use Books destination. | string | _none_ | | `FILE_ORGANIZATION_AUDIOBOOK` | Choose how downloaded audiobook files are named and organized. | string (choice) | `rename` | -| `TEMPLATE_AUDIOBOOK_RENAME` | Variables: {Author}, {Title}, {Year}, {Series}, {SeriesPosition}, {Subtitle}, {PartNumber}. Rename templates are filename-only (no '/' or '\'); use Organize for folders. | string | `{Author} - {Title}` | -| `TEMPLATE_AUDIOBOOK_ORGANIZE` | Use / to create folders. Variables: {Author}, {Title}, {Year}, {Series}, {SeriesPosition}, {Subtitle}, {PartNumber} | string | `{Author}/{Title}` | +| `TEMPLATE_AUDIOBOOK_RENAME` | Variables: {Author}, {Title}, {Year}, {Series}, {SeriesPosition}, {Subtitle}, {PartNumber}. Use arbitrary prefix/suffix: {Vol. SeriesPosition - } outputs 'Vol. 2 - ' when set, nothing when empty. Rename templates are filename-only (no '/' or '\'); use Organize for folders. | string | `{Author} - {Title}` | +| `TEMPLATE_AUDIOBOOK_ORGANIZE` | Use / to create folders. Variables: {Author}, {Title}, {Year}, {Series}, {SeriesPosition}, {Subtitle}, {PartNumber}. Use arbitrary prefix/suffix: {Vol. SeriesPosition - } outputs 'Vol. 2 - ' when set, nothing when empty. | string | `{Author}/{Title}` | | `HARDLINK_TORRENTS_AUDIOBOOK` | Create hardlinks instead of copying. Preserves seeding but archives won't be extracted. Don't use if destination is a library ingest folder. | boolean | `true` | | `AUTO_OPEN_DOWNLOADS_SIDEBAR` | Automatically open the downloads sidebar when a new download is queued. | boolean | `false` | | `DOWNLOAD_TO_BROWSER` | Automatically download completed files to your browser. | boolean | `false` | @@ -298,7 +306,7 @@ Choose how downloaded book files are named and organized. **Naming Template** -Variables: {Author}, {Title}, {Year}. Universal adds: {Series}, {SeriesPosition}, {Subtitle}. Rename templates are filename-only (no '/' or '\'); use Organize for folders. +Variables: {Author}, {Title}, {Year}. Universal adds: {Series}, {SeriesPosition}, {Subtitle}. Use arbitrary prefix/suffix: {Vol. SeriesPosition - } outputs 'Vol. 2 - ' when set, nothing when empty. Rename templates are filename-only (no '/' or '\'); use Organize for folders. - **Type:** string - **Default:** `{Author} - {Title} ({Year})` @@ -307,7 +315,7 @@ Variables: {Author}, {Title}, {Year}. Universal adds: {Series}, {SeriesPosition} **Path Template** -Use / to create folders. Variables: {Author}, {Title}, {Year}. Universal adds: {Series}, {SeriesPosition}, {Subtitle} +Use / to create folders. Variables: {Author}, {Title}, {Year}. Universal adds: {Series}, {SeriesPosition}, {Subtitle}. Use arbitrary prefix/suffix: {Vol. SeriesPosition - } outputs 'Vol. 2 - ' when set, nothing when empty. - **Type:** string - **Default:** `{Author}/{Title} ({Year})` @@ -394,7 +402,7 @@ Choose how downloaded audiobook files are named and organized. **Naming Template** -Variables: {Author}, {Title}, {Year}, {Series}, {SeriesPosition}, {Subtitle}, {PartNumber}. Rename templates are filename-only (no '/' or '\'); use Organize for folders. +Variables: {Author}, {Title}, {Year}, {Series}, {SeriesPosition}, {Subtitle}, {PartNumber}. Use arbitrary prefix/suffix: {Vol. SeriesPosition - } outputs 'Vol. 2 - ' when set, nothing when empty. Rename templates are filename-only (no '/' or '\'); use Organize for folders. - **Type:** string - **Default:** `{Author} - {Title}` @@ -403,7 +411,7 @@ Variables: {Author}, {Title}, {Year}, {Series}, {SeriesPosition}, {Subtitle}, {P **Path Template** -Use / to create folders. Variables: {Author}, {Title}, {Year}, {Series}, {SeriesPosition}, {Subtitle}, {PartNumber} +Use / to create folders. Variables: {Author}, {Title}, {Year}, {Series}, {SeriesPosition}, {Subtitle}, {PartNumber}. Use arbitrary prefix/suffix: {Vol. SeriesPosition - } outputs 'Vol. 2 - ' when set, nothing when empty. - **Type:** string - **Default:** `{Author}/{Title}` diff --git a/entrypoint.sh b/entrypoint.sh index 7a2fa03..d722f20 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -148,6 +148,7 @@ test_write() { make_writable() { folder=$1 + did_full_chown=0 set +e test_write $folder is_writable=$? @@ -158,31 +159,36 @@ make_writable() { echo "Folder $folder is not writable, changing ownership" change_ownership $folder chmod -R g+r,g+w $folder || echo "Failed to change group permissions for ${folder}, continuing..." + did_full_chown=1 fi # Fix any misowned subdirectories/files (e.g., from previous runs as root) - if [ -d "$folder" ]; then - misowned_count=$(find "$folder" -mindepth 1 \( ! -user "$RUN_UID" -o ! -group "$RUN_GID" \) 2>/dev/null | wc -l) - if [ "$misowned_count" -gt 0 ]; then - echo "Fixing ownership of $misowned_count files/directories in $folder" - find "$folder" -mindepth 1 \( ! -user "$RUN_UID" -o ! -group "$RUN_GID" \) \ - -exec chown "$RUN_UID:$RUN_GID" {} \; 2>/dev/null || true - fi + if [ "$did_full_chown" -eq 0 ] && [ -d "$folder" ]; then + echo "Checking for misowned files/directories in $folder" + find "$folder" -mindepth 1 \( ! -user "$RUN_UID" -o ! -group "$RUN_GID" \) \ + -exec chown "$RUN_UID:$RUN_GID" {} + 2>/dev/null || true fi test_write $folder || echo "Failed to test write to ${folder}, continuing..." } +fix_misowned() { + folder=$1 + mkdir -p $folder + echo "Checking for misowned files/directories in $folder" + find "$folder" \( ! -user "$RUN_UID" -o ! -group "$RUN_GID" \) \ + -exec chown "$RUN_UID:$RUN_GID" {} + 2>/dev/null || true +} + # Ensure proper ownership of application directories change_ownership() { folder=$1 mkdir -p $folder echo "Changing ownership of $folder to $USERNAME:$RUN_GID" - chown -R "${RUN_UID}" "${folder}" || echo "Failed to change user ownership for ${folder}, continuing..." - chown -R ":${RUN_GID}" "${folder}" || echo "Failed to change group ownership for ${folder}, continuing..." + chown -R "${RUN_UID}:${RUN_GID}" "${folder}" || echo "Failed to change ownership for ${folder}, continuing..." } -change_ownership /app -change_ownership /var/log/shelfmark -change_ownership /tmp/shelfmark +fix_misowned /app +fix_misowned /var/log/shelfmark +fix_misowned /tmp/shelfmark # SeleniumBase (internal bypasser) writes a patched chromedriver binary (uc_driver) # into its own drivers directory. Some NAS/docker setups can apply restrictive ACLs diff --git a/scripts/generate_env_docs.py b/scripts/generate_env_docs.py index 5094712..1f1c196 100755 --- a/scripts/generate_env_docs.py +++ b/scripts/generate_env_docs.py @@ -178,6 +178,12 @@ def _generate_bootstrap_env_docs() -> List[str]: "type": "boolean", "default": "false", }, + { + "name": "ONBOARDING", + "description": "Show the onboarding wizard on first run. Set to false to skip (useful for ephemeral storage).", + "type": "boolean", + "default": "true", + }, ] lines = [ diff --git a/shelfmark/bypass/internal_bypasser.py b/shelfmark/bypass/internal_bypasser.py index 5412360..0c5fced 100644 --- a/shelfmark/bypass/internal_bypasser.py +++ b/shelfmark/bypass/internal_bypasser.py @@ -1,5 +1,7 @@ +import asyncio import os import random +import signal import socket import subprocess import threading @@ -7,11 +9,11 @@ import time import traceback from datetime import datetime from threading import Event -from typing import Optional +from typing import Any, Optional from urllib.parse import urlparse import requests -from seleniumbase import Driver +from seleniumbase import cdp_driver from shelfmark.bypass import BypassCancelledException from shelfmark.bypass.fingerprint import get_screen_size @@ -42,11 +44,60 @@ DDOS_GUARD_INDICATORS = [ ] DISPLAY = { - "xvfb": None, "ffmpeg": None, + "ffmpeg_output": None, } LOCKED = threading.Lock() + +class _CdpWorker: + def __init__(self) -> None: + self._thread: Optional[threading.Thread] = None + self._loop: Optional[asyncio.AbstractEventLoop] = None + self._ready = threading.Event() + self._lock = threading.Lock() + + def _run(self) -> None: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + self._loop = loop + self._ready.set() + loop.run_forever() + try: + pending = asyncio.all_tasks(loop) + for task in pending: + task.cancel() + if pending: + loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True)) + except Exception: + pass + finally: + loop.close() + + def start(self) -> None: + with self._lock: + if self._thread and self._thread.is_alive(): + return + self._ready.clear() + self._thread = threading.Thread( + target=self._run, + name="cdp-worker", + daemon=True, + ) + self._thread.start() + if not self._ready.wait(timeout=10): + raise RuntimeError("CDP worker loop failed to start") + + def run(self, coro: Any, timeout: Optional[float] = None) -> Any: + self.start() + if not self._loop or self._loop.is_closed(): + raise RuntimeError("CDP worker loop not available") + future = asyncio.run_coroutine_threadsafe(coro, self._loop) + return future.result(timeout=timeout) + + +_CDP_WORKER = _CdpWorker() + # Cookie storage - shared with requests library for Cloudflare bypass # Structure: {domain: {cookie_name: {value, expiry, ...}}} _cf_cookies: dict[str, dict] = {} @@ -77,8 +128,8 @@ def _should_extract_cookie(name: str, extract_all: bool) -> bool: return is_cf or is_ddg -def _extract_cookies_from_driver(driver, url: str) -> None: - """Extract cookies from Chrome after successful bypass.""" +async def _extract_cookies_from_cdp(driver, page, url: str) -> None: + """Extract cookies from a CDP browser after successful bypass.""" try: parsed = urlparse(url) domain = parsed.hostname or "" @@ -88,24 +139,34 @@ def _extract_cookies_from_driver(driver, url: str) -> None: base_domain = _get_base_domain(domain) extract_all = base_domain in FULL_COOKIE_DOMAINS + try: + all_cookies = await driver.cookies.get_all(requests_cookie_format=True) + except Exception as e: + logger.debug(f"Failed to get cookies via CDP: {e}") + return + cookies_found = {} - for cookie in driver.get_cookies(): - name = cookie.get('name', '') - if _should_extract_cookie(name, extract_all): - cookies_found[name] = { - 'value': cookie.get('value', ''), - 'domain': cookie.get('domain', domain), - 'path': cookie.get('path', '/'), - 'expiry': cookie.get('expiry'), - 'secure': cookie.get('secure', True), - 'httpOnly': cookie.get('httpOnly', True), - } + for cookie in all_cookies: + name = getattr(cookie, "name", "") or "" + if not _should_extract_cookie(name, extract_all): + continue + expires = getattr(cookie, "expires", None) + if expires is not None and expires <= 0: + expires = None + cookies_found[name] = { + "value": getattr(cookie, "value", ""), + "domain": getattr(cookie, "domain", None) or domain, + "path": getattr(cookie, "path", None) or "/", + "expiry": expires, + "secure": bool(getattr(cookie, "secure", True)), + "httpOnly": True, + } if not cookies_found: return try: - user_agent = driver.execute_script("return navigator.userAgent") + user_agent = await page.evaluate("navigator.userAgent") except Exception: user_agent = None @@ -113,7 +174,7 @@ def _extract_cookies_from_driver(driver, url: str) -> None: _cf_cookies[base_domain] = cookies_found if user_agent: _cf_user_agents[base_domain] = user_agent - logger.debug(f"Stored UA for {base_domain}: {user_agent[:60]}...") + logger.debug(f"Stored UA for {base_domain}: {str(user_agent)[:60]}...") else: logger.debug(f"No UA captured for {base_domain}") @@ -139,7 +200,9 @@ def get_cf_cookies_for_domain(domain: str) -> dict[str, str]: cf_clearance = cookies.get('cf_clearance', {}) if cf_clearance: expiry = cf_clearance.get('expiry') - if expiry and time.time() > expiry: + if expiry is None: + expiry = cf_clearance.get('expires') + if expiry and expiry > 0 and time.time() > expiry: logger.debug(f"CF cookies expired for {base_domain}") _cf_cookies.pop(base_domain, None) return {} @@ -172,21 +235,14 @@ def clear_cf_cookies(domain: str = None) -> None: _cf_user_agents.clear() -def _reset_pyautogui_display_state(): - try: - import pyautogui - import Xlib.display - pyautogui._pyautogui_x11._display = Xlib.display.Display(os.environ['DISPLAY']) - except Exception as e: - logger.warning(f"Error resetting pyautogui display state: {e}") - - def _cleanup_orphan_processes() -> int: """Kill orphan Chrome/Xvfb/ffmpeg processes. Only runs in Docker mode.""" if not env.DOCKERMODE: return 0 - processes_to_kill = ["chrome", "chromedriver", "Xvfb", "ffmpeg"] + _stop_ffmpeg_recording() + + processes_to_kill = ["chrome", "chromium", "Xvfb", "ffmpeg"] total_killed = 0 logger.debug("Checking for orphan processes...") @@ -231,18 +287,19 @@ def _cleanup_orphan_processes() -> int: return total_killed -def _get_page_info(sb) -> tuple[str, str, str]: +async def _get_page_info(page) -> tuple[str, str, str]: """Extract page title, body text, and current URL safely.""" try: - title = sb.get_title().lower() + title = (await page.get_title() or "").lower() except Exception: title = "" try: - body = sb.get_text("body").lower() + body = await page.evaluate("document.body ? document.body.innerText : ''") + body = (body or "").lower() except Exception: body = "" try: - current_url = sb.get_current_url() + current_url = await page.get_current_url() or "" except Exception: current_url = "" return title, body, current_url @@ -259,41 +316,41 @@ def _has_cloudflare_patterns(body: str, url: str) -> bool: """Check for Cloudflare-specific patterns in body or URL.""" return "cf-" in body or "cloudflare" in url.lower() or "/cdn-cgi/" in url -def _detect_challenge_type(sb) -> str: +async def _detect_challenge_type(page) -> str: """Detect challenge type: 'cloudflare', 'ddos_guard', or 'none'.""" try: - title, body, current_url = _get_page_info(sb) - + title, body, current_url = await _get_page_info(page) + # DDOS-Guard indicators if found := _check_indicators(title, body, DDOS_GUARD_INDICATORS): logger.debug(f"DDOS-Guard indicator found: '{found}'") return "ddos_guard" - + # Cloudflare indicators if found := _check_indicators(title, body, CLOUDFLARE_INDICATORS): logger.debug(f"Cloudflare indicator found: '{found}'") return "cloudflare" - + # Check URL patterns if _has_cloudflare_patterns(body, current_url): return "cloudflare" - + return "none" except Exception as e: logger.warning(f"Error detecting challenge type: {e}") return "none" -def _is_bypassed(sb, escape_emojis: bool = True) -> bool: +async def _is_bypassed(page, escape_emojis: bool = True) -> bool: """Check if the protection has been bypassed.""" try: - title, body, current_url = _get_page_info(sb) + title, body, current_url = await _get_page_info(page) body_len = len(body.strip()) - + # Long page content = probably bypassed if body_len > 100000: logger.debug(f"Page content too long, probably bypassed (len: {body_len})") return True - + # Multiple emojis = probably real content if escape_emojis: import emoji @@ -304,155 +361,71 @@ def _is_bypassed(sb, escape_emojis: bool = True) -> bool: # Check for protection indicators (means NOT bypassed) if _check_indicators(title, body, CLOUDFLARE_INDICATORS + DDOS_GUARD_INDICATORS): return False - + # Cloudflare URL patterns if _has_cloudflare_patterns(body, current_url): logger.debug("Cloudflare patterns detected in page") return False - + # Page too short = still loading if body_len < 50: logger.debug("Page content too short, might still be loading") return False - + logger.debug(f"Bypass check passed - Title: '{title[:100]}', Body length: {body_len}") return True - + except Exception as e: logger.warning(f"Error checking bypass status: {e}") return False -def _simulate_human_behavior(sb) -> None: - """Simulate human-like behavior before bypass attempt.""" - try: - time.sleep(random.uniform(0.5, 1.5)) - - if random.random() < 0.3: - sb.scroll_down(random.randint(20, 50)) - time.sleep(random.uniform(0.2, 0.5)) - sb.scroll_up(random.randint(10, 30)) - time.sleep(random.uniform(0.2, 0.4)) - - try: - import pyautogui - x, y = pyautogui.position() - pyautogui.moveTo( - x + random.randint(-10, 10), - y + random.randint(-10, 10), - duration=random.uniform(0.05, 0.15) - ) - except Exception as e: - logger.debug(f"Mouse jiggle failed: {e}") - except Exception as e: - logger.debug(f"Human simulation failed: {e}") - - -def _bypass_method_handle_captcha(sb) -> bool: - """Method 2: Use uc_gui_handle_captcha() - TAB+SPACEBAR approach, stealthier than click.""" - try: - logger.debug("Attempting bypass: uc_gui_handle_captcha (TAB+SPACEBAR)") - _simulate_human_behavior(sb) - sb.uc_gui_handle_captcha() - time.sleep(random.uniform(3, 5)) - return _is_bypassed(sb) - except Exception as e: - logger.debug(f"uc_gui_handle_captcha failed: {e}") - return False - - -def _bypass_method_click_captcha(sb) -> bool: - """Method 3: Use uc_gui_click_captcha() - direct click via PyAutoGUI.""" - try: - logger.debug("Attempting bypass: uc_gui_click_captcha (direct click)") - _simulate_human_behavior(sb) - sb.uc_gui_click_captcha() - time.sleep(random.uniform(3, 5)) - - if _is_bypassed(sb): - return True - - # Retry once with longer wait - logger.debug("First click attempt failed, retrying...") - time.sleep(random.uniform(4, 6)) - sb.uc_gui_click_captcha() - time.sleep(random.uniform(3, 5)) - return _is_bypassed(sb) - except Exception as e: - logger.debug(f"uc_gui_click_captcha failed: {e}") - return False - - -def _bypass_method_humanlike(sb) -> bool: +async def _bypass_method_humanlike(page) -> bool: """Human-like behavior with scroll, wait, and reload.""" try: logger.debug("Attempting bypass: human-like interaction") - time.sleep(random.uniform(6, 10)) + await asyncio.sleep(random.uniform(6, 10)) try: - sb.scroll_to_bottom() - time.sleep(random.uniform(1, 2)) - sb.scroll_to_top() - time.sleep(random.uniform(2, 3)) + await page.evaluate("window.scrollTo(0, 10000);") + await page.wait() + await asyncio.sleep(random.uniform(1, 2)) + await page.evaluate("window.scrollTo(0, 0);") + await page.wait() + await asyncio.sleep(random.uniform(2, 3)) except Exception as e: logger.debug(f"Scroll behavior failed: {e}") - if _is_bypassed(sb): + if await _is_bypassed(page): return True logger.debug("Trying page refresh...") - sb.refresh() - time.sleep(random.uniform(5, 8)) + await page.reload(ignore_cache=True) + await asyncio.sleep(random.uniform(5, 8)) - if _is_bypassed(sb): + if await _is_bypassed(page): return True try: - sb.uc_gui_click_captcha() - time.sleep(random.uniform(3, 5)) + await page.solve_captcha() + await asyncio.sleep(random.uniform(3, 5)) except Exception as e: logger.debug(f"Final captcha click failed: {e}") - return _is_bypassed(sb) + return await _is_bypassed(page) except Exception as e: logger.debug(f"Human-like method failed: {e}") return False -def _safe_reconnect(sb) -> None: - """Safely attempt to reconnect WebDriver after CDP mode.""" +async def _bypass_method_cdp_solve(page) -> bool: + """CDP Mode with solve_captcha() - auto-detects challenge type.""" try: - sb.reconnect() + logger.debug("Attempting bypass: CDP solve_captcha") + await page.solve_captcha() + await asyncio.sleep(random.uniform(3, 5)) + return await _is_bypassed(page) except Exception as e: - logger.debug(f"Reconnect failed: {e}") - - -def _bypass_method_cdp_solve(sb) -> bool: - """CDP Mode with solve_captcha() - WebDriver disconnected, no PyAutoGUI. - - CDP Mode disconnects WebDriver during interaction, making detection harder. - The solve_captcha() method auto-detects challenge type. - """ - try: - logger.debug("Attempting bypass: CDP Mode solve_captcha") - sb.activate_cdp_mode(sb.get_current_url()) - time.sleep(random.uniform(1, 2)) - - try: - sb.cdp.solve_captcha() - time.sleep(random.uniform(3, 5)) - sb.reconnect() - time.sleep(random.uniform(1, 2)) - - if _is_bypassed(sb): - return True - except Exception as e: - logger.debug(f"CDP solve_captcha failed: {e}") - _safe_reconnect(sb) - - return False - except Exception as e: - logger.debug(f"CDP Mode solve failed: {e}") - _safe_reconnect(sb) + logger.debug(f"CDP solve_captcha failed: {e}") return False @@ -466,41 +439,28 @@ CDP_CLICK_SELECTORS = [ ] -def _bypass_method_cdp_click(sb) -> bool: - """CDP Mode with native clicking - no PyAutoGUI dependency. - - Uses sb.cdp.click() which is native CDP clicking (SeleniumBase 4.45.6+). - """ +async def _bypass_method_cdp_click(page) -> bool: + """CDP Mode with native clicking - no PyAutoGUI dependency.""" try: - logger.debug("Attempting bypass: CDP Mode native click") - sb.activate_cdp_mode(sb.get_current_url()) - time.sleep(random.uniform(1, 2)) + logger.debug("Attempting bypass: CDP native click") for selector in CDP_CLICK_SELECTORS: try: - if not sb.cdp.is_element_visible(selector): + if not await page.is_element_visible(selector): continue logger.debug(f"CDP clicking: {selector}") - sb.cdp.click(selector) - time.sleep(random.uniform(2, 4)) + await page.click(selector) + await asyncio.sleep(random.uniform(2, 4)) - sb.reconnect() - time.sleep(random.uniform(1, 2)) - - if _is_bypassed(sb): + if await _is_bypassed(page): return True - - sb.activate_cdp_mode(sb.get_current_url()) - time.sleep(random.uniform(0.5, 1)) except Exception as e: logger.debug(f"CDP click on '{selector}' failed: {e}") - _safe_reconnect(sb) - return _is_bypassed(sb) + return await _is_bypassed(page) except Exception as e: logger.debug(f"CDP Mode click failed: {e}") - _safe_reconnect(sb) return False @@ -513,66 +473,45 @@ CDP_GUI_CLICK_SELECTORS = [ ] -def _bypass_method_cdp_gui_click(sb) -> bool: - """CDP Mode with PyAutoGUI-based clicking - uses actual mouse movement. - - Most human-like approach for advanced protections (Kasada, DataDome, Akamai). - """ +async def _bypass_method_cdp_gui_click(page) -> bool: + """CDP Mode with gui_click-style behavior.""" try: - logger.debug("Attempting bypass: CDP Mode gui_click (mouse-based)") - sb.activate_cdp_mode(sb.get_current_url()) - time.sleep(random.uniform(1, 2)) + logger.debug("Attempting bypass: CDP gui_click (mouse-based)") try: - logger.debug("Trying cdp.gui_click_captcha()") - sb.cdp.gui_click_captcha() - time.sleep(random.uniform(3, 5)) + logger.debug("Trying solve_captcha()") + await page.solve_captcha() + await asyncio.sleep(random.uniform(3, 5)) - sb.reconnect() - time.sleep(random.uniform(1, 2)) - - if _is_bypassed(sb): + if await _is_bypassed(page): return True - - sb.activate_cdp_mode(sb.get_current_url()) - time.sleep(random.uniform(0.5, 1)) except Exception as e: - logger.debug(f"cdp.gui_click_captcha() failed: {e}") + logger.debug(f"solve_captcha() failed: {e}") for selector in CDP_GUI_CLICK_SELECTORS: try: - if not sb.cdp.is_element_visible(selector): + if not await page.is_element_visible(selector): continue - logger.debug(f"CDP gui_click_element: {selector}") - sb.cdp.gui_click_element(selector) - time.sleep(random.uniform(3, 5)) + logger.debug(f"CDP click_with_offset: {selector}") + await page.click_with_offset(selector, 0, 0, center=True) + await asyncio.sleep(random.uniform(3, 5)) - sb.reconnect() - time.sleep(random.uniform(1, 2)) - - if _is_bypassed(sb): + if await _is_bypassed(page): return True - - sb.activate_cdp_mode(sb.get_current_url()) - time.sleep(random.uniform(0.5, 1)) except Exception as e: logger.debug(f"CDP gui_click on '{selector}' failed: {e}") - _safe_reconnect(sb) - return _is_bypassed(sb) + return await _is_bypassed(page) except Exception as e: logger.debug(f"CDP Mode gui_click failed: {e}") - _safe_reconnect(sb) return False BYPASS_METHODS = [ _bypass_method_cdp_solve, - _bypass_method_cdp_click, _bypass_method_cdp_gui_click, - _bypass_method_handle_captcha, - _bypass_method_click_captcha, + _bypass_method_cdp_click, _bypass_method_humanlike, ] @@ -586,7 +525,7 @@ def _check_cancellation(cancel_flag: Optional[Event], message: str) -> None: raise BypassCancelledException("Bypass cancelled") -def _bypass(sb, max_retries: Optional[int] = None, cancel_flag: Optional[Event] = None) -> bool: +async def _bypass(page, max_retries: Optional[int] = None, cancel_flag: Optional[Event] = None) -> bool: """Attempt to bypass Cloudflare/DDOS-Guard protection using multiple methods.""" max_retries = max_retries if max_retries is not None else app_config.MAX_RETRY @@ -598,29 +537,29 @@ def _bypass(sb, max_retries: Optional[int] = None, cancel_flag: Optional[Event] for try_count in range(max_retries): _check_cancellation(cancel_flag, "Bypass cancelled by user") - if _is_bypassed(sb): + if await _is_bypassed(page): if try_count == 0: logger.info("Page already bypassed") return True - challenge_type = _detect_challenge_type(sb) + challenge_type = await _detect_challenge_type(page) logger.debug(f"Challenge detected: {challenge_type}") # No challenge detected but page doesn't look bypassed - wait and retry if challenge_type == "none": logger.info("No challenge detected, waiting for page to settle...") - time.sleep(random.uniform(2, 3)) - if _is_bypassed(sb): + await asyncio.sleep(random.uniform(2, 3)) + if await _is_bypassed(page): return True - # Try a simple reconnect instead of captcha methods + # Try a simple refresh instead of captcha methods try: - sb.reconnect() - time.sleep(random.uniform(1, 2)) - if _is_bypassed(sb): - logger.info("Bypass successful after reconnect") + await page.reload(ignore_cache=True) + await asyncio.sleep(random.uniform(1, 2)) + if await _is_bypassed(page): + logger.info("Bypass successful after refresh") return True except Exception as e: - logger.debug(f"Reconnect during no-challenge wait failed: {e}") + logger.debug(f"Refresh during no-challenge wait failed: {e}") continue if challenge_type == last_challenge_type: @@ -642,11 +581,11 @@ def _bypass(sb, max_retries: Optional[int] = None, cancel_flag: Optional[Event] logger.info(f"Waiting {wait_time:.1f}s before trying...") for _ in range(int(wait_time)): _check_cancellation(cancel_flag, "Bypass cancelled during wait") - time.sleep(1) - time.sleep(wait_time - int(wait_time)) + await asyncio.sleep(1) + await asyncio.sleep(wait_time - int(wait_time)) try: - if method(sb): + if await method(page): logger.info(f"Bypass successful using {method.__name__}") return True except BypassCancelledException: @@ -659,8 +598,8 @@ def _bypass(sb, max_retries: Optional[int] = None, cancel_flag: Optional[Event] logger.warning("Exceeded maximum retries. Bypass failed.") return False -def _get_chromium_args() -> list[str]: - """Build Chrome arguments, pre-resolving hostnames via Python's patched DNS. +def _get_browser_args() -> list[str]: + """Build extra Chrome arguments, pre-resolving hostnames via patched DNS. Pre-resolves AA hostnames and passes IPs to Chrome via --host-resolver-rules, bypassing Chrome's DNS entirely for those hosts. @@ -670,7 +609,11 @@ def _get_chromium_args() -> list[str]: "--ignore-ssl-errors", "--allow-running-insecure-content", "--ignore-certificate-errors-spki-list", - "--ignore-certificate-errors-skip-list" + "--ignore-certificate-errors-skip-list", + # Chrome 144+ disabled automatic SwiftShader fallback for WebGL (security reasons). + # Without this flag, WebGL is broken in headless/Docker which triggers bot detection. + # See: https://issues.chromium.org/issues/40277080 + "--enable-unsafe-swiftshader", ] if app_config.get("DEBUG", False): @@ -680,12 +623,6 @@ def _get_chromium_args() -> list[str]: "--log-file=" + str(LOG_DIR / "chrome_browser.log") ]) - proxies = get_proxies() - if proxies: - proxy_url = proxies.get('https') or proxies.get('http') - if proxy_url: - arguments.append(f'--proxy-server={proxy_url}') - host_rules = _build_host_resolver_rules() if host_rules: arguments.append(f'--host-resolver-rules={", ".join(host_rules)}') @@ -721,42 +658,41 @@ def _build_host_resolver_rules() -> list[str]: return host_rules -DRIVER_RESET_ERRORS = {"WebDriverException", "SessionNotCreatedException", "TimeoutException", "MaxRetryError"} +DRIVER_RESET_ERRORS = {"ProtocolException", "RuntimeError", "TimeoutError"} -def _get(url: str, driver: Driver, cancel_flag: Optional[Event] = None) -> str: - """Fetch URL with Cloudflare bypass using provided driver.""" +async def _get(url: str, driver, cancel_flag: Optional[Event] = None) -> str: + """Fetch URL with Cloudflare bypass using a CDP browser.""" _check_cancellation(cancel_flag, "Bypass cancelled before starting") - logger.debug(f"SB_GET: {url}") + logger.debug(f"CDP_GET: {url}") - hostname = urlparse(url).hostname or "" - if has_valid_cf_cookies(hostname): - reconnect_time = 1.0 - logger.debug(f"Using fast reconnect ({reconnect_time}s) - valid cookies exist") - else: - reconnect_time = app_config.DEFAULT_SLEEP - logger.debug(f"Using standard reconnect ({reconnect_time}s) - no cached cookies") - - logger.debug("Opening URL with SeleniumBase...") - driver.uc_open_with_reconnect(url, reconnect_time) + logger.debug("Opening URL with SeleniumBase CDP...") + page = await driver.get(url) + try: + await page.wait() + except Exception: + pass _check_cancellation(cancel_flag, "Bypass cancelled after page load") try: - logger.debug(f"Page loaded - URL: {driver.get_current_url()}, Title: {driver.get_title()}") + current_url = await page.get_current_url() + title = await page.get_title() + logger.debug(f"Page loaded - URL: {current_url}, Title: {title}") except Exception as e: logger.debug(f"Could not get page info: {e}") logger.debug("Starting bypass process...") - if _bypass(driver, cancel_flag=cancel_flag): - _extract_cookies_from_driver(driver, url) - return driver.page_source + if await _bypass(page, cancel_flag=cancel_flag): + await _extract_cookies_from_cdp(driver, page, url) + return await page.get_page_source() logger.warning("Bypass completed but page still shows protection") try: - body = driver.get_text("body") - logger.debug(f"Page content: {body[:500]}..." if len(body) > 500 else body) + body = await page.evaluate("document.body ? document.body.innerText : ''") + if body: + logger.debug(f"Page content: {body[:500]}..." if len(body) > 500 else body) except Exception: pass @@ -769,82 +705,153 @@ def get(url: str, retry: Optional[int] = None, cancel_flag: Optional[Event] = No with LOCKED: # Try cookies first - another request may have completed bypass while waiting - cookies = get_cf_cookies_for_domain(urlparse(url).hostname or "") - if cookies: + cached_result = _try_with_cached_cookies(url, urlparse(url).hostname or "") + if cached_result: + return cached_result + + async def _run_bypass() -> str: + driver = None try: - response = requests.get(url, cookies=cookies, proxies=get_proxies(url), timeout=(5, 10)) - if response.status_code == 200: - logger.debug("Cookies available after lock wait - skipped Chrome") - return response.text - except Exception: - pass + driver = await _create_cdp_browser(url) - # Fresh Chrome for each bypass attempt - driver = None - try: - _ensure_display_initialized() - driver = _create_driver() + for attempt in range(retry): + _check_cancellation(cancel_flag, "Bypass cancelled before attempt") - for attempt in range(retry): - _check_cancellation(cancel_flag, "Bypass cancelled before attempt") + try: + result = await _get(url, driver, cancel_flag) + if result: + return result + except BypassCancelledException: + raise + except Exception as e: + error_details = f"{type(e).__name__}: {e}" + logger.warning(f"Bypass failed (attempt {attempt + 1}/{retry}): {error_details}") + logger.debug(f"Stack trace: {traceback.format_exc()}") - try: - result = _get(url, driver, cancel_flag) - if result: - return result - except BypassCancelledException: - raise - except Exception as e: - error_details = f"{type(e).__name__}: {e}" - logger.warning(f"Bypass failed (attempt {attempt + 1}/{retry}): {error_details}") - logger.debug(f"Stack trace: {traceback.format_exc()}") + # On CDP errors, quit and create a fresh browser + if type(e).__name__ in DRIVER_RESET_ERRORS: + logger.info("Restarting Chrome due to browser error...") + await _close_cdp_driver(driver) + driver = await _create_cdp_browser(url) - # On driver errors, quit and create fresh driver - if type(e).__name__ in DRIVER_RESET_ERRORS: - logger.info("Restarting Chrome due to browser error...") - _quit_driver(driver) - driver = _create_driver() + logger.error(f"Bypass failed after {retry} attempts") + return "" + finally: + if driver: + await _close_cdp_driver(driver) - logger.error(f"Bypass failed after {retry} attempts") - return "" - finally: - # Always quit Chrome when done - if driver: - _quit_driver(driver) + return _CDP_WORKER.run(_run_bypass()) -def _create_driver() -> Driver: - """Create a fresh Chrome driver instance.""" - chromium_args = _get_chromium_args() +def _get_proxy_string(url: str) -> Optional[str]: + """Return a single proxy string for CDP, honoring NO_PROXY.""" + proxies = get_proxies(url) + if not proxies: + return None + proxy_url = proxies.get("https") or proxies.get("http") + return proxy_url or None + + +async def _create_cdp_browser(url: str) -> Any: + """Create a fresh CDP browser instance.""" + browser_args = _get_browser_args() screen_width, screen_height = get_screen_size() + display_width = screen_width + 100 + display_height = screen_height + 150 + proxy = _get_proxy_string(url) - logger.debug(f"Creating Chrome driver with args: {chromium_args}") + logger.debug(f"Creating Pure CDP browser with args: {browser_args}") logger.debug(f"Browser screen size: {screen_width}x{screen_height}") - # Start FFmpeg recording if debug mode (record each bypass session) - if app_config.get("DEBUG", False) and DISPLAY["xvfb"] and not DISPLAY["ffmpeg"]: - _start_ffmpeg_recording() - - driver = Driver( - uc=True, + driver = await cdp_driver.start_async( headless=False, + headed=False, + xvfb=True, + xvfb_metrics=f"{display_width},{display_height}", + sandbox=False, + lang="en", incognito=True, - locale="en", ad_block=True, - size=f"{screen_width},{screen_height}", - chromium_arg=chromium_args, + proxy=proxy, + browser_args=browser_args, ) - driver.set_page_load_timeout(60) - time.sleep(app_config.DEFAULT_SLEEP) - logger.info("Chrome browser ready") + + try: + await driver.page.set_window_rect(0, 0, screen_width, screen_height) + except Exception as e: + logger.debug(f"Failed to set window size: {e}") + + # Start FFmpeg recording if debug mode (record each bypass session) + if app_config.get("DEBUG", False) and not DISPLAY.get("ffmpeg"): + _start_ffmpeg_recording(display=os.environ.get("DISPLAY", ":0")) + + await asyncio.sleep(app_config.DEFAULT_SLEEP) + logger.info("Chrome browser ready (Pure CDP)") logger.log_resource_usage() return driver -def _start_ffmpeg_recording() -> None: +async def _close_cdp_driver(driver) -> None: + """Close CDP connections and stop the browser.""" + if not driver: + return + + logger.debug("Quitting Chrome browser (CDP)...") + + _stop_ffmpeg_recording() + + try: + connections = [] + if hasattr(driver, "connection") and driver.connection: + connections.append(driver.connection) + if hasattr(driver, "targets") and driver.targets: + connections.extend(driver.targets) + for conn in connections: + try: + await conn.aclose() + except Exception as e: + logger.debug(f"Failed to close websocket connection: {e}") + except Exception as e: + logger.debug(f"Error during connection cleanup: {e}") + + try: + driver.stop() + logger.debug("Stopped CDP browser") + except Exception as e: + logger.debug(f"CDP stop: {e}") + + if env.DOCKERMODE: + await asyncio.sleep(0.3) + try: + pid = getattr(driver, "_process_pid", None) + + def _pid_alive(check_pid: int) -> bool: + try: + os.kill(check_pid, 0) + except ProcessLookupError: + return False + except PermissionError: + return True + return True + + if pid and _pid_alive(pid): + try: + os.kill(pid, signal.SIGTERM) + await asyncio.sleep(0.1) + if _pid_alive(pid): + os.kill(pid, signal.SIGKILL) + logger.debug(f"Killed Chrome pid {pid}") + except Exception as e: + logger.debug(f"Failed to kill Chrome pid {pid}: {e}") + except Exception as e: + logger.debug(f"Process cleanup failed: {e}") + + logger.log_resource_usage() + + +def _start_ffmpeg_recording(display: str) -> None: """Start FFmpeg screen recording for debug mode.""" global DISPLAY RECORDING_DIR.mkdir(parents=True, exist_ok=True) - display = DISPLAY["xvfb"] timestamp = datetime.now().strftime("%y%m%d-%H%M%S") output_file = RECORDING_DIR / f"screen_recording_{timestamp}.mp4" @@ -855,7 +862,7 @@ def _start_ffmpeg_recording() -> None: ffmpeg_cmd = [ "ffmpeg", "-y", "-f", "x11grab", "-video_size", f"{display_width}x{display_height}", - "-i", f":{display.display}", + "-i", display, "-c:v", "libx264", "-preset", "ultrafast", "-maxrate", "700k", "-bufsize", "1400k", "-crf", "36", "-pix_fmt", "yuv420p", "-tune", "animation", @@ -867,161 +874,39 @@ def _start_ffmpeg_recording() -> None: logger.debug("Starting FFmpeg recording to %s", output_file) logger.debug_trace(f"FFmpeg command: {' '.join(ffmpeg_cmd)}") DISPLAY["ffmpeg"] = subprocess.Popen(ffmpeg_cmd) + DISPLAY["ffmpeg_output"] = output_file -def _close_cdp_sockets() -> int: - """Find and close any sockets connected to CDP port 9222. - - This is a workaround for SeleniumBase not properly closing websocket - connections when using activate_cdp_mode(). Returns count of closed sockets. - """ - import os - closed = 0 - pid = os.getpid() - - try: - fd_path = f'/proc/{pid}/fd' - for fd_name in os.listdir(fd_path): - try: - fd = int(fd_name) - link = os.readlink(f'{fd_path}/{fd_name}') - if 'socket:' not in link: - continue - - # Check if this socket is connected to port 9222 (CDP) - # by reading /proc/net/tcp and matching inode - inode = link.split('[')[1].rstrip(']') - - with open('/proc/net/tcp', 'r') as f: - for line in f: - parts = line.split() - if len(parts) < 10: - continue - # Check if this is our socket and connects to port 9222 (0x2406) - if parts[9] == inode: - remote = parts[2] - remote_port = int(remote.split(':')[1], 16) - if remote_port == 9222: - logger.debug(f"Closing CDP socket fd={fd} inode={inode}") - os.close(fd) - closed += 1 - break - except (ValueError, OSError, IndexError): - continue - except Exception as e: - logger.debug(f"Error scanning for CDP sockets: {e}") - - return closed - - -def _quit_driver(driver: Driver) -> None: - """Quit Chrome driver and clean up resources. - - Proper cleanup sequence for SeleniumBase CDP mode: - 1. Stop CDP browser (closes websocket connections) - 2. Reconnect WebDriver - 3. Close window - 4. Quit driver - 5. Force-kill any lingering processes - - The CDP websocket connection must be explicitly closed before Chrome is killed, - otherwise the sockets end up in CLOSE_WAIT state causing gevent to busy-loop. - - References: - - https://github.com/seleniumbase/SeleniumBase/discussions/3768 - - https://www.selenium.dev/selenium/docs/api/py/selenium_webdriver_common_bidi/selenium.webdriver.common.bidi.cdp.html - """ - if driver is None: - return - - logger.debug("Quitting Chrome driver...") - - # Strategy 1: Stop CDP browser if in CDP mode (closes websocket connections) - # This is the proper SeleniumBase way to close CDP connections - try: - if hasattr(driver, 'cdp') and driver.cdp and hasattr(driver.cdp, 'driver'): - driver.cdp.driver.stop() - logger.debug("Stopped CDP browser (closed websocket)") - time.sleep(0.3) - except Exception as e: - logger.debug(f"CDP stop: {e}") - - # Strategy 2: Reconnect to re-establish WebDriver control before quitting - try: - driver.reconnect() - time.sleep(0.2) - except Exception as e: - logger.debug(f"Reconnect: {e}") - - # Strategy 3: Close the current window/tab - try: - driver.close() - time.sleep(0.2) - except Exception as e: - logger.debug(f"Close window: {e}") - - # Strategy 4: Fallback - explicitly close any remaining CDP sockets - # This catches any sockets that weren't closed by cdp.driver.stop() - closed = _close_cdp_sockets() - if closed: - logger.debug(f"Closed {closed} remaining CDP socket(s)") - - # Strategy 5: Standard quit - try: - driver.quit() - except Exception as e: - logger.debug(f"Quit: {e}") - - # Strategy 6: Force garbage collection - import gc - gc.collect() - - # Strategy 7: Force-kill any lingering Chrome/chromedriver processes - if env.DOCKERMODE: - time.sleep(0.3) - try: - subprocess.run(["pkill", "-9", "-f", "chrom"], capture_output=True, timeout=5) - except Exception as e: - logger.debug(f"pkill chrome: {e}") - - # Strategy 8: Stop ffmpeg recording if running +def _stop_ffmpeg_recording() -> None: + """Stop FFmpeg screen recording if running.""" + import signal global DISPLAY - if DISPLAY.get("ffmpeg"): - try: - DISPLAY["ffmpeg"].terminate() - DISPLAY["ffmpeg"].wait(timeout=2) - logger.debug("Stopped ffmpeg recording") - except Exception as e: - logger.debug(f"ffmpeg terminate: {e}") - try: - DISPLAY["ffmpeg"].kill() - except Exception: - pass + proc = DISPLAY.get("ffmpeg") + output_file = DISPLAY.get("ffmpeg_output") + if not proc: + return + if proc.poll() is not None: + logger.debug("FFmpeg already stopped") DISPLAY["ffmpeg"] = None - - logger.log_resource_usage() - - -def _ensure_display_initialized(): - """Initialize virtual display if needed. Must be called with LOCKED held.""" - global DISPLAY - if DISPLAY["xvfb"] is not None: + DISPLAY["ffmpeg_output"] = None return - if not (env.DOCKERMODE and app_config.get("USE_CF_BYPASS", True)): - return - - from pyvirtualdisplay import Display - # Get the screen size (generates a random one if not already set) - screen_width, screen_height = get_screen_size() - # Add padding for browser chrome (title bar, borders, taskbar space) - display_width = screen_width + 100 - display_height = screen_height + 150 - display = Display(visible=False, size=(display_width, display_height)) - display.start() - DISPLAY["xvfb"] = display - logger.info(f"Virtual display started: {display_width}x{display_height}") - time.sleep(app_config.DEFAULT_SLEEP) - _reset_pyautogui_display_state() + try: + proc.send_signal(signal.SIGINT) + proc.wait(timeout=5) + logger.debug("Stopped ffmpeg recording") + except Exception as e: + logger.debug(f"ffmpeg stop: {e}") + try: + proc.terminate() + proc.wait(timeout=2) + except Exception: + pass + try: + proc.kill() + except Exception: + pass + DISPLAY["ffmpeg"] = None + DISPLAY["ffmpeg_output"] = None def _try_with_cached_cookies(url: str, hostname: str) -> Optional[str]: diff --git a/shelfmark/config/env.py b/shelfmark/config/env.py index cc9ec9f..bc77d5f 100644 --- a/shelfmark/config/env.py +++ b/shelfmark/config/env.py @@ -133,6 +133,14 @@ TOR_VARIANT_AVAILABLE = shutil.which("tor") is not None USING_TOR = string_to_bool(os.getenv("USING_TOR", "false")) +# ============================================================================= +# Onboarding +# ============================================================================= + +# Set to false to skip the onboarding wizard entirely (useful for ephemeral storage) +ONBOARDING = string_to_bool(os.getenv("ONBOARDING", "true")) + + # ============================================================================= # Debug/development settings # ============================================================================= diff --git a/shelfmark/config/settings.py b/shelfmark/config/settings.py index f9b2ea9..1e633bf 100644 --- a/shelfmark/config/settings.py +++ b/shelfmark/config/settings.py @@ -177,8 +177,8 @@ _FORMAT_OPTIONS = [ _AUDIOBOOK_FORMAT_OPTIONS = [ {"value": "m4b", "label": "M4B"}, - {"value": "m4a", "label": "M4A"}, {"value": "mp3", "label": "MP3"}, + {"value": "m4a", "label": "M4A"}, {"value": "zip", "label": "ZIP"}, {"value": "rar", "label": "RAR"}, ] diff --git a/shelfmark/core/onboarding.py b/shelfmark/core/onboarding.py index 18e2f4e..0258fc9 100644 --- a/shelfmark/core/onboarding.py +++ b/shelfmark/core/onboarding.py @@ -34,6 +34,12 @@ def _get_config_dir() -> Path: def is_onboarding_complete() -> bool: """Check if onboarding has been completed.""" + from shelfmark.config.env import ONBOARDING + + # If onboarding is disabled via env var, treat as complete + if not ONBOARDING: + return True + config_file = _get_config_dir() / "settings.json" if not config_file.exists(): return False diff --git a/shelfmark/release_sources/direct_download.py b/shelfmark/release_sources/direct_download.py index fa15084..7796a0f 100644 --- a/shelfmark/release_sources/direct_download.py +++ b/shelfmark/release_sources/direct_download.py @@ -1056,6 +1056,7 @@ def _book_info_to_release(book_info: BookInfo) -> Release: source_id=book_info.id, title=book_info.title, format=book_info.format, + language=book_info.language, # Top-level language for filtering size=book_info.size, download_url=book_info.download_urls[0] if book_info.download_urls else None, info_url=f"{network.get_aa_base_url()}/md5/{book_info.id}", diff --git a/src/frontend/src/components/ReleaseCell.tsx b/src/frontend/src/components/ReleaseCell.tsx index 1d2a1ae..0764147 100644 --- a/src/frontend/src/components/ReleaseCell.tsx +++ b/src/frontend/src/components/ReleaseCell.tsx @@ -165,7 +165,7 @@ export const ReleaseCell = ({ column, release, compact = false, onlineServers }: const torznabAttrs = extra?.torznab_attrs as Record | undefined; const publishDate = extra?.publish_date as string | undefined; - // Helper to format relative time + // Helper to format age in days const formatRelativeTime = (dateStr: string): string | null => { try { const date = new Date(dateStr); @@ -174,14 +174,8 @@ export const ReleaseCell = ({ column, release, compact = false, onlineServers }: const diffMs = now.getTime() - date.getTime(); const diffDays = Math.floor(diffMs / (1000 * 60 * 60 * 24)); if (diffDays === 0) return 'Today'; - if (diffDays === 1) return '1 day ago'; - if (diffDays < 30) return `${diffDays} days ago`; - const diffMonths = Math.floor(diffDays / 30); - if (diffMonths === 1) return '1 month ago'; - if (diffMonths < 12) return `${diffMonths} months ago`; - const diffYears = Math.floor(diffDays / 365); - if (diffYears === 1) return '1 year ago'; - return `${diffYears} years ago`; + if (diffDays === 1) return '1 day'; + return `${diffDays} days`; } catch { return null; } diff --git a/src/frontend/src/components/ReleaseModal.tsx b/src/frontend/src/components/ReleaseModal.tsx index adce990..323d2ae 100644 --- a/src/frontend/src/components/ReleaseModal.tsx +++ b/src/frontend/src/components/ReleaseModal.tsx @@ -9,7 +9,7 @@ import { ReleaseCell } from './ReleaseCell'; import { getColorStyleFromHint } from '../utils/colorMaps'; import { getNestedValue } from '../utils/objectHelpers'; import { LanguageMultiSelect } from './LanguageMultiSelect'; -import { LANGUAGE_OPTION_ALL, LANGUAGE_OPTION_DEFAULT, getLanguageFilterValues, releaseLanguageMatchesFilter } from '../utils/languageFilters'; +import { LANGUAGE_OPTION_ALL, LANGUAGE_OPTION_DEFAULT, getLanguageFilterValues, releaseLanguageMatchesFilter, buildLanguageNormalizer } from '../utils/languageFilters'; // Module-level cache for release search results // Key format: `${provider}:${provider_id}:${source}:${contentType}` @@ -1069,6 +1069,11 @@ export const ReleaseModal = ({ return getLanguageFilterValues(languageFilter, bookLanguages, defaultLanguages); }, [languageFilter, bookLanguages, defaultLanguages]); + // Build language normalizer for release filtering (handles both codes like "en" and names like "English") + const languageNormalizer = useMemo(() => { + return buildLanguageNormalizer(bookLanguages); + }, [bookLanguages]); + // Get column config from response or use default (moved before filteredReleases for sorting) const columnConfig = useMemo((): ReleaseColumnConfig => { const response = releasesBySource[activeTab]; @@ -1164,7 +1169,7 @@ export const ReleaseModal = ({ // Language filtering - use r.language when provided by enriched indexers // Releases with no language (null/undefined) always pass const releaseLang = r.language as string | undefined; - if (!releaseLanguageMatchesFilter(releaseLang, resolvedLanguageCodes ?? defaultLanguages)) { + if (!releaseLanguageMatchesFilter(releaseLang, resolvedLanguageCodes ?? defaultLanguages, languageNormalizer)) { return false; } @@ -1184,7 +1189,7 @@ export const ReleaseModal = ({ } return filtered; - }, [releasesBySource, activeTab, formatFilter, resolvedLanguageCodes, effectiveFormats, defaultLanguages, indexerFilter, currentSort, sortableColumns, columnConfig]); + }, [releasesBySource, activeTab, formatFilter, resolvedLanguageCodes, effectiveFormats, defaultLanguages, languageNormalizer, indexerFilter, currentSort, sortableColumns, columnConfig]); // Pre-compute display field lookups to avoid repeated .find() calls in JSX const displayFields = useMemo(() => { diff --git a/src/frontend/src/utils/colorMaps.ts b/src/frontend/src/utils/colorMaps.ts index e551f38..77dc0fa 100644 --- a/src/frontend/src/utils/colorMaps.ts +++ b/src/frontend/src/utils/colorMaps.ts @@ -17,6 +17,7 @@ const FORMAT_COLORS: Record = { cbz: { bg: 'bg-amber-500/20', text: 'text-amber-700 dark:text-amber-300' }, // Audiobook formats m4b: { bg: 'bg-violet-500/20', text: 'text-violet-700 dark:text-violet-300' }, + m4a: { bg: 'bg-violet-500/20', text: 'text-violet-700 dark:text-violet-300' }, mp3: { bg: 'bg-rose-500/20', text: 'text-rose-700 dark:text-rose-300' }, flac: { bg: 'bg-indigo-500/20', text: 'text-indigo-700 dark:text-indigo-300' }, }; diff --git a/src/frontend/src/utils/languageFilters.ts b/src/frontend/src/utils/languageFilters.ts index 215599d..0ebb55f 100644 --- a/src/frontend/src/utils/languageFilters.ts +++ b/src/frontend/src/utils/languageFilters.ts @@ -87,12 +87,33 @@ export const formatDefaultLanguageLabel = ( return `Default (${joined}${suffix})`; }; +/** + * Build a mapping from language names to codes for normalization. + * Handles both directions: "english" -> "en" and "en" -> "en" + */ +export const buildLanguageNormalizer = (languages: Language[]): Map => { + const map = new Map(); + for (const lang of languages) { + const code = lang.code.toLowerCase(); + map.set(code, code); // code -> code + map.set(lang.language.toLowerCase(), code); // name -> code + } + return map; +}; + /** * Check if ALL languages in a multi-language release match the selected filter. + * Multi-language releases use separators like comma, slash, plus, or ampersand + * (e.g., "English, Spanish", "English/Spanish", "English + Spanish", "English & Spanish"). + * + * @param releaseLang - Language string from the release (can be code or full name) + * @param selectedCodes - Array of selected ISO language codes + * @param languageNormalizer - Optional map to normalize language names to codes */ export const releaseLanguageMatchesFilter = ( releaseLang: string | undefined, selectedCodes: string[] | null, + languageNormalizer?: Map, ): boolean => { if (!releaseLang || !selectedCodes) { return true; @@ -100,7 +121,18 @@ export const releaseLanguageMatchesFilter = ( if (selectedCodes.includes(LANGUAGE_OPTION_ALL)) { return true; } - const releaseCodes = releaseLang.split(/[,/]/).map(l => l.trim().toLowerCase()).filter(Boolean); + + // Split by common multi-language separators: comma, slash, plus, ampersand + const releaseParts = releaseLang.split(/[,/+&]/).map(l => l.trim().toLowerCase()).filter(Boolean); + + // Normalize release language parts to codes (handles both "en" and "english") + const releaseCodes = releaseParts.map(part => { + if (languageNormalizer) { + return languageNormalizer.get(part) ?? part; + } + return part; + }); + const selectedSet = new Set(selectedCodes.map(c => c.toLowerCase())); return releaseCodes.every(code => selectedSet.has(code)); }; diff --git a/tests/bypass/test_internal_bypasser.py b/tests/bypass/test_internal_bypasser.py index 7f79d81..c29b99f 100644 --- a/tests/bypass/test_internal_bypasser.py +++ b/tests/bypass/test_internal_bypasser.py @@ -22,3 +22,70 @@ def test_bypass_tries_all_methods_before_abort(monkeypatch): assert internal_bypasser._bypass(object(), max_retries=10) is False assert calls == [f"m{i}" for i in range(6)] + + +def test_extract_cookies_from_cdp_filters_and_stores_ua(): + import time + import shelfmark.bypass.internal_bypasser as internal_bypasser + + class FakeCookie: + def __init__(self, name, value, domain, path, expires, secure=True): + self.name = name + self.value = value + self.domain = domain + self.path = path + self.expires = expires + self.secure = secure + + class FakeSb: + def get_all_cookies(self, requests_cookie_format=False): + assert requests_cookie_format is True + return [ + FakeCookie("cf_clearance", "abc", "example.com", "/", int(time.time()) + 3600), + FakeCookie("sessionid", "zzz", "example.com", "/", int(time.time()) + 3600), + ] + + def get_user_agent(self): + return "TestUA/1.0" + + internal_bypasser.clear_cf_cookies() + internal_bypasser._extract_cookies_from_cdp(FakeSb(), "https://www.example.com/path") + + cookies = internal_bypasser.get_cf_cookies_for_domain("example.com") + assert cookies == {"cf_clearance": "abc"} + assert internal_bypasser.get_cf_user_agent_for_domain("example.com") == "TestUA/1.0" + + +def test_extract_cookies_from_cdp_normalizes_session_expiry(): + import time + import shelfmark.bypass.internal_bypasser as internal_bypasser + + class FakeCookie: + def __init__(self, name, value, domain, path, expires, secure=True): + self.name = name + self.value = value + self.domain = domain + self.path = path + self.expires = expires + self.secure = secure + + class FakeSb: + def get_all_cookies(self, requests_cookie_format=False): + assert requests_cookie_format is True + return [ + FakeCookie("cf_clearance", "abc", "example.com", "/", 0), + ] + + def get_user_agent(self): + return "TestUA/1.0" + + internal_bypasser.clear_cf_cookies() + internal_bypasser._extract_cookies_from_cdp(FakeSb(), "https://example.com") + + stored = internal_bypasser._cf_cookies.get("example.com", {}) + assert stored["cf_clearance"]["expiry"] is None + assert internal_bypasser.get_cf_cookies_for_domain("example.com") == {"cf_clearance": "abc"} + + # Verify fallback to "expires" key for expiry checks + internal_bypasser._cf_cookies["example.com"]["cf_clearance"]["expires"] = int(time.time()) - 10 + assert internal_bypasser.get_cf_cookies_for_domain("example.com") == {}