From 207cff96d3b87b8d569b5f3ab6dc7350311d96e6 Mon Sep 17 00:00:00 2001 From: Federico Della Rovere Date: Thu, 28 Aug 2025 23:37:59 +0200 Subject: [PATCH] External CloudFlare resolver (#245) Adding support for an external CloudFlare bypasser service and introducing a new Docker image build with a dedicated target. Key Changes - Added `cloudflare_bypasser_external.py` for external bypasser integration. - Updated Docker Compose files to support the new service. - Introduced a new Docker target for building a separate image for the external bypasser. - Refactored relevant modules to utilize the external bypasser when configured. - Documentation and configuration updates to reflect new options and Docker targets. Impact - Users can now choose between internal and external CloudFlare bypassing. - New Docker image and target streamline deployment of the external bypasser. - Improved modularity and maintainability. - No breaking changes for existing workflows. Testing - Manual and E2E tests performed for both bypasser modes. - Docker Compose setups and new image build verified for development and production. Notes Please review the new configuration options and Docker targets. Update your environment and deployment scripts as needed. Feedback and suggestions are welcome! --- .../build-and-publish-docker-image.yml | 3 ++ Dockerfile | 53 ++++++++++++------- app.py | 7 ++- cloudflare_bypasser.py | 19 +++++++ cloudflare_bypasser_external.py | 34 ++++++++++++ config.py | 10 ++-- docker-compose.extbp.dev.yml | 35 ++++++++++++ docker-compose.extbp.yml | 22 ++++++++ downloader.py | 14 +++-- entrypoint.sh | 4 +- env.py | 6 +++ readme.md | 46 +++++++++++++++- requirements.txt => requirements-base.txt | 3 -- requirements-cwa-bd.txt | 3 ++ 14 files changed, 221 insertions(+), 38 deletions(-) create mode 100644 cloudflare_bypasser_external.py create mode 100644 docker-compose.extbp.dev.yml create mode 100644 docker-compose.extbp.yml rename requirements.txt => requirements-base.txt (62%) create mode 100644 requirements-cwa-bd.txt diff --git a/.github/workflows/build-and-publish-docker-image.yml b/.github/workflows/build-and-publish-docker-image.yml index 6af59e5..3115aaf 100644 --- a/.github/workflows/build-and-publish-docker-image.yml +++ b/.github/workflows/build-and-publish-docker-image.yml @@ -26,6 +26,9 @@ jobs: - suffix: "-tor" target: cwa-bd-tor image_name_suffix: "-tor" + - suffix: "-extbp" + target: cwa-bd-extbp + image_name_suffix: "-extbp" steps: - name: Get current date id: date diff --git a/Dockerfile b/Dockerfile index ef4175d..0d44ca1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,18 +38,10 @@ RUN apt-get update && \ curl \ # For entrypoint dumb-init \ - # For dumb display - xvfb \ - # For screen recording - ffmpeg \ # For debug zip iputils-ping \ # For user switching - sudo \ - # --- Chromium Browser --- - chromium-driver \ - # For tkinter (pyautogui) - python3-tk && \ + sudo && \ # Cleanup APT cache *after* all installs in this layer apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \ apt-get clean && \ @@ -67,17 +59,12 @@ WORKDIR /app # Install Python dependencies using pip # Upgrade pip first, then copy requirements and install -# Copying requirements.txt separately leverages build cache -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt && \ +# Copying requirements-base.txt separately leverages build cache +COPY requirements-base.txt . +RUN pip install --no-cache-dir -r requirements-base.txt && \ # Clean root's pip cache rm -rf /root/.cache -# Add this line to grant read/execute permissions to others -RUN chmod -R o+rx /usr/bin/chromium && \ - chmod -R o+rx /usr/bin/chromedriver && \ - chmod -R o+w /usr/local/lib/python3.10/site-packages/seleniumbase/drivers/ - # Copy application code *after* dependencies are installed COPY . . @@ -101,10 +88,34 @@ ENTRYPOINT ["/usr/bin/dumb-init", "--"] FROM base AS cwa-bd +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + # For dumb display + xvfb \ + # For screen recording + ffmpeg \ + # --- Chromium --- + chromium \ + # --- ChromeDriver --- + chromium-driver \ + # For tkinter (pyautogui) + python3-tk + +# install additional dependencies +COPY requirements-cwa-bd.txt . +RUN pip install --no-cache-dir -r requirements-cwa-bd.txt && \ + # Clean root's pip cache + rm -rf /root/.cache + +# Add this line to grant read/execute permissions to others +RUN chmod -R o+rx /usr/bin/chromium && \ + chmod -R o+rx /usr/bin/chromedriver && \ + chmod -R o+w /usr/local/lib/python3.10/site-packages/seleniumbase/drivers/ + # Default command to run the application entrypoint script CMD ["/app/entrypoint.sh"] -FROM base AS cwa-bd-tor +FROM cwa-bd AS cwa-bd-tor ENV USING_TOR=true @@ -124,3 +135,9 @@ RUN apt-get update && \ # Override the default command to run Tor CMD ["/app/entrypoint.sh"] + +FROM base AS cwa-bd-extbp + +ENV USING_EXTERNAL_BYPASSER=true + +CMD ["/app/entrypoint.sh"] diff --git a/app.py b/app.py index d46ea00..04c74d0 100644 --- a/app.py +++ b/app.py @@ -13,7 +13,7 @@ import typing from logger import setup_logger from config import _SUPPORTED_BOOK_LANGUAGE, BOOK_LANGUAGE -from env import FLASK_HOST, FLASK_PORT, APP_ENV, CWA_DB_PATH, DEBUG +from env import FLASK_HOST, FLASK_PORT, APP_ENV, CWA_DB_PATH, DEBUG, USING_EXTERNAL_BYPASSER import backend from models import SearchFilters @@ -117,7 +117,10 @@ from typing import Union, Tuple if DEBUG: import subprocess import time - from cloudflare_bypasser import _reset_driver as STOP_GUI + if USING_EXTERNAL_BYPASSER: + STOP_GUI = lambda: None # No-op for external bypasser + else: + from cloudflare_bypasser import _reset_driver as STOP_GUI @app.route('/debug', methods=['GET']) @login_required def debug() -> Union[Response, Tuple[Response, int]]: diff --git a/cloudflare_bypasser.py b/cloudflare_bypasser.py index 2629306..5f64aea 100644 --- a/cloudflare_bypasser.py +++ b/cloudflare_bypasser.py @@ -8,6 +8,8 @@ from env import LOG_DIR, DEBUG import signal from datetime import datetime import subprocess +import requests +from typing import Optional # --- SeleniumBase Import --- from seleniumbase import Driver @@ -475,3 +477,20 @@ def wait_for_result(func, timeout : int = 10, condition : any = True): time.sleep(0.5) return None _init_cleanup_thread() + + +def get_bypassed_page(url: str) -> Optional[str]: + """Fetch HTML content from a URL using the internal Cloudflare Bypasser. + + Args: + url: Target URL + Returns: + str: HTML content if successful, None otherwise + """ + + response_html = get(url) + logger.debug(f"Cloudflare Bypasser response length: {len(response_html)}") + if response_html.strip() != "": + return response_html + else: + raise requests.exceptions.RequestException("Failed to bypass Cloudflare") diff --git a/cloudflare_bypasser_external.py b/cloudflare_bypasser_external.py new file mode 100644 index 0000000..3b64d6a --- /dev/null +++ b/cloudflare_bypasser_external.py @@ -0,0 +1,34 @@ +from logger import setup_logger +from typing import Optional +import requests + +try: + from env import EXT_BYPASSER_PATH, EXT_BYPASSER_TIMEOUT, EXT_BYPASSER_URL +except ImportError: + raise RuntimeError("Failed to import environment variables. Are you using an `extbp` image?") + +logger = setup_logger(__name__) + + +def get_bypassed_page(url: str) -> Optional[str]: + """Fetch HTML content from a URL using an External Cloudflare Resolver. + + Args: + url: Target URL + Returns: + str: HTML content if successful, None otherwise + """ + if not EXT_BYPASSER_URL or not EXT_BYPASSER_PATH: + logger.error("Wrong External Bypass configuration. Please check your environment configuration.") + return None + ext_url = f"{EXT_BYPASSER_URL}{EXT_BYPASSER_PATH}" + headers = {"Content-Type": "application/json"} + data = { + "cmd": "request.get", + "url": url, + "maxTimeout": EXT_BYPASSER_TIMEOUT + } + response = requests.post(ext_url, headers=headers, json=data) + response.raise_for_status() + logger.debug(f"External Bypass response for '{url}': {response.json()['status']} - {response.json()['message']}") + return response.json()['solution']['response'] diff --git a/config.py b/config.py index 3ab413e..8c33cce 100644 --- a/config.py +++ b/config.py @@ -93,7 +93,9 @@ if CUSTOM_SCRIPT: CUSTOM_SCRIPT = "" # Debugging settings -VIRTUAL_SCREEN_SIZE = (1024, 768) -RECORDING_DIR = env.LOG_DIR / "recording" -if env.DEBUG: - RECORDING_DIR.mkdir(parents=True, exist_ok=True) \ No newline at end of file +if not env.USING_EXTERNAL_BYPASSER: + # Virtual display settings for debugging internal cloudflare bypasser + VIRTUAL_SCREEN_SIZE = (1024, 768) + RECORDING_DIR = env.LOG_DIR / "recording" + if env.DEBUG: + RECORDING_DIR.mkdir(parents=True, exist_ok=True) diff --git a/docker-compose.extbp.dev.yml b/docker-compose.extbp.dev.yml new file mode 100644 index 0000000..c18e345 --- /dev/null +++ b/docker-compose.extbp.dev.yml @@ -0,0 +1,35 @@ +services: + calibre-web-automated-book-downloader-extbp-dev: + container_name: cwa-bd-extbp-dev + extends: + file: ./docker-compose.extbp.yml + service: calibre-web-automated-book-downloader-extbp + build: + context: . + dockerfile: Dockerfile + target: cwa-bd-extbp + environment: + DEBUG: true + APP_ENV: dev + USE_DOH: true + CUSTOM_DNS: cloudflare + USE_CF_BYPASS: true # Enable Cloudflare bypass (default: true) + # External Cloudflare Bypass environment variables + EXT_BYPASSER_URL: "http://flaresolverr:8191" # URL of the external Cloudflare resolver service (used FlareSolverr) + EXT_BYPASSER_PATH: "/v1" # Path for external Cloudflare resolver API (default: /v1) + EXT_BYPASSER_TIMEOUT: 60000 # Timeout for external Cloudflare resolver requests (default: 60000) + volumes: + #- /tmp/cwa-book-downloader:/tmp/cwa-book-downloader + #- /tmp/cwa-book-downloader-log:/var/log/cwa-book-downloader + - ./deploy/ingest:/cwa-book-ingest + - ./deploy/log:/var/log/cwa-book-downloader + - ./deploy/tmp:/tmp/cwa-book-downloader + + flaresolverr: # External Cloudflare resolver service + image: ghcr.io/flaresolverr/flaresolverr:v3.3.22 + container_name: flaresolverr + environment: + LOG_LEVEL: info + LOG_HTML: false + CAPTCHA_SOLVER: none + TZ: Europe/Rome diff --git a/docker-compose.extbp.yml b/docker-compose.extbp.yml new file mode 100644 index 0000000..7365ebe --- /dev/null +++ b/docker-compose.extbp.yml @@ -0,0 +1,22 @@ +services: + calibre-web-automated-book-downloader-extbp: + image: ghcr.io/calibrain/calibre-web-automated-book-downloader-extbp:latest + environment: + FLASK_PORT: 8084 + LOG_LEVEL: info + BOOK_LANGUAGE: en + USE_BOOK_TITLE: true + TZ: America/New_York + APP_ENV: prod + UID: 1000 + GID: 100 + ports: + - 8084:8084 + restart: unless-stopped + volumes: + # This is where the books will be downloaded to, usually it would be + # the same as whatever you gave in "calibre-web-automated" + - /tmp/data/calibre-web/ingest:/cwa-book-ingest + # This is the location of CWA's app.db, which contains authentication + # details + #- /cwa/config/path/app.db:/auth/app.db:ro diff --git a/downloader.py b/downloader.py index cbbf5f9..f85d063 100644 --- a/downloader.py +++ b/downloader.py @@ -12,9 +12,12 @@ from typing import Callable from threading import Event from logger import setup_logger from config import PROXIES -from env import MAX_RETRY, DEFAULT_SLEEP, USE_CF_BYPASS +from env import MAX_RETRY, DEFAULT_SLEEP, USE_CF_BYPASS, USING_EXTERNAL_BYPASSER if USE_CF_BYPASS: - import cloudflare_bypasser + if USING_EXTERNAL_BYPASSER: + from cloudflare_bypasser_external import get_bypassed_page + else: + from cloudflare_bypasser import get_bypassed_page logger = setup_logger(__name__) @@ -35,12 +38,7 @@ def html_get_page(url: str, retry: int = MAX_RETRY, use_bypasser: bool = False) logger.debug(f"html_get_page: {url}, retry: {retry}, use_bypasser: {use_bypasser}") if use_bypasser and USE_CF_BYPASS: logger.info(f"GET Using Cloudflare Bypasser for: {url}") - response_html = cloudflare_bypasser.get(url) - logger.debug(f"Cloudflare Bypasser response length: {len(response_html)}") - if response_html.strip() != "": - return response_html - else: - raise requests.exceptions.RequestException("Failed to bypass Cloudflare") + return get_bypassed_page(url) else: logger.info(f"GET: {url}") response = requests.get(url, proxies=PROXIES) diff --git a/entrypoint.sh b/entrypoint.sh index 8b3402b..1ff25e5 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -112,8 +112,8 @@ else command="python3 app.py" fi -# IF DEBUG -if [ "$DEBUG" = "true" ]; then +# If DEBUG and not using an external bypass +if [ "$DEBUG" = "true" ] && [ "$USING_EXTERNAL_BYPASSER" != "true" ]; then set +e set -x echo "vvvvvvvvvvvv DEBUG MODE vvvvvvvvvvvv" diff --git a/env.py b/env.py index 4224d90..e56723e 100644 --- a/env.py +++ b/env.py @@ -45,6 +45,12 @@ APP_ENV = os.getenv("APP_ENV", "prod").lower() # Logging settings LOG_FILE = LOG_DIR / "cwa-book-downloader.log" +USING_EXTERNAL_BYPASSER = string_to_bool(os.getenv("USING_EXTERNAL_BYPASSER", "false")) +if USING_EXTERNAL_BYPASSER: + EXT_BYPASSER_URL = os.getenv("EXT_BYPASSER_URL").strip() + EXT_BYPASSER_PATH = os.getenv("EXT_BYPASSER_PATH", "/v1").strip() + EXT_BYPASSER_TIMEOUT = int(os.getenv("EXT_BYPASSER_TIMEOUT", "60000")) + USING_TOR = string_to_bool(os.getenv("USING_TOR", "false")) # If using Tor, we don't need to set custom DNS, use DOH, or proxy if USING_TOR: diff --git a/readme.md b/readme.md index f69fdf8..8fa1501 100644 --- a/readme.md +++ b/readme.md @@ -175,7 +175,9 @@ volumes: Mount should align with your Calibre-Web-Automated ingest folder. -## 🧅 Tor Variant +## Variants: + +### 🧅 Tor Variant This application also offers a variant that routes all its traffic through the Tor network. This can be useful for enhanced privacy or bypassing network restrictions. @@ -196,6 +198,48 @@ To use the Tor variant: * **Timezone:** When running in Tor mode, the container will attempt to determine the timezone based on the Tor exit node's IP address and set it automatically. This will override the `TZ` environment variable if it is set. * **Network Settings:** Custom DNS, DoH, and HTTP(S) proxy settings (`CUSTOM_DNS`, `USE_DOH`, `HTTP_PROXY`, `HTTPS_PROXY`) are ignored when using the Tor variant, as all traffic goes through Tor. +### External Cloudflare resolver variant + +This variant allows the application to use an external service to bypass Cloudflare protection, instead of relying on the built-in bypasser. This is useful if you already have a dedicated Cloudflare resolver (such as [FlareSolverr](https://github.com/FlareSolverr/FlareSolverr) or compatible services like [ByParr](https://github.com/ThePhaseless/Byparr)) running elsewhere. + +#### How it works: + +- When enabled, all requests that require Cloudflare bypass are sent to your external resolver service. +- The application communicates with the resolver using its API. +- This approach can improve reliability and performance, especially if your external resolver is optimized or shared across multiple applications. + +#### Configuration + +| Variable | Description | Default Value | +| ---------------------- | ----------------------------------------------------------- | ----------------------- | +| `EXT_BYPASSER_URL` | The full URL of your external resolver (required) | | +| `EXT_BYPASSER_PATH` | API path for the resolver (usually `/v1`) | `/v1` | +| `EXT_BYPASSER_TIMEOUT` | Timeout for page loading (in milliseconds) | `60000` | + +#### Important + +This feature follows the same configuration of the built-in Cloudflare bypasser, so you should turn on the `USE_CF_BYPASS` configuration to enable it. + +#### To use the External Cloudflare resolver variant: + +1. Get the extbp-specific docker-compose file: + ```bash + curl -O https://raw.githubusercontent.com/calibrain/calibre-web-automated-book-downloader/refs/heads/main/docker-compose.extbp.yml + ``` +2. Start the service using this file: + ```bash + docker compose -f docker-compose.extbp.yml up -d + ``` + +#### Compatibility: +This feature is designed to work with any resolver that implements the `FlareSolverr` API schema, including `ByParr` and similar projects. + +#### Benefits: + +- Centralizes Cloudflare bypass logic for easier maintenance. +- Can leverage more powerful or distributed resolver infrastructure. +- Reduces load on the main application container. + ## 🏗️ Architecture The application consists of a single service: diff --git a/requirements.txt b/requirements-base.txt similarity index 62% rename from requirements.txt rename to requirements-base.txt index bff5619..ecdacd2 100644 --- a/requirements.txt +++ b/requirements-base.txt @@ -2,10 +2,7 @@ flask requests[socks] beautifulsoup4 tqdm -pyvirtualdisplay dnspython -pyautogui -seleniumbase>=4.41.1 gunicorn python-xlib psutil diff --git a/requirements-cwa-bd.txt b/requirements-cwa-bd.txt new file mode 100644 index 0000000..3b0075e --- /dev/null +++ b/requirements-cwa-bd.txt @@ -0,0 +1,3 @@ +pyvirtualdisplay +pyautogui +seleniumbase>=4.41.1