From 3a92c5de786f0f98b548bb2d3f0dbfe4541f3ae3 Mon Sep 17 00:00:00 2001 From: CaliBrain Date: Sun, 16 Mar 2025 02:25:15 -0400 Subject: [PATCH] CF BYPASS (#24) The Calibre dependency was due to the script testing for validity of the downloaded file, as often they would be corrupted from aa. But CWA is already doing that, so we are just having redundant code here. For the cloudflarebypasser, I basically run my own version now, instead of depending on an external library, this way we have better control for debugging and on the docker image. Fixes #18, #33, #27, #48, #65, #78, #86, #88, #89 --------- Co-authored-by: mik593 <91991279+mik593@users.noreply.github.com> --- .dockerignore | 12 + .../build-and-publish-docker-image.yml | 5 +- .gitignore | 228 ++++++++++++++++++ .vscode/launch.json | 59 +++++ .vscode/tasks.json | 29 +++ Dockerfile | 77 +++--- app.py | 31 +-- backend.py | 33 +-- book_manager.py | 25 +- check_health.sh | 68 ------ cloudflare_bypasser.py | 176 ++++++++++++++ config.py | 38 ++- docker-compose.dev.yml | 14 +- docker-compose.yml | 8 +- entrypoint.sh | 29 ++- models.py | 15 +- network.py | 85 +++---- readme.md | 16 +- requirements.txt | 4 +- 19 files changed, 688 insertions(+), 264 deletions(-) create mode 100644 .dockerignore create mode 100644 .gitignore create mode 100644 .vscode/launch.json create mode 100644 .vscode/tasks.json delete mode 100644 check_health.sh create mode 100644 cloudflare_bypasser.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..3422903 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +.git +.github +.vscode +.mypy_cache +README_images +.gitignore +Dockerfile +.dockerignore +.DS_Store +docker-compose.yml +docker-compose.*.yml +readme.md diff --git a/.github/workflows/build-and-publish-docker-image.yml b/.github/workflows/build-and-publish-docker-image.yml index c67e242..38322db 100644 --- a/.github/workflows/build-and-publish-docker-image.yml +++ b/.github/workflows/build-and-publish-docker-image.yml @@ -36,11 +36,14 @@ jobs: type=raw,value={{commit_date 'YYYYMMDD'}} type=sha type=ref,event=branch - type=ref,event=tag + type=ref,event=tag + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 - name: Build and push Docker image id: push uses: docker/build-push-action@v5 with: + platforms: linux/amd64,linux/arm64 context: . push: true tags: ${{ steps.meta.outputs.tags }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c2ee6f7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,228 @@ +# Created by https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode,python +# Edit at https://www.toptal.com/developers/gitignore?templates=macos,visualstudiocode,python + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode,python \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..0428459 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,59 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Docker-compose Dev", + "type": "debugpy", // or "debugpy", Node, etc. + "request": "launch", + "program": "${workspaceFolder}/app.py", + "preLaunchTask": "docker-compose up (dev)", // Spin up dev containers + "postDebugTask": "docker-compose down (dev)", // Optional: tear them down + "env": { + "INGEST_DIR": "/tmp/cwa-book-downloader" + }, + }, + { + "name": "Docker-compose Prod", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/app.py", + "preLaunchTask": "docker-compose up (prod)", + "postDebugTask": "docker-compose down (prod)", + "env": { + "INGEST_DIR": "/tmp/cwa-book-downloader" + }, + }, + { + "type": "debugpy", + "request": "launch", + "name": "Launch cwa-bd app.py", + "program": "${workspaceFolder}/app.py", + "env": { + "DOCKERMODE": "false", + "INGEST_DIR": "/tmp/cwa-book-downloader" + }, + "presentation": { + "hidden": true + } + }, + { + "type": "chrome", + "request": "launch", + "name": "Launch Browser", + "url": "http://localhost:8000", + "webRoot": "${workspaceFolder}", + "presentation": { + "hidden": true + } + } + ], + "compounds": [ + { + "name": "Launch CWA-BD", + "configurations": [ + "Launch cwa-bd app.py", + "Launch Browser" + ] + } + ] +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..1a1a646 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,29 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "docker-compose up (dev)", + "type": "shell", + "command": "docker-compose -f docker-compose.dev.yml up --build -d", + "problemMatcher": [] + }, + { + "label": "docker-compose down (dev)", + "type": "shell", + "command": "docker-compose -f docker-compose.dev.yml down", + "problemMatcher": [] + }, + { + "label": "docker-compose up (prod)", + "type": "shell", + "command": "docker-compose -f docker-compose.yml up -d", + "problemMatcher": [] + }, + { + "label": "docker-compose down (prod)", + "type": "shell", + "command": "docker-compose -f docker-compose.yml down", + "problemMatcher": [] + } + ] +} diff --git a/Dockerfile b/Dockerfile index ce4dfa7..e97e74a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,58 +1,49 @@ -# Use Python slim image for smaller size FROM python:3.12-slim -# Set environment variables -ENV DEBIAN_FRONTEND=noninteractive -ENV DOCKERMODE=true -ENV PYTHONUNBUFFERED=1 -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PIP_NO_CACHE_DIR=1 -ENV PIP_DISABLE_PIP_VERSION_CHECK=1 -ENV PIP_DEFAULT_TIMEOUT=100 -ENV NAME=Calibre-Web-Automated-Book-Downloader -ENV FLASK_HOST=0.0.0.0 -ENV FLASK_PORT=8084 -ENV FLASK_DEBUG=0 -ENV CLOUDFLARE_PROXY_URL=http://localhost:8000 -ENV INGEST_DIR=/cwa-book-ingest -ENV STATUS_TIMEOUT=3600 -ENV PYTHONPATH=/app -ENV USE_CF_BYPASS=true -ENV AA_BASE_URL=https://annas-archive.org +ENV DEBIAN_FRONTEND=noninteractive \ + DOCKERMODE=true \ + PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_DEFAULT_TIMEOUT=100 \ + NAME=Calibre-Web-Automated-Book-Downloader \ + FLASK_HOST=0.0.0.0 \ + FLASK_PORT=8084 \ + FLASK_DEBUG=0 \ + STATUS_TIMEOUT=3600 \ + PYTHONPATH=/app \ + USE_CF_BYPASS=true \ + AA_BASE_URL=https://annas-archive.org \ + UID=1000 \ + GID=100 -# Default UID and GID (can be overridden at runtime) -ENV UID=1000 -ENV GID=100 +# Install minimal dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends --no-install-suggests \ + curl \ + xvfb \ + chromium-driver \ + dumb-init && \ + rm -rf /var/lib/apt/lists/* -# Set working directory WORKDIR /app -# Install system dependencies -RUN apt-get update && apt-get install -y --no-install-recommends --no-install-suggests \ - calibre p7zip curl gosu \ - && rm -rf /var/lib/apt/lists/* - -# Copy requirements first for better caching +# Install Python dependencies including playwright COPY requirements.txt . - -# Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt +RUN pip install --no-cache-dir -r requirements.txt && \ + rm -rf /root/.cache /app/.cache COPY . . +RUN chmod +x /app/entrypoint.sh && \ + # Create necessary directories + mkdir -p /var/log/cwa-book-downloader && \ + mkdir -p /cwa-book-ingest -RUN chmod +x /app/check_health.sh && \ - chmod +x /app/entrypoint.sh - -# Expose port EXPOSE ${FLASK_PORT} -# Health check HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ CMD curl -f http://localhost:${FLASK_PORT}/request/api/status || exit 1 -# Entrypoint -ENTRYPOINT ["/app/entrypoint.sh"] - -# Start application -CMD ["python", "-m", "app"] - +ENTRYPOINT ["/usr/bin/dumb-init", "--"] +CMD ["/app/entrypoint.sh"] \ No newline at end of file diff --git a/app.py b/app.py index ba91120..af56054 100644 --- a/app.py +++ b/app.py @@ -4,8 +4,9 @@ import logging import io, re, os from flask import Flask, request, jsonify, render_template, send_file, send_from_directory from werkzeug.middleware.proxy_fix import ProxyFix +from werkzeug.wrappers import Response from flask import url_for as flask_url_for -from functools import partial +import typing from logger import setup_logger from config import FLASK_HOST, FLASK_PORT, FLASK_DEBUG, _SUPPORTED_BOOK_LANGUAGE, BOOK_LANGUAGE @@ -27,7 +28,7 @@ werkzeug_logger = logging.getLogger('werkzeug') werkzeug_logger.handlers = logger.handlers werkzeug_logger.setLevel(logger.level) -def register_dual_routes(app): +def register_dual_routes(app : Flask) -> None: """ Register each route both with and without the /request prefix. This function should be called after all routes are defined. @@ -58,7 +59,7 @@ def register_dual_routes(app): methods=rule.methods) app.jinja_env.globals['url_for'] = url_for_with_request -def url_for_with_request(endpoint, **values): +def url_for_with_request(endpoint : str, **values : typing.Any) -> str: """Generate URLs with /request prefix by default.""" if endpoint == 'static': # For static files, add /request prefix @@ -67,7 +68,7 @@ def url_for_with_request(endpoint, **values): return flask_url_for(endpoint, **values) @app.route('/') -def index(): +def index() -> str: """ Render main page with search and status table. """ @@ -76,12 +77,14 @@ def index(): @app.route('/favico') @app.route('/request/favico') @app.route('/request/static/favico') -def favicon(_): +def favicon(_ : typing.Any) -> Response: return send_from_directory(os.path.join(app.root_path, 'static', 'media'), 'favicon.ico', mimetype='image/vnd.microsoft.icon') +from typing import Union, Tuple + @app.route('/api/search', methods=['GET']) -def api_search(): +def api_search() -> Union[Response, Tuple[Response, int]]: """ Search for books matching the provided query. @@ -95,7 +98,7 @@ def api_search(): content (str): Content type of book Returns: - flask.Response: JSON array of matching books or empty array if no query. + flask.Response: JSON array of matching books or error response. """ query = request.args.get('query', '') @@ -119,7 +122,7 @@ def api_search(): return jsonify({"error": str(e)}), 500 @app.route('/api/info', methods=['GET']) -def api_info(): +def api_info() -> Union[Response, Tuple[Response, int]]: """ Get detailed book information. @@ -143,7 +146,7 @@ def api_info(): return jsonify({"error": str(e)}), 500 @app.route('/api/download', methods=['GET']) -def api_download(): +def api_download() -> Union[Response, Tuple[Response, int]]: """ Queue a book for download. @@ -167,7 +170,7 @@ def api_download(): return jsonify({"error": str(e)}), 500 @app.route('/api/status', methods=['GET']) -def api_status(): +def api_status() -> Union[Response, Tuple[Response, int]]: """ Get current download queue status. @@ -182,7 +185,7 @@ def api_status(): return jsonify({"error": str(e)}), 500 @app.route('/api/localdownload', methods=['GET']) -def api_local_download(): +def api_local_download() -> Union[Response, Tuple[Response, int]]: """ Download an EPUB file from local storage if available. @@ -218,7 +221,7 @@ def api_local_download(): return jsonify({"error": str(e)}), 500 @app.errorhandler(404) -def not_found_error(error): +def not_found_error(error: Exception) -> Union[Response, Tuple[Response, int]]: """ Handle 404 (Not Found) errors. @@ -228,11 +231,11 @@ def not_found_error(error): Returns: flask.Response: JSON error message with 404 status. """ - logger.warning(f"404 error: {request.url}") + logger.warning(f"404 error: {request.url} : {error}") return jsonify({"error": "Resource not found"}), 404 @app.errorhandler(500) -def internal_error(error): +def internal_error(error: Exception) -> Union[Response, Tuple[Response, int]]: """ Handle 500 (Internal Server) errors. diff --git a/backend.py b/backend.py index 7fe2222..351682f 100644 --- a/backend.py +++ b/backend.py @@ -1,7 +1,7 @@ """Backend logic for the book download application.""" import threading, time -import subprocess +import shutil from pathlib import Path from typing import Dict, List, Optional, Any, Tuple @@ -101,29 +101,6 @@ def _book_info_to_dict(book: BookInfo) -> Dict[str, Any]: if value is not None } -def _process_book(book_path: str) -> bool: - """Check if downloaded book is valid. - - Args: - book_path: Path to downloaded book file - - Returns: - bool: True if book is valid - """ - try: - logger.info(f"Verifying book health: {book_path}") - script_path = Path(__file__).parent / "check_health.sh" - result = subprocess.run( - [str(script_path), book_path], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - logger.info(f"Health check result: {result.stdout.decode()}") - return result.returncode == 0 - except Exception as e: - logger.error(f"Error checking book health: {e}") - return False - def _download_book(book_id: str) -> bool: """Download and process a book. @@ -136,16 +113,20 @@ def _download_book(book_id: str) -> bool: try: book_info = book_queue._book_data[book_id] book_path = TMP_DIR / f"{book_id}.{book_info.format}" + success = book_manager.download_book(book_info, book_path) if not success: raise Exception("Unkown error downloading book") - return _process_book(str(book_path)) + + final_path = INGEST_DIR / f"{book_id}.{book_info.format}" + shutil.move(book_path, final_path) + return True except Exception as e: logger.error(f"Error downloading book: {e}") return False -def download_loop(): +def download_loop() -> None: """Background thread for processing download queue.""" logger.info("Starting download loop") diff --git a/book_manager.py b/book_manager.py index e1d595a..d8feb82 100644 --- a/book_manager.py +++ b/book_manager.py @@ -3,9 +3,8 @@ import time, json from pathlib import Path from urllib.parse import quote -from typing import List, Optional, Dict -from bs4 import BeautifulSoup, Tag, NavigableString -from io import BytesIO +from typing import List, Optional, Dict, Union +from bs4 import BeautifulSoup, Tag, NavigableString, ResultSet from logger import setup_logger from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_DONATOR_KEY, AA_BASE_URL, USE_CF_BYPASS @@ -95,15 +94,15 @@ def search_books(query: str, filters: SearchFilters) -> List[BookInfo]: return books -def _parse_search_result_row(row) -> Optional[BookInfo]: +def _parse_search_result_row(row: Tag) -> Optional[BookInfo]: """Parse a single search result row into a BookInfo object.""" try: cells = row.find_all('td') preview_img = cells[0].find('img') preview = preview_img['src'] if preview_img else None - + return BookInfo( - id=row.find('a')['href'].split('/')[-1], + id=row.find_all('a')[0]['href'].split('/')[-1], preview=preview, title=cells[1].find('span').next, author=cells[2].find('span').next, @@ -232,7 +231,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: return book_info -def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]: +def _extract_book_metadata(metadata_divs: Union[ResultSet[Tag], List[Tag]]) -> Dict[str, List[str]]: """Extract metadata from book info divs.""" info : Dict[str, List[str]] = {} @@ -247,7 +246,7 @@ def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]: # Process the second set of metadata (spans) # Find elements where aria-label="code tabs" - meta_spans = [] + meta_spans: List[Tag] = [] for div in metadata_divs: if div.find_all('div', {'aria-label': 'code tabs'}): meta_spans = div.find_all('span') @@ -294,7 +293,7 @@ def download_book(book_info: BookInfo, book_path: Path) -> bool: try: download_url = _get_download_url(link, book_info.title) if download_url != "": - logger.info(f"Downloading {book_info.title} from {download_url}") + logger.info(f"Downloading `{book_info.title}` from `{download_url}`") data = network.download_url(download_url, book_info.size or "") if not data: raise Exception("No data received") @@ -302,7 +301,7 @@ def download_book(book_info: BookInfo, book_path: Path) -> bool: logger.info(f"Download finished. Writing to {book_path}") with open(book_path, "wb") as f: f.write(data.getbuffer()) - logger.info(f"Writing {book_info.title} successfully") + logger.info(f"Writing `{book_info.title}` successfully") return True except Exception as e: @@ -315,14 +314,12 @@ def _get_download_url(link: str, title: str) -> str: """Extract actual download URL from various source pages.""" url = "" - + if link.startswith(f"{AA_BASE_URL}/dyn/api/fast_download.json"): page = network.html_get_page(link) url = json.loads(page).get("download_url") else: - html = network.html_get_page(link, retry=0, skip_403=True) - if html == "": - html = network.html_get_page_cf(link) + html = network.html_get_page(link, retry=0) if html == "": return "" diff --git a/check_health.sh b/check_health.sh deleted file mode 100644 index 774e5a2..0000000 --- a/check_health.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -OUTPUTFOLDER=${TMP_DIR:-/tmp/cwa-book-downloader} -mkdir -p $TMP_DIR - -OUTPUTFOLDER=${INGEST_DIR:-/cwa-book-ingest} -mkdir -p $OUTPUTFOLDER - -# Get a list of files to process - -# Check if a file was supplied through command line arguments -if [ "$#" -gt 0 ]; then - files=("$@") -else - files=($TMP_DIR*) -fi - -# Total number of files -total_files=${#files[@]} - - -good=0 -bad=0 -manual=0 - - -# Process files in the 'downloads' directory -for file in "${files[@]}"; do - - # Skip if it's not a regular file - [ -f "$file" ] || continue - - # Extract filename and extension - filenamewithext="${file##*/}" - filename="${filenamewithext%.*}" - fileextension="${filenamewithext##*.}" - - case "$fileextension" in - epub|mobi|azw3|fb2|djvu|cbz|cbr) - # Attempt to convert the file to EPUB - ebook-convert "$file" "$OUTPUTFOLDER/$filename.epub" >/dev/null 2>&1 - # if file exists in $OUTPUTFOLDER/$filename.epub then it is a good file - if [ -f "$OUTPUTFOLDER/$filename.epub" ]; then - good=$((good + 1)) - else - bad=$((bad + 1)) - fi - rm "$file" - ;; - *) - # Move other files to the 'other' directory - rm "$file" - bad=$((manual + 1)) - ;; - esac -done - -# Move to a new line after the progress bar completes -echo - -echo "Out of $total_files, $good are good, $bad are corrupt and $manual need manual inspection" - -if [ "$bad" -gt 0 ]; then - exit 2 -fi -if [ "$manual" -gt 0 ]; then - exit 1 -fi -exit 0 diff --git a/cloudflare_bypasser.py b/cloudflare_bypasser.py new file mode 100644 index 0000000..f3e0c74 --- /dev/null +++ b/cloudflare_bypasser.py @@ -0,0 +1,176 @@ +import time, os +from DrissionPage import ChromiumPage # type: ignore +from DrissionPage import ChromiumOptions +from DrissionPage._functions.elements import ChromiumElementsList # type: ignore +from DrissionPage._pages.chromium_tab import ChromiumTab # type: ignore +from logger import setup_logger +from config import MAX_RETRY, DOCKERMODE, DEFAULT_SLEEP, PROXIES + +logger = setup_logger(__name__) + +def _search_recursively_shadow_root_with_iframe(ele : ChromiumElementsList) -> ChromiumElementsList | None: + if ele.shadow_root: + if ele.shadow_root.child().tag == "iframe": + return ele.shadow_root.child() + else: + for child in ele.children(): + result = _search_recursively_shadow_root_with_iframe(child) + if result: + return result + return None + +def _search_recursively_shadow_root_with_cf_input(ele : ChromiumElementsList) -> ChromiumElementsList | None: + if ele.shadow_root: + if ele.shadow_root.ele("tag:input"): + return ele.shadow_root.ele("tag:input") + else: + for child in ele.children(): + result = _search_recursively_shadow_root_with_cf_input(child) + if result: + return result + return None + +def _locate_cf_button(driver : ChromiumTab) -> ChromiumElementsList | None: + button : ChromiumElementsList = None + eles = driver.eles("tag:input") + for ele in eles: + if "name" in ele.attrs.keys() and "type" in ele.attrs.keys(): + if "turnstile" in ele.attrs["name"] and ele.attrs["type"] == "hidden": + button = ele.parent().shadow_root.child()("tag:body").shadow_root("tag:input") + break + + if button: + return button + else: + # If the button is not found, search it recursively + logger.debug("Basic search failed. Searching for button recursively.") + ele = driver.ele("tag:body") + iframe = _search_recursively_shadow_root_with_iframe(ele) + if iframe: + button = _search_recursively_shadow_root_with_cf_input(iframe("tag:body")) + else: + logger.debug("Iframe not found. Button search failed.") + return button + +def _click_verification_button(driver: ChromiumTab) -> None: + try: + button = _locate_cf_button(driver) + if button: + logger.debug("Verification button found. Attempting to click.") + button.click() + else: + logger.debug("Verification button not found.") + + except Exception as e: + logger.debug(f"Error clicking verification button: {e}") + +def _is_bypassed(driver: ChromiumTab) -> bool: + try: + title = driver.title.lower() + body = driver.ele("tag:body").text.lower() + # TODO check body + return "just a moment" not in title + except Exception as e: + logger.debug(f"Error checking page title: {e}") + return False + +def _bypass(driver: ChromiumTab, max_retries: int = MAX_RETRY) -> None: + try_count = 0 + + while not _is_bypassed(driver): + logger.info(f"Starting Cloudflare bypass... Retry: {try_count + 1} / {max_retries}") + if try_count >= max_retries: + logger.warning("Exceeded maximum retries. Bypass failed.") + break + + logger.info(f"Attempt {try_count + 1}: Verification page detected. Trying to bypass...") + _click_verification_button(driver) + + try_count += 1 + time.sleep(DEFAULT_SLEEP) + + if _is_bypassed(driver): + logger.info("Bypass successful.") + else: + logger.info("Bypass failed.") + +def _get_chromium_options(arguments: list[str]) -> ChromiumOptions: + options = ChromiumOptions() + for argument in arguments: + options.set_argument(argument) + + # Add proxy settings if configured + if PROXIES: + if 'http' in PROXIES: + options.set_argument(f'--proxy-server={PROXIES["http"]}') + logger.debug(f"Setting HTTP proxy: {PROXIES['http']}") + elif 'https' in PROXIES: + options.set_argument(f'--proxy-server={PROXIES["https"]}') + logger.debug(f"Setting HTTPS proxy: {PROXIES['https']}") + + return options + +def _genScraper() -> ChromiumPage: + arguments = [ + "-no-first-run", + "-force-color-profile=srgb", + "-metrics-recording-only", + "-password-store=basic", + "-use-mock-keychain", + "-export-tagged-pdf", + "-no-default-browser-check", + "-disable-background-mode", + "-enable-features=NetworkService,NetworkServiceInProcess,LoadCryptoTokenExtension,PermuteTLSExtensions", + "-disable-features=FlashDeprecationWarning,EnablePasswordsAccountStorage", + "-deny-permission-prompts", + "-disable-gpu", + "-no-sandbox", + "-accept-lang=en-US", + "-remote-debugging-port=9222" + ] + + options = _get_chromium_options(arguments) + # Initialize the browser + driver = ChromiumPage(addr_or_opts=options) + return driver + +_defaultTab = None + +def _reset_browser() -> None: + if not DOCKERMODE: + return + global _defaultTab + # Kill the browser + if _defaultTab: + _defaultTab.close() + _defaultTab = None + # Force kill the browser + os.system("pkill -f -i *chrom*") + os.system("pkill -f -i xvfb") + time.sleep(1) + +def _init_browser(retry : int = MAX_RETRY) -> ChromiumTab: + global _defaultTab + if _defaultTab: + return _defaultTab + else: + try: + driver = _genScraper() + _defaultTab = driver.get_tabs()[0] + except Exception as e: + if retry > 0: + _reset_browser() + else: + raise e + return _init_browser(retry - 1) + +def get(url : str, retry : int = MAX_RETRY) -> ChromiumTab: + defaultTab = _init_browser() + defaultTab.get(url) + try: + _bypass(defaultTab) + except Exception as e: + if retry > 0: + return get(url, retry - 1) + raise e + return defaultTab diff --git a/config.py b/config.py index c0975d4..3ceb124 100644 --- a/config.py +++ b/config.py @@ -15,7 +15,7 @@ LOG_DIR.mkdir(exist_ok=True) TMP_DIR = Path(os.getenv("TMP_DIR", "/tmp/cwa-book-downloader")) -INGEST_DIR = Path(os.getenv("INGEST_DIR", "/tmp/cwa-book-ingest")) +INGEST_DIR = Path(os.getenv("INGEST_DIR", "/cwa-book-ingest")) STATUS_TIMEOUT = int(os.getenv("STATUS_TIMEOUT", 3600)) # Create necessary directories @@ -26,12 +26,37 @@ INGEST_DIR.mkdir(exist_ok=True) # Network settings MAX_RETRY = int(os.getenv("MAX_RETRY", 3)) DEFAULT_SLEEP = int(os.getenv("DEFAULT_SLEEP", 5)) -CLOUDFLARE_PROXY = os.getenv("CLOUDFLARE_PROXY_URL", "http://localhost:8000") USE_CF_BYPASS = os.getenv("USE_CF_BYPASS", "true").lower() in ["true", "yes", "1", "y"] +# Proxy settings +PROXIES = {} +http_proxy = os.getenv("HTTP_PROXY", "").strip() +https_proxy = os.getenv("HTTPS_PROXY", "").strip() +if http_proxy: + PROXIES["http"] = http_proxy +if https_proxy: + PROXIES["https"] = https_proxy +if not PROXIES: + PROXIES = None + # Anna's Archive settings +aa_available_urls = ["https://annas-archive.org", "https://annas-archive.se", "https://annas-archive.li"] AA_DONATOR_KEY = os.getenv("AA_DONATOR_KEY", "").strip() -AA_BASE_URL = os.getenv("AA_BASE_URL", "https://annas-archive.org").strip("/") +AA_BASE_URL = os.getenv("AA_BASE_URL", "auto").strip("/") +if AA_BASE_URL == "auto": + for url in aa_available_urls: + try: + logger.debug(f"Checking {url}") + response = requests.get(url) + if response.status_code == 200: + logger.debug(f"Found good url: {url}") + AA_BASE_URL = url + break + except Exception as e: + logger.debug(f"Error checking {url}: {e}") +if AA_BASE_URL == "auto": + logger.error("No good url found for Anna's Archive, falling back to default") + AA_BASE_URL = aa_available_urls[0] # File format settings SUPPORTED_FORMATS = os.getenv("SUPPORTED_FORMATS", "epub,mobi,azw3,fb2,djvu,cbz,cbr").split(",") @@ -50,3 +75,10 @@ FLASK_DEBUG = os.getenv("FLASK_DEBUG", "False").lower() == "true" ENABLE_LOGGING = os.getenv("ENABLE_LOGGING", "true").lower() in ["true", "yes", "1", "y"] LOG_FILE = LOG_DIR / "cwa-bookd-downloader.log" MAIN_LOOP_SLEEP_TIME = int(os.getenv("MAIN_LOOP_SLEEP_TIME", 5)) + +# Docker settings +DOCKERMODE = os.getenv('DOCKERMODE', 'false').lower().strip() in ['true', '1', 'yes', 'y'] +if DOCKERMODE: + from pyvirtualdisplay import Display # type: ignore + display = Display(visible=False, size=(800, 600)) + display.start() \ No newline at end of file diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 53ef7ef..19b731b 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -7,23 +7,11 @@ services: environment: FLASK_PORT: 8084 FLASK_DEBUG: false - CLOUDFLARE_PROXY_URL: http://cloudflarebypassforscraping:8000 - INGEST_DIR: /cwa-book-ingest BOOK_LANGUAGE: en ports: - 8084:8084 - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8084/request/api/status"] - interval: 30s - timeout: 30s - retries: 3 - start_period: 5s restart: unless-stopped volumes: # This is where the books will be downloaded to, usually it would be # the same as whatever you gave in "calibre-web-automated" - - /tmp/data/calibre-web/ingest:${INGEST_DIR:-/cwa-book-ingest} - - cloudflarebypassforscraping: - image: ghcr.io/sarperavci/cloudflarebypassforscraping:latest - restart: unless-stopped + - /tmp/data/calibre-web/ingest:/cwa-book-ingest diff --git a/docker-compose.yml b/docker-compose.yml index de22948..55f1493 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,8 +5,6 @@ services: environment: FLASK_PORT: 8084 FLASK_DEBUG: false - CLOUDFLARE_PROXY_URL: http://cloudflarebypassforscraping:8000 - INGEST_DIR: /cwa-book-ingest BOOK_LANGUAGE: en ports: - 8084:8084 @@ -21,8 +19,4 @@ services: volumes: # This is where the books will be downloaded to, usually it would be # the same as whatever you gave in "calibre-web-automated" - - /tmp/data/calibre-web/ingest:${INGEST_DIR:-/cwa-book-ingest} - - cloudflarebypassforscraping: - image: ghcr.io/sarperavci/cloudflarebypassforscraping:latest - restart: unless-stopped + - /tmp/data/calibre-web/ingest:/cwa-book-ingest diff --git a/entrypoint.sh b/entrypoint.sh index f1ff2b7..974e5be 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -1,21 +1,30 @@ #!/bin/bash -# set -e +set -e -mkdir -p /var/log/cwa-book-downloader -mkdir -p "$INGEST_DIR" +# Set UID if not set +if [ -z "$UID" ]; then + UID=1000 +fi + +# Set GID if not set +if [ -z "$GID" ]; then + GID=100 +fi -# Create group if it doesn't exist if ! getent group "$GID" >/dev/null; then - groupadd -g "$GID" abc + groupadd -g "$GID" appuser fi # Create user if it doesn't exist if ! id -u "$UID" >/dev/null 2>&1; then - useradd -u "$UID" -g "$GID" -d /app -s /sbin/nologin abc + useradd -u "$UID" -g "$GID" -d /app -s /sbin/nologin appuser fi -# Adjust ownership of application directories -chown -R $UID:$GID /app "$INGEST_DIR" /var/log/cwa-book-downloader +# Get username for the UID (whether we just created it or it existed) +USERNAME=$(getent passwd "$UID" | cut -d: -f1) -# Switch to the created user and execute the main command -exec gosu $UID "$@" +# Ensure proper ownership of application directories +chown -R "${UID}:${GID}" /app /var/log/cwa-book-downloader /cwa-book-ingest + +# Switch to the user (either newly created or existing) and execute the main command +exec su -s /bin/bash "$USERNAME" -c "python -m app" \ No newline at end of file diff --git a/models.py b/models.py index c7f73f4..811a0cf 100644 --- a/models.py +++ b/models.py @@ -3,8 +3,10 @@ from dataclasses import dataclass, field from typing import Dict, List, Optional from enum import Enum -from config import INGEST_DIR, STATUS_TIMEOUT from datetime import datetime, timedelta +from threading import Lock + +from config import INGEST_DIR, STATUS_TIMEOUT class QueueStatus(str, Enum): """Enum for possible book queue statuses.""" @@ -31,13 +33,12 @@ class BookInfo: class BookQueue: """Thread-safe book queue manager.""" - def __init__(self): - from threading import Lock - self._queue = set() + def __init__(self) -> None: + self._queue: set[str] = set() self._lock = Lock() - self._status = {} - self._book_data = {} - self._status_timestamps = {} # Track when each status was last updated + self._status: dict[str, QueueStatus] = {} + self._book_data: dict[str, BookInfo]= {} + self._status_timestamps: dict[str, datetime] = {} # Track when each status was last updated self._status_timeout = timedelta(seconds=STATUS_TIMEOUT) # 1 hour timeout def add(self, book_id: str, book_data: BookInfo) -> None: diff --git a/network.py b/network.py index ebfecc8..bc0d0ce 100644 --- a/network.py +++ b/network.py @@ -8,24 +8,22 @@ from typing import Optional from urllib.parse import urlparse from tqdm import tqdm +import cloudflare_bypasser from logger import setup_logger -from config import MAX_RETRY, DEFAULT_SLEEP, CLOUDFLARE_PROXY, USE_CF_BYPASS +from config import MAX_RETRY, DEFAULT_SLEEP, USE_CF_BYPASS, PROXIES logger = setup_logger(__name__) +"""Configure urllib opener with appropriate headers.""" +opener = urllib.request.build_opener() +opener.addheaders = [ + ('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/129.0.0.0 Safari/537.3') +] +urllib.request.install_opener(opener) -def setup_urllib_opener(): - """Configure urllib opener with appropriate headers.""" - opener = urllib.request.build_opener() - opener.addheaders = [ - ('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/129.0.0.0 Safari/537.3') - ] - urllib.request.install_opener(opener) -setup_urllib_opener() - -def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip_403: bool = False) -> str: +def html_get_page(url: str, retry: int = MAX_RETRY, use_bypasser: bool = False) -> str: """Fetch HTML content from a URL with retry mechanism. Args: @@ -37,25 +35,37 @@ def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip str: HTML content if successful, None otherwise """ try: + + if use_bypasser and USE_CF_BYPASS: + logger.info(f"GET Using Cloudflare Bypasser for: {url}") + response = cloudflare_bypasser.get(url) + logger.debug(f"Cloudflare Bypasser response: {response}") + if response: + return str(response.html) + else: + raise requests.exceptions.RequestException("Failed to bypass Cloudflare") + logger.info(f"GET: {url}") - response = requests.get(url) - + response = requests.get(url, proxies=PROXIES) response.raise_for_status() + logger.debug(f"Success getting: {url}") time.sleep(1) - return response.text + return str(response.text) except requests.exceptions.RequestException as e: if retry == 0: logger.error(f"Failed to fetch page: {url}, error: {e}") return "" - if skip_404 and response.status_code == 404: + if response.status_code == 404: logger.warning(f"404 error for URL: {url}") return "" - if skip_403 and response.status_code == 403: + if response.status_code == 403: logger.warning(f"403 error for URL: {url}. Should retry using cloudflare bypass.") - return "" + if use_bypasser: + return "" + use_bypasser = True sleep_time = DEFAULT_SLEEP * (MAX_RETRY - retry + 1) @@ -63,40 +73,7 @@ def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip f"Retrying GET {url} in {sleep_time} seconds due to error: {e}" ) time.sleep(sleep_time) - return html_get_page(url, retry - 1) - -def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> str: - """Fetch HTML content through Cloudflare proxy. - - Args: - url: Target URL - retry: Number of retry attempts - - Returns: - str: HTML content if successful, None otherwise - """ - if USE_CF_BYPASS == False: - logger.warning("Cloudflare bypass is disabled, trying without it.") - return html_get_page(url, retry, skip_403=True) - try: - logger.info(f"GET_CF: {url}") - response = requests.get( - f"{CLOUDFLARE_PROXY}/html?url={url}&retries=3" - ) - time.sleep(1) - return response.text - - except Exception as e: - if retry == 0: - logger.error(f"Failed to fetch page through CF: {url}, error: {e}") - return "" - - sleep_time = DEFAULT_SLEEP * (MAX_RETRY - retry + 1) - logger.warning( - f"Retrying GET_CF {url} in {sleep_time} seconds due to error: {e}" - ) - time.sleep(sleep_time) - return html_get_page_cf(url, retry - 1) + return html_get_page(url, retry - 1, use_bypasser) def download_url(link: str, size: str = "") -> Optional[BytesIO]: """Download content from URL into a BytesIO buffer. @@ -109,7 +86,7 @@ def download_url(link: str, size: str = "") -> Optional[BytesIO]: """ try: logger.info(f"Downloading from: {link}") - response = requests.get(link, stream=True) + response = requests.get(link, stream=True, proxies=PROXIES) response.raise_for_status() total_size : float = 0.0 diff --git a/readme.md b/readme.md index a25dd3a..361821a 100644 --- a/readme.md +++ b/readme.md @@ -61,7 +61,7 @@ An intuitive web interface for searching and requesting book downloads, designed | `GID` | Runtime group ID | `100` | | `ENABLE_LOGGING` | Enable log file | `true` | -If logging is enabld, log folder default location is `var/log/cwa-book-downloader` +If logging is enabld, log folder default location is `/var/log/cwa-book-downloader` #### Download Settings @@ -90,10 +90,20 @@ If disabling the cloudflare bypass, you will be using alternative download hosts | Variable | Description | Default Value | | ---------------------- | ----------------------------- | ----------------------- | -| `CLOUDFLARE_PROXY_URL` | Cloudflare bypass service URL | `http://localhost:8000` | | `PORT` | Container external port | `8084` | +| `HTTP_PROXY` | HTTP proxy URL | `` | +| `HTTPS_PROXY` | HTTPS proxy URL | `` | -`CLOUDFLARE_PROXY_URL` is ignored if `USE_CF_BYPASS` is set to `false` +For proxy configuration, you can specify URLs in the following format: +```bash +# Basic proxy +HTTP_PROXY=http://proxy.example.com:8080 +HTTPS_PROXY=http://proxy.example.com:8080 + +# Proxy with authentication +HTTP_PROXY=http://username:password@proxy.example.com:8080 +HTTPS_PROXY=http://username:password@proxy.example.com:8080 +``` ### Volume Configuration diff --git a/requirements.txt b/requirements.txt index aad7e15..42affc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,8 @@ flask requests beautifulsoup4 tqdm +DrissionPage +pyvirtualdisplay types-requests types-beautifulsoup4 -types-tqdm \ No newline at end of file +types-tqdm