Files
shelfmark/downloader.py
Federico Della Rovere 207cff96d3 External CloudFlare resolver (#245)
Adding support for an external CloudFlare bypasser service and
introducing a new Docker image build with a dedicated target.

Key Changes
- Added `cloudflare_bypasser_external.py` for external bypasser
integration.
- Updated Docker Compose files to support the new service.
- Introduced a new Docker target for building a separate image for the
external bypasser.
- Refactored relevant modules to utilize the external bypasser when
configured.
- Documentation and configuration updates to reflect new options and
Docker targets.

Impact
- Users can now choose between internal and external CloudFlare
bypassing.
- New Docker image and target streamline deployment of the external
bypasser.
- Improved modularity and maintainability.
- No breaking changes for existing workflows.

Testing
- Manual and E2E tests performed for both bypasser modes.
- Docker Compose setups and new image build verified for development and
production.

Notes
Please review the new configuration options and Docker targets. Update
your environment and deployment scripts as needed. Feedback and
suggestions are welcome!
2025-08-28 17:37:59 -04:00

136 lines
5.0 KiB
Python

"""Network operations manager for the book downloader application."""
import network
network.init()
import requests
import time
from io import BytesIO
from typing import Optional
from urllib.parse import urlparse
from tqdm import tqdm
from typing import Callable
from threading import Event
from logger import setup_logger
from config import PROXIES
from env import MAX_RETRY, DEFAULT_SLEEP, USE_CF_BYPASS, USING_EXTERNAL_BYPASSER
if USE_CF_BYPASS:
if USING_EXTERNAL_BYPASSER:
from cloudflare_bypasser_external import get_bypassed_page
else:
from cloudflare_bypasser import get_bypassed_page
logger = setup_logger(__name__)
def html_get_page(url: str, retry: int = MAX_RETRY, use_bypasser: bool = False) -> str:
"""Fetch HTML content from a URL with retry mechanism.
Args:
url: Target URL
retry: Number of retry attempts
skip_404: Whether to skip 404 errors
Returns:
str: HTML content if successful, None otherwise
"""
response = None
try:
logger.debug(f"html_get_page: {url}, retry: {retry}, use_bypasser: {use_bypasser}")
if use_bypasser and USE_CF_BYPASS:
logger.info(f"GET Using Cloudflare Bypasser for: {url}")
return get_bypassed_page(url)
else:
logger.info(f"GET: {url}")
response = requests.get(url, proxies=PROXIES)
response.raise_for_status()
logger.debug(f"Success getting: {url}")
time.sleep(1)
return str(response.text)
except Exception as e:
if retry == 0:
logger.error_trace(f"Failed to fetch page: {url}, error: {e}")
return ""
if use_bypasser and USE_CF_BYPASS:
logger.warning(f"Exception while using cloudflare bypass for URL: {url}")
logger.warning(f"Exception: {e}")
logger.warning(f"Response: {response}")
elif response is not None and response.status_code == 404:
logger.warning(f"404 error for URL: {url}")
return ""
elif response is not None and response.status_code == 403:
logger.warning(f"403 detected for URL: {url}. Should retry using cloudflare bypass.")
return html_get_page(url, retry - 1, True)
sleep_time = DEFAULT_SLEEP * (MAX_RETRY - retry + 1)
logger.warning(
f"Retrying GET {url} in {sleep_time} seconds due to error: {e}"
)
time.sleep(sleep_time)
return html_get_page(url, retry - 1, use_bypasser)
def download_url(link: str, size: str = "", progress_callback: Optional[Callable[[float], None]] = None, cancel_flag: Optional[Event] = None) -> Optional[BytesIO]:
"""Download content from URL into a BytesIO buffer.
Args:
link: URL to download from
Returns:
BytesIO: Buffer containing downloaded content if successful
"""
try:
logger.info(f"Downloading from: {link}")
response = requests.get(link, stream=True, proxies=PROXIES)
response.raise_for_status()
total_size : float = 0.0
try:
# we assume size is in MB
total_size = float(size.strip().replace(" ", "").replace(",", ".").upper()[:-2].strip()) * 1024 * 1024
except:
total_size = float(response.headers.get('content-length', 0))
buffer = BytesIO()
# Initialize the progress bar with your guess
pbar = tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading')
for chunk in response.iter_content(chunk_size=1000):
buffer.write(chunk)
pbar.update(len(chunk))
if progress_callback is not None:
progress_callback(pbar.n * 100.0 / total_size)
if cancel_flag is not None and cancel_flag.is_set():
logger.info(f"Download cancelled: {link}")
return None
pbar.close()
if buffer.tell() * 0.1 < total_size * 0.9:
# Check the content of the buffer if its HTML or binary
if response.headers.get('content-type', '').startswith('text/html'):
logger.warn(f"Failed to download content for {link}. Found HTML content instead.")
return None
return buffer
except requests.exceptions.RequestException as e:
logger.error_trace(f"Failed to download from {link}: {e}")
return None
def get_absolute_url(base_url: str, url: str) -> str:
"""Get absolute URL from relative URL and base URL.
Args:
base_url: Base URL
url: Relative URL
"""
if url.strip() == "":
return ""
if url.strip("#") == "":
return ""
if url.startswith("http"):
return url
parsed_url = urlparse(url)
parsed_base = urlparse(base_url)
if parsed_url.netloc == "" or parsed_url.scheme == "":
parsed_url = parsed_url._replace(netloc=parsed_base.netloc, scheme=parsed_base.scheme)
return parsed_url.geturl()