mirror of
https://github.com/calibrain/shelfmark.git
synced 2026-04-22 14:58:15 -04:00
Adding support for an external CloudFlare bypasser service and introducing a new Docker image build with a dedicated target. Key Changes - Added `cloudflare_bypasser_external.py` for external bypasser integration. - Updated Docker Compose files to support the new service. - Introduced a new Docker target for building a separate image for the external bypasser. - Refactored relevant modules to utilize the external bypasser when configured. - Documentation and configuration updates to reflect new options and Docker targets. Impact - Users can now choose between internal and external CloudFlare bypassing. - New Docker image and target streamline deployment of the external bypasser. - Improved modularity and maintainability. - No breaking changes for existing workflows. Testing - Manual and E2E tests performed for both bypasser modes. - Docker Compose setups and new image build verified for development and production. Notes Please review the new configuration options and Docker targets. Update your environment and deployment scripts as needed. Feedback and suggestions are welcome!
136 lines
5.0 KiB
Python
136 lines
5.0 KiB
Python
"""Network operations manager for the book downloader application."""
|
|
|
|
import network
|
|
network.init()
|
|
import requests
|
|
import time
|
|
from io import BytesIO
|
|
from typing import Optional
|
|
from urllib.parse import urlparse
|
|
from tqdm import tqdm
|
|
from typing import Callable
|
|
from threading import Event
|
|
from logger import setup_logger
|
|
from config import PROXIES
|
|
from env import MAX_RETRY, DEFAULT_SLEEP, USE_CF_BYPASS, USING_EXTERNAL_BYPASSER
|
|
if USE_CF_BYPASS:
|
|
if USING_EXTERNAL_BYPASSER:
|
|
from cloudflare_bypasser_external import get_bypassed_page
|
|
else:
|
|
from cloudflare_bypasser import get_bypassed_page
|
|
|
|
logger = setup_logger(__name__)
|
|
|
|
|
|
def html_get_page(url: str, retry: int = MAX_RETRY, use_bypasser: bool = False) -> str:
|
|
"""Fetch HTML content from a URL with retry mechanism.
|
|
|
|
Args:
|
|
url: Target URL
|
|
retry: Number of retry attempts
|
|
skip_404: Whether to skip 404 errors
|
|
|
|
Returns:
|
|
str: HTML content if successful, None otherwise
|
|
"""
|
|
response = None
|
|
try:
|
|
logger.debug(f"html_get_page: {url}, retry: {retry}, use_bypasser: {use_bypasser}")
|
|
if use_bypasser and USE_CF_BYPASS:
|
|
logger.info(f"GET Using Cloudflare Bypasser for: {url}")
|
|
return get_bypassed_page(url)
|
|
else:
|
|
logger.info(f"GET: {url}")
|
|
response = requests.get(url, proxies=PROXIES)
|
|
response.raise_for_status()
|
|
logger.debug(f"Success getting: {url}")
|
|
time.sleep(1)
|
|
return str(response.text)
|
|
|
|
except Exception as e:
|
|
if retry == 0:
|
|
logger.error_trace(f"Failed to fetch page: {url}, error: {e}")
|
|
return ""
|
|
|
|
if use_bypasser and USE_CF_BYPASS:
|
|
logger.warning(f"Exception while using cloudflare bypass for URL: {url}")
|
|
logger.warning(f"Exception: {e}")
|
|
logger.warning(f"Response: {response}")
|
|
elif response is not None and response.status_code == 404:
|
|
logger.warning(f"404 error for URL: {url}")
|
|
return ""
|
|
elif response is not None and response.status_code == 403:
|
|
logger.warning(f"403 detected for URL: {url}. Should retry using cloudflare bypass.")
|
|
return html_get_page(url, retry - 1, True)
|
|
|
|
sleep_time = DEFAULT_SLEEP * (MAX_RETRY - retry + 1)
|
|
logger.warning(
|
|
f"Retrying GET {url} in {sleep_time} seconds due to error: {e}"
|
|
)
|
|
time.sleep(sleep_time)
|
|
return html_get_page(url, retry - 1, use_bypasser)
|
|
|
|
def download_url(link: str, size: str = "", progress_callback: Optional[Callable[[float], None]] = None, cancel_flag: Optional[Event] = None) -> Optional[BytesIO]:
|
|
"""Download content from URL into a BytesIO buffer.
|
|
|
|
Args:
|
|
link: URL to download from
|
|
|
|
Returns:
|
|
BytesIO: Buffer containing downloaded content if successful
|
|
"""
|
|
try:
|
|
logger.info(f"Downloading from: {link}")
|
|
response = requests.get(link, stream=True, proxies=PROXIES)
|
|
response.raise_for_status()
|
|
|
|
total_size : float = 0.0
|
|
try:
|
|
# we assume size is in MB
|
|
total_size = float(size.strip().replace(" ", "").replace(",", ".").upper()[:-2].strip()) * 1024 * 1024
|
|
except:
|
|
total_size = float(response.headers.get('content-length', 0))
|
|
|
|
buffer = BytesIO()
|
|
|
|
# Initialize the progress bar with your guess
|
|
pbar = tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading')
|
|
for chunk in response.iter_content(chunk_size=1000):
|
|
buffer.write(chunk)
|
|
pbar.update(len(chunk))
|
|
if progress_callback is not None:
|
|
progress_callback(pbar.n * 100.0 / total_size)
|
|
if cancel_flag is not None and cancel_flag.is_set():
|
|
logger.info(f"Download cancelled: {link}")
|
|
return None
|
|
|
|
pbar.close()
|
|
if buffer.tell() * 0.1 < total_size * 0.9:
|
|
# Check the content of the buffer if its HTML or binary
|
|
if response.headers.get('content-type', '').startswith('text/html'):
|
|
logger.warn(f"Failed to download content for {link}. Found HTML content instead.")
|
|
return None
|
|
return buffer
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error_trace(f"Failed to download from {link}: {e}")
|
|
return None
|
|
|
|
def get_absolute_url(base_url: str, url: str) -> str:
|
|
"""Get absolute URL from relative URL and base URL.
|
|
|
|
Args:
|
|
base_url: Base URL
|
|
url: Relative URL
|
|
"""
|
|
if url.strip() == "":
|
|
return ""
|
|
if url.strip("#") == "":
|
|
return ""
|
|
if url.startswith("http"):
|
|
return url
|
|
parsed_url = urlparse(url)
|
|
parsed_base = urlparse(base_url)
|
|
if parsed_url.netloc == "" or parsed_url.scheme == "":
|
|
parsed_url = parsed_url._replace(netloc=parsed_base.netloc, scheme=parsed_base.scheme)
|
|
return parsed_url.geturl()
|