mirror of
https://github.com/calibrain/shelfmark.git
synced 2026-04-20 13:59:46 -04:00
152 lines
4.8 KiB
Python
152 lines
4.8 KiB
Python
"""Network operations manager for the book downloader application."""
|
|
|
|
import requests
|
|
import time
|
|
from io import BytesIO
|
|
import urllib.request
|
|
from typing import Optional
|
|
from urllib.parse import urlparse
|
|
from tqdm import tqdm
|
|
|
|
from logger import setup_logger
|
|
from config import MAX_RETRY, DEFAULT_SLEEP, CLOUDFLARE_PROXY, USE_CF_BYPASS
|
|
|
|
logger = setup_logger(__name__)
|
|
|
|
def setup_urllib_opener():
|
|
"""Configure urllib opener with appropriate headers."""
|
|
opener = urllib.request.build_opener()
|
|
opener.addheaders = [
|
|
('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
|
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
'Chrome/129.0.0.0 Safari/537.3')
|
|
]
|
|
urllib.request.install_opener(opener)
|
|
|
|
setup_urllib_opener()
|
|
|
|
def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip_403: bool = False) -> str:
|
|
"""Fetch HTML content from a URL with retry mechanism.
|
|
|
|
Args:
|
|
url: Target URL
|
|
retry: Number of retry attempts
|
|
skip_404: Whether to skip 404 errors
|
|
|
|
Returns:
|
|
str: HTML content if successful, None otherwise
|
|
"""
|
|
try:
|
|
logger.info(f"GET: {url}")
|
|
response = requests.get(url)
|
|
|
|
response.raise_for_status()
|
|
time.sleep(1)
|
|
return response.text
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
if retry == 0:
|
|
logger.error(f"Failed to fetch page: {url}, error: {e}")
|
|
return ""
|
|
|
|
if skip_404 and response.status_code == 404:
|
|
logger.warning(f"404 error for URL: {url}")
|
|
return ""
|
|
|
|
if skip_403 and response.status_code == 403:
|
|
logger.warning(f"403 error for URL: {url}. Should retry using cloudflare bypass.")
|
|
return ""
|
|
|
|
|
|
sleep_time = DEFAULT_SLEEP * (MAX_RETRY - retry + 1)
|
|
logger.warning(
|
|
f"Retrying GET {url} in {sleep_time} seconds due to error: {e}"
|
|
)
|
|
time.sleep(sleep_time)
|
|
return html_get_page(url, retry - 1)
|
|
|
|
def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> str:
|
|
"""Fetch HTML content through Cloudflare proxy.
|
|
|
|
Args:
|
|
url: Target URL
|
|
retry: Number of retry attempts
|
|
|
|
Returns:
|
|
str: HTML content if successful, None otherwise
|
|
"""
|
|
if USE_CF_BYPASS == False:
|
|
logger.warning("Cloudflare bypass is disabled, trying without it.")
|
|
return html_get_page(url, retry, skip_403=True)
|
|
try:
|
|
logger.info(f"GET_CF: {url}")
|
|
response = requests.get(
|
|
f"{CLOUDFLARE_PROXY}/html?url={url}&retries=3"
|
|
)
|
|
time.sleep(1)
|
|
return response.text
|
|
|
|
except Exception as e:
|
|
if retry == 0:
|
|
logger.error(f"Failed to fetch page through CF: {url}, error: {e}")
|
|
return ""
|
|
|
|
sleep_time = DEFAULT_SLEEP * (MAX_RETRY - retry + 1)
|
|
logger.warning(
|
|
f"Retrying GET_CF {url} in {sleep_time} seconds due to error: {e}"
|
|
)
|
|
time.sleep(sleep_time)
|
|
return html_get_page_cf(url, retry - 1)
|
|
|
|
def download_url(link: str, size: str = "") -> Optional[BytesIO]:
|
|
"""Download content from URL into a BytesIO buffer.
|
|
|
|
Args:
|
|
link: URL to download from
|
|
|
|
Returns:
|
|
BytesIO: Buffer containing downloaded content if successful
|
|
"""
|
|
try:
|
|
logger.info(f"Downloading from: {link}")
|
|
response = requests.get(link, stream=True)
|
|
response.raise_for_status()
|
|
|
|
total_size : float = 0.0
|
|
try:
|
|
# we assume size is in MB
|
|
total_size = float(size.strip().replace(" ", "").replace(",", ".").upper()[:-2].strip()) * 1024 * 1024
|
|
except:
|
|
total_size = float(response.headers.get('content-length', 0))
|
|
|
|
buffer = BytesIO()
|
|
|
|
# Initialize the progress bar with your guess
|
|
pbar = tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading')
|
|
for chunk in response.iter_content(chunk_size=1000):
|
|
buffer.write(chunk)
|
|
pbar.update(len(chunk))
|
|
|
|
pbar.close()
|
|
return buffer
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Failed to download from {link}: {e}")
|
|
return None
|
|
|
|
def get_absolute_url(base_url: str, url: str) -> str:
|
|
"""Get absolute URL from relative URL and base URL.
|
|
|
|
Args:
|
|
base_url: Base URL
|
|
url: Relative URL
|
|
"""
|
|
if url.strip() == "":
|
|
return ""
|
|
if url.startswith("http"):
|
|
return url
|
|
parsed_url = urlparse(url)
|
|
parsed_base = urlparse(base_url)
|
|
if parsed_url.netloc == "" or parsed_url.scheme == "":
|
|
parsed_url = parsed_url._replace(netloc=parsed_base.netloc, scheme=parsed_base.scheme)
|
|
return parsed_url.geturl()
|