mirror of
https://github.com/calibrain/shelfmark.git
synced 2026-04-20 13:59:46 -04:00
Emoji check fix Fix multi language books Fix DNS in Chromium Headless Fix DNS IPv6 address by un-abreviating them Fix typo in Quad9 DNS
492 lines
17 KiB
Python
492 lines
17 KiB
Python
import time
|
|
import os
|
|
import socket
|
|
from urllib.parse import urlparse
|
|
import threading
|
|
import env
|
|
from env import LOG_DIR, DEBUG
|
|
import signal
|
|
from datetime import datetime
|
|
import subprocess
|
|
import requests
|
|
from typing import Optional
|
|
|
|
# --- SeleniumBase Import ---
|
|
from seleniumbase import Driver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.common.exceptions import TimeoutException
|
|
|
|
import network
|
|
from logger import setup_logger
|
|
from env import MAX_RETRY, DEFAULT_SLEEP
|
|
from config import PROXIES, CUSTOM_DNS, DOH_SERVER, VIRTUAL_SCREEN_SIZE, RECORDING_DIR
|
|
|
|
logger = setup_logger(__name__)
|
|
network.init()
|
|
|
|
DRIVER = None
|
|
DISPLAY = {
|
|
"xvfb": None,
|
|
"ffmpeg": None,
|
|
}
|
|
LAST_USED = None
|
|
LOCKED = threading.Lock()
|
|
TENTATIVE_CURRENT_URL = None
|
|
|
|
def _reset_pyautogui_display_state():
|
|
try:
|
|
import pyautogui
|
|
import Xlib.display
|
|
pyautogui._pyautogui_x11._display = (
|
|
Xlib.display.Display(os.environ['DISPLAY'])
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Error resetting pyautogui display state: {e}")
|
|
|
|
def _is_bypassed(sb, escape_emojis : bool = True) -> bool:
|
|
"""Enhanced bypass detection with more comprehensive checks"""
|
|
try:
|
|
# Get page information with error handling
|
|
try:
|
|
title = sb.get_title().lower()
|
|
except:
|
|
title = ""
|
|
|
|
try:
|
|
body = sb.get_text("body").lower()
|
|
except:
|
|
body = ""
|
|
|
|
try:
|
|
current_url = sb.get_current_url()
|
|
except:
|
|
current_url = ""
|
|
|
|
# Check if page is too long, if so we are probably bypassed
|
|
if len(body.strip()) > 100000:
|
|
logger.debug(f"Page content too long, we are probably bypassed len: {len(body.strip())}")
|
|
return True
|
|
|
|
# Detect if there is an emoji in the page, any utf8 emoji, if so we are probably bypassed
|
|
if escape_emojis:
|
|
import emoji
|
|
emoji_list = emoji.emoji_list(body)
|
|
if len(emoji_list) >= 3:
|
|
logger.debug(f"Detected emoji in page, we are probably bypassed len: {len(emoji_list)}")
|
|
return True
|
|
|
|
# Enhanced verification texts for newer Cloudflare versions
|
|
verification_texts = [
|
|
"just a moment",
|
|
"verify you are human",
|
|
"verifying you are human",
|
|
"cloudflare.com/products/turnstile/?utm_source=turnstile"
|
|
]
|
|
|
|
# Check for Cloudflare indicators
|
|
for text in verification_texts:
|
|
if text in title or text in body:
|
|
logger.debug(f"Cloudflare indicator found: '{text}' in page")
|
|
return False
|
|
|
|
# Additional checks for specific Cloudflare patterns
|
|
if "cf-" in body or "cloudflare" in current_url.lower():
|
|
logger.debug("Cloudflare patterns detected in page")
|
|
return False
|
|
|
|
# Check if we're still on a challenge page (common Cloudflare pattern)
|
|
if "/cdn-cgi/" in current_url:
|
|
logger.debug("Still on Cloudflare CDN challenge page")
|
|
return False
|
|
|
|
# If page is mostly empty, it might still be loading
|
|
if len(body.strip()) < 50:
|
|
logger.debug("Page content too short, might still be loading")
|
|
return False
|
|
|
|
logger.debug(f"Bypass check passed - Title: '{title[:100]}', Body length: {len(body)}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error checking bypass status: {e}")
|
|
# If we can't check, assume we're not bypassed
|
|
return False
|
|
|
|
def _bypass_method_1(sb) -> bool:
|
|
"""Original bypass method using uc_gui_click_captcha"""
|
|
try:
|
|
logger.debug("Attempting bypass method 1: uc_gui_click_captcha")
|
|
sb.uc_gui_click_captcha()
|
|
time.sleep(3)
|
|
return _is_bypassed(sb)
|
|
except Exception as e:
|
|
logger.debug(f"Method 1 failed on first try: {e}")
|
|
try:
|
|
time.sleep(5)
|
|
sb.wait_for_element_visible('body', timeout=10)
|
|
sb.uc_gui_click_captcha()
|
|
time.sleep(3)
|
|
return _is_bypassed(sb)
|
|
except Exception as e2:
|
|
logger.debug(f"Method 1 failed on second try: {e2}")
|
|
try:
|
|
time.sleep(DEFAULT_SLEEP)
|
|
sb.uc_gui_click_captcha()
|
|
time.sleep(5)
|
|
return _is_bypassed(sb)
|
|
except Exception as e3:
|
|
logger.debug(f"Method 1 completely failed: {e3}")
|
|
return False
|
|
|
|
def _bypass_method_2(sb) -> bool:
|
|
"""Alternative bypass method using longer waits and manual interaction"""
|
|
try:
|
|
logger.debug("Attempting bypass method 2: wait and reload")
|
|
# Wait longer for page to load completely
|
|
time.sleep(10)
|
|
|
|
# Try refreshing the page
|
|
sb.refresh()
|
|
time.sleep(8)
|
|
|
|
# Check if bypass worked after refresh
|
|
if _is_bypassed(sb):
|
|
return True
|
|
|
|
# Try clicking on the page center (sometimes helps trigger bypass)
|
|
try:
|
|
sb.click_if_visible("body", timeout=5)
|
|
time.sleep(5)
|
|
except:
|
|
pass
|
|
|
|
return _is_bypassed(sb)
|
|
except Exception as e:
|
|
logger.debug(f"Method 2 failed: {e}")
|
|
return False
|
|
|
|
def _bypass_method_3(sb) -> bool:
|
|
"""Third bypass method using user-agent rotation and stealth mode"""
|
|
try:
|
|
logger.debug("Attempting bypass method 3: stealth approach")
|
|
# Wait a random amount to appear more human
|
|
import random
|
|
wait_time = random.uniform(8, 15)
|
|
time.sleep(wait_time)
|
|
|
|
# Try to scroll the page (human-like behavior)
|
|
try:
|
|
sb.scroll_to_bottom()
|
|
time.sleep(2)
|
|
sb.scroll_to_top()
|
|
time.sleep(3)
|
|
except:
|
|
pass
|
|
|
|
# Check if this helped
|
|
if _is_bypassed(sb):
|
|
return True
|
|
|
|
# Try the original captcha click as last resort
|
|
try:
|
|
sb.uc_gui_click_captcha()
|
|
time.sleep(5)
|
|
except:
|
|
pass
|
|
|
|
return _is_bypassed(sb)
|
|
except Exception as e:
|
|
logger.debug(f"Method 3 failed: {e}")
|
|
return False
|
|
|
|
def _bypass(sb, max_retries: int = MAX_RETRY) -> None:
|
|
"""Enhanced bypass function with multiple strategies"""
|
|
try_count = 0
|
|
methods = [_bypass_method_1, _bypass_method_2, _bypass_method_3]
|
|
|
|
while not _is_bypassed(sb):
|
|
if try_count >= max_retries:
|
|
logger.warning("Exceeded maximum retries. Bypass failed.")
|
|
break
|
|
|
|
method_index = try_count % len(methods)
|
|
method = methods[method_index]
|
|
|
|
logger.info(f"Bypass attempt {try_count + 1} / {max_retries} using {method.__name__}")
|
|
|
|
try_count += 1
|
|
|
|
# Progressive backoff: wait longer between retries
|
|
wait_time = min(DEFAULT_SLEEP * (try_count - 1), 15)
|
|
if wait_time > 0:
|
|
logger.info(f"Waiting {wait_time}s before trying...")
|
|
time.sleep(wait_time)
|
|
|
|
try:
|
|
if method(sb):
|
|
logger.info(f"Bypass successful using {method.__name__}")
|
|
return
|
|
except Exception as e:
|
|
logger.warning(f"Exception in {method.__name__}: {e}")
|
|
|
|
logger.info(f"Bypass method {method.__name__} failed.")
|
|
|
|
def _get_chromium_args():
|
|
|
|
arguments = [
|
|
# Ignore certificate and SSL errors (similar to curl's --insecure)
|
|
"--ignore-certificate-errors",
|
|
"--ignore-ssl-errors",
|
|
"--allow-running-insecure-content",
|
|
"--ignore-certificate-errors-spki-list",
|
|
"--ignore-certificate-errors-skip-list"
|
|
]
|
|
|
|
# Conditionally add verbose logging arguments
|
|
if DEBUG:
|
|
arguments.extend([
|
|
"--enable-logging", # Enable Chrome browser logging
|
|
"--v=1", # Set verbosity level for Chrome logs
|
|
"--log-file=" + str(LOG_DIR / "chrome_browser.log")
|
|
])
|
|
|
|
# Add proxy settings if configured
|
|
if PROXIES:
|
|
proxy_url = PROXIES.get('https') or PROXIES.get('http')
|
|
if proxy_url:
|
|
arguments.append(f'--proxy-server={proxy_url}')
|
|
|
|
# --- Add Custom DNS settings ---
|
|
try:
|
|
if len(CUSTOM_DNS) > 0:
|
|
if DOH_SERVER:
|
|
logger.info(f"Configuring DNS over HTTPS (DoH) with server: {DOH_SERVER}")
|
|
|
|
# TODO: This is probably broken and a halucination,
|
|
# but it should still default to google DOH so its fine...
|
|
arguments.extend(['--enable-features=DnsOverHttps', '--dns-over-https-mode=secure', f'--dns-over-https-servers="{DOH_SERVER}"'])
|
|
doh_hostname = urlparse(DOH_SERVER).hostname
|
|
if doh_hostname:
|
|
try:
|
|
arguments.append(f'--host-resolver-rules=MAP {doh_hostname} {socket.gethostbyname(doh_hostname)}')
|
|
except socket.gaierror:
|
|
logger.warning(f"Could not resolve DoH hostname: {doh_hostname}")
|
|
elif CUSTOM_DNS:
|
|
arguments.append(f'--dns-server="{",".join(CUSTOM_DNS)}"')
|
|
arguments.append(f'--disable-features=DnsOverHttps')
|
|
except Exception as e:
|
|
logger.error_trace(f"Error configuring DNS settings: {e}")
|
|
return arguments
|
|
|
|
CHROMIUM_ARGS = _get_chromium_args()
|
|
|
|
def _get(url, retry : int = MAX_RETRY):
|
|
try:
|
|
logger.info(f"SB_GET: {url}")
|
|
sb = _get_driver()
|
|
|
|
# Enhanced page loading with better error handling
|
|
logger.debug("Opening URL with SeleniumBase...")
|
|
sb.uc_open_with_reconnect(url, DEFAULT_SLEEP)
|
|
time.sleep(DEFAULT_SLEEP)
|
|
|
|
# Log current page title and URL for debugging
|
|
try:
|
|
current_url = sb.get_current_url()
|
|
current_title = sb.get_title()
|
|
logger.debug(f"Page loaded - URL: {current_url}, Title: {current_title}")
|
|
except Exception as debug_e:
|
|
logger.debug(f"Could not get page info: {debug_e}")
|
|
|
|
# Attempt bypass
|
|
logger.debug("Starting bypass process...")
|
|
_bypass(sb)
|
|
|
|
if _is_bypassed(sb):
|
|
logger.info("Bypass successful.")
|
|
return sb.page_source
|
|
else:
|
|
logger.warning("Bypass completed but page still shows Cloudflare protection")
|
|
# Log page content for debugging (truncated)
|
|
try:
|
|
page_text = sb.get_text("body")[:500] + "..." if len(sb.get_text("body")) > 500 else sb.get_text("body")
|
|
logger.debug(f"Page content: {page_text}")
|
|
except:
|
|
pass
|
|
|
|
except Exception as e:
|
|
# Enhanced error logging with full stack trace
|
|
import traceback
|
|
error_details = f"Exception type: {type(e).__name__}, Message: {str(e)}"
|
|
stack_trace = traceback.format_exc()
|
|
|
|
if retry == 0:
|
|
logger.error(f"Failed to initialize browser after all retries: {error_details}")
|
|
logger.debug(f"Full stack trace: {stack_trace}")
|
|
_reset_driver()
|
|
raise e
|
|
|
|
logger.warning(f"Failed to bypass Cloudflare (retry {MAX_RETRY - retry + 1}/{MAX_RETRY}): {error_details}")
|
|
logger.debug(f"Stack trace: {stack_trace}")
|
|
|
|
# Reset driver on certain errors
|
|
if "WebDriverException" in str(type(e)) or "SessionNotCreatedException" in str(type(e)):
|
|
logger.info("Resetting driver due to WebDriver error...")
|
|
_reset_driver()
|
|
|
|
return _get(url, retry - 1)
|
|
|
|
def get(url, retry : int = MAX_RETRY):
|
|
global LOCKED, TENTATIVE_CURRENT_URL, LAST_USED
|
|
with LOCKED:
|
|
TENTATIVE_CURRENT_URL = url
|
|
ret = _get(url, retry)
|
|
LAST_USED = time.time()
|
|
return ret
|
|
|
|
def _init_driver():
|
|
global DRIVER
|
|
if DRIVER:
|
|
_reset_driver()
|
|
driver = Driver(uc=True, headless=False, size=f"{VIRTUAL_SCREEN_SIZE[0]},{VIRTUAL_SCREEN_SIZE[1]}", chromium_arg=CHROMIUM_ARGS)
|
|
DRIVER = driver
|
|
time.sleep(DEFAULT_SLEEP)
|
|
return driver
|
|
|
|
def _get_driver():
|
|
global DRIVER, DISPLAY
|
|
global LAST_USED
|
|
logger.info("Getting driver...")
|
|
LAST_USED = time.time()
|
|
if env.DOCKERMODE and env.USE_CF_BYPASS and not DISPLAY["xvfb"]:
|
|
from pyvirtualdisplay import Display
|
|
display = Display(visible=False, size=VIRTUAL_SCREEN_SIZE)
|
|
display.start()
|
|
logger.info("Display started")
|
|
DISPLAY["xvfb"] = display
|
|
time.sleep(DEFAULT_SLEEP)
|
|
_reset_pyautogui_display_state()
|
|
|
|
if env.DEBUG:
|
|
timestamp = datetime.now().strftime("%y%m%d-%H%M%S")
|
|
output_file = RECORDING_DIR / f"screen_recording_{timestamp}.mp4"
|
|
|
|
ffmpeg_cmd = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-f", "x11grab",
|
|
"-video_size", f"{VIRTUAL_SCREEN_SIZE[0]}x{VIRTUAL_SCREEN_SIZE[1]}",
|
|
"-i", f":{display.display}",
|
|
"-c:v", "libx264",
|
|
"-preset", "ultrafast", # or "veryfast" (trade speed for slightly better compression)
|
|
"-maxrate", "700k", # Slightly higher bitrate for text clarity
|
|
"-bufsize", "1400k", # Buffer size (2x maxrate)
|
|
"-crf", "36", # Adjust as needed: higher = smaller, lower = better quality (23 is visually lossless)
|
|
"-pix_fmt", "yuv420p", # Crucial for compatibility with most players
|
|
"-tune", "animation", # Optimize encoding for screen content
|
|
"-x264-params", "bframes=0:deblock=-1,-1", # Optimize for text, disable b-frames and deblocking
|
|
"-r", "15", # Reduce frame rate (if content allows)
|
|
"-an", # Disable audio recording (if not needed)
|
|
output_file.as_posix(),
|
|
"-nostats", "-loglevel", "0"
|
|
]
|
|
logger.info("Starting FFmpeg recording to %s", output_file)
|
|
logger.debug_trace(f"FFmpeg command: {' '.join(ffmpeg_cmd)}")
|
|
DISPLAY["ffmpeg"] = subprocess.Popen(ffmpeg_cmd)
|
|
if not DRIVER:
|
|
return _init_driver()
|
|
logger.log_resource_usage()
|
|
return DRIVER
|
|
|
|
def _reset_driver():
|
|
logger.log_resource_usage()
|
|
logger.info("Resetting driver...")
|
|
global DRIVER, DISPLAY
|
|
if DRIVER:
|
|
try:
|
|
DRIVER.quit()
|
|
DRIVER = None
|
|
except Exception as e:
|
|
logger.warning(f"Error quitting driver: {e}")
|
|
time.sleep(0.5)
|
|
if DISPLAY["xvfb"]:
|
|
try:
|
|
DISPLAY["xvfb"].stop()
|
|
DISPLAY["xvfb"] = None
|
|
except Exception as e:
|
|
logger.warning(f"Error stopping display: {e}")
|
|
time.sleep(0.5)
|
|
try:
|
|
os.system("pkill -f Xvfb")
|
|
except Exception as e:
|
|
logger.debug(f"Error killing Xvfb: {e}")
|
|
time.sleep(0.5)
|
|
if DISPLAY["ffmpeg"]:
|
|
try:
|
|
DISPLAY["ffmpeg"].send_signal(signal.SIGINT)
|
|
DISPLAY["ffmpeg"] = None
|
|
except Exception as e:
|
|
logger.debug(f"Error stopping ffmpeg: {e}")
|
|
time.sleep(0.5)
|
|
try:
|
|
os.system("pkill -f ffmpeg")
|
|
except Exception as e:
|
|
logger.debug(f"Error killing ffmpeg: {e}")
|
|
time.sleep(0.5)
|
|
try:
|
|
os.system("pkill -f chrom")
|
|
except Exception as e:
|
|
logger.debug(f"Error killing chrom: {e}")
|
|
time.sleep(0.5)
|
|
logger.info("Driver reset.")
|
|
logger.log_resource_usage()
|
|
|
|
def _cleanup_driver():
|
|
global LOCKED
|
|
global LAST_USED
|
|
with LOCKED:
|
|
if LAST_USED:
|
|
if time.time() - LAST_USED >= env.BYPASS_RELEASE_INACTIVE_MIN * 60:
|
|
_reset_driver()
|
|
LAST_USED = None
|
|
logger.info("Driver reset due to inactivity.")
|
|
|
|
def _cleanup_loop():
|
|
while True:
|
|
_cleanup_driver()
|
|
time.sleep(max(env.BYPASS_RELEASE_INACTIVE_MIN / 2, 1))
|
|
|
|
def _init_cleanup_thread():
|
|
cleanup_thread = threading.Thread(target=_cleanup_loop)
|
|
cleanup_thread.daemon = True
|
|
cleanup_thread.start()
|
|
|
|
def wait_for_result(func, timeout : int = 10, condition : any = True):
|
|
start_time = time.time()
|
|
while time.time() - start_time < timeout:
|
|
result = func()
|
|
if condition(result):
|
|
return result
|
|
time.sleep(0.5)
|
|
return None
|
|
_init_cleanup_thread()
|
|
|
|
|
|
def get_bypassed_page(url: str) -> Optional[str]:
|
|
"""Fetch HTML content from a URL using the internal Cloudflare Bypasser.
|
|
|
|
Args:
|
|
url: Target URL
|
|
Returns:
|
|
str: HTML content if successful, None otherwise
|
|
"""
|
|
|
|
response_html = get(url)
|
|
logger.debug(f"Cloudflare Bypasser response length: {len(response_html)}")
|
|
if response_html.strip() != "":
|
|
return response_html
|
|
else:
|
|
raise requests.exceptions.RequestException("Failed to bypass Cloudflare")
|