From 8ea2fee0bbdfdff225a313b95e495b5ba15faf6d Mon Sep 17 00:00:00 2001 From: CaliBrain Date: Sat, 4 Oct 2025 14:44:10 -0400 Subject: [PATCH] Fixing the title and book details from AA (#289) Should fix #288 --- book_manager.py | 56 +++++++++++++++++++++++++++++++------------------ env.py | 1 + readme.md | 1 + 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/book_manager.py b/book_manager.py index 26ce1dc..c2dc466 100644 --- a/book_manager.py +++ b/book_manager.py @@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, Tag, NavigableString, ResultSet import downloader from logger import setup_logger from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_BASE_URL -from env import AA_DONATOR_KEY, USE_CF_BYPASS, PRIORITIZE_WELIB +from env import AA_DONATOR_KEY, USE_CF_BYPASS, PRIORITIZE_WELIB, ALLOW_USE_WELIB from models import BookInfo, SearchFilters logger = setup_logger(__name__) @@ -169,21 +169,6 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: data = soup.find_all("div", {"class": "main-inner"})[0].find_next("div") divs = list(data.children) - _details = divs[13].text.strip().lower().split(" · ") - format = "" - size = "" - for f in _details: - if format == "" and f.strip().lower() in SUPPORTED_FORMATS: - format = f.strip().lower() - if size == "" and any(u in f.strip().lower() for u in ["mb", "kb", "gb"]): - size = f.strip().lower() - - if format == "" or size == "": - for f in _details: - if f == "" and not " " in f.strip().lower(): - format = f.strip().lower() - if size == "" and "." in f.strip().lower(): - size = f.strip().lower() every_url = soup.find_all("a") slow_urls_no_waitlist = set() @@ -237,20 +222,49 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: # Remove empty urls urls = [url for url in urls if url != ""] + # Filter out divs that are not text + original_divs = divs + divs = [div.text.strip() for div in divs if div.text.strip() != ""] + + separator_index = 6 + for i, div in enumerate(divs): + if "·" in div.strip(): + separator_index = i + break + + _details = divs[separator_index].lower().split(" · ") + format = "" + size = "" + for f in _details: + if format == "" and f.strip().lower() in SUPPORTED_FORMATS: + format = f.strip().lower() + if size == "" and any(u in f.strip().lower() for u in ["mb", "kb", "gb"]): + size = f.strip().lower() + + if format == "" or size == "": + for f in _details: + if f == "" and not " " in f.strip().lower(): + format = f.strip().lower() + if size == "" and "." in f.strip().lower(): + size = f.strip().lower() + + + book_title = divs[separator_index-3].strip("🔍") + # Extract basic information book_info = BookInfo( id=book_id, preview=preview, - title=divs[7].text.strip(), - publisher=divs[11].text.strip(), - author=divs[9].text.strip(), + title=book_title, + publisher=divs[separator_index-1], + author=divs[separator_index-2], format=format, size=size, download_urls=urls, ) # Extract additional metadata - info = _extract_book_metadata(divs[-6]) + info = _extract_book_metadata(original_divs[-6]) book_info.info = info # Set language and year from metadata if available @@ -262,6 +276,8 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: return book_info def _get_download_urls_from_welib(book_id: str) -> set[str]: + if ALLOW_USE_WELIB == False: + return set() """Get download urls from welib.org.""" url = f"https://welib.org/md5/{book_id}" logger.info(f"Getting download urls from welib.org for {book_id}. While this uses the bypasser, it will not start downloading them yet.") diff --git a/env.py b/env.py index c825bb8..7c8d288 100644 --- a/env.py +++ b/env.py @@ -28,6 +28,7 @@ FLASK_PORT = int(os.getenv("FLASK_PORT", "8084")) DEBUG = string_to_bool(os.getenv("DEBUG", "false")) APP_ENV = os.getenv("APP_ENV", "N/A").lower() PRIORITIZE_WELIB = string_to_bool(os.getenv("PRIORITIZE_WELIB", "false")) +ALLOW_USE_WELIB = string_to_bool(os.getenv("ALLOW_USE_WELIB", "true")) # Version information from Docker build BUILD_VERSION = os.getenv("BUILD_VERSION", "N/A") diff --git a/readme.md b/readme.md index 8fa1501..c887e1a 100644 --- a/readme.md +++ b/readme.md @@ -83,6 +83,7 @@ Note that if using TOR, the TZ will be calculated automatically based on IP. | `AA_DONATOR_KEY` | Optional Donator key for Anna's Archive fast download API | `` | | `USE_BOOK_TITLE` | Use book title as filename instead of ID | `false` | | `PRIORITIZE_WELIB` | When downloading, download from WELIB first instead of AA | `false` | +| `ALLOW_USE_WELIB` | Allow usage of welib for downloading books if found there | `true` | If you change `BOOK_LANGUAGE`, you can add multiple comma separated languages, such as `en,fr,ru` etc.