Fixing the title and book details from AA (#289)

Should fix #288
This commit is contained in:
CaliBrain
2025-10-04 14:44:10 -04:00
committed by GitHub
parent 1c24312eb0
commit 8ea2fee0bb
3 changed files with 38 additions and 20 deletions

View File

@@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, Tag, NavigableString, ResultSet
import downloader
from logger import setup_logger
from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_BASE_URL
from env import AA_DONATOR_KEY, USE_CF_BYPASS, PRIORITIZE_WELIB
from env import AA_DONATOR_KEY, USE_CF_BYPASS, PRIORITIZE_WELIB, ALLOW_USE_WELIB
from models import BookInfo, SearchFilters
logger = setup_logger(__name__)
@@ -169,21 +169,6 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
data = soup.find_all("div", {"class": "main-inner"})[0].find_next("div")
divs = list(data.children)
_details = divs[13].text.strip().lower().split(" · ")
format = ""
size = ""
for f in _details:
if format == "" and f.strip().lower() in SUPPORTED_FORMATS:
format = f.strip().lower()
if size == "" and any(u in f.strip().lower() for u in ["mb", "kb", "gb"]):
size = f.strip().lower()
if format == "" or size == "":
for f in _details:
if f == "" and not " " in f.strip().lower():
format = f.strip().lower()
if size == "" and "." in f.strip().lower():
size = f.strip().lower()
every_url = soup.find_all("a")
slow_urls_no_waitlist = set()
@@ -237,20 +222,49 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
# Remove empty urls
urls = [url for url in urls if url != ""]
# Filter out divs that are not text
original_divs = divs
divs = [div.text.strip() for div in divs if div.text.strip() != ""]
separator_index = 6
for i, div in enumerate(divs):
if "·" in div.strip():
separator_index = i
break
_details = divs[separator_index].lower().split(" · ")
format = ""
size = ""
for f in _details:
if format == "" and f.strip().lower() in SUPPORTED_FORMATS:
format = f.strip().lower()
if size == "" and any(u in f.strip().lower() for u in ["mb", "kb", "gb"]):
size = f.strip().lower()
if format == "" or size == "":
for f in _details:
if f == "" and not " " in f.strip().lower():
format = f.strip().lower()
if size == "" and "." in f.strip().lower():
size = f.strip().lower()
book_title = divs[separator_index-3].strip("🔍")
# Extract basic information
book_info = BookInfo(
id=book_id,
preview=preview,
title=divs[7].text.strip(),
publisher=divs[11].text.strip(),
author=divs[9].text.strip(),
title=book_title,
publisher=divs[separator_index-1],
author=divs[separator_index-2],
format=format,
size=size,
download_urls=urls,
)
# Extract additional metadata
info = _extract_book_metadata(divs[-6])
info = _extract_book_metadata(original_divs[-6])
book_info.info = info
# Set language and year from metadata if available
@@ -262,6 +276,8 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
return book_info
def _get_download_urls_from_welib(book_id: str) -> set[str]:
if ALLOW_USE_WELIB == False:
return set()
"""Get download urls from welib.org."""
url = f"https://welib.org/md5/{book_id}"
logger.info(f"Getting download urls from welib.org for {book_id}. While this uses the bypasser, it will not start downloading them yet.")

1
env.py
View File

@@ -28,6 +28,7 @@ FLASK_PORT = int(os.getenv("FLASK_PORT", "8084"))
DEBUG = string_to_bool(os.getenv("DEBUG", "false"))
APP_ENV = os.getenv("APP_ENV", "N/A").lower()
PRIORITIZE_WELIB = string_to_bool(os.getenv("PRIORITIZE_WELIB", "false"))
ALLOW_USE_WELIB = string_to_bool(os.getenv("ALLOW_USE_WELIB", "true"))
# Version information from Docker build
BUILD_VERSION = os.getenv("BUILD_VERSION", "N/A")

View File

@@ -83,6 +83,7 @@ Note that if using TOR, the TZ will be calculated automatically based on IP.
| `AA_DONATOR_KEY` | Optional Donator key for Anna's Archive fast download API | `` |
| `USE_BOOK_TITLE` | Use book title as filename instead of ID | `false` |
| `PRIORITIZE_WELIB` | When downloading, download from WELIB first instead of AA | `false` |
| `ALLOW_USE_WELIB` | Allow usage of welib for downloading books if found there | `true` |
If you change `BOOK_LANGUAGE`, you can add multiple comma separated languages, such as `en,fr,ru` etc.