diff --git a/backend.py b/backend.py index 3fab469..46e265e 100644 --- a/backend.py +++ b/backend.py @@ -11,7 +11,8 @@ from threading import Event from logger import setup_logger from config import CUSTOM_SCRIPT -from env import INGEST_DIR, TMP_DIR, MAIN_LOOP_SLEEP_TIME, USE_BOOK_TITLE, MAX_CONCURRENT_DOWNLOADS, DOWNLOAD_PROGRESS_UPDATE_INTERVAL +from env import (INGEST_DIR, DOWNLOAD_PATHS, TMP_DIR, MAIN_LOOP_SLEEP_TIME, USE_BOOK_TITLE, + MAX_CONCURRENT_DOWNLOADS, DOWNLOAD_PROGRESS_UPDATE_INTERVAL) from models import book_queue, BookInfo, QueueStatus, SearchFilters import book_manager @@ -135,6 +136,13 @@ def _book_info_to_dict(book: BookInfo) -> Dict[str, Any]: if value is not None } +def _prepare_download_folder(book_info: BookInfo) -> Path: + """Prepare final content-type subdir""" + content = book_info.content + content_dir = DOWNLOAD_PATHS.get(content) if content and content in DOWNLOAD_PATHS else INGEST_DIR + os.makedirs(content_dir, exist_ok=True) + return content_dir + def _download_book_with_cancellation(book_id: str, cancel_flag: Event) -> Optional[str]: """Download and process a book with cancellation support. @@ -212,8 +220,9 @@ def _download_book_with_cancellation(book_id: str, cancel_flag: Event) -> Option book_info.format = success_download_url.split(".")[-1] book_name += f".{book_info.format}" - intermediate_path = INGEST_DIR / f"{book_id}.crdownload" - final_path = INGEST_DIR / book_name + final_dir = _prepare_download_folder(book_info) + intermediate_path = final_dir / f"{book_id}.crdownload" + final_path = final_dir / book_name if os.path.exists(book_path): logger.info(f"Moving book to ingest directory: {book_path} -> {final_path}") diff --git a/book_manager.py b/book_manager.py index 1920ead..3579036 100644 --- a/book_manager.py +++ b/book_manager.py @@ -1,6 +1,6 @@ """Book download manager handling search and retrieval operations.""" -import time, json, re +import time, json, os, re from pathlib import Path from urllib.parse import quote from typing import List, Optional, Dict, Union, Callable @@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, Tag, NavigableString, ResultSet import downloader from logger import setup_logger from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_BASE_URL -from env import AA_DONATOR_KEY, USE_CF_BYPASS, PRIORITIZE_WELIB, ALLOW_USE_WELIB +from env import AA_DONATOR_KEY, USE_CF_BYPASS, PRIORITIZE_WELIB, ALLOW_USE_WELIB, DOWNLOAD_PATHS from models import BookInfo, SearchFilters logger = setup_logger(__name__) @@ -125,6 +125,7 @@ def _parse_search_result_row(row: Tag) -> Optional[BookInfo]: publisher=cells[3].find("span").next, year=cells[4].find("span").next, language=cells[7].find("span").next, + content=cells[8].find("span").next.lower(), format=cells[9].find("span").next.lower(), size=cells[10].find("span").next, ) @@ -232,6 +233,8 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: all_details = _find_in_divs(divs, " ยท ") format = "" size = "" + content = "" + for _details in all_details: _details = _details.split(" ยท ") for f in _details: @@ -239,7 +242,11 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: format = f.strip().lower() if size == "" and any(u in f.strip().lower() for u in ["mb", "kb", "gb"]): size = f.strip().lower() - + if content == "": + for ct in DOWNLOAD_PATHS.keys(): + if ct in f.strip().lower(): + content = ct + break if format == "" or size == "": for f in _details: stripped = f.strip().lower() @@ -247,7 +254,6 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: format = stripped if size == "" and "." in stripped: size = stripped - book_title = _find_in_divs(divs, "๐Ÿ”")[0].strip("๐Ÿ”").strip() @@ -258,6 +264,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: id=book_id, preview=preview, title=book_title, + content=content, publisher=_find_in_divs(divs, "icon-[mdi--company]", isClass=True)[0], author=_find_in_divs(divs, "icon-[mdi--user-edit]", isClass=True)[0], format=format, @@ -277,7 +284,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: book_info.year = info["Year"][0] # TODO : - # Backfill missing metadata from original book + # Backfill missing metadata from original book # To do this, we need to cache the results of search_books() in some kind of LRU return book_info @@ -423,13 +430,13 @@ def download_book(book_info: BookInfo, book_path: Path, progress_callback: Optio # Update status to resolving before attempting download URL fetch if status_callback: status_callback("resolving") - + download_url = _get_download_url(link, book_info.title, cancel_flag, status_callback) if download_url != "": # Update status to downloading before starting actual download if status_callback: status_callback("downloading") - + logger.info(f"Downloading `{book_info.title}` from `{download_url}`") data = downloader.download_url(download_url, book_info.size or "", progress_callback, cancel_flag) diff --git a/env.py b/env.py index a7d60a3..6d01017 100644 --- a/env.py +++ b/env.py @@ -17,6 +17,27 @@ LOG_ROOT = Path(os.getenv("LOG_ROOT", "/var/log/")) LOG_DIR = LOG_ROOT / "cwa-book-downloader" TMP_DIR = Path(os.getenv("TMP_DIR", "/tmp/cwa-book-downloader")) INGEST_DIR = Path(os.getenv("INGEST_DIR", "/cwa-book-ingest")) +INGEST_DIR_BOOK_FICTION = os.getenv("INGEST_DIR_BOOK_FICTION", "") +INGEST_DIR_BOOK_NON_FICTION = os.getenv("INGEST_DIR_BOOK_NON_FICTION", "") +INGEST_DIR_BOOK_UNKNOWN = os.getenv("INGEST_DIR_BOOK_UNKNOWN", "") +INGEST_DIR_MAGAZINE = os.getenv("INGEST_DIR_MAGAZINE", "") +INGEST_DIR_COMIC_BOOK = os.getenv("INGEST_DIR_COMIC_BOOK", "") +INGEST_DIR_AUDIOBOOK = os.getenv("INGEST_DIR_AUDIOBOOK", "") +INGEST_DIR_STANDARDS_DOCUMENT = os.getenv("INGEST_DIR_STANDARDS_DOCUMENT", "") +INGEST_DIR_MUSICAL_SCORE = os.getenv("INGEST_DIR_MUSICAL_SCORE", "") +INGEST_DIR_OTHER = os.getenv("INGEST_DIR_OTHER", "") +DOWNLOAD_PATHS = { + "book (fiction)": Path(INGEST_DIR_BOOK_FICTION) if INGEST_DIR_BOOK_FICTION else INGEST_DIR, + "book (non-fiction)": Path(INGEST_DIR_BOOK_NON_FICTION) if INGEST_DIR_BOOK_NON_FICTION else INGEST_DIR, + "book (unknown)": Path(INGEST_DIR_BOOK_UNKNOWN) if INGEST_DIR_BOOK_UNKNOWN else INGEST_DIR, + "magazine": Path(INGEST_DIR_MAGAZINE) if INGEST_DIR_MAGAZINE else INGEST_DIR, + "comic book": Path(INGEST_DIR_COMIC_BOOK) if INGEST_DIR_COMIC_BOOK else INGEST_DIR, + "audiobook": Path(INGEST_DIR_AUDIOBOOK) if INGEST_DIR_AUDIOBOOK else INGEST_DIR, + "standards document": Path(INGEST_DIR_STANDARDS_DOCUMENT) if INGEST_DIR_STANDARDS_DOCUMENT else INGEST_DIR, + "musical score": Path(INGEST_DIR_MUSICAL_SCORE) if INGEST_DIR_MUSICAL_SCORE else INGEST_DIR, + "other": Path(INGEST_DIR_OTHER) if INGEST_DIR_OTHER else INGEST_DIR, +} + STATUS_TIMEOUT = int(os.getenv("STATUS_TIMEOUT", "3600")) USE_BOOK_TITLE = string_to_bool(os.getenv("USE_BOOK_TITLE", "false")) MAX_RETRY = int(os.getenv("MAX_RETRY", "10")) diff --git a/models.py b/models.py index f0a77d6..c327c23 100644 --- a/models.py +++ b/models.py @@ -47,6 +47,7 @@ class BookInfo: publisher: Optional[str] = None year: Optional[str] = None language: Optional[str] = None + content: Optional[str] = None format: Optional[str] = None size: Optional[str] = None info: Optional[Dict[str, List[str]]] = None diff --git a/readme.md b/readme.md index dd5d4f4..37dcff3 100644 --- a/readme.md +++ b/readme.md @@ -87,6 +87,30 @@ Note that if using TOR, the TZ will be calculated automatically based on IP. If you change `BOOK_LANGUAGE`, you can add multiple comma separated languages, such as `en,fr,ru` etc. +Use the following environment variables to set specific folders in which to download +different content types (Book, Magazine, Comic, etc.): + +| Variable | Description | Default Value | +|---------------------------------|--------------------------------|---------------| +| `INGEST_DIR_BOOK_FICTION` | Book (fiction) folder name | `` | +| `INGEST_DIR_BOOK_NON_FICTION` | Book (non-fiction) folder name | `` | +| `INGEST_DIR_BOOK_UNKNOWN` | Book (unknown) folder name | `` | +| `INGEST_DIR_MAGAZINE` | Magazine folder name | `` | +| `INGEST_DIR_COMIC_BOOK` | Comic book folder name | `` | +| `INGEST_DIR_AUDIOBOOK` | Audiobook folder name | `` | +| `INGEST_DIR_STANDARDS_DOCUMENT` | Standards document folder name | `` | +| `INGEST_DIR_MUSICAL_SCORE` | Musical score folder name | `` | + +If no specific path is set for a content type the default is `INGEST_DIR`. +Remember to map the specified paths to where your instance of Calibre-Web-Automated (CWA) will find them, e.g.: +``` +volumes: + - /tmp/data/calibre-web/comicbook-ingest:/cwa-comicbook-ingest +``` +if `INGEST_DIR_COMIC_BOOK=/cwa-comicbook-ingest` and your CWA is configured to use `/tmp/data/calibre-web/comicbook-ingest` +for comic books. + + #### AA | Variable | Description | Default Value | diff --git a/testing/E2E_test.py b/testing/E2E_test.py index 816d407..bea816f 100644 --- a/testing/E2E_test.py +++ b/testing/E2E_test.py @@ -13,6 +13,7 @@ port = SERVER_ENV.FLASK_PORT server_url = f"http://localhost:{port}" book_title = "077484a10743e5dd5d151013e8c732f4" # "Moby Dick" # Directory where downloads should appear +download_paths = SERVER_ENV.DOWNLOAD_PATHS download_dir = SERVER_ENV.INGEST_DIR # Timeout for waiting for download download_timeout_seconds = 60 * 5 @@ -38,7 +39,7 @@ def check_download_status(book_id): continue # Check success conditions based on download_path - for status_key in ["available", "done"]: + for status_key in ["available", "done", "complete"]: if status_key in status_data and book_id in status_data[status_key]: book_status_info = status_data[status_key].get(book_id) # Check if the status info is a dictionary and has a non-empty download_path @@ -117,6 +118,12 @@ if SERVER_ENV.USE_BOOK_TITLE: else: expected_filename = f"{book_id}.epub" +if book_details.get("content"): + content = book_details.get("content") + for key, path in SERVER_ENV.DOWNLOAD_PATHS.items(): + if key in content: + download_dir = path + break expected_filepath = os.path.join(download_dir, expected_filename) assert os.path.exists(expected_filepath), f"Expected downloaded file not found at: {expected_filepath}"