[FEATURE] Separate download folders #122 (#297)

Re: Issue #122
Fetches content type from search results - displays it on thumbnails in
results grid;
Fetches content type from book id detail page (dfaults to "Other") and
uses it to construct the `final_path`.

---------

Co-authored-by: Patricia Ritter <pritter@events.com>
Co-authored-by: CaliBrain <calibrain@l4n.xyz>
This commit is contained in:
TigreModerata
2025-11-23 23:11:43 +01:00
committed by GitHub
parent 78c61e88b3
commit 2b831dcfa5
6 changed files with 80 additions and 11 deletions

View File

@@ -11,7 +11,8 @@ from threading import Event
from logger import setup_logger
from config import CUSTOM_SCRIPT
from env import INGEST_DIR, TMP_DIR, MAIN_LOOP_SLEEP_TIME, USE_BOOK_TITLE, MAX_CONCURRENT_DOWNLOADS, DOWNLOAD_PROGRESS_UPDATE_INTERVAL
from env import (INGEST_DIR, DOWNLOAD_PATHS, TMP_DIR, MAIN_LOOP_SLEEP_TIME, USE_BOOK_TITLE,
MAX_CONCURRENT_DOWNLOADS, DOWNLOAD_PROGRESS_UPDATE_INTERVAL)
from models import book_queue, BookInfo, QueueStatus, SearchFilters
import book_manager
@@ -135,6 +136,13 @@ def _book_info_to_dict(book: BookInfo) -> Dict[str, Any]:
if value is not None
}
def _prepare_download_folder(book_info: BookInfo) -> Path:
"""Prepare final content-type subdir"""
content = book_info.content
content_dir = DOWNLOAD_PATHS.get(content) if content and content in DOWNLOAD_PATHS else INGEST_DIR
os.makedirs(content_dir, exist_ok=True)
return content_dir
def _download_book_with_cancellation(book_id: str, cancel_flag: Event) -> Optional[str]:
"""Download and process a book with cancellation support.
@@ -212,8 +220,9 @@ def _download_book_with_cancellation(book_id: str, cancel_flag: Event) -> Option
book_info.format = success_download_url.split(".")[-1]
book_name += f".{book_info.format}"
intermediate_path = INGEST_DIR / f"{book_id}.crdownload"
final_path = INGEST_DIR / book_name
final_dir = _prepare_download_folder(book_info)
intermediate_path = final_dir / f"{book_id}.crdownload"
final_path = final_dir / book_name
if os.path.exists(book_path):
logger.info(f"Moving book to ingest directory: {book_path} -> {final_path}")

View File

@@ -1,6 +1,6 @@
"""Book download manager handling search and retrieval operations."""
import time, json, re
import time, json, os, re
from pathlib import Path
from urllib.parse import quote
from typing import List, Optional, Dict, Union, Callable
@@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, Tag, NavigableString, ResultSet
import downloader
from logger import setup_logger
from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_BASE_URL
from env import AA_DONATOR_KEY, USE_CF_BYPASS, PRIORITIZE_WELIB, ALLOW_USE_WELIB
from env import AA_DONATOR_KEY, USE_CF_BYPASS, PRIORITIZE_WELIB, ALLOW_USE_WELIB, DOWNLOAD_PATHS
from models import BookInfo, SearchFilters
logger = setup_logger(__name__)
@@ -125,6 +125,7 @@ def _parse_search_result_row(row: Tag) -> Optional[BookInfo]:
publisher=cells[3].find("span").next,
year=cells[4].find("span").next,
language=cells[7].find("span").next,
content=cells[8].find("span").next.lower(),
format=cells[9].find("span").next.lower(),
size=cells[10].find("span").next,
)
@@ -232,6 +233,8 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
all_details = _find_in_divs(divs, " · ")
format = ""
size = ""
content = ""
for _details in all_details:
_details = _details.split(" · ")
for f in _details:
@@ -239,7 +242,11 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
format = f.strip().lower()
if size == "" and any(u in f.strip().lower() for u in ["mb", "kb", "gb"]):
size = f.strip().lower()
if content == "":
for ct in DOWNLOAD_PATHS.keys():
if ct in f.strip().lower():
content = ct
break
if format == "" or size == "":
for f in _details:
stripped = f.strip().lower()
@@ -247,7 +254,6 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
format = stripped
if size == "" and "." in stripped:
size = stripped
book_title = _find_in_divs(divs, "🔍")[0].strip("🔍").strip()
@@ -258,6 +264,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
id=book_id,
preview=preview,
title=book_title,
content=content,
publisher=_find_in_divs(divs, "icon-[mdi--company]", isClass=True)[0],
author=_find_in_divs(divs, "icon-[mdi--user-edit]", isClass=True)[0],
format=format,
@@ -277,7 +284,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
book_info.year = info["Year"][0]
# TODO :
# Backfill missing metadata from original book
# Backfill missing metadata from original book
# To do this, we need to cache the results of search_books() in some kind of LRU
return book_info
@@ -423,13 +430,13 @@ def download_book(book_info: BookInfo, book_path: Path, progress_callback: Optio
# Update status to resolving before attempting download URL fetch
if status_callback:
status_callback("resolving")
download_url = _get_download_url(link, book_info.title, cancel_flag, status_callback)
if download_url != "":
# Update status to downloading before starting actual download
if status_callback:
status_callback("downloading")
logger.info(f"Downloading `{book_info.title}` from `{download_url}`")
data = downloader.download_url(download_url, book_info.size or "", progress_callback, cancel_flag)

21
env.py
View File

@@ -17,6 +17,27 @@ LOG_ROOT = Path(os.getenv("LOG_ROOT", "/var/log/"))
LOG_DIR = LOG_ROOT / "cwa-book-downloader"
TMP_DIR = Path(os.getenv("TMP_DIR", "/tmp/cwa-book-downloader"))
INGEST_DIR = Path(os.getenv("INGEST_DIR", "/cwa-book-ingest"))
INGEST_DIR_BOOK_FICTION = os.getenv("INGEST_DIR_BOOK_FICTION", "")
INGEST_DIR_BOOK_NON_FICTION = os.getenv("INGEST_DIR_BOOK_NON_FICTION", "")
INGEST_DIR_BOOK_UNKNOWN = os.getenv("INGEST_DIR_BOOK_UNKNOWN", "")
INGEST_DIR_MAGAZINE = os.getenv("INGEST_DIR_MAGAZINE", "")
INGEST_DIR_COMIC_BOOK = os.getenv("INGEST_DIR_COMIC_BOOK", "")
INGEST_DIR_AUDIOBOOK = os.getenv("INGEST_DIR_AUDIOBOOK", "")
INGEST_DIR_STANDARDS_DOCUMENT = os.getenv("INGEST_DIR_STANDARDS_DOCUMENT", "")
INGEST_DIR_MUSICAL_SCORE = os.getenv("INGEST_DIR_MUSICAL_SCORE", "")
INGEST_DIR_OTHER = os.getenv("INGEST_DIR_OTHER", "")
DOWNLOAD_PATHS = {
"book (fiction)": Path(INGEST_DIR_BOOK_FICTION) if INGEST_DIR_BOOK_FICTION else INGEST_DIR,
"book (non-fiction)": Path(INGEST_DIR_BOOK_NON_FICTION) if INGEST_DIR_BOOK_NON_FICTION else INGEST_DIR,
"book (unknown)": Path(INGEST_DIR_BOOK_UNKNOWN) if INGEST_DIR_BOOK_UNKNOWN else INGEST_DIR,
"magazine": Path(INGEST_DIR_MAGAZINE) if INGEST_DIR_MAGAZINE else INGEST_DIR,
"comic book": Path(INGEST_DIR_COMIC_BOOK) if INGEST_DIR_COMIC_BOOK else INGEST_DIR,
"audiobook": Path(INGEST_DIR_AUDIOBOOK) if INGEST_DIR_AUDIOBOOK else INGEST_DIR,
"standards document": Path(INGEST_DIR_STANDARDS_DOCUMENT) if INGEST_DIR_STANDARDS_DOCUMENT else INGEST_DIR,
"musical score": Path(INGEST_DIR_MUSICAL_SCORE) if INGEST_DIR_MUSICAL_SCORE else INGEST_DIR,
"other": Path(INGEST_DIR_OTHER) if INGEST_DIR_OTHER else INGEST_DIR,
}
STATUS_TIMEOUT = int(os.getenv("STATUS_TIMEOUT", "3600"))
USE_BOOK_TITLE = string_to_bool(os.getenv("USE_BOOK_TITLE", "false"))
MAX_RETRY = int(os.getenv("MAX_RETRY", "10"))

View File

@@ -47,6 +47,7 @@ class BookInfo:
publisher: Optional[str] = None
year: Optional[str] = None
language: Optional[str] = None
content: Optional[str] = None
format: Optional[str] = None
size: Optional[str] = None
info: Optional[Dict[str, List[str]]] = None

View File

@@ -87,6 +87,30 @@ Note that if using TOR, the TZ will be calculated automatically based on IP.
If you change `BOOK_LANGUAGE`, you can add multiple comma separated languages, such as `en,fr,ru` etc.
Use the following environment variables to set specific folders in which to download
different content types (Book, Magazine, Comic, etc.):
| Variable | Description | Default Value |
|---------------------------------|--------------------------------|---------------|
| `INGEST_DIR_BOOK_FICTION` | Book (fiction) folder name | `` |
| `INGEST_DIR_BOOK_NON_FICTION` | Book (non-fiction) folder name | `` |
| `INGEST_DIR_BOOK_UNKNOWN` | Book (unknown) folder name | `` |
| `INGEST_DIR_MAGAZINE` | Magazine folder name | `` |
| `INGEST_DIR_COMIC_BOOK` | Comic book folder name | `` |
| `INGEST_DIR_AUDIOBOOK` | Audiobook folder name | `` |
| `INGEST_DIR_STANDARDS_DOCUMENT` | Standards document folder name | `` |
| `INGEST_DIR_MUSICAL_SCORE` | Musical score folder name | `` |
If no specific path is set for a content type the default is `INGEST_DIR`.
Remember to map the specified paths to where your instance of Calibre-Web-Automated (CWA) will find them, e.g.:
```
volumes:
- /tmp/data/calibre-web/comicbook-ingest:/cwa-comicbook-ingest
```
if `INGEST_DIR_COMIC_BOOK=/cwa-comicbook-ingest` and your CWA is configured to use `/tmp/data/calibre-web/comicbook-ingest`
for comic books.
#### AA
| Variable | Description | Default Value |

View File

@@ -13,6 +13,7 @@ port = SERVER_ENV.FLASK_PORT
server_url = f"http://localhost:{port}"
book_title = "077484a10743e5dd5d151013e8c732f4" # "Moby Dick"
# Directory where downloads should appear
download_paths = SERVER_ENV.DOWNLOAD_PATHS
download_dir = SERVER_ENV.INGEST_DIR
# Timeout for waiting for download
download_timeout_seconds = 60 * 5
@@ -38,7 +39,7 @@ def check_download_status(book_id):
continue
# Check success conditions based on download_path
for status_key in ["available", "done"]:
for status_key in ["available", "done", "complete"]:
if status_key in status_data and book_id in status_data[status_key]:
book_status_info = status_data[status_key].get(book_id)
# Check if the status info is a dictionary and has a non-empty download_path
@@ -117,6 +118,12 @@ if SERVER_ENV.USE_BOOK_TITLE:
else:
expected_filename = f"{book_id}.epub"
if book_details.get("content"):
content = book_details.get("content")
for key, path in SERVER_ENV.DOWNLOAD_PATHS.items():
if key in content:
download_dir = path
break
expected_filepath = os.path.join(download_dir, expected_filename)
assert os.path.exists(expected_filepath), f"Expected downloaded file not found at: {expected_filepath}"