[FEATURE] Separate download folders #122 (#297)

Re: Issue #122 Fetches content type from search results - displays it on thumbnails in results grid; Fetches content type from book id detail page (dfaults to "Other") and uses it to construct the `final_path`. --------- Co-authored-by: Patricia Ritter <pritter@events.com> Co-authored-by: CaliBrain <calibrain@l4n.xyz>
2026-02-19 23:37:51 -05:00 · 2025-11-23 23:11:43 +01:00
parent 78c61e88b3
commit 2b831dcfa5
6 changed files with 80 additions and 11 deletions
--- a/backend.py
+++ b/backend.py
@@ -11,7 +11,8 @@ from threading import Event

 from logger import setup_logger
 from config import CUSTOM_SCRIPT
-from env import INGEST_DIR, TMP_DIR, MAIN_LOOP_SLEEP_TIME, USE_BOOK_TITLE, MAX_CONCURRENT_DOWNLOADS, DOWNLOAD_PROGRESS_UPDATE_INTERVAL
+from env import (INGEST_DIR, DOWNLOAD_PATHS, TMP_DIR, MAIN_LOOP_SLEEP_TIME, USE_BOOK_TITLE,
+                 MAX_CONCURRENT_DOWNLOADS, DOWNLOAD_PROGRESS_UPDATE_INTERVAL)
 from models import book_queue, BookInfo, QueueStatus, SearchFilters
 import book_manager

@@ -135,6 +136,13 @@ def _book_info_to_dict(book: BookInfo) -> Dict[str, Any]:
        if value is not None
    }

+def _prepare_download_folder(book_info: BookInfo) -> Path:
+    """Prepare final content-type subdir"""
+    content = book_info.content
+    content_dir = DOWNLOAD_PATHS.get(content) if content and content in DOWNLOAD_PATHS else INGEST_DIR
+    os.makedirs(content_dir, exist_ok=True)
+    return content_dir
+
 def _download_book_with_cancellation(book_id: str, cancel_flag: Event) -> Optional[str]:
    """Download and process a book with cancellation support.
    
@@ -212,8 +220,9 @@ def _download_book_with_cancellation(book_id: str, cancel_flag: Event) -> Option
            book_info.format = success_download_url.split(".")[-1]
            book_name += f".{book_info.format}"

-        intermediate_path = INGEST_DIR / f"{book_id}.crdownload"
-        final_path = INGEST_DIR / book_name
+        final_dir = _prepare_download_folder(book_info)
+        intermediate_path = final_dir / f"{book_id}.crdownload"
+        final_path = final_dir / book_name
        
        if os.path.exists(book_path):
            logger.info(f"Moving book to ingest directory: {book_path} -> {final_path}")
--- a/book_manager.py
+++ b/book_manager.py
@@ -1,6 +1,6 @@
 """Book download manager handling search and retrieval operations."""

-import time, json, re
+import time, json, os, re
 from pathlib import Path
 from urllib.parse import quote
 from typing import List, Optional, Dict, Union, Callable
@@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, Tag, NavigableString, ResultSet
 import downloader
 from logger import setup_logger
 from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_BASE_URL
-from env import AA_DONATOR_KEY, USE_CF_BYPASS, PRIORITIZE_WELIB, ALLOW_USE_WELIB
+from env import AA_DONATOR_KEY, USE_CF_BYPASS, PRIORITIZE_WELIB, ALLOW_USE_WELIB, DOWNLOAD_PATHS
 from models import BookInfo, SearchFilters
 logger = setup_logger(__name__)

@@ -125,6 +125,7 @@ def _parse_search_result_row(row: Tag) -> Optional[BookInfo]:
            publisher=cells[3].find("span").next,
            year=cells[4].find("span").next,
            language=cells[7].find("span").next,
+            content=cells[8].find("span").next.lower(),
            format=cells[9].find("span").next.lower(),
            size=cells[10].find("span").next,
        )
@@ -232,6 +233,8 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
    all_details = _find_in_divs(divs, " · ")
    format = ""
    size = ""
+    content = ""
+    
    for _details in all_details:
        _details = _details.split(" · ")
        for f in _details:
@@ -239,7 +242,11 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
                format = f.strip().lower()
            if size == "" and any(u in f.strip().lower() for u in ["mb", "kb", "gb"]):
                size = f.strip().lower()
-
+            if content == "":
+                for ct in DOWNLOAD_PATHS.keys():
+                    if ct in f.strip().lower():
+                        content = ct
+                        break
        if format == "" or size == "":
            for f in _details:
                stripped = f.strip().lower()
@@ -247,7 +254,6 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
                    format = stripped
                if size == "" and "." in stripped:
                    size = stripped
-
    
    book_title = _find_in_divs(divs, "🔍")[0].strip("🔍").strip()

@@ -258,6 +264,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
        id=book_id,
        preview=preview,
        title=book_title,
+        content=content,
        publisher=_find_in_divs(divs, "icon-[mdi--company]", isClass=True)[0],
        author=_find_in_divs(divs, "icon-[mdi--user-edit]", isClass=True)[0],
        format=format,
@@ -277,7 +284,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
        book_info.year = info["Year"][0]

    # TODO :
-    # Backfill missing metadata from original book 
+    # Backfill missing metadata from original book
    # To do this, we need to cache the results of search_books() in some kind of LRU

    return book_info
@@ -423,13 +430,13 @@ def download_book(book_info: BookInfo, book_path: Path, progress_callback: Optio
            # Update status to resolving before attempting download URL fetch
            if status_callback:
                status_callback("resolving")
-            
+
            download_url = _get_download_url(link, book_info.title, cancel_flag, status_callback)
            if download_url != "":
                # Update status to downloading before starting actual download
                if status_callback:
                    status_callback("downloading")
-                    
+
                logger.info(f"Downloading `{book_info.title}` from `{download_url}`")

                data = downloader.download_url(download_url, book_info.size or "", progress_callback, cancel_flag)
--- a/env.py
+++ b/env.py
@@ -17,6 +17,27 @@ LOG_ROOT = Path(os.getenv("LOG_ROOT", "/var/log/"))
 LOG_DIR = LOG_ROOT / "cwa-book-downloader"
 TMP_DIR = Path(os.getenv("TMP_DIR", "/tmp/cwa-book-downloader"))
 INGEST_DIR = Path(os.getenv("INGEST_DIR", "/cwa-book-ingest"))
+INGEST_DIR_BOOK_FICTION = os.getenv("INGEST_DIR_BOOK_FICTION", "")
+INGEST_DIR_BOOK_NON_FICTION = os.getenv("INGEST_DIR_BOOK_NON_FICTION", "")
+INGEST_DIR_BOOK_UNKNOWN = os.getenv("INGEST_DIR_BOOK_UNKNOWN", "")
+INGEST_DIR_MAGAZINE = os.getenv("INGEST_DIR_MAGAZINE", "")
+INGEST_DIR_COMIC_BOOK = os.getenv("INGEST_DIR_COMIC_BOOK", "")
+INGEST_DIR_AUDIOBOOK = os.getenv("INGEST_DIR_AUDIOBOOK", "")
+INGEST_DIR_STANDARDS_DOCUMENT = os.getenv("INGEST_DIR_STANDARDS_DOCUMENT", "")
+INGEST_DIR_MUSICAL_SCORE = os.getenv("INGEST_DIR_MUSICAL_SCORE", "")
+INGEST_DIR_OTHER = os.getenv("INGEST_DIR_OTHER", "")
+DOWNLOAD_PATHS = {
+    "book (fiction)": Path(INGEST_DIR_BOOK_FICTION) if INGEST_DIR_BOOK_FICTION else INGEST_DIR,
+    "book (non-fiction)": Path(INGEST_DIR_BOOK_NON_FICTION) if INGEST_DIR_BOOK_NON_FICTION else INGEST_DIR,
+    "book (unknown)": Path(INGEST_DIR_BOOK_UNKNOWN) if INGEST_DIR_BOOK_UNKNOWN else INGEST_DIR,
+    "magazine": Path(INGEST_DIR_MAGAZINE) if INGEST_DIR_MAGAZINE else INGEST_DIR,
+    "comic book": Path(INGEST_DIR_COMIC_BOOK) if INGEST_DIR_COMIC_BOOK else INGEST_DIR,
+    "audiobook": Path(INGEST_DIR_AUDIOBOOK) if INGEST_DIR_AUDIOBOOK else INGEST_DIR,
+    "standards document": Path(INGEST_DIR_STANDARDS_DOCUMENT) if INGEST_DIR_STANDARDS_DOCUMENT else INGEST_DIR,
+    "musical score": Path(INGEST_DIR_MUSICAL_SCORE) if INGEST_DIR_MUSICAL_SCORE else INGEST_DIR,
+    "other": Path(INGEST_DIR_OTHER) if INGEST_DIR_OTHER else INGEST_DIR,
+}
+
 STATUS_TIMEOUT = int(os.getenv("STATUS_TIMEOUT", "3600"))
 USE_BOOK_TITLE = string_to_bool(os.getenv("USE_BOOK_TITLE", "false"))
 MAX_RETRY = int(os.getenv("MAX_RETRY", "10"))
--- a/models.py
+++ b/models.py
@@ -47,6 +47,7 @@ class BookInfo:
    publisher: Optional[str] = None
    year: Optional[str] = None
    language: Optional[str] = None
+    content: Optional[str] = None
    format: Optional[str] = None
    size: Optional[str] = None
    info: Optional[Dict[str, List[str]]] = None
--- a/readme.md
+++ b/readme.md
@@ -87,6 +87,30 @@ Note that if using TOR, the TZ will be calculated automatically based on IP.

 If you change `BOOK_LANGUAGE`, you can add multiple comma separated languages, such as `en,fr,ru` etc.  

+Use the following environment variables to set specific folders in which to download 
+different content types (Book, Magazine, Comic, etc.):
+
+| Variable                        | Description                    | Default Value |
+|---------------------------------|--------------------------------|---------------|
+| `INGEST_DIR_BOOK_FICTION`       | Book (fiction) folder name     | ``            |
+| `INGEST_DIR_BOOK_NON_FICTION`   | Book (non-fiction) folder name | ``            |
+| `INGEST_DIR_BOOK_UNKNOWN`       | Book (unknown) folder name     | ``            |
+| `INGEST_DIR_MAGAZINE`           | Magazine folder name           | ``            |
+| `INGEST_DIR_COMIC_BOOK`         | Comic book folder name         | ``            |
+| `INGEST_DIR_AUDIOBOOK`          | Audiobook folder name          | ``            |
+| `INGEST_DIR_STANDARDS_DOCUMENT` | Standards document folder name | ``            |
+| `INGEST_DIR_MUSICAL_SCORE`      | Musical score folder name      | ``            |
+
+If no specific path is set for a content type the default is `INGEST_DIR`.  
+Remember to map the specified paths to where your instance of Calibre-Web-Automated (CWA) will find them, e.g.:  
+```
+volumes:
+    - /tmp/data/calibre-web/comicbook-ingest:/cwa-comicbook-ingest
+```
+if `INGEST_DIR_COMIC_BOOK=/cwa-comicbook-ingest` and your CWA is configured to use `/tmp/data/calibre-web/comicbook-ingest` 
+for comic books.
+
+
 #### AA 

 | Variable               | Description                                               | Default Value                     |
--- a/testing/E2E_test.py
+++ b/testing/E2E_test.py
@@ -13,6 +13,7 @@ port = SERVER_ENV.FLASK_PORT
 server_url = f"http://localhost:{port}"
 book_title = "077484a10743e5dd5d151013e8c732f4" # "Moby Dick"
 # Directory where downloads should appear
+download_paths = SERVER_ENV.DOWNLOAD_PATHS
 download_dir = SERVER_ENV.INGEST_DIR
 # Timeout for waiting for download
 download_timeout_seconds = 60 * 5
@@ -38,7 +39,7 @@ def check_download_status(book_id):
            continue

        # Check success conditions based on download_path
-        for status_key in ["available", "done"]:
+        for status_key in ["available", "done", "complete"]:
            if status_key in status_data and book_id in status_data[status_key]:
                book_status_info = status_data[status_key].get(book_id)
                # Check if the status info is a dictionary and has a non-empty download_path
@@ -117,6 +118,12 @@ if SERVER_ENV.USE_BOOK_TITLE:
 else:
    expected_filename = f"{book_id}.epub"

+if book_details.get("content"):
+    content = book_details.get("content")
+    for key, path in SERVER_ENV.DOWNLOAD_PATHS.items():
+        if key in content:
+            download_dir = path
+            break
 expected_filepath = os.path.join(download_dir, expected_filename)

 assert os.path.exists(expected_filepath), f"Expected downloaded file not found at: {expected_filepath}"