mirror of
https://github.com/calibrain/shelfmark.git
synced 2026-02-19 23:37:51 -05:00
Re: Issue #122 Fetches content type from search results - displays it on thumbnails in results grid; Fetches content type from book id detail page (dfaults to "Other") and uses it to construct the `final_path`. --------- Co-authored-by: Patricia Ritter <pritter@events.com> Co-authored-by: CaliBrain <calibrain@l4n.xyz>
This commit is contained in:
15
backend.py
15
backend.py
@@ -11,7 +11,8 @@ from threading import Event
|
||||
|
||||
from logger import setup_logger
|
||||
from config import CUSTOM_SCRIPT
|
||||
from env import INGEST_DIR, TMP_DIR, MAIN_LOOP_SLEEP_TIME, USE_BOOK_TITLE, MAX_CONCURRENT_DOWNLOADS, DOWNLOAD_PROGRESS_UPDATE_INTERVAL
|
||||
from env import (INGEST_DIR, DOWNLOAD_PATHS, TMP_DIR, MAIN_LOOP_SLEEP_TIME, USE_BOOK_TITLE,
|
||||
MAX_CONCURRENT_DOWNLOADS, DOWNLOAD_PROGRESS_UPDATE_INTERVAL)
|
||||
from models import book_queue, BookInfo, QueueStatus, SearchFilters
|
||||
import book_manager
|
||||
|
||||
@@ -135,6 +136,13 @@ def _book_info_to_dict(book: BookInfo) -> Dict[str, Any]:
|
||||
if value is not None
|
||||
}
|
||||
|
||||
def _prepare_download_folder(book_info: BookInfo) -> Path:
|
||||
"""Prepare final content-type subdir"""
|
||||
content = book_info.content
|
||||
content_dir = DOWNLOAD_PATHS.get(content) if content and content in DOWNLOAD_PATHS else INGEST_DIR
|
||||
os.makedirs(content_dir, exist_ok=True)
|
||||
return content_dir
|
||||
|
||||
def _download_book_with_cancellation(book_id: str, cancel_flag: Event) -> Optional[str]:
|
||||
"""Download and process a book with cancellation support.
|
||||
|
||||
@@ -212,8 +220,9 @@ def _download_book_with_cancellation(book_id: str, cancel_flag: Event) -> Option
|
||||
book_info.format = success_download_url.split(".")[-1]
|
||||
book_name += f".{book_info.format}"
|
||||
|
||||
intermediate_path = INGEST_DIR / f"{book_id}.crdownload"
|
||||
final_path = INGEST_DIR / book_name
|
||||
final_dir = _prepare_download_folder(book_info)
|
||||
intermediate_path = final_dir / f"{book_id}.crdownload"
|
||||
final_path = final_dir / book_name
|
||||
|
||||
if os.path.exists(book_path):
|
||||
logger.info(f"Moving book to ingest directory: {book_path} -> {final_path}")
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""Book download manager handling search and retrieval operations."""
|
||||
|
||||
import time, json, re
|
||||
import time, json, os, re
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
from typing import List, Optional, Dict, Union, Callable
|
||||
@@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, Tag, NavigableString, ResultSet
|
||||
import downloader
|
||||
from logger import setup_logger
|
||||
from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_BASE_URL
|
||||
from env import AA_DONATOR_KEY, USE_CF_BYPASS, PRIORITIZE_WELIB, ALLOW_USE_WELIB
|
||||
from env import AA_DONATOR_KEY, USE_CF_BYPASS, PRIORITIZE_WELIB, ALLOW_USE_WELIB, DOWNLOAD_PATHS
|
||||
from models import BookInfo, SearchFilters
|
||||
logger = setup_logger(__name__)
|
||||
|
||||
@@ -125,6 +125,7 @@ def _parse_search_result_row(row: Tag) -> Optional[BookInfo]:
|
||||
publisher=cells[3].find("span").next,
|
||||
year=cells[4].find("span").next,
|
||||
language=cells[7].find("span").next,
|
||||
content=cells[8].find("span").next.lower(),
|
||||
format=cells[9].find("span").next.lower(),
|
||||
size=cells[10].find("span").next,
|
||||
)
|
||||
@@ -232,6 +233,8 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
|
||||
all_details = _find_in_divs(divs, " · ")
|
||||
format = ""
|
||||
size = ""
|
||||
content = ""
|
||||
|
||||
for _details in all_details:
|
||||
_details = _details.split(" · ")
|
||||
for f in _details:
|
||||
@@ -239,7 +242,11 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
|
||||
format = f.strip().lower()
|
||||
if size == "" and any(u in f.strip().lower() for u in ["mb", "kb", "gb"]):
|
||||
size = f.strip().lower()
|
||||
|
||||
if content == "":
|
||||
for ct in DOWNLOAD_PATHS.keys():
|
||||
if ct in f.strip().lower():
|
||||
content = ct
|
||||
break
|
||||
if format == "" or size == "":
|
||||
for f in _details:
|
||||
stripped = f.strip().lower()
|
||||
@@ -247,7 +254,6 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
|
||||
format = stripped
|
||||
if size == "" and "." in stripped:
|
||||
size = stripped
|
||||
|
||||
|
||||
book_title = _find_in_divs(divs, "🔍")[0].strip("🔍").strip()
|
||||
|
||||
@@ -258,6 +264,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
|
||||
id=book_id,
|
||||
preview=preview,
|
||||
title=book_title,
|
||||
content=content,
|
||||
publisher=_find_in_divs(divs, "icon-[mdi--company]", isClass=True)[0],
|
||||
author=_find_in_divs(divs, "icon-[mdi--user-edit]", isClass=True)[0],
|
||||
format=format,
|
||||
@@ -277,7 +284,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
|
||||
book_info.year = info["Year"][0]
|
||||
|
||||
# TODO :
|
||||
# Backfill missing metadata from original book
|
||||
# Backfill missing metadata from original book
|
||||
# To do this, we need to cache the results of search_books() in some kind of LRU
|
||||
|
||||
return book_info
|
||||
@@ -423,13 +430,13 @@ def download_book(book_info: BookInfo, book_path: Path, progress_callback: Optio
|
||||
# Update status to resolving before attempting download URL fetch
|
||||
if status_callback:
|
||||
status_callback("resolving")
|
||||
|
||||
|
||||
download_url = _get_download_url(link, book_info.title, cancel_flag, status_callback)
|
||||
if download_url != "":
|
||||
# Update status to downloading before starting actual download
|
||||
if status_callback:
|
||||
status_callback("downloading")
|
||||
|
||||
|
||||
logger.info(f"Downloading `{book_info.title}` from `{download_url}`")
|
||||
|
||||
data = downloader.download_url(download_url, book_info.size or "", progress_callback, cancel_flag)
|
||||
|
||||
21
env.py
21
env.py
@@ -17,6 +17,27 @@ LOG_ROOT = Path(os.getenv("LOG_ROOT", "/var/log/"))
|
||||
LOG_DIR = LOG_ROOT / "cwa-book-downloader"
|
||||
TMP_DIR = Path(os.getenv("TMP_DIR", "/tmp/cwa-book-downloader"))
|
||||
INGEST_DIR = Path(os.getenv("INGEST_DIR", "/cwa-book-ingest"))
|
||||
INGEST_DIR_BOOK_FICTION = os.getenv("INGEST_DIR_BOOK_FICTION", "")
|
||||
INGEST_DIR_BOOK_NON_FICTION = os.getenv("INGEST_DIR_BOOK_NON_FICTION", "")
|
||||
INGEST_DIR_BOOK_UNKNOWN = os.getenv("INGEST_DIR_BOOK_UNKNOWN", "")
|
||||
INGEST_DIR_MAGAZINE = os.getenv("INGEST_DIR_MAGAZINE", "")
|
||||
INGEST_DIR_COMIC_BOOK = os.getenv("INGEST_DIR_COMIC_BOOK", "")
|
||||
INGEST_DIR_AUDIOBOOK = os.getenv("INGEST_DIR_AUDIOBOOK", "")
|
||||
INGEST_DIR_STANDARDS_DOCUMENT = os.getenv("INGEST_DIR_STANDARDS_DOCUMENT", "")
|
||||
INGEST_DIR_MUSICAL_SCORE = os.getenv("INGEST_DIR_MUSICAL_SCORE", "")
|
||||
INGEST_DIR_OTHER = os.getenv("INGEST_DIR_OTHER", "")
|
||||
DOWNLOAD_PATHS = {
|
||||
"book (fiction)": Path(INGEST_DIR_BOOK_FICTION) if INGEST_DIR_BOOK_FICTION else INGEST_DIR,
|
||||
"book (non-fiction)": Path(INGEST_DIR_BOOK_NON_FICTION) if INGEST_DIR_BOOK_NON_FICTION else INGEST_DIR,
|
||||
"book (unknown)": Path(INGEST_DIR_BOOK_UNKNOWN) if INGEST_DIR_BOOK_UNKNOWN else INGEST_DIR,
|
||||
"magazine": Path(INGEST_DIR_MAGAZINE) if INGEST_DIR_MAGAZINE else INGEST_DIR,
|
||||
"comic book": Path(INGEST_DIR_COMIC_BOOK) if INGEST_DIR_COMIC_BOOK else INGEST_DIR,
|
||||
"audiobook": Path(INGEST_DIR_AUDIOBOOK) if INGEST_DIR_AUDIOBOOK else INGEST_DIR,
|
||||
"standards document": Path(INGEST_DIR_STANDARDS_DOCUMENT) if INGEST_DIR_STANDARDS_DOCUMENT else INGEST_DIR,
|
||||
"musical score": Path(INGEST_DIR_MUSICAL_SCORE) if INGEST_DIR_MUSICAL_SCORE else INGEST_DIR,
|
||||
"other": Path(INGEST_DIR_OTHER) if INGEST_DIR_OTHER else INGEST_DIR,
|
||||
}
|
||||
|
||||
STATUS_TIMEOUT = int(os.getenv("STATUS_TIMEOUT", "3600"))
|
||||
USE_BOOK_TITLE = string_to_bool(os.getenv("USE_BOOK_TITLE", "false"))
|
||||
MAX_RETRY = int(os.getenv("MAX_RETRY", "10"))
|
||||
|
||||
@@ -47,6 +47,7 @@ class BookInfo:
|
||||
publisher: Optional[str] = None
|
||||
year: Optional[str] = None
|
||||
language: Optional[str] = None
|
||||
content: Optional[str] = None
|
||||
format: Optional[str] = None
|
||||
size: Optional[str] = None
|
||||
info: Optional[Dict[str, List[str]]] = None
|
||||
|
||||
24
readme.md
24
readme.md
@@ -87,6 +87,30 @@ Note that if using TOR, the TZ will be calculated automatically based on IP.
|
||||
|
||||
If you change `BOOK_LANGUAGE`, you can add multiple comma separated languages, such as `en,fr,ru` etc.
|
||||
|
||||
Use the following environment variables to set specific folders in which to download
|
||||
different content types (Book, Magazine, Comic, etc.):
|
||||
|
||||
| Variable | Description | Default Value |
|
||||
|---------------------------------|--------------------------------|---------------|
|
||||
| `INGEST_DIR_BOOK_FICTION` | Book (fiction) folder name | `` |
|
||||
| `INGEST_DIR_BOOK_NON_FICTION` | Book (non-fiction) folder name | `` |
|
||||
| `INGEST_DIR_BOOK_UNKNOWN` | Book (unknown) folder name | `` |
|
||||
| `INGEST_DIR_MAGAZINE` | Magazine folder name | `` |
|
||||
| `INGEST_DIR_COMIC_BOOK` | Comic book folder name | `` |
|
||||
| `INGEST_DIR_AUDIOBOOK` | Audiobook folder name | `` |
|
||||
| `INGEST_DIR_STANDARDS_DOCUMENT` | Standards document folder name | `` |
|
||||
| `INGEST_DIR_MUSICAL_SCORE` | Musical score folder name | `` |
|
||||
|
||||
If no specific path is set for a content type the default is `INGEST_DIR`.
|
||||
Remember to map the specified paths to where your instance of Calibre-Web-Automated (CWA) will find them, e.g.:
|
||||
```
|
||||
volumes:
|
||||
- /tmp/data/calibre-web/comicbook-ingest:/cwa-comicbook-ingest
|
||||
```
|
||||
if `INGEST_DIR_COMIC_BOOK=/cwa-comicbook-ingest` and your CWA is configured to use `/tmp/data/calibre-web/comicbook-ingest`
|
||||
for comic books.
|
||||
|
||||
|
||||
#### AA
|
||||
|
||||
| Variable | Description | Default Value |
|
||||
|
||||
@@ -13,6 +13,7 @@ port = SERVER_ENV.FLASK_PORT
|
||||
server_url = f"http://localhost:{port}"
|
||||
book_title = "077484a10743e5dd5d151013e8c732f4" # "Moby Dick"
|
||||
# Directory where downloads should appear
|
||||
download_paths = SERVER_ENV.DOWNLOAD_PATHS
|
||||
download_dir = SERVER_ENV.INGEST_DIR
|
||||
# Timeout for waiting for download
|
||||
download_timeout_seconds = 60 * 5
|
||||
@@ -38,7 +39,7 @@ def check_download_status(book_id):
|
||||
continue
|
||||
|
||||
# Check success conditions based on download_path
|
||||
for status_key in ["available", "done"]:
|
||||
for status_key in ["available", "done", "complete"]:
|
||||
if status_key in status_data and book_id in status_data[status_key]:
|
||||
book_status_info = status_data[status_key].get(book_id)
|
||||
# Check if the status info is a dictionary and has a non-empty download_path
|
||||
@@ -117,6 +118,12 @@ if SERVER_ENV.USE_BOOK_TITLE:
|
||||
else:
|
||||
expected_filename = f"{book_id}.epub"
|
||||
|
||||
if book_details.get("content"):
|
||||
content = book_details.get("content")
|
||||
for key, path in SERVER_ENV.DOWNLOAD_PATHS.items():
|
||||
if key in content:
|
||||
download_dir = path
|
||||
break
|
||||
expected_filepath = os.path.join(download_dir, expected_filename)
|
||||
|
||||
assert os.path.exists(expected_filepath), f"Expected downloaded file not found at: {expected_filepath}"
|
||||
|
||||
Reference in New Issue
Block a user