feat: fix Nextory series null handling and add PDF book support

- Fix _extract_series_name to handle null series values - Add PDF book support for Nextory (served via epub endpoint) - Convert PDF-in-epub to merged PDF after download - Add pypdf dependency for PDF merging - Add Pdf output format (unused but available) - Update metadata transformer to handle both epub and pdf formats
2026-05-12 14:09:36 -06:00 · 2025-12-18 12:25:37 +01:00 · 2025-12-18 12:25:37 +01:00 · 6199e02b9b
commit 6199e02b9b
parent 08af04ab32
7 changed files with 127 additions and 29 deletions
--- a/grawlix/main.py
+++ b/grawlix/main.py
@ -163,6 +163,18 @@ async def download_with_progress(book: Book, progress: Progress, template: str,
    # Download the book
    await download_book(book, update_function, template)

+    # Convert PDF-in-epub to PDF if needed
+    if book.source_data and book.source_data.get('format_type') == 'pdf':
+        from .output import format_output_location, get_default_format
+        from .pdf_converter import convert_pdf_epub_to_pdf, is_pdf_in_epub
+
+        output_format = get_default_format(book)
+        location = format_output_location(book, output_format, template)
+
+        if location.endswith('.epub') and os.path.exists(location) and is_pdf_in_epub(location):
+            convert_pdf_epub_to_pdf(location)
+            logging.debug(f"Converted PDF-in-epub to PDF: {location}")
+
    # Write metadata if requested and available
    if write_metadata and book.source_data:
        from .output import format_output_location, get_default_format, find_output_format, get_valid_extensions
--- a/grawlix/epub_metadata_writers.py
+++ b/grawlix/epub_metadata_writers.py
@ -61,27 +61,30 @@ def nextory_transformer(details: dict) -> dict:
    :param details: Nextory book details JSON
    :return: Standardized metadata dict
    """
-    # Extract epub format
-    epub_format = None
-    for fmt in details.get("formats", []):
-        if fmt.get("type") == "epub":
-            epub_format = fmt
+    # Extract ebook format (epub or pdf - Nextory serves both as epub)
+    ebook_format = None
+    for fmt_type in ("epub", "pdf"):
+        for fmt in details.get("formats", []):
+            if fmt.get("type") == fmt_type:
+                ebook_format = fmt
+                break
+        if ebook_format:
            break

    metadata = {
        "title": details.get("title"),
        "authors": [author.get("name", "") for author in details.get("authors", [])],
-        "translators": [translator.get("name", "") for translator in epub_format.get("translators", []) if epub_format],
+        "translators": [translator.get("name", "") for translator in ebook_format.get("translators", [])] if ebook_format else [],
        "description": details.get("description_full"),
        "language": details.get("language"),
    }

-    # Epub-specific metadata
-    if epub_format:
-        metadata["publisher"] = epub_format.get("publisher", {}).get("name")
-        metadata["isbn"] = epub_format.get("isbn")
+    # Format-specific metadata
+    if ebook_format:
+        metadata["publisher"] = ebook_format.get("publisher", {}).get("name")
+        metadata["isbn"] = ebook_format.get("isbn")

-        publication_date = epub_format.get("publication_date")
+        publication_date = ebook_format.get("publication_date")
        if publication_date:
            # Already in YYYY-MM-DD format
            metadata["release_date"] = publication_date
--- a/grawlix/output/init.py
+++ b/grawlix/output/init.py
@ -6,6 +6,7 @@ from .output_format import OutputFormat
 from .acsm import Acsm
 from .cbz import Cbz
 from .epub import Epub
+from .pdf import Pdf

 from typing import Callable, Iterable
 from pathlib import Path
@ -184,4 +185,5 @@ def get_output_formats() -> list[type[OutputFormat]]:
        Acsm,
        Cbz,
        Epub,
+        Pdf,
    ]
--- a/grawlix/output/pdf.py
+++ b/grawlix/output/pdf.py
@ -0,0 +1,10 @@
+from grawlix.book import Book, SingleFile
+from .output_format import OutputFormat, Update
+
+
+class Pdf(OutputFormat):
+    extension = "pdf"
+    input_types = [SingleFile]
+
+    async def download(self, book: Book, location: str, update_func: Update) -> None:
+        await self._download_single_file(book, location, update_func)
--- a/grawlix/pdf_converter.py
+++ b/grawlix/pdf_converter.py
@ -0,0 +1,68 @@
+"""
+Convert PDF-in-epub files to proper PDF format.
+Some sources (like Nextory) wrap PDF pages in epub containers.
+"""
+
+import os
+import re
+import zipfile
+from io import BytesIO
+from pypdf import PdfWriter, PdfReader
+
+
+def convert_pdf_epub_to_pdf(epub_path: str) -> str:
+    """
+    Extract embedded PDFs from an epub and merge them into a single PDF.
+
+    :param epub_path: Path to the epub file containing embedded PDFs
+    :return: Path to the created PDF file
+    """
+    pdf_path = epub_path.rsplit('.', 1)[0] + '.pdf'
+
+    with zipfile.ZipFile(epub_path, 'r') as zf:
+        # Find all PDF files in the epub
+        pdf_files = [f for f in zf.namelist() if f.endswith('.pdf')]
+
+        if not pdf_files:
+            raise ValueError("No PDF files found in epub")
+
+        # Sort by numeric order (1.pdf, 2.pdf, ..., 10.pdf, 11.pdf, ...)
+        def extract_number(path: str) -> int:
+            match = re.search(r'/(\d+)\.pdf$', path)
+            return int(match.group(1)) if match else 0
+
+        pdf_files.sort(key=extract_number)
+
+        # Merge all PDFs
+        writer = PdfWriter()
+        for pdf_file in pdf_files:
+            pdf_data = zf.read(pdf_file)
+            reader = PdfReader(BytesIO(pdf_data))
+            for page in reader.pages:
+                writer.add_page(page)
+
+        # Write merged PDF
+        with open(pdf_path, 'wb') as out_file:
+            writer.write(out_file)
+
+    # Remove the original epub
+    os.remove(epub_path)
+
+    return pdf_path
+
+
+def is_pdf_in_epub(epub_path: str) -> bool:
+    """
+    Check if an epub contains embedded PDF files instead of HTML.
+
+    :param epub_path: Path to the epub file
+    :return: True if the epub contains PDF files
+    """
+    try:
+        with zipfile.ZipFile(epub_path, 'r') as zf:
+            for name in zf.namelist():
+                if name.endswith('.pdf'):
+                    return True
+    except (zipfile.BadZipFile, FileNotFoundError):
+        pass
+    return False
--- a/grawlix/sources/nextory.py
+++ b/grawlix/sources/nextory.py
@ -3,7 +3,7 @@ from grawlix.encryption import AESEncryption
 from grawlix.exceptions import InvalidUrl
 from .source import Source

-from typing import Optional
+from typing import Optional, Tuple
 import uuid
 import base64

@ -24,14 +24,12 @@ class Nextory(Source):
        self._client.headers.update(
            {
                "X-Application-Id": "200",
-                "X-App-Version": "5.47.0",
+                "X-App-Version": "2025.12.1",
                "X-Locale": LOCALE,
                "X-Model": "Personal Computer",
                "X-Device-Id": device_id,
                "X-OS-INFO": "Personal Computer",
                "locale": LOCALE,
-                "device": device_id,
-                "appid": "200",
            }
        )
        # Login for account
@ -115,10 +113,12 @@ class Nextory(Source):

    async def _download_book(self, book_id: str) -> Book:
        product_data = await self._get_product_data(book_id)
-        epub_id = self._find_epub_id(product_data)
-        pages = await self._get_pages(epub_id)
+        format_type, format_id = self._find_format(product_data)
+        # Nextory serves all books via epub endpoint regardless of original format
+        data = await self._get_epub_data(format_id)
+
        return Book(
-            data = pages,
+            data = data,
            metadata = Metadata(
                title = product_data["title"],
                authors = [author["name"] for author in product_data["authors"]],
@ -126,6 +126,7 @@ class Nextory(Source):
            ),
            source_data = {
                "source_name": "nextory",
+                "format_type": format_type,
                "details": product_data
            }
        )
@ -145,27 +146,29 @@ class Nextory(Source):


    @staticmethod
-    def _find_epub_id(product_data) -> str:
-        """Find id of book format of type epub for given book"""
-        for format in product_data["formats"]:
-            if format["type"] == "epub":
-                return format["identifier"]
+    def _find_format(product_data) -> Tuple[str, str]:
+        """Find a supported book format (epub or pdf)"""
+        for format_type in ("epub", "pdf"):
+            for fmt in product_data["formats"]:
+                if fmt["type"] == format_type:
+                    return (format_type, fmt["identifier"])
        raise InvalidUrl


    @staticmethod
    def _extract_series_name(product_info: dict) -> Optional[str]:
-        if "series" not in product_info:
+        series = product_info.get("series")
+        if series is None:
            return None
-        return product_info["series"]["name"]
+        return series["name"]


-    async def _get_pages(self, epub_id: str) -> BookData:
+    async def _get_epub_data(self, epub_id: str) -> BookData:
        """
-        Download page information for book
+        Download epub data for book

        :param epub_id: Id of epub file
-        :return: Page data
+        :return: Epub data
        """
        # Nextory books are for some reason split up into multiple epub files -
        # one for each chapter file. All of these files has to be decrypted and
@ -197,7 +200,6 @@ class Nextory(Source):
            files_in_toc
        )

-
    @staticmethod
    def _fix_key(value: str) -> bytes:
        """Remove unused data and decode key"""
--- a/pyproject.toml
+++ b/pyproject.toml
@ -25,6 +25,7 @@ dependencies = [
    "lxml>=4.6.0",
    "platformdirs>=3.0.0",
    "pycryptodome>=3.10.0",
+    "pypdf>=3.0.0",
    "rich>=10.0.0",
    "tomli>=1.0.0; python_version<'3.11'",
 ]