feat: fix Nextory series null handling and add PDF book support

- Fix _extract_series_name to handle null series values
- Add PDF book support for Nextory (served via epub endpoint)
- Convert PDF-in-epub to merged PDF after download
- Add pypdf dependency for PDF merging
- Add Pdf output format (unused but available)
- Update metadata transformer to handle both epub and pdf formats
This commit is contained in:
^_^ 2025-12-18 12:25:37 +01:00
parent 08af04ab32
commit 6199e02b9b
7 changed files with 127 additions and 29 deletions

View File

@ -163,6 +163,18 @@ async def download_with_progress(book: Book, progress: Progress, template: str,
# Download the book
await download_book(book, update_function, template)
# Convert PDF-in-epub to PDF if needed
if book.source_data and book.source_data.get('format_type') == 'pdf':
from .output import format_output_location, get_default_format
from .pdf_converter import convert_pdf_epub_to_pdf, is_pdf_in_epub
output_format = get_default_format(book)
location = format_output_location(book, output_format, template)
if location.endswith('.epub') and os.path.exists(location) and is_pdf_in_epub(location):
convert_pdf_epub_to_pdf(location)
logging.debug(f"Converted PDF-in-epub to PDF: {location}")
# Write metadata if requested and available
if write_metadata and book.source_data:
from .output import format_output_location, get_default_format, find_output_format, get_valid_extensions

View File

@ -61,27 +61,30 @@ def nextory_transformer(details: dict) -> dict:
:param details: Nextory book details JSON
:return: Standardized metadata dict
"""
# Extract epub format
epub_format = None
for fmt in details.get("formats", []):
if fmt.get("type") == "epub":
epub_format = fmt
# Extract ebook format (epub or pdf - Nextory serves both as epub)
ebook_format = None
for fmt_type in ("epub", "pdf"):
for fmt in details.get("formats", []):
if fmt.get("type") == fmt_type:
ebook_format = fmt
break
if ebook_format:
break
metadata = {
"title": details.get("title"),
"authors": [author.get("name", "") for author in details.get("authors", [])],
"translators": [translator.get("name", "") for translator in epub_format.get("translators", []) if epub_format],
"translators": [translator.get("name", "") for translator in ebook_format.get("translators", [])] if ebook_format else [],
"description": details.get("description_full"),
"language": details.get("language"),
}
# Epub-specific metadata
if epub_format:
metadata["publisher"] = epub_format.get("publisher", {}).get("name")
metadata["isbn"] = epub_format.get("isbn")
# Format-specific metadata
if ebook_format:
metadata["publisher"] = ebook_format.get("publisher", {}).get("name")
metadata["isbn"] = ebook_format.get("isbn")
publication_date = epub_format.get("publication_date")
publication_date = ebook_format.get("publication_date")
if publication_date:
# Already in YYYY-MM-DD format
metadata["release_date"] = publication_date

View File

@ -6,6 +6,7 @@ from .output_format import OutputFormat
from .acsm import Acsm
from .cbz import Cbz
from .epub import Epub
from .pdf import Pdf
from typing import Callable, Iterable
from pathlib import Path
@ -184,4 +185,5 @@ def get_output_formats() -> list[type[OutputFormat]]:
Acsm,
Cbz,
Epub,
Pdf,
]

10
grawlix/output/pdf.py Normal file
View File

@ -0,0 +1,10 @@
from grawlix.book import Book, SingleFile
from .output_format import OutputFormat, Update
class Pdf(OutputFormat):
extension = "pdf"
input_types = [SingleFile]
async def download(self, book: Book, location: str, update_func: Update) -> None:
await self._download_single_file(book, location, update_func)

68
grawlix/pdf_converter.py Normal file
View File

@ -0,0 +1,68 @@
"""
Convert PDF-in-epub files to proper PDF format.
Some sources (like Nextory) wrap PDF pages in epub containers.
"""
import os
import re
import zipfile
from io import BytesIO
from pypdf import PdfWriter, PdfReader
def convert_pdf_epub_to_pdf(epub_path: str) -> str:
"""
Extract embedded PDFs from an epub and merge them into a single PDF.
:param epub_path: Path to the epub file containing embedded PDFs
:return: Path to the created PDF file
"""
pdf_path = epub_path.rsplit('.', 1)[0] + '.pdf'
with zipfile.ZipFile(epub_path, 'r') as zf:
# Find all PDF files in the epub
pdf_files = [f for f in zf.namelist() if f.endswith('.pdf')]
if not pdf_files:
raise ValueError("No PDF files found in epub")
# Sort by numeric order (1.pdf, 2.pdf, ..., 10.pdf, 11.pdf, ...)
def extract_number(path: str) -> int:
match = re.search(r'/(\d+)\.pdf$', path)
return int(match.group(1)) if match else 0
pdf_files.sort(key=extract_number)
# Merge all PDFs
writer = PdfWriter()
for pdf_file in pdf_files:
pdf_data = zf.read(pdf_file)
reader = PdfReader(BytesIO(pdf_data))
for page in reader.pages:
writer.add_page(page)
# Write merged PDF
with open(pdf_path, 'wb') as out_file:
writer.write(out_file)
# Remove the original epub
os.remove(epub_path)
return pdf_path
def is_pdf_in_epub(epub_path: str) -> bool:
"""
Check if an epub contains embedded PDF files instead of HTML.
:param epub_path: Path to the epub file
:return: True if the epub contains PDF files
"""
try:
with zipfile.ZipFile(epub_path, 'r') as zf:
for name in zf.namelist():
if name.endswith('.pdf'):
return True
except (zipfile.BadZipFile, FileNotFoundError):
pass
return False

View File

@ -3,7 +3,7 @@ from grawlix.encryption import AESEncryption
from grawlix.exceptions import InvalidUrl
from .source import Source
from typing import Optional
from typing import Optional, Tuple
import uuid
import base64
@ -24,14 +24,12 @@ class Nextory(Source):
self._client.headers.update(
{
"X-Application-Id": "200",
"X-App-Version": "5.47.0",
"X-App-Version": "2025.12.1",
"X-Locale": LOCALE,
"X-Model": "Personal Computer",
"X-Device-Id": device_id,
"X-OS-INFO": "Personal Computer",
"locale": LOCALE,
"device": device_id,
"appid": "200",
}
)
# Login for account
@ -115,10 +113,12 @@ class Nextory(Source):
async def _download_book(self, book_id: str) -> Book:
product_data = await self._get_product_data(book_id)
epub_id = self._find_epub_id(product_data)
pages = await self._get_pages(epub_id)
format_type, format_id = self._find_format(product_data)
# Nextory serves all books via epub endpoint regardless of original format
data = await self._get_epub_data(format_id)
return Book(
data = pages,
data = data,
metadata = Metadata(
title = product_data["title"],
authors = [author["name"] for author in product_data["authors"]],
@ -126,6 +126,7 @@ class Nextory(Source):
),
source_data = {
"source_name": "nextory",
"format_type": format_type,
"details": product_data
}
)
@ -145,27 +146,29 @@ class Nextory(Source):
@staticmethod
def _find_epub_id(product_data) -> str:
"""Find id of book format of type epub for given book"""
for format in product_data["formats"]:
if format["type"] == "epub":
return format["identifier"]
def _find_format(product_data) -> Tuple[str, str]:
"""Find a supported book format (epub or pdf)"""
for format_type in ("epub", "pdf"):
for fmt in product_data["formats"]:
if fmt["type"] == format_type:
return (format_type, fmt["identifier"])
raise InvalidUrl
@staticmethod
def _extract_series_name(product_info: dict) -> Optional[str]:
if "series" not in product_info:
series = product_info.get("series")
if series is None:
return None
return product_info["series"]["name"]
return series["name"]
async def _get_pages(self, epub_id: str) -> BookData:
async def _get_epub_data(self, epub_id: str) -> BookData:
"""
Download page information for book
Download epub data for book
:param epub_id: Id of epub file
:return: Page data
:return: Epub data
"""
# Nextory books are for some reason split up into multiple epub files -
# one for each chapter file. All of these files has to be decrypted and
@ -197,7 +200,6 @@ class Nextory(Source):
files_in_toc
)
@staticmethod
def _fix_key(value: str) -> bytes:
"""Remove unused data and decode key"""

View File

@ -25,6 +25,7 @@ dependencies = [
"lxml>=4.6.0",
"platformdirs>=3.0.0",
"pycryptodome>=3.10.0",
"pypdf>=3.0.0",
"rich>=10.0.0",
"tomli>=1.0.0; python_version<'3.11'",
]