diff --git a/grawlix/__main__.py b/grawlix/__main__.py index 5ad8b7e..6370d4e 100644 --- a/grawlix/__main__.py +++ b/grawlix/__main__.py @@ -163,6 +163,18 @@ async def download_with_progress(book: Book, progress: Progress, template: str, # Download the book await download_book(book, update_function, template) + # Convert PDF-in-epub to PDF if needed + if book.source_data and book.source_data.get('format_type') == 'pdf': + from .output import format_output_location, get_default_format + from .pdf_converter import convert_pdf_epub_to_pdf, is_pdf_in_epub + + output_format = get_default_format(book) + location = format_output_location(book, output_format, template) + + if location.endswith('.epub') and os.path.exists(location) and is_pdf_in_epub(location): + convert_pdf_epub_to_pdf(location) + logging.debug(f"Converted PDF-in-epub to PDF: {location}") + # Write metadata if requested and available if write_metadata and book.source_data: from .output import format_output_location, get_default_format, find_output_format, get_valid_extensions diff --git a/grawlix/epub_metadata_writers.py b/grawlix/epub_metadata_writers.py index 5a8b063..0f3dc3a 100644 --- a/grawlix/epub_metadata_writers.py +++ b/grawlix/epub_metadata_writers.py @@ -61,27 +61,30 @@ def nextory_transformer(details: dict) -> dict: :param details: Nextory book details JSON :return: Standardized metadata dict """ - # Extract epub format - epub_format = None - for fmt in details.get("formats", []): - if fmt.get("type") == "epub": - epub_format = fmt + # Extract ebook format (epub or pdf - Nextory serves both as epub) + ebook_format = None + for fmt_type in ("epub", "pdf"): + for fmt in details.get("formats", []): + if fmt.get("type") == fmt_type: + ebook_format = fmt + break + if ebook_format: break metadata = { "title": details.get("title"), "authors": [author.get("name", "") for author in details.get("authors", [])], - "translators": [translator.get("name", "") for translator in epub_format.get("translators", []) if epub_format], + "translators": [translator.get("name", "") for translator in ebook_format.get("translators", [])] if ebook_format else [], "description": details.get("description_full"), "language": details.get("language"), } - # Epub-specific metadata - if epub_format: - metadata["publisher"] = epub_format.get("publisher", {}).get("name") - metadata["isbn"] = epub_format.get("isbn") + # Format-specific metadata + if ebook_format: + metadata["publisher"] = ebook_format.get("publisher", {}).get("name") + metadata["isbn"] = ebook_format.get("isbn") - publication_date = epub_format.get("publication_date") + publication_date = ebook_format.get("publication_date") if publication_date: # Already in YYYY-MM-DD format metadata["release_date"] = publication_date diff --git a/grawlix/output/__init__.py b/grawlix/output/__init__.py index 0444170..1a88936 100644 --- a/grawlix/output/__init__.py +++ b/grawlix/output/__init__.py @@ -6,6 +6,7 @@ from .output_format import OutputFormat from .acsm import Acsm from .cbz import Cbz from .epub import Epub +from .pdf import Pdf from typing import Callable, Iterable from pathlib import Path @@ -184,4 +185,5 @@ def get_output_formats() -> list[type[OutputFormat]]: Acsm, Cbz, Epub, + Pdf, ] diff --git a/grawlix/output/pdf.py b/grawlix/output/pdf.py new file mode 100644 index 0000000..17468fd --- /dev/null +++ b/grawlix/output/pdf.py @@ -0,0 +1,10 @@ +from grawlix.book import Book, SingleFile +from .output_format import OutputFormat, Update + + +class Pdf(OutputFormat): + extension = "pdf" + input_types = [SingleFile] + + async def download(self, book: Book, location: str, update_func: Update) -> None: + await self._download_single_file(book, location, update_func) diff --git a/grawlix/pdf_converter.py b/grawlix/pdf_converter.py new file mode 100644 index 0000000..a352e3c --- /dev/null +++ b/grawlix/pdf_converter.py @@ -0,0 +1,68 @@ +""" +Convert PDF-in-epub files to proper PDF format. +Some sources (like Nextory) wrap PDF pages in epub containers. +""" + +import os +import re +import zipfile +from io import BytesIO +from pypdf import PdfWriter, PdfReader + + +def convert_pdf_epub_to_pdf(epub_path: str) -> str: + """ + Extract embedded PDFs from an epub and merge them into a single PDF. + + :param epub_path: Path to the epub file containing embedded PDFs + :return: Path to the created PDF file + """ + pdf_path = epub_path.rsplit('.', 1)[0] + '.pdf' + + with zipfile.ZipFile(epub_path, 'r') as zf: + # Find all PDF files in the epub + pdf_files = [f for f in zf.namelist() if f.endswith('.pdf')] + + if not pdf_files: + raise ValueError("No PDF files found in epub") + + # Sort by numeric order (1.pdf, 2.pdf, ..., 10.pdf, 11.pdf, ...) + def extract_number(path: str) -> int: + match = re.search(r'/(\d+)\.pdf$', path) + return int(match.group(1)) if match else 0 + + pdf_files.sort(key=extract_number) + + # Merge all PDFs + writer = PdfWriter() + for pdf_file in pdf_files: + pdf_data = zf.read(pdf_file) + reader = PdfReader(BytesIO(pdf_data)) + for page in reader.pages: + writer.add_page(page) + + # Write merged PDF + with open(pdf_path, 'wb') as out_file: + writer.write(out_file) + + # Remove the original epub + os.remove(epub_path) + + return pdf_path + + +def is_pdf_in_epub(epub_path: str) -> bool: + """ + Check if an epub contains embedded PDF files instead of HTML. + + :param epub_path: Path to the epub file + :return: True if the epub contains PDF files + """ + try: + with zipfile.ZipFile(epub_path, 'r') as zf: + for name in zf.namelist(): + if name.endswith('.pdf'): + return True + except (zipfile.BadZipFile, FileNotFoundError): + pass + return False diff --git a/grawlix/sources/nextory.py b/grawlix/sources/nextory.py index 9f09861..4fe89f9 100644 --- a/grawlix/sources/nextory.py +++ b/grawlix/sources/nextory.py @@ -3,7 +3,7 @@ from grawlix.encryption import AESEncryption from grawlix.exceptions import InvalidUrl from .source import Source -from typing import Optional +from typing import Optional, Tuple import uuid import base64 @@ -24,14 +24,12 @@ class Nextory(Source): self._client.headers.update( { "X-Application-Id": "200", - "X-App-Version": "5.47.0", + "X-App-Version": "2025.12.1", "X-Locale": LOCALE, "X-Model": "Personal Computer", "X-Device-Id": device_id, "X-OS-INFO": "Personal Computer", "locale": LOCALE, - "device": device_id, - "appid": "200", } ) # Login for account @@ -115,10 +113,12 @@ class Nextory(Source): async def _download_book(self, book_id: str) -> Book: product_data = await self._get_product_data(book_id) - epub_id = self._find_epub_id(product_data) - pages = await self._get_pages(epub_id) + format_type, format_id = self._find_format(product_data) + # Nextory serves all books via epub endpoint regardless of original format + data = await self._get_epub_data(format_id) + return Book( - data = pages, + data = data, metadata = Metadata( title = product_data["title"], authors = [author["name"] for author in product_data["authors"]], @@ -126,6 +126,7 @@ class Nextory(Source): ), source_data = { "source_name": "nextory", + "format_type": format_type, "details": product_data } ) @@ -145,27 +146,29 @@ class Nextory(Source): @staticmethod - def _find_epub_id(product_data) -> str: - """Find id of book format of type epub for given book""" - for format in product_data["formats"]: - if format["type"] == "epub": - return format["identifier"] + def _find_format(product_data) -> Tuple[str, str]: + """Find a supported book format (epub or pdf)""" + for format_type in ("epub", "pdf"): + for fmt in product_data["formats"]: + if fmt["type"] == format_type: + return (format_type, fmt["identifier"]) raise InvalidUrl @staticmethod def _extract_series_name(product_info: dict) -> Optional[str]: - if "series" not in product_info: + series = product_info.get("series") + if series is None: return None - return product_info["series"]["name"] + return series["name"] - async def _get_pages(self, epub_id: str) -> BookData: + async def _get_epub_data(self, epub_id: str) -> BookData: """ - Download page information for book + Download epub data for book :param epub_id: Id of epub file - :return: Page data + :return: Epub data """ # Nextory books are for some reason split up into multiple epub files - # one for each chapter file. All of these files has to be decrypted and @@ -197,7 +200,6 @@ class Nextory(Source): files_in_toc ) - @staticmethod def _fix_key(value: str) -> bytes: """Remove unused data and decode key""" diff --git a/pyproject.toml b/pyproject.toml index 0dd718f..db548bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "lxml>=4.6.0", "platformdirs>=3.0.0", "pycryptodome>=3.10.0", + "pypdf>=3.0.0", "rich>=10.0.0", "tomli>=1.0.0; python_version<'3.11'", ]