mirror of
https://github.com/jo1gi/grawlix.git
synced 2026-03-26 20:58:28 -06:00
feat: fix Nextory series null handling and add PDF book support
- Fix _extract_series_name to handle null series values - Add PDF book support for Nextory (served via epub endpoint) - Convert PDF-in-epub to merged PDF after download - Add pypdf dependency for PDF merging - Add Pdf output format (unused but available) - Update metadata transformer to handle both epub and pdf formats
This commit is contained in:
parent
08af04ab32
commit
6199e02b9b
@ -163,6 +163,18 @@ async def download_with_progress(book: Book, progress: Progress, template: str,
|
||||
# Download the book
|
||||
await download_book(book, update_function, template)
|
||||
|
||||
# Convert PDF-in-epub to PDF if needed
|
||||
if book.source_data and book.source_data.get('format_type') == 'pdf':
|
||||
from .output import format_output_location, get_default_format
|
||||
from .pdf_converter import convert_pdf_epub_to_pdf, is_pdf_in_epub
|
||||
|
||||
output_format = get_default_format(book)
|
||||
location = format_output_location(book, output_format, template)
|
||||
|
||||
if location.endswith('.epub') and os.path.exists(location) and is_pdf_in_epub(location):
|
||||
convert_pdf_epub_to_pdf(location)
|
||||
logging.debug(f"Converted PDF-in-epub to PDF: {location}")
|
||||
|
||||
# Write metadata if requested and available
|
||||
if write_metadata and book.source_data:
|
||||
from .output import format_output_location, get_default_format, find_output_format, get_valid_extensions
|
||||
|
||||
@ -61,27 +61,30 @@ def nextory_transformer(details: dict) -> dict:
|
||||
:param details: Nextory book details JSON
|
||||
:return: Standardized metadata dict
|
||||
"""
|
||||
# Extract epub format
|
||||
epub_format = None
|
||||
for fmt in details.get("formats", []):
|
||||
if fmt.get("type") == "epub":
|
||||
epub_format = fmt
|
||||
# Extract ebook format (epub or pdf - Nextory serves both as epub)
|
||||
ebook_format = None
|
||||
for fmt_type in ("epub", "pdf"):
|
||||
for fmt in details.get("formats", []):
|
||||
if fmt.get("type") == fmt_type:
|
||||
ebook_format = fmt
|
||||
break
|
||||
if ebook_format:
|
||||
break
|
||||
|
||||
metadata = {
|
||||
"title": details.get("title"),
|
||||
"authors": [author.get("name", "") for author in details.get("authors", [])],
|
||||
"translators": [translator.get("name", "") for translator in epub_format.get("translators", []) if epub_format],
|
||||
"translators": [translator.get("name", "") for translator in ebook_format.get("translators", [])] if ebook_format else [],
|
||||
"description": details.get("description_full"),
|
||||
"language": details.get("language"),
|
||||
}
|
||||
|
||||
# Epub-specific metadata
|
||||
if epub_format:
|
||||
metadata["publisher"] = epub_format.get("publisher", {}).get("name")
|
||||
metadata["isbn"] = epub_format.get("isbn")
|
||||
# Format-specific metadata
|
||||
if ebook_format:
|
||||
metadata["publisher"] = ebook_format.get("publisher", {}).get("name")
|
||||
metadata["isbn"] = ebook_format.get("isbn")
|
||||
|
||||
publication_date = epub_format.get("publication_date")
|
||||
publication_date = ebook_format.get("publication_date")
|
||||
if publication_date:
|
||||
# Already in YYYY-MM-DD format
|
||||
metadata["release_date"] = publication_date
|
||||
|
||||
@ -6,6 +6,7 @@ from .output_format import OutputFormat
|
||||
from .acsm import Acsm
|
||||
from .cbz import Cbz
|
||||
from .epub import Epub
|
||||
from .pdf import Pdf
|
||||
|
||||
from typing import Callable, Iterable
|
||||
from pathlib import Path
|
||||
@ -184,4 +185,5 @@ def get_output_formats() -> list[type[OutputFormat]]:
|
||||
Acsm,
|
||||
Cbz,
|
||||
Epub,
|
||||
Pdf,
|
||||
]
|
||||
|
||||
10
grawlix/output/pdf.py
Normal file
10
grawlix/output/pdf.py
Normal file
@ -0,0 +1,10 @@
|
||||
from grawlix.book import Book, SingleFile
|
||||
from .output_format import OutputFormat, Update
|
||||
|
||||
|
||||
class Pdf(OutputFormat):
|
||||
extension = "pdf"
|
||||
input_types = [SingleFile]
|
||||
|
||||
async def download(self, book: Book, location: str, update_func: Update) -> None:
|
||||
await self._download_single_file(book, location, update_func)
|
||||
68
grawlix/pdf_converter.py
Normal file
68
grawlix/pdf_converter.py
Normal file
@ -0,0 +1,68 @@
|
||||
"""
|
||||
Convert PDF-in-epub files to proper PDF format.
|
||||
Some sources (like Nextory) wrap PDF pages in epub containers.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from pypdf import PdfWriter, PdfReader
|
||||
|
||||
|
||||
def convert_pdf_epub_to_pdf(epub_path: str) -> str:
|
||||
"""
|
||||
Extract embedded PDFs from an epub and merge them into a single PDF.
|
||||
|
||||
:param epub_path: Path to the epub file containing embedded PDFs
|
||||
:return: Path to the created PDF file
|
||||
"""
|
||||
pdf_path = epub_path.rsplit('.', 1)[0] + '.pdf'
|
||||
|
||||
with zipfile.ZipFile(epub_path, 'r') as zf:
|
||||
# Find all PDF files in the epub
|
||||
pdf_files = [f for f in zf.namelist() if f.endswith('.pdf')]
|
||||
|
||||
if not pdf_files:
|
||||
raise ValueError("No PDF files found in epub")
|
||||
|
||||
# Sort by numeric order (1.pdf, 2.pdf, ..., 10.pdf, 11.pdf, ...)
|
||||
def extract_number(path: str) -> int:
|
||||
match = re.search(r'/(\d+)\.pdf$', path)
|
||||
return int(match.group(1)) if match else 0
|
||||
|
||||
pdf_files.sort(key=extract_number)
|
||||
|
||||
# Merge all PDFs
|
||||
writer = PdfWriter()
|
||||
for pdf_file in pdf_files:
|
||||
pdf_data = zf.read(pdf_file)
|
||||
reader = PdfReader(BytesIO(pdf_data))
|
||||
for page in reader.pages:
|
||||
writer.add_page(page)
|
||||
|
||||
# Write merged PDF
|
||||
with open(pdf_path, 'wb') as out_file:
|
||||
writer.write(out_file)
|
||||
|
||||
# Remove the original epub
|
||||
os.remove(epub_path)
|
||||
|
||||
return pdf_path
|
||||
|
||||
|
||||
def is_pdf_in_epub(epub_path: str) -> bool:
|
||||
"""
|
||||
Check if an epub contains embedded PDF files instead of HTML.
|
||||
|
||||
:param epub_path: Path to the epub file
|
||||
:return: True if the epub contains PDF files
|
||||
"""
|
||||
try:
|
||||
with zipfile.ZipFile(epub_path, 'r') as zf:
|
||||
for name in zf.namelist():
|
||||
if name.endswith('.pdf'):
|
||||
return True
|
||||
except (zipfile.BadZipFile, FileNotFoundError):
|
||||
pass
|
||||
return False
|
||||
@ -3,7 +3,7 @@ from grawlix.encryption import AESEncryption
|
||||
from grawlix.exceptions import InvalidUrl
|
||||
from .source import Source
|
||||
|
||||
from typing import Optional
|
||||
from typing import Optional, Tuple
|
||||
import uuid
|
||||
import base64
|
||||
|
||||
@ -24,14 +24,12 @@ class Nextory(Source):
|
||||
self._client.headers.update(
|
||||
{
|
||||
"X-Application-Id": "200",
|
||||
"X-App-Version": "5.47.0",
|
||||
"X-App-Version": "2025.12.1",
|
||||
"X-Locale": LOCALE,
|
||||
"X-Model": "Personal Computer",
|
||||
"X-Device-Id": device_id,
|
||||
"X-OS-INFO": "Personal Computer",
|
||||
"locale": LOCALE,
|
||||
"device": device_id,
|
||||
"appid": "200",
|
||||
}
|
||||
)
|
||||
# Login for account
|
||||
@ -115,10 +113,12 @@ class Nextory(Source):
|
||||
|
||||
async def _download_book(self, book_id: str) -> Book:
|
||||
product_data = await self._get_product_data(book_id)
|
||||
epub_id = self._find_epub_id(product_data)
|
||||
pages = await self._get_pages(epub_id)
|
||||
format_type, format_id = self._find_format(product_data)
|
||||
# Nextory serves all books via epub endpoint regardless of original format
|
||||
data = await self._get_epub_data(format_id)
|
||||
|
||||
return Book(
|
||||
data = pages,
|
||||
data = data,
|
||||
metadata = Metadata(
|
||||
title = product_data["title"],
|
||||
authors = [author["name"] for author in product_data["authors"]],
|
||||
@ -126,6 +126,7 @@ class Nextory(Source):
|
||||
),
|
||||
source_data = {
|
||||
"source_name": "nextory",
|
||||
"format_type": format_type,
|
||||
"details": product_data
|
||||
}
|
||||
)
|
||||
@ -145,27 +146,29 @@ class Nextory(Source):
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _find_epub_id(product_data) -> str:
|
||||
"""Find id of book format of type epub for given book"""
|
||||
for format in product_data["formats"]:
|
||||
if format["type"] == "epub":
|
||||
return format["identifier"]
|
||||
def _find_format(product_data) -> Tuple[str, str]:
|
||||
"""Find a supported book format (epub or pdf)"""
|
||||
for format_type in ("epub", "pdf"):
|
||||
for fmt in product_data["formats"]:
|
||||
if fmt["type"] == format_type:
|
||||
return (format_type, fmt["identifier"])
|
||||
raise InvalidUrl
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _extract_series_name(product_info: dict) -> Optional[str]:
|
||||
if "series" not in product_info:
|
||||
series = product_info.get("series")
|
||||
if series is None:
|
||||
return None
|
||||
return product_info["series"]["name"]
|
||||
return series["name"]
|
||||
|
||||
|
||||
async def _get_pages(self, epub_id: str) -> BookData:
|
||||
async def _get_epub_data(self, epub_id: str) -> BookData:
|
||||
"""
|
||||
Download page information for book
|
||||
Download epub data for book
|
||||
|
||||
:param epub_id: Id of epub file
|
||||
:return: Page data
|
||||
:return: Epub data
|
||||
"""
|
||||
# Nextory books are for some reason split up into multiple epub files -
|
||||
# one for each chapter file. All of these files has to be decrypted and
|
||||
@ -197,7 +200,6 @@ class Nextory(Source):
|
||||
files_in_toc
|
||||
)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _fix_key(value: str) -> bytes:
|
||||
"""Remove unused data and decode key"""
|
||||
|
||||
@ -25,6 +25,7 @@ dependencies = [
|
||||
"lxml>=4.6.0",
|
||||
"platformdirs>=3.0.0",
|
||||
"pycryptodome>=3.10.0",
|
||||
"pypdf>=3.0.0",
|
||||
"rich>=10.0.0",
|
||||
"tomli>=1.0.0; python_version<'3.11'",
|
||||
]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user