From 08ddad3a74794ad6d47cd5d8aa29e2604abe0b68 Mon Sep 17 00:00:00 2001 From: ^_^ <8480595+ballaballaballa@users.noreply.github.com> Date: Fri, 16 Jan 2026 11:05:33 +0100 Subject: [PATCH] feat: add fixed-layout support for split EPUB sources - Add rendition properties to Metadata (layout, spread, orientation) - Extract and preserve rendition properties when merging EPUB parts - Fix viewport meta tags for fixed-layout pages - Use EpubItem instead of EpubHtml to preserve original content Fix EPUB validation errors in merged output: - Skip directory entries, mimetype, and META-INF from manifest - Exclude nav/toc from spine for fixed-layout books Improve CSS merging across parts: - Keep longer version of duplicate selectors (more complete rules) - Return None from _get_css_rule_key for invalid font-faces Add cover detection fallback: - Detect cover from largest image on first page when OPF lacks cover info - Optimize lookup with dict instead of nested loops Clean up redundant imports in epub_metadata.py --- grawlix/book.py | 4 + grawlix/output/epub.py | 362 ++++++++++++++++++++++- grawlix/output/metadata/epub_metadata.py | 89 +++--- 3 files changed, 383 insertions(+), 72 deletions(-) diff --git a/grawlix/book.py b/grawlix/book.py index 726eaf8..64d2933 100644 --- a/grawlix/book.py +++ b/grawlix/book.py @@ -20,6 +20,10 @@ class Metadata: translators: list[str] = field(default_factory=list) category: Optional[str] = None tags: list[str] = field(default_factory=list) + # EPUB 3 rendition properties (fixed-layout support) + rendition_layout: Optional[str] = None # "pre-paginated" or "reflowable" + rendition_spread: Optional[str] = None # "none", "auto", "landscape", "portrait", "both" + rendition_orientation: Optional[str] = None # "auto", "landscape", "portrait" def as_dict(self) -> dict: return { diff --git a/grawlix/output/epub.py b/grawlix/output/epub.py index 333ebc4..faf6dce 100644 --- a/grawlix/output/epub.py +++ b/grawlix/output/epub.py @@ -3,12 +3,162 @@ from grawlix.exceptions import UnsupportedOutputFormat from .output_format import OutputFormat, Update import asyncio -from bs4 import BeautifulSoup import os -from ebooklib import epub +import re +import xml.etree.ElementTree as ET from zipfile import ZipFile + +from bs4 import BeautifulSoup +from ebooklib import epub import rich + +def _fix_fixed_layout_page(html_content: bytes, css_content: bytes = None) -> bytes: + """ + Fix fixed-layout XHTML pages by adding viewport and fixing broken styles. + + Extracts dimensions from CSS and applies them to viewport and inline styles. + """ + try: + html_str = html_content.decode('utf-8') + except UnicodeDecodeError: + return html_content + + # Extract dimensions from CSS if provided + width = None + height = None + if css_content: + try: + css_str = css_content.decode('utf-8') + # Look for body width/height + width_match = re.search(r'body\s*\{[^}]*width:\s*(\d+)px', css_str) + height_match = re.search(r'body\s*\{[^}]*height:\s*(\d+)px', css_str) + if width_match: + width = width_match.group(1) + if height_match: + height = height_match.group(1) + except UnicodeDecodeError: + pass + + if not width or not height: + return html_content + + # Add viewport meta tag if missing + if 'name="viewport"' not in html_str and '' in html_str: + viewport_tag = f'' + html_str = html_str.replace('', f'\n {viewport_tag}', 1) + + # Fix broken inline styles (width:px; height:px;) + html_str = re.sub( + r'style="width:px;\s*height:px;"', + f'style="width:{width}px; height:{height}px;"', + html_str + ) + + return html_str.encode('utf-8') + + +def _get_css_rule_key(rule_text: str) -> str | None: + """Get unique key for a CSS rule. For @font-face, include font-family.""" + selector = rule_text.split('{')[0].strip() + if selector == '@font-face': + # Extract font-family to distinguish different font-faces + match = re.search(r'font-family:\s*["\']?([^"\';}]+)', rule_text) + if match: + return f'@font-face:{match.group(1).strip()}' + return None # Skip font-face without font-family + return selector if selector else None + + +def _extract_opf_metadata(opf_content: bytes) -> dict: + """ + Extract rendition properties, cover info, and spine properties from OPF content. + + Returns dict with keys: rendition_layout, rendition_spread, + rendition_orientation, cover_id, cover_href, spine_properties + """ + result = { + 'rendition_layout': None, + 'rendition_spread': None, + 'rendition_orientation': None, + 'cover_id': None, + 'cover_href': None, + 'spine_properties': {}, # Maps href -> properties (e.g., 'page-spread-left') + } + + try: + root = ET.fromstring(opf_content) + ns = { + 'opf': 'http://www.idpf.org/2007/opf', + 'dc': 'http://purl.org/dc/elements/1.1/', + } + + # Find metadata element + metadata = root.find('opf:metadata', ns) + if metadata is None: + metadata = root.find('{http://www.idpf.org/2007/opf}metadata') + if metadata is None: + return result + + # Extract rendition properties from + for meta in metadata.iter(): + if meta.tag.endswith('}meta') or meta.tag == 'meta': + prop = meta.get('property', '') + if prop == 'rendition:layout': + result['rendition_layout'] = meta.text + elif prop == 'rendition:spread': + result['rendition_spread'] = meta.text + elif prop == 'rendition:orientation': + result['rendition_orientation'] = meta.text + + # Cover reference: + name = meta.get('name', '') + if name == 'cover': + result['cover_id'] = meta.get('content') + + # Parse manifest once for cover info and id->href mapping + manifest = root.find('opf:manifest', ns) + if manifest is None: + manifest = root.find('{http://www.idpf.org/2007/opf}manifest') + + id_to_href = {} + if manifest is not None: + for item in manifest.iter(): + item_id = item.get('id') + item_href = item.get('href') + if item_id and item_href: + id_to_href[item_id] = item_href + + # Check for cover by ID match + if result['cover_id'] and item_id == result['cover_id'] and not result['cover_href']: + result['cover_href'] = item_href + + # Check for cover-image property + props = item.get('properties', '') + if 'cover-image' in props and not result['cover_href']: + result['cover_href'] = item_href + result['cover_id'] = item_id + + # Extract spine properties (page-spread-left, page-spread-right) + spine = root.find('opf:spine', ns) + if spine is None: + spine = root.find('{http://www.idpf.org/2007/opf}spine') + if spine is not None: + # Extract spine itemref properties + for itemref in spine.iter(): + if itemref.tag.endswith('}itemref') or itemref.tag == 'itemref': + idref = itemref.get('idref') + props = itemref.get('properties') + if idref and props and idref in id_to_href: + href = id_to_href[idref] + result['spine_properties'][href] = props + + except ET.ParseError: + pass + + return result + + class Epub(OutputFormat): extension = "epub" input_types = [SingleFile, HtmlFiles, EpubInParts] @@ -88,22 +238,102 @@ class Epub(OutputFormat): progress = 1/(file_count) temporary_file_location = f"{location}.tmp" - added_files: set[str] = set() - def get_new_files(zipfile: ZipFile): - """Returns files in zipfile not already added to file""" - for filename in zipfile.namelist(): - if filename in added_files or filename.endswith(".opf") or filename.endswith(".ncx"): - continue - yield filename + added_files: dict[str, int] = {} # Track filepath -> content size + opf_metadata: dict = {} + css_cache: dict[str, bytes] = {} # Store CSS content for fixing HTML pages + cover_href: str = None # Store cover image path from OPF + spine_properties: dict[str, str] = {} # Store spine properties (href -> properties) + + def should_add_file(zipfile: ZipFile, filename: str) -> bool: + """Check if file should be added (new or larger than existing)""" + # Skip directory entries, container files (ebooklib handles these), and OPF/NCX + if filename.endswith("/"): + return False + if filename == "mimetype" or filename.startswith("META-INF/"): + return False + if filename.endswith(".opf") or filename.endswith(".ncx"): + return False + if filename not in added_files: + return True + # If file exists, only replace if new version is larger (non-empty beats empty) + new_size = zipfile.getinfo(filename).file_size + return new_size > added_files[filename] output = epub.EpubBook() + opf_extracted = False for file in files: await self._download_and_write_file(file, temporary_file_location) with ZipFile(temporary_file_location, "r") as zipfile: - for filepath in get_new_files(zipfile): + # Extract OPF metadata from first OPF file (before skipping) + if not opf_extracted: + for filename in zipfile.namelist(): + if filename.endswith(".opf"): + opf_content = zipfile.read(filename) + opf_metadata = _extract_opf_metadata(opf_content) + # Store rendition properties in metadata + if opf_metadata.get('rendition_layout'): + metadata.rendition_layout = opf_metadata['rendition_layout'] + if opf_metadata.get('rendition_spread'): + metadata.rendition_spread = opf_metadata['rendition_spread'] + if opf_metadata.get('rendition_orientation'): + metadata.rendition_orientation = opf_metadata['rendition_orientation'] + if opf_metadata.get('cover_href'): + cover_href = opf_metadata['cover_href'] + if opf_metadata.get('spine_properties'): + spine_properties.update(opf_metadata['spine_properties']) + opf_extracted = True + break + + # Collect CSS files, merging content from all parts + for filepath in zipfile.namelist(): + if filepath.endswith(".css"): + content = zipfile.read(filepath) + if not content: + continue # Skip empty files + if filepath not in css_cache: + css_cache[filepath] = content + else: + # Merge: combine rules, keeping the longer version for duplicate selectors + existing_str = css_cache[filepath].decode('utf-8', errors='ignore') + new_str = content.decode('utf-8', errors='ignore') + + # Parse existing rules into dict: key -> full rule + existing_rules = {} + for rule in existing_str.split('}'): + if '{' in rule: + rule_key = _get_css_rule_key(rule) + if rule_key: + existing_rules[rule_key] = rule.strip() + '}' + + # Process new rules: add new ones, replace if longer + for rule in new_str.split('}'): + if '{' in rule: + rule_key = _get_css_rule_key(rule) + if rule_key: + new_rule = rule.strip() + '}' + if rule_key not in existing_rules or len(new_rule) > len(existing_rules[rule_key]): + existing_rules[rule_key] = new_rule + + # Rebuild CSS from merged rules + css_cache[filepath] = '\n'.join(existing_rules.values()).encode('utf-8') + + for filepath in zipfile.namelist(): + # Skip CSS files here - they'll be added after all parts are merged + if filepath.endswith(".css"): + continue + if not should_add_file(zipfile, filepath): + continue content = zipfile.read(filepath) + file_size = len(content) if filepath.endswith("html"): filename = os.path.basename(filepath) + # Fix fixed-layout pages if we have rendition:layout + if metadata.rendition_layout == 'pre-paginated': + # Find matching CSS (e.g., page1.xhtml -> page1.css) + css_path = filepath.replace('.xhtml', '.css').replace('.html', '.css') + css_content = css_cache.get(css_path) + if css_content: + content = _fix_fixed_layout_page(content, css_content) is_in_toc = False title = None for key, value in data.files_in_toc.items(): @@ -112,13 +342,28 @@ class Epub(OutputFormat): title = value is_in_toc = True break - epub_file = epub.EpubHtml( - title = title, + # Use EpubItem to preserve original content (link tags, viewport, etc.) + # EpubHtml parses and regenerates HTML, stripping these + epub_file = epub.EpubItem( file_name = filepath, - content = content + content = content, + media_type = 'application/xhtml+xml' ) output.add_item(epub_file) - output.spine.append(epub_file) + # Skip nav.xhtml from spine for fixed-layout (causes blank first page) + is_nav = any(x in filepath.lower() for x in ['nav.xhtml', 'nav.html', 'toc.xhtml', 'toc.html']) + if not (is_nav and metadata.rendition_layout == 'pre-paginated'): + # Check for spine properties (page-spread-left/right) + # Try matching with different path variations + props = None + for href, prop_value in spine_properties.items(): + if filepath.endswith(href) or href.endswith(os.path.basename(filepath)): + props = prop_value + break + if props: + output.spine.append((epub_file, props)) + else: + output.spine.append(epub_file) if is_in_toc: output.toc.append(epub_file) else: @@ -127,11 +372,96 @@ class Epub(OutputFormat): content = content ) output.add_item(epub_file) - added_files.add(filepath) + added_files[filepath] = file_size if update: update(progress) os.remove(temporary_file_location) + # Add merged CSS files after all parts have been processed + for css_path, css_content in css_cache.items(): + css_item = epub.EpubItem( + file_name=css_path, + content=css_content, + media_type='text/css' + ) + output.add_item(css_item) + + # Set cover image if found in source OPF, or detect from first page for fixed-layout + if not cover_href and metadata.rendition_layout == 'pre-paginated': + # Find first content page from spine (excluding nav/toc) + first_page = None + for spine_item in output.spine: + item = spine_item[0] if isinstance(spine_item, tuple) else spine_item + if hasattr(item, 'file_name') and item.file_name: + fname = item.file_name.lower() + # Skip nav and toc files + if 'nav.' in fname or 'toc.' in fname: + continue + if fname.endswith('.xhtml') or fname.endswith('.html'): + first_page = item + break + + if first_page and hasattr(first_page, 'content') and first_page.content: + # Parse HTML to find all images and pick the largest one + try: + content = first_page.content.decode('utf-8') if isinstance(first_page.content, bytes) else first_page.content + img_matches = re.findall(r']+src=["\']([^"\']+)["\']', content) + if img_matches: + page_dir = os.path.dirname(first_page.file_name) + # Build lookup dict for item sizes + item_sizes = { + item.file_name: len(item.content) + for item in output.items + if hasattr(item, 'file_name') and item.file_name + and hasattr(item, 'content') and item.content + } + best_img = None + best_size = 0 + for img_src in img_matches: + img_path = os.path.normpath(os.path.join(page_dir, img_src)) + # Find matching item by suffix + for file_name, size in item_sizes.items(): + if file_name.endswith(img_path): + if size > best_size: + best_size = size + best_img = img_path + break + if best_img: + cover_href = best_img + except (UnicodeDecodeError, AttributeError): + pass + + if cover_href: + # Find the cover image item and mark it as cover + for item in output.items: + if hasattr(item, 'file_name') and item.file_name and item.file_name.endswith(cover_href): + # Get or create item ID + item_id = item.id if hasattr(item, 'id') and item.id else os.path.basename(cover_href).replace('.', '-') + if not item.id: + item.id = item_id + # Add EPUB 2 cover metadata: + output.add_metadata('OPF', 'meta', '', {'name': 'cover', 'content': item_id}) + # Mark item with EPUB 3 cover-image property + if not hasattr(item, 'properties') or item.properties is None: + item.properties = [] + if 'cover-image' not in item.properties: + item.properties.append('cover-image') + break + + # Apply rendition properties to output (fixed-layout support) + if metadata.rendition_layout: + output.add_metadata(None, 'meta', metadata.rendition_layout, {'property': 'rendition:layout'}) + if metadata.rendition_spread: + output.add_metadata(None, 'meta', metadata.rendition_spread, {'property': 'rendition:spread'}) + if metadata.rendition_orientation: + output.add_metadata(None, 'meta', metadata.rendition_orientation, {'property': 'rendition:orientation'}) + output.add_item(epub.EpubNcx()) - output.add_item(epub.EpubNav()) + nav = epub.EpubNav() + output.add_item(nav) + + # For fixed-layout, remove nav from spine (it shouldn't be in reading order) + if metadata.rendition_layout == 'pre-paginated': + output.spine = [item for item in output.spine if item != nav and not (isinstance(item, tuple) and item[0] == nav)] + epub.write_epub(location, output) diff --git a/grawlix/output/metadata/epub_metadata.py b/grawlix/output/metadata/epub_metadata.py index 626027c..3212b18 100644 --- a/grawlix/output/metadata/epub_metadata.py +++ b/grawlix/output/metadata/epub_metadata.py @@ -118,6 +118,10 @@ def _find_opf_file(epub_dir: str) -> str: def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxml: bool) -> None: """Update EPUB metadata elements from Metadata object""" + if using_lxml: + from lxml import etree as ET + else: + import xml.etree.ElementTree as ET # Helper function to create/update element def update_or_create_element(tag: str, text: str, attribs: dict = None): @@ -129,13 +133,7 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm metadata_elem.remove(elem) # Create new element - if using_lxml: - from lxml import etree as ET - elem = ET.SubElement(metadata_elem, tag) - else: - import xml.etree.ElementTree as ET - elem = ET.SubElement(metadata_elem, tag) - + elem = ET.SubElement(metadata_elem, tag) elem.text = str(text) if attribs: for key, value in attribs.items(): @@ -145,14 +143,7 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm def create_meta(name: str, content): if content is None: return - - if using_lxml: - from lxml import etree as ET - meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta") - else: - import xml.etree.ElementTree as ET - meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta") - + meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta") meta.set('name', name) meta.set('content', str(content)) @@ -166,44 +157,25 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm elem.set('id', 'main-title') # Add original title - if using_lxml: - from lxml import etree as ET - orig_title = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}title") - else: - import xml.etree.ElementTree as ET - orig_title = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}title") - + orig_title = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}title") orig_title.set('id', 'original-title') orig_title.text = metadata.original_title # Add meta refinement for original title - if using_lxml: - meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta") - else: - meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta") + meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta") meta.set('refines', '#original-title') meta.set('property', 'title-type') meta.text = 'original' # Authors for author in metadata.authors: - if using_lxml: - from lxml import etree as ET - creator = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}creator") - else: - import xml.etree.ElementTree as ET - creator = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}creator") + creator = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}creator") creator.text = author creator.set(f"{{{ns['opf']}}}role", "aut") # Translators for translator in metadata.translators: - if using_lxml: - from lxml import etree as ET - contributor = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}contributor") - else: - import xml.etree.ElementTree as ET - contributor = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}contributor") + contributor = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}contributor") contributor.text = translator contributor.set(f"{{{ns['opf']}}}role", "trl") @@ -225,12 +197,7 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm metadata_elem.remove(elem) # Add new ISBN - if using_lxml: - from lxml import etree as ET - identifier = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}identifier") - else: - import xml.etree.ElementTree as ET - identifier = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}identifier") + identifier = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}identifier") identifier.text = metadata.isbn identifier.set(f"{{{ns['opf']}}}scheme", "ISBN") @@ -240,22 +207,12 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm # Category if metadata.category: - if using_lxml: - from lxml import etree as ET - subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject") - else: - import xml.etree.ElementTree as ET - subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject") + subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject") subject.text = metadata.category # Tags for tag in metadata.tags: - if using_lxml: - from lxml import etree as ET - subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject") - else: - import xml.etree.ElementTree as ET - subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject") + subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject") subject.text = tag # Series info (Calibre format) - using series and index fields @@ -263,6 +220,26 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm create_meta("calibre:series", metadata.series) create_meta("calibre:series_index", metadata.index) + # EPUB 3 rendition properties (fixed-layout support) + # These use value format, not name/content + def create_meta_property(property_name: str, value: str): + if not value: + return + # Remove existing property if present + for elem in list(metadata_elem): + if elem.get('property') == property_name: + metadata_elem.remove(elem) + meta = ET.SubElement(metadata_elem, 'meta') + meta.set('property', property_name) + meta.text = value + + if metadata.rendition_layout: + create_meta_property('rendition:layout', metadata.rendition_layout) + if metadata.rendition_spread: + create_meta_property('rendition:spread', metadata.rendition_spread) + if metadata.rendition_orientation: + create_meta_property('rendition:orientation', metadata.rendition_orientation) + def _repack_epub(epub_dir: str, output_path: str) -> None: """Repack EPUB directory into ZIP file"""