feat: add fixed-layout support for split EPUB sources

- Add rendition properties to Metadata (layout, spread, orientation) - Extract and preserve rendition properties when merging EPUB parts - Fix viewport meta tags for fixed-layout pages - Use EpubItem instead of EpubHtml to preserve original content Fix EPUB validation errors in merged output: - Skip directory entries, mimetype, and META-INF from manifest - Exclude nav/toc from spine for fixed-layout books Improve CSS merging across parts: - Keep longer version of duplicate selectors (more complete rules) - Return None from _get_css_rule_key for invalid font-faces Add cover detection fallback: - Detect cover from largest image on first page when OPF lacks cover info - Optimize lookup with dict instead of nested loops Clean up redundant imports in epub_metadata.py
2026-05-12 14:09:36 -06:00 · 2026-01-16 11:05:33 +01:00 · 2026-01-16 11:05:33 +01:00 · 08ddad3a74
commit 08ddad3a74
parent ed8fe9eafa
3 changed files with 383 additions and 72 deletions
--- a/grawlix/book.py
+++ b/grawlix/book.py
@ -20,6 +20,10 @@ class Metadata:
    translators: list[str] = field(default_factory=list)
    category: Optional[str] = None
    tags: list[str] = field(default_factory=list)
+    # EPUB 3 rendition properties (fixed-layout support)
+    rendition_layout: Optional[str] = None      # "pre-paginated" or "reflowable"
+    rendition_spread: Optional[str] = None      # "none", "auto", "landscape", "portrait", "both"
+    rendition_orientation: Optional[str] = None # "auto", "landscape", "portrait"

    def as_dict(self) -> dict:
        return {
--- a/grawlix/output/epub.py
+++ b/grawlix/output/epub.py
@ -3,12 +3,162 @@ from grawlix.exceptions import UnsupportedOutputFormat
 from .output_format import OutputFormat, Update

 import asyncio
-from bs4 import BeautifulSoup
 import os
-from ebooklib import epub
+import re
+import xml.etree.ElementTree as ET
 from zipfile import ZipFile
+
+from bs4 import BeautifulSoup
+from ebooklib import epub
 import rich

+
+def _fix_fixed_layout_page(html_content: bytes, css_content: bytes = None) -> bytes:
+    """
+    Fix fixed-layout XHTML pages by adding viewport and fixing broken styles.
+
+    Extracts dimensions from CSS and applies them to viewport and inline styles.
+    """
+    try:
+        html_str = html_content.decode('utf-8')
+    except UnicodeDecodeError:
+        return html_content
+
+    # Extract dimensions from CSS if provided
+    width = None
+    height = None
+    if css_content:
+        try:
+            css_str = css_content.decode('utf-8')
+            # Look for body width/height
+            width_match = re.search(r'body\s*\{[^}]*width:\s*(\d+)px', css_str)
+            height_match = re.search(r'body\s*\{[^}]*height:\s*(\d+)px', css_str)
+            if width_match:
+                width = width_match.group(1)
+            if height_match:
+                height = height_match.group(1)
+        except UnicodeDecodeError:
+            pass
+
+    if not width or not height:
+        return html_content
+
+    # Add viewport meta tag if missing
+    if 'name="viewport"' not in html_str and '<head>' in html_str:
+        viewport_tag = f'<meta name="viewport" content="width={width}, height={height}"/>'
+        html_str = html_str.replace('<head>', f'<head>\n    {viewport_tag}', 1)
+
+    # Fix broken inline styles (width:px; height:px;)
+    html_str = re.sub(
+        r'style="width:px;\s*height:px;"',
+        f'style="width:{width}px; height:{height}px;"',
+        html_str
+    )
+
+    return html_str.encode('utf-8')
+
+
+def _get_css_rule_key(rule_text: str) -> str | None:
+    """Get unique key for a CSS rule. For @font-face, include font-family."""
+    selector = rule_text.split('{')[0].strip()
+    if selector == '@font-face':
+        # Extract font-family to distinguish different font-faces
+        match = re.search(r'font-family:\s*["\']?([^"\';}]+)', rule_text)
+        if match:
+            return f'@font-face:{match.group(1).strip()}'
+        return None  # Skip font-face without font-family
+    return selector if selector else None
+
+
+def _extract_opf_metadata(opf_content: bytes) -> dict:
+    """
+    Extract rendition properties, cover info, and spine properties from OPF content.
+
+    Returns dict with keys: rendition_layout, rendition_spread,
+    rendition_orientation, cover_id, cover_href, spine_properties
+    """
+    result = {
+        'rendition_layout': None,
+        'rendition_spread': None,
+        'rendition_orientation': None,
+        'cover_id': None,
+        'cover_href': None,
+        'spine_properties': {},  # Maps href -> properties (e.g., 'page-spread-left')
+    }
+
+    try:
+        root = ET.fromstring(opf_content)
+        ns = {
+            'opf': 'http://www.idpf.org/2007/opf',
+            'dc': 'http://purl.org/dc/elements/1.1/',
+        }
+
+        # Find metadata element
+        metadata = root.find('opf:metadata', ns)
+        if metadata is None:
+            metadata = root.find('{http://www.idpf.org/2007/opf}metadata')
+        if metadata is None:
+            return result
+
+        # Extract rendition properties from <meta property="rendition:X">
+        for meta in metadata.iter():
+            if meta.tag.endswith('}meta') or meta.tag == 'meta':
+                prop = meta.get('property', '')
+                if prop == 'rendition:layout':
+                    result['rendition_layout'] = meta.text
+                elif prop == 'rendition:spread':
+                    result['rendition_spread'] = meta.text
+                elif prop == 'rendition:orientation':
+                    result['rendition_orientation'] = meta.text
+
+                # Cover reference: <meta name="cover" content="image-id"/>
+                name = meta.get('name', '')
+                if name == 'cover':
+                    result['cover_id'] = meta.get('content')
+
+        # Parse manifest once for cover info and id->href mapping
+        manifest = root.find('opf:manifest', ns)
+        if manifest is None:
+            manifest = root.find('{http://www.idpf.org/2007/opf}manifest')
+
+        id_to_href = {}
+        if manifest is not None:
+            for item in manifest.iter():
+                item_id = item.get('id')
+                item_href = item.get('href')
+                if item_id and item_href:
+                    id_to_href[item_id] = item_href
+
+                # Check for cover by ID match
+                if result['cover_id'] and item_id == result['cover_id'] and not result['cover_href']:
+                    result['cover_href'] = item_href
+
+                # Check for cover-image property
+                props = item.get('properties', '')
+                if 'cover-image' in props and not result['cover_href']:
+                    result['cover_href'] = item_href
+                    result['cover_id'] = item_id
+
+        # Extract spine properties (page-spread-left, page-spread-right)
+        spine = root.find('opf:spine', ns)
+        if spine is None:
+            spine = root.find('{http://www.idpf.org/2007/opf}spine')
+        if spine is not None:
+            # Extract spine itemref properties
+            for itemref in spine.iter():
+                if itemref.tag.endswith('}itemref') or itemref.tag == 'itemref':
+                    idref = itemref.get('idref')
+                    props = itemref.get('properties')
+                    if idref and props and idref in id_to_href:
+                        href = id_to_href[idref]
+                        result['spine_properties'][href] = props
+
+    except ET.ParseError:
+        pass
+
+    return result
+
+
 class Epub(OutputFormat):
    extension = "epub"
    input_types = [SingleFile, HtmlFiles, EpubInParts]
@ -88,22 +238,102 @@ class Epub(OutputFormat):
        progress = 1/(file_count)
        temporary_file_location = f"{location}.tmp"

-        added_files: set[str] = set()
-        def get_new_files(zipfile: ZipFile):
-            """Returns files in zipfile not already added to file"""
-            for filename in zipfile.namelist():
-                if filename in added_files or filename.endswith(".opf") or filename.endswith(".ncx"):
-                    continue
-                yield filename
+        added_files: dict[str, int] = {}  # Track filepath -> content size
+        opf_metadata: dict = {}
+        css_cache: dict[str, bytes] = {}  # Store CSS content for fixing HTML pages
+        cover_href: str = None  # Store cover image path from OPF
+        spine_properties: dict[str, str] = {}  # Store spine properties (href -> properties)
+
+        def should_add_file(zipfile: ZipFile, filename: str) -> bool:
+            """Check if file should be added (new or larger than existing)"""
+            # Skip directory entries, container files (ebooklib handles these), and OPF/NCX
+            if filename.endswith("/"):
+                return False
+            if filename == "mimetype" or filename.startswith("META-INF/"):
+                return False
+            if filename.endswith(".opf") or filename.endswith(".ncx"):
+                return False
+            if filename not in added_files:
+                return True
+            # If file exists, only replace if new version is larger (non-empty beats empty)
+            new_size = zipfile.getinfo(filename).file_size
+            return new_size > added_files[filename]

        output = epub.EpubBook()
+        opf_extracted = False
        for file in files:
            await self._download_and_write_file(file, temporary_file_location)
            with ZipFile(temporary_file_location, "r") as zipfile:
-                for filepath in get_new_files(zipfile):
+                # Extract OPF metadata from first OPF file (before skipping)
+                if not opf_extracted:
+                    for filename in zipfile.namelist():
+                        if filename.endswith(".opf"):
+                            opf_content = zipfile.read(filename)
+                            opf_metadata = _extract_opf_metadata(opf_content)
+                            # Store rendition properties in metadata
+                            if opf_metadata.get('rendition_layout'):
+                                metadata.rendition_layout = opf_metadata['rendition_layout']
+                            if opf_metadata.get('rendition_spread'):
+                                metadata.rendition_spread = opf_metadata['rendition_spread']
+                            if opf_metadata.get('rendition_orientation'):
+                                metadata.rendition_orientation = opf_metadata['rendition_orientation']
+                            if opf_metadata.get('cover_href'):
+                                cover_href = opf_metadata['cover_href']
+                            if opf_metadata.get('spine_properties'):
+                                spine_properties.update(opf_metadata['spine_properties'])
+                            opf_extracted = True
+                            break
+
+                # Collect CSS files, merging content from all parts
+                for filepath in zipfile.namelist():
+                    if filepath.endswith(".css"):
+                        content = zipfile.read(filepath)
+                        if not content:
+                            continue  # Skip empty files
+                        if filepath not in css_cache:
+                            css_cache[filepath] = content
+                        else:
+                            # Merge: combine rules, keeping the longer version for duplicate selectors
+                            existing_str = css_cache[filepath].decode('utf-8', errors='ignore')
+                            new_str = content.decode('utf-8', errors='ignore')
+
+                            # Parse existing rules into dict: key -> full rule
+                            existing_rules = {}
+                            for rule in existing_str.split('}'):
+                                if '{' in rule:
+                                    rule_key = _get_css_rule_key(rule)
+                                    if rule_key:
+                                        existing_rules[rule_key] = rule.strip() + '}'
+
+                            # Process new rules: add new ones, replace if longer
+                            for rule in new_str.split('}'):
+                                if '{' in rule:
+                                    rule_key = _get_css_rule_key(rule)
+                                    if rule_key:
+                                        new_rule = rule.strip() + '}'
+                                        if rule_key not in existing_rules or len(new_rule) > len(existing_rules[rule_key]):
+                                            existing_rules[rule_key] = new_rule
+
+                            # Rebuild CSS from merged rules
+                            css_cache[filepath] = '\n'.join(existing_rules.values()).encode('utf-8')
+
+                for filepath in zipfile.namelist():
+                    # Skip CSS files here - they'll be added after all parts are merged
+                    if filepath.endswith(".css"):
+                        continue
+                    if not should_add_file(zipfile, filepath):
+                        continue
                    content = zipfile.read(filepath)
+                    file_size = len(content)
                    if filepath.endswith("html"):
                        filename = os.path.basename(filepath)
+                        # Fix fixed-layout pages if we have rendition:layout
+                        if metadata.rendition_layout == 'pre-paginated':
+                            # Find matching CSS (e.g., page1.xhtml -> page1.css)
+                            css_path = filepath.replace('.xhtml', '.css').replace('.html', '.css')
+                            css_content = css_cache.get(css_path)
+                            if css_content:
+                                content = _fix_fixed_layout_page(content, css_content)
                        is_in_toc = False
                        title = None
                        for key, value in data.files_in_toc.items():
@ -112,13 +342,28 @@ class Epub(OutputFormat):
                                title = value
                                is_in_toc = True
                                break
-                        epub_file = epub.EpubHtml(
-                            title = title,
+                        # Use EpubItem to preserve original content (link tags, viewport, etc.)
+                        # EpubHtml parses and regenerates HTML, stripping these
+                        epub_file = epub.EpubItem(
                            file_name = filepath,
-                            content = content
+                            content = content,
+                            media_type = 'application/xhtml+xml'
                        )
                        output.add_item(epub_file)
-                        output.spine.append(epub_file)
+                        # Skip nav.xhtml from spine for fixed-layout (causes blank first page)
+                        is_nav = any(x in filepath.lower() for x in ['nav.xhtml', 'nav.html', 'toc.xhtml', 'toc.html'])
+                        if not (is_nav and metadata.rendition_layout == 'pre-paginated'):
+                            # Check for spine properties (page-spread-left/right)
+                            # Try matching with different path variations
+                            props = None
+                            for href, prop_value in spine_properties.items():
+                                if filepath.endswith(href) or href.endswith(os.path.basename(filepath)):
+                                    props = prop_value
+                                    break
+                            if props:
+                                output.spine.append((epub_file, props))
+                            else:
+                                output.spine.append(epub_file)
                        if is_in_toc:
                            output.toc.append(epub_file)
                    else:
@ -127,11 +372,96 @@ class Epub(OutputFormat):
                            content = content
                        )
                        output.add_item(epub_file)
-                    added_files.add(filepath)
+                    added_files[filepath] = file_size
            if update:
                update(progress)
        os.remove(temporary_file_location)

+        # Add merged CSS files after all parts have been processed
+        for css_path, css_content in css_cache.items():
+            css_item = epub.EpubItem(
+                file_name=css_path,
+                content=css_content,
+                media_type='text/css'
+            )
+            output.add_item(css_item)
+
+        # Set cover image if found in source OPF, or detect from first page for fixed-layout
+        if not cover_href and metadata.rendition_layout == 'pre-paginated':
+            # Find first content page from spine (excluding nav/toc)
+            first_page = None
+            for spine_item in output.spine:
+                item = spine_item[0] if isinstance(spine_item, tuple) else spine_item
+                if hasattr(item, 'file_name') and item.file_name:
+                    fname = item.file_name.lower()
+                    # Skip nav and toc files
+                    if 'nav.' in fname or 'toc.' in fname:
+                        continue
+                    if fname.endswith('.xhtml') or fname.endswith('.html'):
+                        first_page = item
+                        break
+
+            if first_page and hasattr(first_page, 'content') and first_page.content:
+                # Parse HTML to find all images and pick the largest one
+                try:
+                    content = first_page.content.decode('utf-8') if isinstance(first_page.content, bytes) else first_page.content
+                    img_matches = re.findall(r'<img[^>]+src=["\']([^"\']+)["\']', content)
+                    if img_matches:
+                        page_dir = os.path.dirname(first_page.file_name)
+                        # Build lookup dict for item sizes
+                        item_sizes = {
+                            item.file_name: len(item.content)
+                            for item in output.items
+                            if hasattr(item, 'file_name') and item.file_name
+                            and hasattr(item, 'content') and item.content
+                        }
+                        best_img = None
+                        best_size = 0
+                        for img_src in img_matches:
+                            img_path = os.path.normpath(os.path.join(page_dir, img_src))
+                            # Find matching item by suffix
+                            for file_name, size in item_sizes.items():
+                                if file_name.endswith(img_path):
+                                    if size > best_size:
+                                        best_size = size
+                                        best_img = img_path
+                                    break
+                        if best_img:
+                            cover_href = best_img
+                except (UnicodeDecodeError, AttributeError):
+                    pass
+
+        if cover_href:
+            # Find the cover image item and mark it as cover
+            for item in output.items:
+                if hasattr(item, 'file_name') and item.file_name and item.file_name.endswith(cover_href):
+                    # Get or create item ID
+                    item_id = item.id if hasattr(item, 'id') and item.id else os.path.basename(cover_href).replace('.', '-')
+                    if not item.id:
+                        item.id = item_id
+                    # Add EPUB 2 cover metadata: <meta name="cover" content="image-id"/>
+                    output.add_metadata('OPF', 'meta', '', {'name': 'cover', 'content': item_id})
+                    # Mark item with EPUB 3 cover-image property
+                    if not hasattr(item, 'properties') or item.properties is None:
+                        item.properties = []
+                    if 'cover-image' not in item.properties:
+                        item.properties.append('cover-image')
+                    break
+
+        # Apply rendition properties to output (fixed-layout support)
+        if metadata.rendition_layout:
+            output.add_metadata(None, 'meta', metadata.rendition_layout, {'property': 'rendition:layout'})
+        if metadata.rendition_spread:
+            output.add_metadata(None, 'meta', metadata.rendition_spread, {'property': 'rendition:spread'})
+        if metadata.rendition_orientation:
+            output.add_metadata(None, 'meta', metadata.rendition_orientation, {'property': 'rendition:orientation'})
+
        output.add_item(epub.EpubNcx())
-        output.add_item(epub.EpubNav())
+        nav = epub.EpubNav()
+        output.add_item(nav)
+
+        # For fixed-layout, remove nav from spine (it shouldn't be in reading order)
+        if metadata.rendition_layout == 'pre-paginated':
+            output.spine = [item for item in output.spine if item != nav and not (isinstance(item, tuple) and item[0] == nav)]
+
        epub.write_epub(location, output)
--- a/grawlix/output/metadata/epub_metadata.py
+++ b/grawlix/output/metadata/epub_metadata.py
@ -118,6 +118,10 @@ def _find_opf_file(epub_dir: str) -> str:

 def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxml: bool) -> None:
    """Update EPUB metadata elements from Metadata object"""
+    if using_lxml:
+        from lxml import etree as ET
+    else:
+        import xml.etree.ElementTree as ET

    # Helper function to create/update element
    def update_or_create_element(tag: str, text: str, attribs: dict = None):
@ -129,13 +133,7 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm
            metadata_elem.remove(elem)

        # Create new element
-        if using_lxml:
-            from lxml import etree as ET
-            elem = ET.SubElement(metadata_elem, tag)
-        else:
-            import xml.etree.ElementTree as ET
-            elem = ET.SubElement(metadata_elem, tag)
-
+        elem = ET.SubElement(metadata_elem, tag)
        elem.text = str(text)
        if attribs:
            for key, value in attribs.items():
@ -145,14 +143,7 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm
    def create_meta(name: str, content):
        if content is None:
            return
-
-        if using_lxml:
-            from lxml import etree as ET
-            meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta")
-        else:
-            import xml.etree.ElementTree as ET
-            meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta")
-
+        meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta")
        meta.set('name', name)
        meta.set('content', str(content))

@ -166,44 +157,25 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm
            elem.set('id', 'main-title')

        # Add original title
-        if using_lxml:
-            from lxml import etree as ET
-            orig_title = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}title")
-        else:
-            import xml.etree.ElementTree as ET
-            orig_title = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}title")
-
+        orig_title = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}title")
        orig_title.set('id', 'original-title')
        orig_title.text = metadata.original_title

        # Add meta refinement for original title
-        if using_lxml:
-            meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta")
-        else:
-            meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta")
+        meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta")
        meta.set('refines', '#original-title')
        meta.set('property', 'title-type')
        meta.text = 'original'

    # Authors
    for author in metadata.authors:
-        if using_lxml:
-            from lxml import etree as ET
-            creator = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}creator")
-        else:
-            import xml.etree.ElementTree as ET
-            creator = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}creator")
+        creator = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}creator")
        creator.text = author
        creator.set(f"{{{ns['opf']}}}role", "aut")

    # Translators
    for translator in metadata.translators:
-        if using_lxml:
-            from lxml import etree as ET
-            contributor = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}contributor")
-        else:
-            import xml.etree.ElementTree as ET
-            contributor = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}contributor")
+        contributor = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}contributor")
        contributor.text = translator
        contributor.set(f"{{{ns['opf']}}}role", "trl")

@ -225,12 +197,7 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm
                metadata_elem.remove(elem)

        # Add new ISBN
-        if using_lxml:
-            from lxml import etree as ET
-            identifier = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}identifier")
-        else:
-            import xml.etree.ElementTree as ET
-            identifier = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}identifier")
+        identifier = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}identifier")
        identifier.text = metadata.isbn
        identifier.set(f"{{{ns['opf']}}}scheme", "ISBN")

@ -240,22 +207,12 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm

    # Category
    if metadata.category:
-        if using_lxml:
-            from lxml import etree as ET
-            subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject")
-        else:
-            import xml.etree.ElementTree as ET
-            subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject")
+        subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject")
        subject.text = metadata.category

    # Tags
    for tag in metadata.tags:
-        if using_lxml:
-            from lxml import etree as ET
-            subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject")
-        else:
-            import xml.etree.ElementTree as ET
-            subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject")
+        subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject")
        subject.text = tag

    # Series info (Calibre format) - using series and index fields
@ -263,6 +220,26 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm
        create_meta("calibre:series", metadata.series)
        create_meta("calibre:series_index", metadata.index)

+    # EPUB 3 rendition properties (fixed-layout support)
+    # These use <meta property="...">value</meta> format, not name/content
+    def create_meta_property(property_name: str, value: str):
+        if not value:
+            return
+        # Remove existing property if present
+        for elem in list(metadata_elem):
+            if elem.get('property') == property_name:
+                metadata_elem.remove(elem)
+        meta = ET.SubElement(metadata_elem, 'meta')
+        meta.set('property', property_name)
+        meta.text = value
+
+    if metadata.rendition_layout:
+        create_meta_property('rendition:layout', metadata.rendition_layout)
+    if metadata.rendition_spread:
+        create_meta_property('rendition:spread', metadata.rendition_spread)
+    if metadata.rendition_orientation:
+        create_meta_property('rendition:orientation', metadata.rendition_orientation)
+

 def _repack_epub(epub_dir: str, output_path: str) -> None:
    """Repack EPUB directory into ZIP file"""