feat: add fixed-layout support for split EPUB sources

- Add rendition properties to Metadata (layout, spread, orientation) - Extract and preserve rendition properties when merging EPUB parts - Fix viewport meta tags for fixed-layout pages - Use EpubItem instead of EpubHtml to preserve original content Fix EPUB validation errors in merged output: - Skip directory entries, mimetype, and META-INF from manifest - Exclude nav/toc from spine for fixed-layout books Improve CSS merging across parts: - Keep longer version of duplicate selectors (more complete rules) - Return None from _get_css_rule_key for invalid font-faces Add cover detection fallback: - Detect cover from largest image on first page when OPF lacks cover info - Optimize lookup with dict instead of nested loops Clean up redundant imports in epub_metadata.py
2026-06-06 14:34:57 -06:00 · 2026-01-16 11:05:33 +01:00 · 2026-01-16 11:05:33 +01:00 · 08ddad3a74
commit 08ddad3a74
parent ed8fe9eafa
3 changed files with 383 additions and 72 deletions
--- a/grawlix/book.py
+++ b/grawlix/book.py
@ -20,6 +20,10 @@ class Metadata:
    translators: list[str] = field(default_factory=list)
    category: Optional[str] = None
    tags: list[str] = field(default_factory=list)
    # EPUB 3 rendition properties (fixed-layout support)
    rendition_layout: Optional[str] = None      # "pre-paginated" or "reflowable"
    rendition_spread: Optional[str] = None      # "none", "auto", "landscape", "portrait", "both"
    rendition_orientation: Optional[str] = None # "auto", "landscape", "portrait"
    def as_dict(self) -> dict:
        return {
--- a/grawlix/output/epub.py
+++ b/grawlix/output/epub.py
@ -3,12 +3,162 @@ from grawlix.exceptions import UnsupportedOutputFormat
 from .output_format import OutputFormat, Update
 import asyncio
 from bs4 import BeautifulSoup
 import os
-from ebooklib import epub
+import re
 import xml.etree.ElementTree as ET
 from zipfile import ZipFile
 from bs4 import BeautifulSoup
 from ebooklib import epub
 import rich
 def _fix_fixed_layout_page(html_content: bytes, css_content: bytes = None) -> bytes:
    """
    Fix fixed-layout XHTML pages by adding viewport and fixing broken styles.
    Extracts dimensions from CSS and applies them to viewport and inline styles.
    """
    try:
        html_str = html_content.decode('utf-8')
    except UnicodeDecodeError:
        return html_content
    # Extract dimensions from CSS if provided
    width = None
    height = None
    if css_content:
        try:
            css_str = css_content.decode('utf-8')
            # Look for body width/height
            width_match = re.search(r'body\s*\{[^}]*width:\s*(\d+)px', css_str)
            height_match = re.search(r'body\s*\{[^}]*height:\s*(\d+)px', css_str)
            if width_match:
                width = width_match.group(1)
            if height_match:
                height = height_match.group(1)
        except UnicodeDecodeError:
            pass
    if not width or not height:
        return html_content
    # Add viewport meta tag if missing
    if 'name="viewport"' not in html_str and '<head>' in html_str:
        viewport_tag = f'<meta name="viewport" content="width={width}, height={height}"/>'
        html_str = html_str.replace('<head>', f'<head>\n    {viewport_tag}', 1)
    # Fix broken inline styles (width:px; height:px;)
    html_str = re.sub(
        r'style="width:px;\s*height:px;"',
        f'style="width:{width}px; height:{height}px;"',
        html_str
    )
    return html_str.encode('utf-8')
 def _get_css_rule_key(rule_text: str) -> str | None:
    """Get unique key for a CSS rule. For @font-face, include font-family."""
    selector = rule_text.split('{')[0].strip()
    if selector == '@font-face':
        # Extract font-family to distinguish different font-faces
        match = re.search(r'font-family:\s*["\']?([^"\';}]+)', rule_text)
        if match:
            return f'@font-face:{match.group(1).strip()}'
        return None  # Skip font-face without font-family
    return selector if selector else None
 def _extract_opf_metadata(opf_content: bytes) -> dict:
    """
    Extract rendition properties, cover info, and spine properties from OPF content.
    Returns dict with keys: rendition_layout, rendition_spread,
    rendition_orientation, cover_id, cover_href, spine_properties
    """
    result = {
        'rendition_layout': None,
        'rendition_spread': None,
        'rendition_orientation': None,
        'cover_id': None,
        'cover_href': None,
        'spine_properties': {},  # Maps href -> properties (e.g., 'page-spread-left')
    }
    try:
        root = ET.fromstring(opf_content)
        ns = {
            'opf': 'http://www.idpf.org/2007/opf',
            'dc': 'http://purl.org/dc/elements/1.1/',
        }
        # Find metadata element
        metadata = root.find('opf:metadata', ns)
        if metadata is None:
            metadata = root.find('{http://www.idpf.org/2007/opf}metadata')
        if metadata is None:
            return result
        # Extract rendition properties from <meta property="rendition:X">
        for meta in metadata.iter():
            if meta.tag.endswith('}meta') or meta.tag == 'meta':
                prop = meta.get('property', '')
                if prop == 'rendition:layout':
                    result['rendition_layout'] = meta.text
                elif prop == 'rendition:spread':
                    result['rendition_spread'] = meta.text
                elif prop == 'rendition:orientation':
                    result['rendition_orientation'] = meta.text
                # Cover reference: <meta name="cover" content="image-id"/>
                name = meta.get('name', '')
                if name == 'cover':
                    result['cover_id'] = meta.get('content')
        # Parse manifest once for cover info and id->href mapping
        manifest = root.find('opf:manifest', ns)
        if manifest is None:
            manifest = root.find('{http://www.idpf.org/2007/opf}manifest')
        id_to_href = {}
        if manifest is not None:
            for item in manifest.iter():
                item_id = item.get('id')
                item_href = item.get('href')
                if item_id and item_href:
                    id_to_href[item_id] = item_href
                # Check for cover by ID match
                if result['cover_id'] and item_id == result['cover_id'] and not result['cover_href']:
                    result['cover_href'] = item_href
                # Check for cover-image property
                props = item.get('properties', '')
                if 'cover-image' in props and not result['cover_href']:
                    result['cover_href'] = item_href
                    result['cover_id'] = item_id
        # Extract spine properties (page-spread-left, page-spread-right)
        spine = root.find('opf:spine', ns)
        if spine is None:
            spine = root.find('{http://www.idpf.org/2007/opf}spine')
        if spine is not None:
            # Extract spine itemref properties
            for itemref in spine.iter():
                if itemref.tag.endswith('}itemref') or itemref.tag == 'itemref':
                    idref = itemref.get('idref')
                    props = itemref.get('properties')
                    if idref and props and idref in id_to_href:
                        href = id_to_href[idref]
                        result['spine_properties'][href] = props
    except ET.ParseError:
        pass
    return result
 class Epub(OutputFormat):
    extension = "epub"
    input_types = [SingleFile, HtmlFiles, EpubInParts]
@ -88,22 +238,102 @@ class Epub(OutputFormat):
        progress = 1/(file_count)
        temporary_file_location = f"{location}.tmp"
-        added_files: set[str] = set()
+        added_files: dict[str, int] = {}  # Track filepath -> content size
-        def get_new_files(zipfile: ZipFile):
+        opf_metadata: dict = {}
-            """Returns files in zipfile not already added to file"""
+        css_cache: dict[str, bytes] = {}  # Store CSS content for fixing HTML pages
-            for filename in zipfile.namelist():
+        cover_href: str = None  # Store cover image path from OPF
-                if filename in added_files or filename.endswith(".opf") or filename.endswith(".ncx"):
+        spine_properties: dict[str, str] = {}  # Store spine properties (href -> properties)
-                    continue
+
-                yield filename
+        def should_add_file(zipfile: ZipFile, filename: str) -> bool:
            """Check if file should be added (new or larger than existing)"""
            # Skip directory entries, container files (ebooklib handles these), and OPF/NCX
            if filename.endswith("/"):
                return False
            if filename == "mimetype" or filename.startswith("META-INF/"):
                return False
            if filename.endswith(".opf") or filename.endswith(".ncx"):
                return False
            if filename not in added_files:
                return True
            # If file exists, only replace if new version is larger (non-empty beats empty)
            new_size = zipfile.getinfo(filename).file_size
            return new_size > added_files[filename]
        output = epub.EpubBook()
        opf_extracted = False
        for file in files:
            await self._download_and_write_file(file, temporary_file_location)
            with ZipFile(temporary_file_location, "r") as zipfile:
-                for filepath in get_new_files(zipfile):
+                # Extract OPF metadata from first OPF file (before skipping)
                if not opf_extracted:
                    for filename in zipfile.namelist():
                        if filename.endswith(".opf"):
                            opf_content = zipfile.read(filename)
                            opf_metadata = _extract_opf_metadata(opf_content)
                            # Store rendition properties in metadata
                            if opf_metadata.get('rendition_layout'):
                                metadata.rendition_layout = opf_metadata['rendition_layout']
                            if opf_metadata.get('rendition_spread'):
                                metadata.rendition_spread = opf_metadata['rendition_spread']
                            if opf_metadata.get('rendition_orientation'):
                                metadata.rendition_orientation = opf_metadata['rendition_orientation']
                            if opf_metadata.get('cover_href'):
                                cover_href = opf_metadata['cover_href']
                            if opf_metadata.get('spine_properties'):
                                spine_properties.update(opf_metadata['spine_properties'])
                            opf_extracted = True
                            break
                # Collect CSS files, merging content from all parts
                for filepath in zipfile.namelist():
                    if filepath.endswith(".css"):
                        content = zipfile.read(filepath)
                        if not content:
                            continue  # Skip empty files
                        if filepath not in css_cache:
                            css_cache[filepath] = content
                        else:
                            # Merge: combine rules, keeping the longer version for duplicate selectors
                            existing_str = css_cache[filepath].decode('utf-8', errors='ignore')
                            new_str = content.decode('utf-8', errors='ignore')
                            # Parse existing rules into dict: key -> full rule
                            existing_rules = {}
                            for rule in existing_str.split('}'):
                                if '{' in rule:
                                    rule_key = _get_css_rule_key(rule)
                                    if rule_key:
                                        existing_rules[rule_key] = rule.strip() + '}'
                            # Process new rules: add new ones, replace if longer
                            for rule in new_str.split('}'):
                                if '{' in rule:
                                    rule_key = _get_css_rule_key(rule)
                                    if rule_key:
                                        new_rule = rule.strip() + '}'
                                        if rule_key not in existing_rules or len(new_rule) > len(existing_rules[rule_key]):
                                            existing_rules[rule_key] = new_rule
                            # Rebuild CSS from merged rules
                            css_cache[filepath] = '\n'.join(existing_rules.values()).encode('utf-8')
                for filepath in zipfile.namelist():
                    # Skip CSS files here - they'll be added after all parts are merged
                    if filepath.endswith(".css"):
                        continue
                    if not should_add_file(zipfile, filepath):
                        continue
                    content = zipfile.read(filepath)
                    file_size = len(content)
                    if filepath.endswith("html"):
                        filename = os.path.basename(filepath)
                        # Fix fixed-layout pages if we have rendition:layout
                        if metadata.rendition_layout == 'pre-paginated':
                            # Find matching CSS (e.g., page1.xhtml -> page1.css)
                            css_path = filepath.replace('.xhtml', '.css').replace('.html', '.css')
                            css_content = css_cache.get(css_path)
                            if css_content:
                                content = _fix_fixed_layout_page(content, css_content)
                        is_in_toc = False
                        title = None
                        for key, value in data.files_in_toc.items():
@ -112,13 +342,28 @@ class Epub(OutputFormat):
                                title = value
                                is_in_toc = True
                                break
-                        epub_file = epub.EpubHtml(
+                        # Use EpubItem to preserve original content (link tags, viewport, etc.)
-                            title = title,
+                        # EpubHtml parses and regenerates HTML, stripping these
                        epub_file = epub.EpubItem(
                            file_name = filepath,
-                            content = content
+                            content = content,
                            media_type = 'application/xhtml+xml'
                        )
                        output.add_item(epub_file)
-                        output.spine.append(epub_file)
+                        # Skip nav.xhtml from spine for fixed-layout (causes blank first page)
                        is_nav = any(x in filepath.lower() for x in ['nav.xhtml', 'nav.html', 'toc.xhtml', 'toc.html'])
                        if not (is_nav and metadata.rendition_layout == 'pre-paginated'):
                            # Check for spine properties (page-spread-left/right)
                            # Try matching with different path variations
                            props = None
                            for href, prop_value in spine_properties.items():
                                if filepath.endswith(href) or href.endswith(os.path.basename(filepath)):
                                    props = prop_value
                                    break
                            if props:
                                output.spine.append((epub_file, props))
                            else:
                                output.spine.append(epub_file)
                        if is_in_toc:
                            output.toc.append(epub_file)
                    else:
@ -127,11 +372,96 @@ class Epub(OutputFormat):
                            content = content
                        )
                        output.add_item(epub_file)
-                    added_files.add(filepath)
+                    added_files[filepath] = file_size
            if update:
                update(progress)
        os.remove(temporary_file_location)
        # Add merged CSS files after all parts have been processed
        for css_path, css_content in css_cache.items():
            css_item = epub.EpubItem(
                file_name=css_path,
                content=css_content,
                media_type='text/css'
            )
            output.add_item(css_item)
        # Set cover image if found in source OPF, or detect from first page for fixed-layout
        if not cover_href and metadata.rendition_layout == 'pre-paginated':
            # Find first content page from spine (excluding nav/toc)
            first_page = None
            for spine_item in output.spine:
                item = spine_item[0] if isinstance(spine_item, tuple) else spine_item
                if hasattr(item, 'file_name') and item.file_name:
                    fname = item.file_name.lower()
                    # Skip nav and toc files
                    if 'nav.' in fname or 'toc.' in fname:
                        continue
                    if fname.endswith('.xhtml') or fname.endswith('.html'):
                        first_page = item
                        break
            if first_page and hasattr(first_page, 'content') and first_page.content:
                # Parse HTML to find all images and pick the largest one
                try:
                    content = first_page.content.decode('utf-8') if isinstance(first_page.content, bytes) else first_page.content
                    img_matches = re.findall(r'<img[^>]+src=["\']([^"\']+)["\']', content)
                    if img_matches:
                        page_dir = os.path.dirname(first_page.file_name)
                        # Build lookup dict for item sizes
                        item_sizes = {
                            item.file_name: len(item.content)
                            for item in output.items
                            if hasattr(item, 'file_name') and item.file_name
                            and hasattr(item, 'content') and item.content
                        }
                        best_img = None
                        best_size = 0
                        for img_src in img_matches:
                            img_path = os.path.normpath(os.path.join(page_dir, img_src))
                            # Find matching item by suffix
                            for file_name, size in item_sizes.items():
                                if file_name.endswith(img_path):
                                    if size > best_size:
                                        best_size = size
                                        best_img = img_path
                                    break
                        if best_img:
                            cover_href = best_img
                except (UnicodeDecodeError, AttributeError):
                    pass
        if cover_href:
            # Find the cover image item and mark it as cover
            for item in output.items:
                if hasattr(item, 'file_name') and item.file_name and item.file_name.endswith(cover_href):
                    # Get or create item ID
                    item_id = item.id if hasattr(item, 'id') and item.id else os.path.basename(cover_href).replace('.', '-')
                    if not item.id:
                        item.id = item_id
                    # Add EPUB 2 cover metadata: <meta name="cover" content="image-id"/>
                    output.add_metadata('OPF', 'meta', '', {'name': 'cover', 'content': item_id})
                    # Mark item with EPUB 3 cover-image property
                    if not hasattr(item, 'properties') or item.properties is None:
                        item.properties = []
                    if 'cover-image' not in item.properties:
                        item.properties.append('cover-image')
                    break
        # Apply rendition properties to output (fixed-layout support)
        if metadata.rendition_layout:
            output.add_metadata(None, 'meta', metadata.rendition_layout, {'property': 'rendition:layout'})
        if metadata.rendition_spread:
            output.add_metadata(None, 'meta', metadata.rendition_spread, {'property': 'rendition:spread'})
        if metadata.rendition_orientation:
            output.add_metadata(None, 'meta', metadata.rendition_orientation, {'property': 'rendition:orientation'})
        output.add_item(epub.EpubNcx())
-        output.add_item(epub.EpubNav())
+        nav = epub.EpubNav()
        output.add_item(nav)
        # For fixed-layout, remove nav from spine (it shouldn't be in reading order)
        if metadata.rendition_layout == 'pre-paginated':
            output.spine = [item for item in output.spine if item != nav and not (isinstance(item, tuple) and item[0] == nav)]
        epub.write_epub(location, output)
--- a/grawlix/output/metadata/epub_metadata.py
+++ b/grawlix/output/metadata/epub_metadata.py
@ -118,6 +118,10 @@ def _find_opf_file(epub_dir: str) -> str:
 def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxml: bool) -> None:
    """Update EPUB metadata elements from Metadata object"""
    if using_lxml:
        from lxml import etree as ET
    else:
        import xml.etree.ElementTree as ET
    # Helper function to create/update element
    def update_or_create_element(tag: str, text: str, attribs: dict = None):
@ -129,13 +133,7 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm
            metadata_elem.remove(elem)
        # Create new element
-        if using_lxml:
+        elem = ET.SubElement(metadata_elem, tag)
            from lxml import etree as ET
            elem = ET.SubElement(metadata_elem, tag)
        else:
            import xml.etree.ElementTree as ET
            elem = ET.SubElement(metadata_elem, tag)
        elem.text = str(text)
        if attribs:
            for key, value in attribs.items():
@ -145,14 +143,7 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm
    def create_meta(name: str, content):
        if content is None:
            return
-
+        meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta")
        if using_lxml:
            from lxml import etree as ET
            meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta")
        else:
            import xml.etree.ElementTree as ET
            meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta")
        meta.set('name', name)
        meta.set('content', str(content))
@ -166,44 +157,25 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm
            elem.set('id', 'main-title')
        # Add original title
-        if using_lxml:
+        orig_title = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}title")
            from lxml import etree as ET
            orig_title = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}title")
        else:
            import xml.etree.ElementTree as ET
            orig_title = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}title")
        orig_title.set('id', 'original-title')
        orig_title.text = metadata.original_title
        # Add meta refinement for original title
-        if using_lxml:
+        meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta")
            meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta")
        else:
            meta = ET.SubElement(metadata_elem, f"{{{ns['opf']}}}meta")
        meta.set('refines', '#original-title')
        meta.set('property', 'title-type')
        meta.text = 'original'
    # Authors
    for author in metadata.authors:
-        if using_lxml:
+        creator = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}creator")
            from lxml import etree as ET
            creator = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}creator")
        else:
            import xml.etree.ElementTree as ET
            creator = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}creator")
        creator.text = author
        creator.set(f"{{{ns['opf']}}}role", "aut")
    # Translators
    for translator in metadata.translators:
-        if using_lxml:
+        contributor = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}contributor")
            from lxml import etree as ET
            contributor = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}contributor")
        else:
            import xml.etree.ElementTree as ET
            contributor = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}contributor")
        contributor.text = translator
        contributor.set(f"{{{ns['opf']}}}role", "trl")
@ -225,12 +197,7 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm
                metadata_elem.remove(elem)
        # Add new ISBN
-        if using_lxml:
+        identifier = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}identifier")
            from lxml import etree as ET
            identifier = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}identifier")
        else:
            import xml.etree.ElementTree as ET
            identifier = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}identifier")
        identifier.text = metadata.isbn
        identifier.set(f"{{{ns['opf']}}}scheme", "ISBN")
@ -240,22 +207,12 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm
    # Category
    if metadata.category:
-        if using_lxml:
+        subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject")
            from lxml import etree as ET
            subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject")
        else:
            import xml.etree.ElementTree as ET
            subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject")
        subject.text = metadata.category
    # Tags
    for tag in metadata.tags:
-        if using_lxml:
+        subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject")
            from lxml import etree as ET
            subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject")
        else:
            import xml.etree.ElementTree as ET
            subject = ET.SubElement(metadata_elem, f"{{{ns['dc']}}}subject")
        subject.text = tag
    # Series info (Calibre format) - using series and index fields
@ -263,6 +220,26 @@ def _update_epub_metadata(metadata_elem, metadata: Metadata, ns: dict, using_lxm
        create_meta("calibre:series", metadata.series)
        create_meta("calibre:series_index", metadata.index)
    # EPUB 3 rendition properties (fixed-layout support)
    # These use <meta property="...">value</meta> format, not name/content
    def create_meta_property(property_name: str, value: str):
        if not value:
            return
        # Remove existing property if present
        for elem in list(metadata_elem):
            if elem.get('property') == property_name:
                metadata_elem.remove(elem)
        meta = ET.SubElement(metadata_elem, 'meta')
        meta.set('property', property_name)
        meta.text = value
    if metadata.rendition_layout:
        create_meta_property('rendition:layout', metadata.rendition_layout)
    if metadata.rendition_spread:
        create_meta_property('rendition:spread', metadata.rendition_spread)
    if metadata.rendition_orientation:
        create_meta_property('rendition:orientation', metadata.rendition_orientation)
 def _repack_epub(epub_dir: str, output_path: str) -> None:
    """Repack EPUB directory into ZIP file"""