From fc136038734609f6690e8646407f74b8a5a45731 Mon Sep 17 00:00:00 2001
From: Joakim Holm <mail@joakimholm.xyz>
Date: Thu, 4 May 2023 21:30:50 +0200
Subject: [PATCH 1/3] Add basic Internet Archive support

---
 README.md                           |   1 +
 grawlix/__main__.py                 |   2 +-
 grawlix/book.py                     |  11 ++-
 grawlix/output/__init__.py          |   2 +
 grawlix/output/acsm.py              |  22 ++++++
 grawlix/output/output_format.py     |  23 +++++-
 grawlix/sources/__init__.py         |   2 +
 grawlix/sources/ereolen.py          |   2 +-
 grawlix/sources/internet_archive.py | 111 ++++++++++++++++++++++++++++
 9 files changed, 170 insertions(+), 6 deletions(-)
 create mode 100644 grawlix/output/acsm.py
 create mode 100644 grawlix/sources/internet_archive.py

diff --git a/README.md b/README.md
index 36bc222..dbbd7b8 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ CLI ebook downloader
 grawlix currently supports downloading from the following sources:
 - [eReolen](https://ereolen.dk)
 - [Flipp](https://flipp.dk)
+- [Internet Archive](https://archive.org)
 - [Manga Plus](https://mangaplus.shueisha.co.jp)
 - [Royal Road](https://www.royalroad.com)
 - [Saxo](https://saxo.com)
diff --git a/grawlix/__main__.py b/grawlix/__main__.py
index be376ea..d05a947 100644
--- a/grawlix/__main__.py
+++ b/grawlix/__main__.py
@@ -20,7 +20,7 @@ def get_login(source: Source, config: Config, options) -> Tuple[str, str, Option
     :param options: Command line options
     :returns: Login credentials
     """
-    source_name = source.name.lower()
+    source_name = source.name.lower().replace(" ", "")
     if source_name in config.sources:
         username = config.sources[source_name].username or options.username
         password = config.sources[source_name].password or options.password
diff --git a/grawlix/book.py b/grawlix/book.py
index c1ec9d8..ae49564 100644
--- a/grawlix/book.py
+++ b/grawlix/book.py
@@ -31,11 +31,20 @@ class OnlineFile:
     encryption: Optional[Encryption] = None
     headers: Optional[dict[str, str]] = None
 
+@dataclass(slots=True)
+class OfflineFile:
+    """Stores content of a file"""
+    content: bytes
+    extension: str
+    encryption: Optional[Encryption] = None
+
+File = Union[OnlineFile, OfflineFile]
+
 
 @dataclass(slots=True)
 class SingleFile:
     """Bookdata in the form of a single file"""
-    file: OnlineFile
+    file: File
 
 
 @dataclass(slots=True)
diff --git a/grawlix/output/__init__.py b/grawlix/output/__init__.py
index 33f5c2c..fcca4b9 100644
--- a/grawlix/output/__init__.py
+++ b/grawlix/output/__init__.py
@@ -3,6 +3,7 @@ from grawlix.exceptions import GrawlixError
 from grawlix.logging import info
 
 from .output_format import OutputFormat
+from .acsm import Acsm
 from .cbz import Cbz
 from .epub import Epub
 
@@ -85,6 +86,7 @@ def get_output_formats() -> list[type[OutputFormat]]:
     :returns: List of available output format classes
     """
     return [
+        Acsm,
         Cbz,
         Epub,
     ]
diff --git a/grawlix/output/acsm.py b/grawlix/output/acsm.py
new file mode 100644
index 0000000..ca5230f
--- /dev/null
+++ b/grawlix/output/acsm.py
@@ -0,0 +1,22 @@
+from grawlix.book import Book
+from .output_format import OutputFormat, Update
+import shutil
+import subprocess
+
+class Acsm(OutputFormat):
+    extension = "acsm"
+
+    async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
+        # Download and write acsm file to disk
+        await super().dl_single_file(book, location, update_func)
+        # TODO: Implement more general solution
+        # Decrypt if knock is available
+        # https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock
+        if shutil.which("knock") is not None:
+            subprocess.run(
+                ["knock", location],
+                capture_output = True
+            )
+        else:
+            # TODO: Print warning
+            pass
diff --git a/grawlix/output/output_format.py b/grawlix/output/output_format.py
index 83406e6..f9a60ea 100644
--- a/grawlix/output/output_format.py
+++ b/grawlix/output/output_format.py
@@ -1,4 +1,4 @@
-from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book
+from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile
 from grawlix.exceptions import UnsupportedOutputFormat
 from grawlix.encryption import decrypt
 
@@ -9,7 +9,7 @@ Update = Optional[Callable[[float], None]]
 
 class OutputFormat:
     # Extension for output files
-    extension: str = ""
+    extension: str
 
     def __init__(self) -> None:
         self._client = httpx.AsyncClient()
@@ -32,7 +32,10 @@ class OutputFormat:
             raise UnsupportedOutputFormat
         if not book.data.file.extension == self.extension:
             raise UnsupportedOutputFormat
-        await self._download_and_write_file(book.data.file, location, update_func)
+        if isinstance(book.data.file, OnlineFile):
+            await self._download_and_write_file(book.data.file, location, update_func)
+        elif isinstance(book.data.file, OfflineFile):
+            self._write_offline_file(book.data.file, location)
 
 
     async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
@@ -88,3 +91,17 @@ class OutputFormat:
         content = await self._download_file(file, update)
         with open(location, "wb") as f:
             f.write(content)
+
+
+    def _write_offline_file(self, file: OfflineFile, location: str) -> None:
+        """
+        Write the content of an `OfflineFile` to disk
+
+        :param file: File to write to disk
+        :param location: Path to where the file is written
+        """
+        with open(location, "wb") as f:
+            content = file.content
+            if file.encryption:
+                content = decrypt(content, file.encryption)
+            f.write(content)
diff --git a/grawlix/sources/__init__.py b/grawlix/sources/__init__.py
index cd87be7..7e1fbf4 100644
--- a/grawlix/sources/__init__.py
+++ b/grawlix/sources/__init__.py
@@ -3,6 +3,7 @@ from grawlix.exceptions import NoSourceFound
 from .source import Source
 from .ereolen import Ereolen
 from .flipp import Flipp
+from .internet_archive import InternetArchive
 from .mangaplus import MangaPlus
 from .royal_road import RoyalRoad
 from .saxo import Saxo
@@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]:
     return [
         Ereolen,
         Flipp,
+        InternetArchive,
         MangaPlus,
         RoyalRoad,
         Saxo,
diff --git a/grawlix/sources/ereolen.py b/grawlix/sources/ereolen.py
index 8360260..3047e17 100644
--- a/grawlix/sources/ereolen.py
+++ b/grawlix/sources/ereolen.py
@@ -2,8 +2,8 @@ from grawlix.book import Result, Book, SingleFile, Metadata, OnlineFile
 from grawlix.encryption import AESCTREncryption
 from grawlix.exceptions import InvalidUrl, DataNotFound
 from grawlix.utils import nearest_string
-
 from .source import Source
+
 from bs4 import BeautifulSoup
 import json
 import re
diff --git a/grawlix/sources/internet_archive.py b/grawlix/sources/internet_archive.py
new file mode 100644
index 0000000..d158723
--- /dev/null
+++ b/grawlix/sources/internet_archive.py
@@ -0,0 +1,111 @@
+from grawlix.book import Book, SingleFile, Metadata, OfflineFile
+from .source import Source
+
+import random
+import string
+from bs4 import BeautifulSoup
+import asyncio
+
+class InternetArchive(Source):
+    name: str = "Internet Archive"
+    match: list[str] = [
+        r"https://archive.org/details/.+"
+    ]
+    _authentication_methods = [ "login", "cookies" ]
+    _login_credentials = [ "username", "password" ]
+
+    @staticmethod
+    def _format_data(content_type: str, fields):
+        data = ""
+        for name, value in fields.items():
+            data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a"
+        data += content_type+"--"
+        return data
+
+
+    async def login(self, username: str, password: str, **kwargs) -> None:
+        await self._client.get("https://archive.org/account/login")
+        content_type = "----WebKitFormBoundary"+"".join(random.sample(string.ascii_letters + string.digits, 16))
+        headers = {'Content-Type': 'multipart/form-data; boundary='+content_type}
+        data = self._format_data(content_type, {"username":username, "password":password, "submit_by_js":"true"})
+        response = await self._client.post(
+            "https://archive.org/account/login",
+            data=data,
+            headers=headers
+        )
+        if not "Successful login" in response.text:
+            print("Failed login")
+            exit(1)
+
+
+    async def _download_acsm(self, book_id: str) -> bytes:
+        """
+        Loan book on archive.org and download acsm file
+
+        :param book_id: Id of book
+        """
+        await self._client.post(
+            "https://archive.org/services/loans/loan/searchInside.php",
+            data = {
+                "action": "grant_access",
+                "identifier": book_id
+            }
+        )
+        await self._client.post(
+            "https://archive.org/services/loans/loan/",
+            data = {
+                "action": "browse_book",
+                "identifier": book_id
+            }
+        )
+        # TODO: Error handling
+        await self._client.post(
+            "https://archive.org/services/loans/loan/",
+            data = {
+                "action": "create_token",
+                "identifier": book_id
+            }
+        )
+        acsm_response = await self._client.get(
+            f"https://archive.org/services/loans/loan/?action=media_url&identifier={book_id}&format=pdf&redirect=1",
+            follow_redirects = True
+        )
+        return acsm_response.content
+
+
+    async def download(self, url: str) -> Book:
+        book_id = url.split("/")[4]
+        metadata, acsm_file = await asyncio.gather(
+            self._download_metadata(book_id),
+            self._download_acsm(book_id)
+        )
+        return Book(
+            data = SingleFile(
+                OfflineFile(
+                    content = acsm_file,
+                    extension = "acsm",
+                )
+            ),
+            metadata = Metadata(
+                title = metadata["title"],
+                authors = [ metadata.get("creator") ] if "creator" in metadata else []
+            )
+        )
+
+
+    async def _download_metadata(self, book_id: str) -> dict:
+        """
+        Download metadata for book
+
+        :param book_id: Id of book
+        :returns: Dictionary with metadata
+        """
+        page_response = await self._client.get(
+            f"https://archive.org/details/{book_id}"
+        )
+        soup = BeautifulSoup(page_response.text, "lxml")
+        metadata_url = soup.find("ia-book-theater").get("bookmanifesturl")
+        metadata_response = await self._client.get(
+            f"https:{metadata_url}"
+        )
+        return metadata_response.json()["data"]["metadata"]

From c2545b871fc5ee2ae784773daa5d2bf2023e7ddd Mon Sep 17 00:00:00 2001
From: Joakim Holm <mail@joakimholm.xyz>
Date: Fri, 5 May 2023 12:04:59 +0200
Subject: [PATCH 2/3] Small changes

- Fix config to load source configuration correctly
- Add a newline in readme to be more readable
---
 README.md         | 1 +
 grawlix/config.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index dbbd7b8..a601e3a 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@
 ![GitHub top language](https://img.shields.io/github/languages/top/jo1gi/grawlix)
 ![License](https://img.shields.io/github/license/jo1gi/grawlix)
 [![Donate using Ko-Fi](https://img.shields.io/badge/donate-kofi-00b9fe?logo=ko-fi&logoColor=00b9fe)](https://ko-fi.com/jo1gi)
+
 CLI ebook downloader
 
 ## Supported services
diff --git a/grawlix/config.py b/grawlix/config.py
index 4599fea..ce73133 100644
--- a/grawlix/config.py
+++ b/grawlix/config.py
@@ -33,7 +33,7 @@ def load_config() -> Config:
     else:
         config_dict = {}
     sources = {}
-    if "source" in config_dict:
+    if "sources" in config_dict:
         for key, values in config_dict["sources"].items():
             sources[key] = SourceConfig (
                 username = values.get("username"),

From f91a32b0abf8dbd8e63632b4aaefc98a9442a449 Mon Sep 17 00:00:00 2001
From: Joakim Holm <mail@joakimholm.xyz>
Date: Fri, 5 May 2023 12:05:52 +0200
Subject: [PATCH 3/3] Restructure output system

Formats can now be loaded based on book data format and file extension
of the output file. Will also try to use the extension of the output
location instead of using the default filetype every time.
---
 grawlix/output/__init__.py      | 53 +++++++++++++++++++--------------
 grawlix/output/acsm.py          |  7 +++--
 grawlix/output/cbz.py           | 13 ++++----
 grawlix/output/epub.py          | 18 +++++++----
 grawlix/output/output_format.py | 38 +++++++++--------------
 5 files changed, 68 insertions(+), 61 deletions(-)

diff --git a/grawlix/output/__init__.py b/grawlix/output/__init__.py
index fcca4b9..35eb1d2 100644
--- a/grawlix/output/__init__.py
+++ b/grawlix/output/__init__.py
@@ -1,5 +1,5 @@
 from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles
-from grawlix.exceptions import GrawlixError
+from grawlix.exceptions import GrawlixError, UnsupportedOutputFormat
 from grawlix.logging import info
 
 from .output_format import OutputFormat
@@ -17,7 +17,12 @@ async def download_book(book: Book, update_func: Callable, template: str) -> Non
 
     :param book: Book to download
     """
-    output_format = get_default_format(book.data)
+    _, ext = os.path.splitext(template)
+    ext = ext[1:]
+    if ext in get_valid_extensions():
+        output_format = find_output_format(book, ext)()
+    else:
+        output_format = get_default_format(book)
     location = format_output_location(book, output_format, template)
     if not book.overwrite and os.path.exists(location):
         info("Skipping - File already exists")
@@ -25,14 +30,7 @@ async def download_book(book: Book, update_func: Callable, template: str) -> Non
     parent = Path(location).parent
     if not parent.exists():
         os.makedirs(parent)
-    if isinstance(book.data, SingleFile):
-        await output_format.dl_single_file(book, location, update_func)
-    elif isinstance(book.data, ImageList):
-        await output_format.dl_image_list(book, location, update_func)
-    elif isinstance(book.data, HtmlFiles):
-        await output_format.dl_html_files(book, location, update_func)
-    else:
-        raise NotImplementedError
+    await output_format.download(book, location, update_func)
     await output_format.close()
 
 
@@ -49,34 +47,43 @@ def format_output_location(book: Book, output_format: OutputFormat, template: st
     return template.format(**values, ext = output_format.extension)
 
 
-def get_default_format(bookdata: BookData) -> OutputFormat:
+def get_default_format(book: Book) -> OutputFormat:
     """
     Get default output format for bookdata.
     Should only be used if no format was specified by the user
 
-    :param bookdata: Content of book
+    :param book: Content of book
     :returns: OutputFormat object matching the default
     """
+    bookdata = book.data
     if isinstance(bookdata, SingleFile):
-        return output_format_from_str(bookdata.file.extension)
+        extension = bookdata.file.extension
     if isinstance(bookdata, ImageList):
-        return Cbz()
+        extension = "cbz"
     if isinstance(bookdata, HtmlFiles):
-        return Epub()
-    raise GrawlixError
+        extension = "epub"
+    output_format = find_output_format(book, extension)
+    return output_format()
 
 
-def output_format_from_str(name: str) -> OutputFormat:
+def find_output_format(book: Book, extension: str) -> type[OutputFormat]:
     """
-    Convert string to outputformat object
+    Find a compatible output format
 
-    :param name: Name of output format
-    :returns: OutputFormat object
+    :param book: Book to download
+    :param extension: Extension of output file
+    :returns: Compatible OutputFormat type
+    :raises: UnsupportedOutputFormat if nothing is found
     """
     for output_format in get_output_formats():
-        if output_format.extension == name:
-            return output_format()
-    raise GrawlixError
+        matches_extension = output_format.extension == extension
+        supports_bookdata = type(book.data) in output_format.input_types
+        if matches_extension and supports_bookdata:
+            return output_format
+    raise UnsupportedOutputFormat
+
+def get_valid_extensions() -> list[str]:
+    return [output_format.extension for output_format in get_output_formats()]
 
 
 def get_output_formats() -> list[type[OutputFormat]]:
diff --git a/grawlix/output/acsm.py b/grawlix/output/acsm.py
index ca5230f..0d12e46 100644
--- a/grawlix/output/acsm.py
+++ b/grawlix/output/acsm.py
@@ -1,14 +1,15 @@
-from grawlix.book import Book
+from grawlix.book import Book, SingleFile
 from .output_format import OutputFormat, Update
 import shutil
 import subprocess
 
 class Acsm(OutputFormat):
     extension = "acsm"
+    input_types = [SingleFile]
 
-    async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
+    async def download(self, book: Book, location: str, update_func: Update) -> None:
         # Download and write acsm file to disk
-        await super().dl_single_file(book, location, update_func)
+        await self._download_single_file(book, location, update_func)
         # TODO: Implement more general solution
         # Decrypt if knock is available
         # https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock
diff --git a/grawlix/output/cbz.py b/grawlix/output/cbz.py
index 54ac63a..b374c5a 100644
--- a/grawlix/output/cbz.py
+++ b/grawlix/output/cbz.py
@@ -9,18 +9,21 @@ class Cbz(OutputFormat):
     """Comic book zip file"""
 
     extension: str = "cbz"
+    input_types = [ImageList]
 
-    async def dl_image_list(self, book: Book, location: str, update: Update) -> None:
+    async def download(self, book: Book, location: str, update: Update) -> None:
         if not isinstance(book.data, ImageList):
             raise UnsupportedOutputFormat
+        semaphore = asyncio.Semaphore(10)
         images = book.data.images
         image_count = len(images)
         with ZipFile(location, mode="w") as zip:
             async def download_page(index: int, file: OnlineFile):
-                content = await self._download_file(file)
-                zip.writestr(f"Image {index}.{file.extension}", content)
-                if update:
-                    update(1/image_count)
+                async with semaphore:
+                    content = await self._download_file(file)
+                    zip.writestr(f"Image {index}.{file.extension}", content)
+                    if update:
+                        update(1/image_count)
             tasks = [
                 asyncio.create_task(download_page(index, file))
                 for index, file in enumerate(images)
diff --git a/grawlix/output/epub.py b/grawlix/output/epub.py
index 39bc683..7a5d4fa 100644
--- a/grawlix/output/epub.py
+++ b/grawlix/output/epub.py
@@ -1,4 +1,4 @@
-from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book
+from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book, SingleFile, Metadata
 from grawlix.exceptions import UnsupportedOutputFormat
 from .output_format import OutputFormat, Update
 
@@ -9,14 +9,20 @@ from ebooklib import epub
 
 class Epub(OutputFormat):
     extension = "epub"
+    input_types = [SingleFile, HtmlFiles]
 
-    async def dl_html_files(self, book: Book, location: str, update: Update) -> None:
-        if not isinstance(book.data, HtmlFiles):
+    async def download(self, book: Book, location: str, update: Update) -> None:
+        if isinstance(book.data, SingleFile):
+            await self._download_single_file(book, location, update)
+        elif isinstance(book.data, HtmlFiles):
+            await self._download_html_files(book.data, book.metadata, location, update)
+        else:
             raise UnsupportedOutputFormat
-        html = book.data
+
+    async def _download_html_files(self, html: HtmlFiles, metadata: Metadata, location: str, update: Update) -> None:
         output = epub.EpubBook()
-        output.set_title(book.metadata.title)
-        for author in book.metadata.authors:
+        output.set_title(metadata.title)
+        for author in metadata.authors:
             output.add_author(author)
         file_count = len(html.htmlfiles) + 1 # Html files + cover
 
diff --git a/grawlix/output/output_format.py b/grawlix/output/output_format.py
index f9a60ea..1ed91e9 100644
--- a/grawlix/output/output_format.py
+++ b/grawlix/output/output_format.py
@@ -1,4 +1,4 @@
-from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile
+from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile, BookData
 from grawlix.exceptions import UnsupportedOutputFormat
 from grawlix.encryption import decrypt
 
@@ -10,6 +10,7 @@ Update = Optional[Callable[[float], None]]
 class OutputFormat:
     # Extension for output files
     extension: str
+    input_types: list[type[BookData]]
 
     def __init__(self) -> None:
         self._client = httpx.AsyncClient()
@@ -20,7 +21,18 @@ class OutputFormat:
         await self._client.aclose()
 
 
-    async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
+    async def download(self, book: Book, location: str, update_func: Update) -> None:
+        """
+        Download book
+
+        :param book: Book to download
+        :param location: Path to where the file is written
+        :param update_func: Function to update progress bar
+        """
+        raise UnsupportedOutputFormat
+
+
+    async def _download_single_file(self, book: Book, location: str, update_func: Update) -> None:
         """
         Download and write an `grawlix.SingleFile` to disk
 
@@ -38,28 +50,6 @@ class OutputFormat:
             self._write_offline_file(book.data.file, location)
 
 
-    async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
-        """
-        Download and write an `grawlix.ImageList` to disk
-
-        :param book: Book to download
-        :param location: Path to where the file is written
-        :raises UnsupportedOutputFormat: If datatype is not supported by format
-        """
-        raise UnsupportedOutputFormat
-
-
-    async def dl_html_files(self, book: Book, location: str, update_func: Update) -> None:
-        """
-        Download and write a `grawlix.HtmlFiles` to disk
-
-        :param book: Book to download
-        :param location: Path to where the file is written
-        :raises UnsupportedOutputFormat: If datatype is not supported by format
-        """
-        raise UnsupportedOutputFormat
-
-
     async def _download_file(self, file: OnlineFile, update: Update = None) -> bytes:
         """
         Download `grawlix.OnlineFile`