From fc136038734609f6690e8646407f74b8a5a45731 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Thu, 4 May 2023 21:30:50 +0200 Subject: [PATCH 1/3] Add basic Internet Archive support --- README.md | 1 + grawlix/__main__.py | 2 +- grawlix/book.py | 11 ++- grawlix/output/__init__.py | 2 + grawlix/output/acsm.py | 22 ++++++ grawlix/output/output_format.py | 23 +++++- grawlix/sources/__init__.py | 2 + grawlix/sources/ereolen.py | 2 +- grawlix/sources/internet_archive.py | 111 ++++++++++++++++++++++++++++ 9 files changed, 170 insertions(+), 6 deletions(-) create mode 100644 grawlix/output/acsm.py create mode 100644 grawlix/sources/internet_archive.py diff --git a/README.md b/README.md index 36bc222..dbbd7b8 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ CLI ebook downloader grawlix currently supports downloading from the following sources: - [eReolen](https://ereolen.dk) - [Flipp](https://flipp.dk) +- [Internet Archive](https://archive.org) - [Manga Plus](https://mangaplus.shueisha.co.jp) - [Royal Road](https://www.royalroad.com) - [Saxo](https://saxo.com) diff --git a/grawlix/__main__.py b/grawlix/__main__.py index be376ea..d05a947 100644 --- a/grawlix/__main__.py +++ b/grawlix/__main__.py @@ -20,7 +20,7 @@ def get_login(source: Source, config: Config, options) -> Tuple[str, str, Option :param options: Command line options :returns: Login credentials """ - source_name = source.name.lower() + source_name = source.name.lower().replace(" ", "") if source_name in config.sources: username = config.sources[source_name].username or options.username password = config.sources[source_name].password or options.password diff --git a/grawlix/book.py b/grawlix/book.py index c1ec9d8..ae49564 100644 --- a/grawlix/book.py +++ b/grawlix/book.py @@ -31,11 +31,20 @@ class OnlineFile: encryption: Optional[Encryption] = None headers: Optional[dict[str, str]] = None +@dataclass(slots=True) +class OfflineFile: + """Stores content of a file""" + content: bytes + extension: str + encryption: Optional[Encryption] = None + +File = Union[OnlineFile, OfflineFile] + @dataclass(slots=True) class SingleFile: """Bookdata in the form of a single file""" - file: OnlineFile + file: File @dataclass(slots=True) diff --git a/grawlix/output/__init__.py b/grawlix/output/__init__.py index 33f5c2c..fcca4b9 100644 --- a/grawlix/output/__init__.py +++ b/grawlix/output/__init__.py @@ -3,6 +3,7 @@ from grawlix.exceptions import GrawlixError from grawlix.logging import info from .output_format import OutputFormat +from .acsm import Acsm from .cbz import Cbz from .epub import Epub @@ -85,6 +86,7 @@ def get_output_formats() -> list[type[OutputFormat]]: :returns: List of available output format classes """ return [ + Acsm, Cbz, Epub, ] diff --git a/grawlix/output/acsm.py b/grawlix/output/acsm.py new file mode 100644 index 0000000..ca5230f --- /dev/null +++ b/grawlix/output/acsm.py @@ -0,0 +1,22 @@ +from grawlix.book import Book +from .output_format import OutputFormat, Update +import shutil +import subprocess + +class Acsm(OutputFormat): + extension = "acsm" + + async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None: + # Download and write acsm file to disk + await super().dl_single_file(book, location, update_func) + # TODO: Implement more general solution + # Decrypt if knock is available + # https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock + if shutil.which("knock") is not None: + subprocess.run( + ["knock", location], + capture_output = True + ) + else: + # TODO: Print warning + pass diff --git a/grawlix/output/output_format.py b/grawlix/output/output_format.py index 83406e6..f9a60ea 100644 --- a/grawlix/output/output_format.py +++ b/grawlix/output/output_format.py @@ -1,4 +1,4 @@ -from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book +from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile from grawlix.exceptions import UnsupportedOutputFormat from grawlix.encryption import decrypt @@ -9,7 +9,7 @@ Update = Optional[Callable[[float], None]] class OutputFormat: # Extension for output files - extension: str = "" + extension: str def __init__(self) -> None: self._client = httpx.AsyncClient() @@ -32,7 +32,10 @@ class OutputFormat: raise UnsupportedOutputFormat if not book.data.file.extension == self.extension: raise UnsupportedOutputFormat - await self._download_and_write_file(book.data.file, location, update_func) + if isinstance(book.data.file, OnlineFile): + await self._download_and_write_file(book.data.file, location, update_func) + elif isinstance(book.data.file, OfflineFile): + self._write_offline_file(book.data.file, location) async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None: @@ -88,3 +91,17 @@ class OutputFormat: content = await self._download_file(file, update) with open(location, "wb") as f: f.write(content) + + + def _write_offline_file(self, file: OfflineFile, location: str) -> None: + """ + Write the content of an `OfflineFile` to disk + + :param file: File to write to disk + :param location: Path to where the file is written + """ + with open(location, "wb") as f: + content = file.content + if file.encryption: + content = decrypt(content, file.encryption) + f.write(content) diff --git a/grawlix/sources/__init__.py b/grawlix/sources/__init__.py index cd87be7..7e1fbf4 100644 --- a/grawlix/sources/__init__.py +++ b/grawlix/sources/__init__.py @@ -3,6 +3,7 @@ from grawlix.exceptions import NoSourceFound from .source import Source from .ereolen import Ereolen from .flipp import Flipp +from .internet_archive import InternetArchive from .mangaplus import MangaPlus from .royal_road import RoyalRoad from .saxo import Saxo @@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]: return [ Ereolen, Flipp, + InternetArchive, MangaPlus, RoyalRoad, Saxo, diff --git a/grawlix/sources/ereolen.py b/grawlix/sources/ereolen.py index 8360260..3047e17 100644 --- a/grawlix/sources/ereolen.py +++ b/grawlix/sources/ereolen.py @@ -2,8 +2,8 @@ from grawlix.book import Result, Book, SingleFile, Metadata, OnlineFile from grawlix.encryption import AESCTREncryption from grawlix.exceptions import InvalidUrl, DataNotFound from grawlix.utils import nearest_string - from .source import Source + from bs4 import BeautifulSoup import json import re diff --git a/grawlix/sources/internet_archive.py b/grawlix/sources/internet_archive.py new file mode 100644 index 0000000..d158723 --- /dev/null +++ b/grawlix/sources/internet_archive.py @@ -0,0 +1,111 @@ +from grawlix.book import Book, SingleFile, Metadata, OfflineFile +from .source import Source + +import random +import string +from bs4 import BeautifulSoup +import asyncio + +class InternetArchive(Source): + name: str = "Internet Archive" + match: list[str] = [ + r"https://archive.org/details/.+" + ] + _authentication_methods = [ "login", "cookies" ] + _login_credentials = [ "username", "password" ] + + @staticmethod + def _format_data(content_type: str, fields): + data = "" + for name, value in fields.items(): + data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a" + data += content_type+"--" + return data + + + async def login(self, username: str, password: str, **kwargs) -> None: + await self._client.get("https://archive.org/account/login") + content_type = "----WebKitFormBoundary"+"".join(random.sample(string.ascii_letters + string.digits, 16)) + headers = {'Content-Type': 'multipart/form-data; boundary='+content_type} + data = self._format_data(content_type, {"username":username, "password":password, "submit_by_js":"true"}) + response = await self._client.post( + "https://archive.org/account/login", + data=data, + headers=headers + ) + if not "Successful login" in response.text: + print("Failed login") + exit(1) + + + async def _download_acsm(self, book_id: str) -> bytes: + """ + Loan book on archive.org and download acsm file + + :param book_id: Id of book + """ + await self._client.post( + "https://archive.org/services/loans/loan/searchInside.php", + data = { + "action": "grant_access", + "identifier": book_id + } + ) + await self._client.post( + "https://archive.org/services/loans/loan/", + data = { + "action": "browse_book", + "identifier": book_id + } + ) + # TODO: Error handling + await self._client.post( + "https://archive.org/services/loans/loan/", + data = { + "action": "create_token", + "identifier": book_id + } + ) + acsm_response = await self._client.get( + f"https://archive.org/services/loans/loan/?action=media_url&identifier={book_id}&format=pdf&redirect=1", + follow_redirects = True + ) + return acsm_response.content + + + async def download(self, url: str) -> Book: + book_id = url.split("/")[4] + metadata, acsm_file = await asyncio.gather( + self._download_metadata(book_id), + self._download_acsm(book_id) + ) + return Book( + data = SingleFile( + OfflineFile( + content = acsm_file, + extension = "acsm", + ) + ), + metadata = Metadata( + title = metadata["title"], + authors = [ metadata.get("creator") ] if "creator" in metadata else [] + ) + ) + + + async def _download_metadata(self, book_id: str) -> dict: + """ + Download metadata for book + + :param book_id: Id of book + :returns: Dictionary with metadata + """ + page_response = await self._client.get( + f"https://archive.org/details/{book_id}" + ) + soup = BeautifulSoup(page_response.text, "lxml") + metadata_url = soup.find("ia-book-theater").get("bookmanifesturl") + metadata_response = await self._client.get( + f"https:{metadata_url}" + ) + return metadata_response.json()["data"]["metadata"] From c2545b871fc5ee2ae784773daa5d2bf2023e7ddd Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Fri, 5 May 2023 12:04:59 +0200 Subject: [PATCH 2/3] Small changes - Fix config to load source configuration correctly - Add a newline in readme to be more readable --- README.md | 1 + grawlix/config.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dbbd7b8..a601e3a 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ ![GitHub top language](https://img.shields.io/github/languages/top/jo1gi/grawlix) ![License](https://img.shields.io/github/license/jo1gi/grawlix) [![Donate using Ko-Fi](https://img.shields.io/badge/donate-kofi-00b9fe?logo=ko-fi&logoColor=00b9fe)](https://ko-fi.com/jo1gi) + CLI ebook downloader ## Supported services diff --git a/grawlix/config.py b/grawlix/config.py index 4599fea..ce73133 100644 --- a/grawlix/config.py +++ b/grawlix/config.py @@ -33,7 +33,7 @@ def load_config() -> Config: else: config_dict = {} sources = {} - if "source" in config_dict: + if "sources" in config_dict: for key, values in config_dict["sources"].items(): sources[key] = SourceConfig ( username = values.get("username"), From f91a32b0abf8dbd8e63632b4aaefc98a9442a449 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Fri, 5 May 2023 12:05:52 +0200 Subject: [PATCH 3/3] Restructure output system Formats can now be loaded based on book data format and file extension of the output file. Will also try to use the extension of the output location instead of using the default filetype every time. --- grawlix/output/__init__.py | 53 +++++++++++++++++++-------------- grawlix/output/acsm.py | 7 +++-- grawlix/output/cbz.py | 13 ++++---- grawlix/output/epub.py | 18 +++++++---- grawlix/output/output_format.py | 38 +++++++++-------------- 5 files changed, 68 insertions(+), 61 deletions(-) diff --git a/grawlix/output/__init__.py b/grawlix/output/__init__.py index fcca4b9..35eb1d2 100644 --- a/grawlix/output/__init__.py +++ b/grawlix/output/__init__.py @@ -1,5 +1,5 @@ from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles -from grawlix.exceptions import GrawlixError +from grawlix.exceptions import GrawlixError, UnsupportedOutputFormat from grawlix.logging import info from .output_format import OutputFormat @@ -17,7 +17,12 @@ async def download_book(book: Book, update_func: Callable, template: str) -> Non :param book: Book to download """ - output_format = get_default_format(book.data) + _, ext = os.path.splitext(template) + ext = ext[1:] + if ext in get_valid_extensions(): + output_format = find_output_format(book, ext)() + else: + output_format = get_default_format(book) location = format_output_location(book, output_format, template) if not book.overwrite and os.path.exists(location): info("Skipping - File already exists") @@ -25,14 +30,7 @@ async def download_book(book: Book, update_func: Callable, template: str) -> Non parent = Path(location).parent if not parent.exists(): os.makedirs(parent) - if isinstance(book.data, SingleFile): - await output_format.dl_single_file(book, location, update_func) - elif isinstance(book.data, ImageList): - await output_format.dl_image_list(book, location, update_func) - elif isinstance(book.data, HtmlFiles): - await output_format.dl_html_files(book, location, update_func) - else: - raise NotImplementedError + await output_format.download(book, location, update_func) await output_format.close() @@ -49,34 +47,43 @@ def format_output_location(book: Book, output_format: OutputFormat, template: st return template.format(**values, ext = output_format.extension) -def get_default_format(bookdata: BookData) -> OutputFormat: +def get_default_format(book: Book) -> OutputFormat: """ Get default output format for bookdata. Should only be used if no format was specified by the user - :param bookdata: Content of book + :param book: Content of book :returns: OutputFormat object matching the default """ + bookdata = book.data if isinstance(bookdata, SingleFile): - return output_format_from_str(bookdata.file.extension) + extension = bookdata.file.extension if isinstance(bookdata, ImageList): - return Cbz() + extension = "cbz" if isinstance(bookdata, HtmlFiles): - return Epub() - raise GrawlixError + extension = "epub" + output_format = find_output_format(book, extension) + return output_format() -def output_format_from_str(name: str) -> OutputFormat: +def find_output_format(book: Book, extension: str) -> type[OutputFormat]: """ - Convert string to outputformat object + Find a compatible output format - :param name: Name of output format - :returns: OutputFormat object + :param book: Book to download + :param extension: Extension of output file + :returns: Compatible OutputFormat type + :raises: UnsupportedOutputFormat if nothing is found """ for output_format in get_output_formats(): - if output_format.extension == name: - return output_format() - raise GrawlixError + matches_extension = output_format.extension == extension + supports_bookdata = type(book.data) in output_format.input_types + if matches_extension and supports_bookdata: + return output_format + raise UnsupportedOutputFormat + +def get_valid_extensions() -> list[str]: + return [output_format.extension for output_format in get_output_formats()] def get_output_formats() -> list[type[OutputFormat]]: diff --git a/grawlix/output/acsm.py b/grawlix/output/acsm.py index ca5230f..0d12e46 100644 --- a/grawlix/output/acsm.py +++ b/grawlix/output/acsm.py @@ -1,14 +1,15 @@ -from grawlix.book import Book +from grawlix.book import Book, SingleFile from .output_format import OutputFormat, Update import shutil import subprocess class Acsm(OutputFormat): extension = "acsm" + input_types = [SingleFile] - async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None: + async def download(self, book: Book, location: str, update_func: Update) -> None: # Download and write acsm file to disk - await super().dl_single_file(book, location, update_func) + await self._download_single_file(book, location, update_func) # TODO: Implement more general solution # Decrypt if knock is available # https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock diff --git a/grawlix/output/cbz.py b/grawlix/output/cbz.py index 54ac63a..b374c5a 100644 --- a/grawlix/output/cbz.py +++ b/grawlix/output/cbz.py @@ -9,18 +9,21 @@ class Cbz(OutputFormat): """Comic book zip file""" extension: str = "cbz" + input_types = [ImageList] - async def dl_image_list(self, book: Book, location: str, update: Update) -> None: + async def download(self, book: Book, location: str, update: Update) -> None: if not isinstance(book.data, ImageList): raise UnsupportedOutputFormat + semaphore = asyncio.Semaphore(10) images = book.data.images image_count = len(images) with ZipFile(location, mode="w") as zip: async def download_page(index: int, file: OnlineFile): - content = await self._download_file(file) - zip.writestr(f"Image {index}.{file.extension}", content) - if update: - update(1/image_count) + async with semaphore: + content = await self._download_file(file) + zip.writestr(f"Image {index}.{file.extension}", content) + if update: + update(1/image_count) tasks = [ asyncio.create_task(download_page(index, file)) for index, file in enumerate(images) diff --git a/grawlix/output/epub.py b/grawlix/output/epub.py index 39bc683..7a5d4fa 100644 --- a/grawlix/output/epub.py +++ b/grawlix/output/epub.py @@ -1,4 +1,4 @@ -from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book +from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book, SingleFile, Metadata from grawlix.exceptions import UnsupportedOutputFormat from .output_format import OutputFormat, Update @@ -9,14 +9,20 @@ from ebooklib import epub class Epub(OutputFormat): extension = "epub" + input_types = [SingleFile, HtmlFiles] - async def dl_html_files(self, book: Book, location: str, update: Update) -> None: - if not isinstance(book.data, HtmlFiles): + async def download(self, book: Book, location: str, update: Update) -> None: + if isinstance(book.data, SingleFile): + await self._download_single_file(book, location, update) + elif isinstance(book.data, HtmlFiles): + await self._download_html_files(book.data, book.metadata, location, update) + else: raise UnsupportedOutputFormat - html = book.data + + async def _download_html_files(self, html: HtmlFiles, metadata: Metadata, location: str, update: Update) -> None: output = epub.EpubBook() - output.set_title(book.metadata.title) - for author in book.metadata.authors: + output.set_title(metadata.title) + for author in metadata.authors: output.add_author(author) file_count = len(html.htmlfiles) + 1 # Html files + cover diff --git a/grawlix/output/output_format.py b/grawlix/output/output_format.py index f9a60ea..1ed91e9 100644 --- a/grawlix/output/output_format.py +++ b/grawlix/output/output_format.py @@ -1,4 +1,4 @@ -from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile +from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile, BookData from grawlix.exceptions import UnsupportedOutputFormat from grawlix.encryption import decrypt @@ -10,6 +10,7 @@ Update = Optional[Callable[[float], None]] class OutputFormat: # Extension for output files extension: str + input_types: list[type[BookData]] def __init__(self) -> None: self._client = httpx.AsyncClient() @@ -20,7 +21,18 @@ class OutputFormat: await self._client.aclose() - async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None: + async def download(self, book: Book, location: str, update_func: Update) -> None: + """ + Download book + + :param book: Book to download + :param location: Path to where the file is written + :param update_func: Function to update progress bar + """ + raise UnsupportedOutputFormat + + + async def _download_single_file(self, book: Book, location: str, update_func: Update) -> None: """ Download and write an `grawlix.SingleFile` to disk @@ -38,28 +50,6 @@ class OutputFormat: self._write_offline_file(book.data.file, location) - async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None: - """ - Download and write an `grawlix.ImageList` to disk - - :param book: Book to download - :param location: Path to where the file is written - :raises UnsupportedOutputFormat: If datatype is not supported by format - """ - raise UnsupportedOutputFormat - - - async def dl_html_files(self, book: Book, location: str, update_func: Update) -> None: - """ - Download and write a `grawlix.HtmlFiles` to disk - - :param book: Book to download - :param location: Path to where the file is written - :raises UnsupportedOutputFormat: If datatype is not supported by format - """ - raise UnsupportedOutputFormat - - async def _download_file(self, file: OnlineFile, update: Update = None) -> bytes: """ Download `grawlix.OnlineFile`