From 8d2fe3d7d55bfa8a3064ec40433dda5ce97f20d7 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Wed, 19 Apr 2023 22:53:33 +0200 Subject: [PATCH 01/12] Add authentication with netscape cookie file --- grawlix/__main__.py | 21 ++++++++++++++++++++- grawlix/arguments.py | 6 ++++++ grawlix/sources/source.py | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/grawlix/__main__.py b/grawlix/__main__.py index 6052ad1..55b33ac 100644 --- a/grawlix/__main__.py +++ b/grawlix/__main__.py @@ -5,9 +5,10 @@ from .sources import load_source, Source from .output import download_book from . import arguments, logging -from typing import Tuple +from typing import Tuple, Optional from rich.progress import Progress from functools import partial +import os def get_login(source: Source, config: Config, options) -> Tuple[str, str]: @@ -45,6 +46,20 @@ def get_urls(options) -> list[str]: return urls +def get_cookie_file(options) -> Optional[str]: + """ + Get path to cookie file + + :param options: Cli arguments + :returns: Path to cookie file + """ + if options.cookie_file is not None and os.path.exists(options.cookie_file): + return options.cookie_file + if os.path.exists("./cookies.txt"): + return "./cookies.txt" + return None + + def authenticate(source: Source, config: Config, options): """ Authenticate with source @@ -58,6 +73,10 @@ def authenticate(source: Source, config: Config, options): username, password = get_login(source, config, options) source.login(username, password) source.authenticated = True + if source.supports_cookies: + cookie_file = get_cookie_file(options) + if cookie_file: + source.load_cookies(cookie_file) else: raise SourceNotAuthenticated diff --git a/grawlix/arguments.py b/grawlix/arguments.py index f4cdbf7..2ec087a 100644 --- a/grawlix/arguments.py +++ b/grawlix/arguments.py @@ -39,6 +39,12 @@ def parse_arguments(): help = "Password for login", dest = "password", ) + parser.add_argument( + '-c', + '--cookies', + help = "Path to netscape cookie file", + dest = "cookie_file" + ) # Outputs parser.add_argument( '-o', diff --git a/grawlix/sources/source.py b/grawlix/sources/source.py index ac32744..1ddd9bc 100644 --- a/grawlix/sources/source.py +++ b/grawlix/sources/source.py @@ -1,7 +1,9 @@ from grawlix.book import Book, Series, Result -from typing import Generic, TypeVar, Tuple +from typing import Generic, TypeVar, Tuple, Optional +from http.cookiejar import MozillaCookieJar import requests +import re T = TypeVar("T") @@ -41,6 +43,25 @@ class Source(Generic[T]): raise NotImplementedError + @property + def supports_cookies(self) -> bool: + """Does the source support authentication with cookie file""" + return "cookies" in self._authentication_methods + + + def load_cookies(self, cookie_file: str): + """ + Authenticate with source with netscape cookie file + + :param cookie_file: Path to netscape cookie file + """ + if self.supports_cookies: + cookie_jar = MozillaCookieJar() + cookie_jar.load(cookie_file, ignore_expires=True) + self._session.cookies.update(cookie_jar) + self.authenticated = True + + def download(self, url: str) -> Result[T]: """ Download book metadata from source @@ -59,3 +80,16 @@ class Source(Generic[T]): :returns: Downloaded book metadata """ raise NotImplementedError + + + def get_match_index(self, url: str) -> Optional[int]: + """ + Find the first regex in `self.match` that matches url + + :param url: Url to match + :returns: Index of regex + """ + for index, match in enumerate(self.match): + if re.match(match, url): + return index + return None From ada2005c2ef4d892ce1e2652058e449531162000 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Sun, 14 May 2023 17:35:25 +0200 Subject: [PATCH 02/12] Add support for marvel unlimited --- .gitignore | 1 + README.md | 1 + grawlix/sources/__init__.py | 2 ++ grawlix/sources/marvel.py | 32 ++++++++++++++++++++++++++++---- 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index c5aed4a..b379466 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ MANIFEST # Installer logs pip-log.txt pip-delete-this-directory.txt +cookies.txt diff --git a/README.md b/README.md index 36bc222..638229e 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ grawlix currently supports downloading from the following sources: - [eReolen](https://ereolen.dk) - [Flipp](https://flipp.dk) - [Manga Plus](https://mangaplus.shueisha.co.jp) +- [Marvel Unlimited](https://marvel.com) - [Royal Road](https://www.royalroad.com) - [Saxo](https://saxo.com) - [Webtoons](https://webtoons.com) diff --git a/grawlix/sources/__init__.py b/grawlix/sources/__init__.py index cd87be7..4fa4e46 100644 --- a/grawlix/sources/__init__.py +++ b/grawlix/sources/__init__.py @@ -4,6 +4,7 @@ from .source import Source from .ereolen import Ereolen from .flipp import Flipp from .mangaplus import MangaPlus +from .marvel import Marvel from .royal_road import RoyalRoad from .saxo import Saxo from .webtoons import Webtoons @@ -54,6 +55,7 @@ def get_source_classes() -> list[type[Source]]: Ereolen, Flipp, MangaPlus, + Marvel, RoyalRoad, Saxo, Webtoons diff --git a/grawlix/sources/marvel.py b/grawlix/sources/marvel.py index a2d8779..d42910f 100644 --- a/grawlix/sources/marvel.py +++ b/grawlix/sources/marvel.py @@ -1,14 +1,18 @@ from grawlix.book import Book, Metadata, ImageList, OnlineFile, Series, Result -from grawlix.exceptions import InvalidUrl +from grawlix.exceptions import InvalidUrl, DataNotFound from .source import Source +import re + # Personal marvel ip key API_KEY = "83ac0da31d3f6801f2c73c7e07ad76e8" class Marvel(Source[str]): name: str = "Marvel" match = [ + r"https://www.marvel.com/comics/issue/\d+/.+", + r"https://read.marvel.com/#/book/\d+", r"https://www.marvel.com/comics/series/\d+/.+" ] _authentication_methods: list[str] = [ "cookies" ] @@ -17,6 +21,12 @@ class Marvel(Source[str]): async def download(self, url: str) -> Result[str]: match_index = self.get_match_index(url) if match_index == 0: + issue_id = await self._get_issue_id(url) + return await self.download_book_from_id(issue_id) + if match_index == 1: + issue_id = url.split("/")[-1] + return await self.download_book_from_id(issue_id) + if match_index == 2: return await self._download_series(url) raise InvalidUrl @@ -44,10 +54,10 @@ class Marvel(Source[str]): :param series_id: Id of comic series on marvel.com :returns: List of comic ids for marvel comics """ - response = self._client.get( + response = await self._client.get( f"https://api.marvel.com/browse/comics?byType=comic_series&isDigital=1&limit=10000&byId={series_id}", - ).json() - issue_ids = [issue["digital_id"] for issue in response["data"]["results"]] + ) + issue_ids = [issue["digital_id"] for issue in response.json()["data"]["results"]] return issue_ids @@ -66,6 +76,20 @@ class Marvel(Source[str]): ) return response.json() + async def _get_issue_id(self, url: str) -> str: + """ + Download issue id from url + + :param url: Url to issue info page + :return: Issue id + """ + response = await self._client.get(url) + search = re.search(r"digital_comic_id: \"(\d+)\"", response.text) + if not search: + raise DataNotFound + return search.group(1) + + async def download_book_from_id(self, issue_id: str) -> Book: return Book( From d46a4fe88b1fd5739b3ac3913c6b914115676e82 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Mon, 22 May 2023 11:02:05 +0200 Subject: [PATCH 03/12] Fix authentication when not supporting cookies --- grawlix/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grawlix/__main__.py b/grawlix/__main__.py index 4669b97..8a81971 100644 --- a/grawlix/__main__.py +++ b/grawlix/__main__.py @@ -76,7 +76,7 @@ async def authenticate(source: Source, config: Config, options): username, password, library = get_login(source, config, options) await source.login(username, password, library=library) source.authenticated = True - if source.supports_cookies: + elif source.supports_cookies: cookie_file = get_cookie_file(options) if cookie_file: source.load_cookies(cookie_file) From f90cb13bfcfffec6ea0af004a60818cfd90eddb5 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Wed, 24 May 2023 20:52:11 +0200 Subject: [PATCH 04/12] Update README with authentication methods --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index eea5e1f..52d5843 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,10 @@ python3 setup.py install ``` ## Authentication +Authentication can either be done with login (username and password) or cookies. +Not all sources support both methods. + +### Login Some sources require authentication, which can be done either with cli arguments or a config file. @@ -39,6 +43,15 @@ grawlix --username "user@example.com" --password "SuperSecretPassword" username = "user@example.com" password = "SuperSecretPassword" ``` +Config file should be placed in `~/.config/grawlix/grawlix.toml` + +### Cookies +Some sources can be authenticated with Netscape cookie files. I use +[this extension](https://github,com/rotemdan/ExportCookies) to export my +cookies from my browser. + +Cookies can be placed in current dir as `cookies.txt` or be given with the +`--cookie` argument. ## Download books To download a book run: From 2faa4b747b531a8f2969fb7080431a4a057ceb44 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Wed, 31 May 2023 19:41:47 +0200 Subject: [PATCH 05/12] Make sure error happens if no cookie file is found --- grawlix/__main__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/grawlix/__main__.py b/grawlix/__main__.py index 687e399..c30ee35 100644 --- a/grawlix/__main__.py +++ b/grawlix/__main__.py @@ -80,6 +80,8 @@ async def authenticate(source: Source, config: Config, options): cookie_file = get_cookie_file(options) if cookie_file: source.load_cookies(cookie_file) + else: + raise SourceNotAuthenticated else: raise SourceNotAuthenticated @@ -108,6 +110,7 @@ async def main() -> None: logging.info("") except GrawlixError as error: error.print_error() + exit(1) async def download_with_progress(book: Book, progress: Progress, template: str): From dade9db6dafb3c842bbfef820d5bb167c1a42e43 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Thu, 1 Jun 2023 22:32:04 +0200 Subject: [PATCH 06/12] Make onlinefiles support cookies --- grawlix/book.py | 5 +++-- grawlix/output/epub.py | 15 +++++++++++---- grawlix/output/output_format.py | 2 +- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/grawlix/book.py b/grawlix/book.py index ae49564..0cdf054 100644 --- a/grawlix/book.py +++ b/grawlix/book.py @@ -1,6 +1,6 @@ from grawlix import Encryption from dataclasses import dataclass, field -from typing import Optional, Union, TypeVar, Generic +from typing import Optional, Union, TypeVar, Generic, Any @dataclass(slots=True) class Metadata: @@ -30,6 +30,7 @@ class OnlineFile: extension: str encryption: Optional[Encryption] = None headers: Optional[dict[str, str]] = None + cookies: Optional[Any] = None # TODO Change type @dataclass(slots=True) class OfflineFile: @@ -63,8 +64,8 @@ class HtmlFile: @dataclass(slots=True) class HtmlFiles: - cover: OnlineFile htmlfiles: list[HtmlFile] + cover: Optional[OnlineFile] = None BookData = Union[ SingleFile, diff --git a/grawlix/output/epub.py b/grawlix/output/epub.py index 7a5d4fa..96d9c28 100644 --- a/grawlix/output/epub.py +++ b/grawlix/output/epub.py @@ -27,9 +27,9 @@ class Epub(OutputFormat): file_count = len(html.htmlfiles) + 1 # Html files + cover async def download_cover(cover_file: OnlineFile): - cover_filename = f"cover.{html.cover.extension}" + cover_filename = f"cover.{cover_file.extension}" epub_cover = epub.EpubCover(file_name = cover_filename) - epub_cover.content = await self._download_file(html.cover) + epub_cover.content = await self._download_file(cover_file) output.add_item(epub_cover) epub_cover_page = epub.EpubCoverHtml(image_name = cover_filename) if update: @@ -38,7 +38,12 @@ class Epub(OutputFormat): async def download_file(index: int, file: HtmlFile): - response = await self._client.get(file.file.url, follow_redirects=True) + response = await self._client.get( + file.file.url, + headers = file.file.headers, + cookies = file.file.cookies, + follow_redirects=True + ) soup = BeautifulSoup(response.text, "lxml") selected_element = soup.find(attrs=file.selector) epub_file = epub.EpubHtml( @@ -55,7 +60,9 @@ class Epub(OutputFormat): download_file(index, file) for index, file in enumerate(html.htmlfiles) ] - epub_files = await asyncio.gather(download_cover(html.cover), *tasks) + if html.cover: + tasks.append(download_cover(html.cover)) + epub_files = await asyncio.gather(*tasks) # Add files to epub for epub_file in epub_files: diff --git a/grawlix/output/output_format.py b/grawlix/output/output_format.py index 1ed91e9..80f7c78 100644 --- a/grawlix/output/output_format.py +++ b/grawlix/output/output_format.py @@ -59,7 +59,7 @@ class OutputFormat: :returns: Content of downloaded file """ content = b"" - async with self._client.stream("GET", file.url, headers = file.headers, follow_redirects=True) as request: + async with self._client.stream("GET", file.url, headers = file.headers, cookies = file.cookies, follow_redirects=True) as request: total_filesize = int(request.headers["Content-length"]) async for chunk in request.aiter_bytes(): content += chunk From be0aa9eec0ad19d96da842ed27062b3d3ba1eac9 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Fri, 2 Jun 2023 20:02:07 +0200 Subject: [PATCH 07/12] Add fanfiction.net source --- README.md | 1 + grawlix/sources/__init__.py | 2 ++ grawlix/sources/fanfictionnet.py | 55 ++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 grawlix/sources/fanfictionnet.py diff --git a/README.md b/README.md index 52d5843..153f98f 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ CLI ebook downloader ## Supported services grawlix currently supports downloading from the following sources: - [eReolen](https://ereolen.dk) +- [fanfiction.net](https://www.fanfiction.net) - [Flipp](https://flipp.dk) - [Internet Archive](https://archive.org) - [Manga Plus](https://mangaplus.shueisha.co.jp) diff --git a/grawlix/sources/__init__.py b/grawlix/sources/__init__.py index 7ebb447..54c4c21 100644 --- a/grawlix/sources/__init__.py +++ b/grawlix/sources/__init__.py @@ -2,6 +2,7 @@ from grawlix.exceptions import InvalidUrl from .source import Source from .ereolen import Ereolen +from .fanfictionnet import FanfictionNet from .flipp import Flipp from .internet_archive import InternetArchive from .mangaplus import MangaPlus @@ -54,6 +55,7 @@ def get_source_classes() -> list[type[Source]]: """ return [ Ereolen, + FanfictionNet, Flipp, InternetArchive, MangaPlus, diff --git a/grawlix/sources/fanfictionnet.py b/grawlix/sources/fanfictionnet.py new file mode 100644 index 0000000..32a8940 --- /dev/null +++ b/grawlix/sources/fanfictionnet.py @@ -0,0 +1,55 @@ +from .source import Source +from grawlix.book import Book, HtmlFile, HtmlFiles, OnlineFile, Metadata + +from bs4 import BeautifulSoup + +USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; rv:113.0) Gecko/20100101 Firefox/113.0" + +class FanfictionNet(Source): + name: str = "fanfiction.net" + match = [ + r"https://www.fanfiction.net/s/\d+/\d+.*" + ] + _authentication_methods: list[str] = [ "cookies" ] + + async def download(self, url: str) -> Book: + book_id = self._extract_id(url) + response = await self._client.get( + f"https://www.fanfiction.net/s/{book_id}/1", + headers = { + "User-Agent": USER_AGENT + } + ) + soup = BeautifulSoup(response.text, "lxml") + chapters = [] + for index, chapter in enumerate(soup.find(id="chap_select").find_all("option")): + chapters.append( + HtmlFile( + title = chapter.text, + file = OnlineFile( + url = f"https://www.fanfiction.net/s/{book_id}/{index+1}", + extension = "html", + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:113.0) Gecko/20100101 Firefox/113.0", + }, + cookies = self._client.cookies + ), + selector = { "id": "storytext" } + ) + ) + return Book( + data = HtmlFiles(htmlfiles = chapters), + metadata = Metadata( + title = soup.find("b", class_="xcontrast_txt").text, + ) + ) + + @staticmethod + def _extract_id(url: str) -> str: + """ + Extracts book id from url + + :param url: Url of book + :returns: Id of book + """ + return url.split("/")[4] From f21c56873d1c7d6710bf4ba33e8fd82febf997c1 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Mon, 5 Jun 2023 23:42:22 +0200 Subject: [PATCH 08/12] Update README with new installation instructions --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 153f98f..40b6c1f 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,13 @@ grawlix currently supports downloading from the following sources: - [Webtoons](https://webtoons.com) ## Installation + +### From pypi (recommended) +```shell +pip install grawlix +``` + +### From repo (unstable) ```shell git clone https://github.com/jo1gi/grawlix.git cd grawlix From 143c2b4c61923f9ae057649212e9d3daab99372c Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Tue, 6 Jun 2023 23:47:47 +0200 Subject: [PATCH 09/12] Fix debug argument It simply didn't work --- grawlix/arguments.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/grawlix/arguments.py b/grawlix/arguments.py index 93ef23c..c4a52c9 100644 --- a/grawlix/arguments.py +++ b/grawlix/arguments.py @@ -61,6 +61,7 @@ def parse_arguments() -> argparse.Namespace: parser.add_argument( '--debug', help = "Enable debug messages", - dest = "debug" + dest = "debug", + action="store_true", ) return parser.parse_args() From 28c594d445e012199f97c4e211cb87db16c4ec8d Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Thu, 8 Jun 2023 03:11:32 +0200 Subject: [PATCH 10/12] Add support for Norwegian and Swedish Flipp --- grawlix/sources/flipp.py | 78 ++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 23 deletions(-) diff --git a/grawlix/sources/flipp.py b/grawlix/sources/flipp.py index 2b6c9bf..9bd16a1 100644 --- a/grawlix/sources/flipp.py +++ b/grawlix/sources/flipp.py @@ -1,6 +1,7 @@ from .source import Source from grawlix.book import Book, Metadata, ImageList, OnlineFile, Series, Result from grawlix.exceptions import InvalidUrl, DataNotFound +from grawlix.logging import debug from grawlix.utils import get_arg_from_url import re @@ -9,31 +10,41 @@ from typing import Tuple, Optional BASEURL = "https://reader.flipp.dk/html5/reader" +LANGUAGE_CODE_MAPPING = { + "dk": "da-DK", + "no": "nb-NO", + "se": "sv-SE" +} + class Flipp(Source): name: str = "Flipp" match = [ - r"https?://reader.flipp.dk/html5/reader/production/default.aspx\?pubname=&edid=([^/]+)", - r"https?://magasiner.flipp.dk/flipp/web-app/#/publications/.+" + r"https?://reader.flipp.(dk|no|se)/html5/reader/production/default.aspx\?pubname=&edid=([^/]+)", + r"https?://(magasiner|blader).flipp.(dk|no|se)/flipp/web-app/#/publications/.+" ] _authentication_methods: list[str] = [] - _login_cache: Optional[dict] = None + _login_cache: dict = {} + + async def download(self, url: str) -> Result: + domain_extension = self.get_domain_extension(url) if re.match(self.match[0], url): - eid = self._get_eid(url) - publication_id = await self._get_series_id(eid) - return await self._download_book(eid, publication_id) + issue_id = self._extract_issue_id(url) + series_id = await self._get_series_id(issue_id) + debug(f"{series_id=}") + return await self._download_book(issue_id, series_id, domain_extension) elif re.match(self.match[1], url): - return await self._download_series(url) + return await self._download_series(url, domain_extension) raise InvalidUrl - async def download_book_from_id(self, book_id: Tuple[str, str]) -> Book: - series_id, issue_id = book_id - return await self._download_book(issue_id, series_id) + async def download_book_from_id(self, book_id: Tuple[str, str, str]) -> Book: + series_id, issue_id, language_code = book_id + return await self._download_book(issue_id, series_id, language_code) - async def _download_series(self, url: str) -> Series: + async def _download_series(self, url: str, language_code) -> Series: """ Download series with book ids from Flipp @@ -41,27 +52,27 @@ class Flipp(Source): :returns: Series object """ series_id = url.split("/")[-1] - login_info = await self._download_login_info() + login_info = await self._download_login_info(language_code) series_metadata = self._extract_series_data(login_info, series_id) issues = [] for issue in series_metadata["issues"]: issue_id = issue["customIssueCode"] - issues.append((series_id, issue_id)) + issues.append((series_id, issue_id, language_code)) return Series( title = series_metadata["name"], book_ids = issues ) - async def _download_login_info(self) -> dict: + async def _download_login_info(self, language_code: str) -> dict: """ Download login info from Flipp Will use cache if available :returns: Login info """ - if self._login_cache: - return self._login_cache + if language_code in self._login_cache: + return self._login_cache[language_code] login_cache = await self._client.post( "https://flippapi.egmontservice.com/api/signin", headers = { @@ -71,14 +82,14 @@ class Flipp(Source): "email": "", "password": "", "token": "", - "languageCulture": "da-DK", + "languageCulture": LANGUAGE_CODE_MAPPING[language_code], "appId": "", "appVersion": "", "uuid": "", "os": "" } ) - self._login_cache = login_cache.json() + self._login_cache[language_code] = login_cache.json() return login_cache.json() @@ -96,7 +107,7 @@ class Flipp(Source): raise DataNotFound - async def _download_book(self, issue_id: str, series_id: str) -> Book: + async def _download_book(self, issue_id: str, series_id: str, language_code: str) -> Book: """ Download book from Flipp @@ -105,7 +116,7 @@ class Flipp(Source): :returns: Book metadata """ pages = await self._get_pages(issue_id, series_id) - metadata = await self._get_metadata(issue_id, series_id) + metadata = await self._get_metadata(issue_id, series_id, language_code) return Book( data = ImageList(pages), metadata = Metadata( @@ -116,7 +127,7 @@ class Flipp(Source): ) - async def _get_metadata(self, issue_id: str, series_id: str) -> dict: + async def _get_metadata(self, issue_id: str, series_id: str, language_code: str) -> dict: """ Download and extract issue data @@ -124,7 +135,7 @@ class Flipp(Source): :param series_id: Series id :returns: Issue metadata """ - login_info = await self._download_login_info() + login_info = await self._download_login_info(language_code) series_metadata = self._extract_series_data(login_info, series_id) for issue in series_metadata["issues"]: if issue["customIssueCode"] == issue_id: @@ -132,7 +143,28 @@ class Flipp(Source): return issue raise DataNotFound - def _get_eid(self, url: str) -> str: + + @staticmethod + def get_domain_extension(url: str) -> str: + """ + Extract domain extension from url + + :param url: Url to parse + :returns: Domain extension of url + """ + parsed_url = urlparse(url) + extension = parsed_url.netloc.split(".")[-1] + return extension + + + @staticmethod + def _extract_issue_id(url: str) -> str: + """ + Extract eid from url + + :param url: Url to extract data from + :returns: Eid in url + """ return get_arg_from_url(url, "edid") From 9e876a0cf670f9890860294b7bfc3a77b08b3a9c Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Mon, 12 Jun 2023 22:36:54 +0200 Subject: [PATCH 11/12] Add basic support for Nextory --- README.md | 1 + grawlix/book.py | 11 ++ grawlix/output/__init__.py | 6 +- grawlix/output/epub.py | 66 ++++++++++- grawlix/sources/__init__.py | 2 + grawlix/sources/nextory.py | 227 ++++++++++++++++++++++++++++++++++++ 6 files changed, 308 insertions(+), 5 deletions(-) create mode 100644 grawlix/sources/nextory.py diff --git a/README.md b/README.md index a601e3a..2dff4d1 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ grawlix currently supports downloading from the following sources: - [Flipp](https://flipp.dk) - [Internet Archive](https://archive.org) - [Manga Plus](https://mangaplus.shueisha.co.jp) +- [Nextory](https://nextory.com) - [Royal Road](https://www.royalroad.com) - [Saxo](https://saxo.com) - [Webtoons](https://webtoons.com) diff --git a/grawlix/book.py b/grawlix/book.py index ae49564..77e92ec 100644 --- a/grawlix/book.py +++ b/grawlix/book.py @@ -55,6 +55,16 @@ class ImageList: """ images: list[OnlineFile] + +@dataclass(slots=True) +class EpubInParts: + """ + Epub split up into smaller epubs + """ + files: list[OnlineFile] + files_in_toc: dict[str, str] + + @dataclass(slots=True) class HtmlFile: title: str @@ -67,6 +77,7 @@ class HtmlFiles: htmlfiles: list[HtmlFile] BookData = Union[ + EpubInParts, SingleFile, ImageList, HtmlFiles diff --git a/grawlix/output/__init__.py b/grawlix/output/__init__.py index 35eb1d2..7243327 100644 --- a/grawlix/output/__init__.py +++ b/grawlix/output/__init__.py @@ -1,4 +1,4 @@ -from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles +from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles, EpubInParts from grawlix.exceptions import GrawlixError, UnsupportedOutputFormat from grawlix.logging import info @@ -58,9 +58,9 @@ def get_default_format(book: Book) -> OutputFormat: bookdata = book.data if isinstance(bookdata, SingleFile): extension = bookdata.file.extension - if isinstance(bookdata, ImageList): + elif isinstance(bookdata, ImageList): extension = "cbz" - if isinstance(bookdata, HtmlFiles): + elif isinstance(bookdata, HtmlFiles) or isinstance(bookdata, EpubInParts): extension = "epub" output_format = find_output_format(book, extension) return output_format() diff --git a/grawlix/output/epub.py b/grawlix/output/epub.py index 7a5d4fa..7850805 100644 --- a/grawlix/output/epub.py +++ b/grawlix/output/epub.py @@ -1,4 +1,4 @@ -from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book, SingleFile, Metadata +from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book, SingleFile, Metadata, EpubInParts from grawlix.exceptions import UnsupportedOutputFormat from .output_format import OutputFormat, Update @@ -6,19 +6,25 @@ import asyncio from bs4 import BeautifulSoup import os from ebooklib import epub +from zipfile import ZipFile +import rich class Epub(OutputFormat): extension = "epub" - input_types = [SingleFile, HtmlFiles] + input_types = [SingleFile, HtmlFiles, EpubInParts] + async def download(self, book: Book, location: str, update: Update) -> None: if isinstance(book.data, SingleFile): await self._download_single_file(book, location, update) elif isinstance(book.data, HtmlFiles): await self._download_html_files(book.data, book.metadata, location, update) + elif isinstance(book.data, EpubInParts): + await self._download_epub_in_parts(book.data, book.metadata, location, update) else: raise UnsupportedOutputFormat + async def _download_html_files(self, html: HtmlFiles, metadata: Metadata, location: str, update: Update) -> None: output = epub.EpubBook() output.set_title(metadata.title) @@ -67,3 +73,59 @@ class Epub(OutputFormat): output.add_item(epub.EpubNcx()) output.add_item(epub.EpubNav()) epub.write_epub(location, output) + + + async def _download_epub_in_parts(self, data: EpubInParts, metadata: Metadata, location: str, update: Update) -> None: + files = data.files + file_count = len(files) + progress = 1/(file_count) + temporary_file_location = f"{location}.tmp" + + added_files: set[str] = set() + def get_new_files(zipfile: ZipFile): + """Returns files in zipfile not already added to file""" + for filename in zipfile.namelist(): + if filename in added_files or filename.endswith(".opf") or filename.endswith(".ncx"): + continue + yield filename + + output = epub.EpubBook() + for file in files: + await self._download_and_write_file(file, temporary_file_location) + with ZipFile(temporary_file_location, "r") as zipfile: + for filepath in get_new_files(zipfile): + content = zipfile.read(filepath) + if filepath.endswith("html"): + filename = os.path.basename(filepath) + is_in_toc = False + title = None + for key, value in data.files_in_toc.items(): + toc_filename = key.split("#")[0] + if filename == toc_filename: + title = value + is_in_toc = True + break + epub_file = epub.EpubHtml( + title = title, + file_name = filepath, + content = content + ) + output.add_item(epub_file) + output.spine.append(epub_file) + if is_in_toc: + output.toc.append(epub_file) + else: + epub_file = epub.EpubItem( + file_name = filepath, + content = content + ) + output.add_item(epub_file) + added_files.add(filepath) + if update: + update(progress) + os.remove(temporary_file_location) + + output.add_item(epub.EpubNcx()) + output.add_item(epub.EpubNav()) + epub.write_epub(location, output) + exit() diff --git a/grawlix/sources/__init__.py b/grawlix/sources/__init__.py index 7e1fbf4..01ec768 100644 --- a/grawlix/sources/__init__.py +++ b/grawlix/sources/__init__.py @@ -5,6 +5,7 @@ from .ereolen import Ereolen from .flipp import Flipp from .internet_archive import InternetArchive from .mangaplus import MangaPlus +from .nextory import Nextory from .royal_road import RoyalRoad from .saxo import Saxo from .webtoons import Webtoons @@ -56,6 +57,7 @@ def get_source_classes() -> list[type[Source]]: Flipp, InternetArchive, MangaPlus, + Nextory, RoyalRoad, Saxo, Webtoons diff --git a/grawlix/sources/nextory.py b/grawlix/sources/nextory.py new file mode 100644 index 0000000..5356641 --- /dev/null +++ b/grawlix/sources/nextory.py @@ -0,0 +1,227 @@ +from grawlix.book import Book, Metadata, OnlineFile, BookData, OnlineFile, SingleFile, EpubInParts, Result, Series +from grawlix.encryption import AESEncryption +from grawlix.exceptions import InvalidUrl +from .source import Source + +from typing import Optional +import uuid +import rich +import base64 + +LOCALE = "en_GB" + +class Nextory(Source): + name: str = "Nextory" + match = [ + r"https?://((www|catalog-\w\w).)?nextory.+" + ] + _authentication_methods = [ "login" ] + + + @staticmethod + def _create_device_id() -> str: + """Create unique device id""" + return str(uuid.uuid3(uuid.NAMESPACE_DNS, "audiobook-dl")) + + + async def login(self, username: str, password: str, **kwargs) -> None: + # Set permanent headers + device_id = self._create_device_id() + self._client.headers.update( + { + "X-Application-Id": "200", + "X-App-Version": "5.0.0", + "X-Locale": LOCALE, + "X-Model": "Personal Computer", + "X-Device-Id": device_id, + "locale": LOCALE, + "device": device_id, + "osinfo": "Android 13", + "model": "Personal Computer", + "version": "4.34.6", + "appid": "200", + } + ) + # Login for account + session_response = await self._client.post( + "https://api.nextory.com/user/v1/sessions", + json = { + "identifier": username, + "password": password + }, + ) + session_response = session_response.json() + rich.print(session_response) + login_token = session_response["login_token"] + country = session_response["country"] + self._client.headers.update( + { + "token": login_token, + "X-Login-Token": login_token, + "X-Country-Code": country, + } + ) + # Login for user + profiles_response = await self._client.get( + "https://api.nextory.com/user/v1/me/profiles", + ) + profiles_response = profiles_response.json() + rich.print(profiles_response) + profile = profiles_response["profiles"][0] + login_key = profile["login_key"] + authorize_response = await self._client.post( + "https://api.nextory.com/user/v1/profile/authorize", + json = { + "login_key": login_key + } + ) + authorize_response = authorize_response.json() + rich.print(authorize_response) + profile_token = authorize_response["profile_token"] + self._client.headers.update({"X-Profile-Token": profile_token}) + self._client.headers.update({"X-Profile-Token": profile_token}) + + + @staticmethod + def _find_epub_id(product_data) -> str: + """Find id of book format of type epub for given book""" + for format in product_data["formats"]: + if format["type"] == "epub": + return format["identifier"] + raise InvalidUrl + + + @staticmethod + def _extract_id_from_url(url: str) -> str: + """ + Extract id of book from url. This id is not always the internal id for + the book. + + :param url: Url to book information page + :return: Id in url + """ + return url.split("-")[-1].replace("/", "") + + + async def download(self, url: str) -> Result: + url_id = self._extract_id_from_url(url) + if "serier" in url: + return await self._download_series(url_id) + else: + book_id = await self._get_book_id_from_url_id(url_id) + return await self._download_book(book_id) + + + async def download_book_from_id(self, book_id: str) -> Book: + return await self._download_book(book_id) + + + async def _download_series(self, series_id: str) -> Series: + """ + Download series from Nextory + + :param series_id: Id of series on Nextory + :returns: Series data + """ + response = await self._client.get( + f"https://api.nextory.com/discovery/v1/series/{series_id}/products", + params = { + "content_type": "book", + "page": 0, + "per": 100, + } + ) + series_data = response.json() + book_ids = [] + for book in series_data["products"]: + book_id = book["id"] + book_ids.append(book_id) + return Series( + title = series_data["products"][0]["series"]["name"], + book_ids = book_ids, + ) + + + @staticmethod + def _extract_series_name(product_info: dict) -> Optional[str]: + if not "series" in product_info: + return None + return product_info["series"]["name"] + + + async def _get_book_id_from_url_id(self, url_id: str) -> str: + """ + Download book id from url id + + :param url_id: Id of book from url + :return: Book id + """ + response = await self._client.get( + f"https://api.nextory.se/api/app/product/7.5/bookinfo", + params = { "id": url_id }, + ) + rich.print(response.url) + rich.print(response.content) + exit() + + + async def _download_book(self, book_id: str) -> Book: + product_data = await self._client.get( + f"https://api.nextory.com/library/v1/products/{book_id}" + ) + product_data = product_data.json() + epub_id = self._find_epub_id(product_data) + pages = await self._get_pages(epub_id) + return Book( + data = pages, + metadata = Metadata( + title = product_data["title"], + authors = [author["name"] for author in product_data["authors"]], + series = self._extract_series_name(product_data), + ) + ) + + + @staticmethod + def _fix_key(value: str) -> bytes: + """Remove unused data and decode key""" + return base64.b64decode(value[:-1]) + + + async def _get_pages(self, epub_id: str) -> BookData: + """ + Download page information for book + + :param epub_id: Id of epub file + :return: Page data + """ + # Nextory books are for some reason split up into multiple epub files - + # one for each chapter file. All of these files has to be decrypted and + # combined afterwards. Many of the provided epub files contain the same + # files and some of them contain the same file names but with variation + # in the content and comments that describe what should have been there + # if the book was whole from the start. + response = await self._client.get( + f"https://api.nextory.com/reader/books/{epub_id}/packages/epub" + ) + epub_data = response.json() + encryption = AESEncryption( + key = self._fix_key(epub_data["crypt_key"]), + iv = self._fix_key(epub_data["crypt_iv"]) + ) + files = [] + for part in epub_data["spines"]: + files.append( + OnlineFile( + url = part["spine_url"], + extension = "epub", + encryption = encryption + ) + ) + files_in_toc = {} + for item in epub_data["toc"]["childrens"]: # Why is it "childrens"? + files_in_toc[item["src"]] = item["name"] + return EpubInParts( + files, + files_in_toc + ) From ef18a5a94222ce54116625629b98788e85926ad3 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Fri, 16 Jun 2023 16:30:00 +0200 Subject: [PATCH 12/12] Add more metadata to comics --- grawlix/book.py | 5 ++++ grawlix/output/cbz.py | 2 ++ grawlix/output/metadata/__init__.py | 5 ++++ grawlix/output/metadata/comicinfo.py | 37 ++++++++++++++++++++++++++++ grawlix/sources/marvel.py | 8 ++++-- 5 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 grawlix/output/metadata/__init__.py create mode 100644 grawlix/output/metadata/comicinfo.py diff --git a/grawlix/book.py b/grawlix/book.py index 8bd2caf..7de10ef 100644 --- a/grawlix/book.py +++ b/grawlix/book.py @@ -1,6 +1,7 @@ from grawlix import Encryption from dataclasses import dataclass, field from typing import Optional, Union, TypeVar, Generic, Any +from datetime import date @dataclass(slots=True) class Metadata: @@ -11,6 +12,8 @@ class Metadata: language: Optional[str] = None publisher: Optional[str] = None identifier: Optional[str] = None + description: Optional[str] = None + release_date: Optional[date] = None def as_dict(self) -> dict: return { @@ -20,6 +23,8 @@ class Metadata: "identifier": self.identifier or "UNKNOWN", "language": self.language or "UNKNOWN", "authors": "; ".join(self.authors), + "description": self.description or "UNKNOWN", + "relase_date": self.release_date.isoformat() if self.release_date else "UNKNOWN", } diff --git a/grawlix/output/cbz.py b/grawlix/output/cbz.py index b374c5a..6511022 100644 --- a/grawlix/output/cbz.py +++ b/grawlix/output/cbz.py @@ -1,6 +1,7 @@ from .output_format import OutputFormat, Update, Book from grawlix.book import ImageList, OnlineFile from grawlix.exceptions import UnsupportedOutputFormat +from .metadata.comicinfo import to_comic_info from zipfile import ZipFile import asyncio @@ -29,3 +30,4 @@ class Cbz(OutputFormat): for index, file in enumerate(images) ] await asyncio.wait(tasks) + zip.writestr("ComicInfo.xml", to_comic_info(book.metadata)) diff --git a/grawlix/output/metadata/__init__.py b/grawlix/output/metadata/__init__.py new file mode 100644 index 0000000..7cd49d7 --- /dev/null +++ b/grawlix/output/metadata/__init__.py @@ -0,0 +1,5 @@ +from grawlix.book import Metadata + +from typing import Callable, Any + +MetadataOutput = Callable[[Metadata], Any] diff --git a/grawlix/output/metadata/comicinfo.py b/grawlix/output/metadata/comicinfo.py new file mode 100644 index 0000000..e4f0869 --- /dev/null +++ b/grawlix/output/metadata/comicinfo.py @@ -0,0 +1,37 @@ +from grawlix.book import Metadata + +import xml.etree.ElementTree as ET +from typing import Optional + + +def add_value(element: ET.Element, name: str, value: Optional[str]) -> None: + """ + Add new tag to element + + :param element: Element to add tag to + :param name: Name of new tag + :param value: Contents of new tag + """ + if value: + subelement = ET.SubElement(element, name) + subelement.text = str(value) + + +def to_comic_info(metadata: Metadata) -> str: + """ + Output as ComicRack metadata format (ComicInfo) + Documentation: https://anansi-project.github.io/docs/comicinfo + + :param metadata: Book metadata + :returns: ComicInfo xml as a string + """ + root = ET.Element("ComicInfo") + add_value(root, "Title", metadata.title) + add_value(root, "Series", metadata.series) + add_value(root, "Summary", metadata.description) + add_value(root, "Publisher", metadata.publisher) + add_value(root, "Year", getattr(metadata.release_date, "year", None)) + add_value(root, "Month", getattr(metadata.release_date, "month", None)) + add_value(root, "Day", getattr(metadata.release_date, "day", None)) + add_value(root, "Format", "Web") + return ET.tostring(root).decode("utf8") diff --git a/grawlix/sources/marvel.py b/grawlix/sources/marvel.py index d42910f..bcdbb43 100644 --- a/grawlix/sources/marvel.py +++ b/grawlix/sources/marvel.py @@ -1,9 +1,11 @@ from grawlix.book import Book, Metadata, ImageList, OnlineFile, Series, Result from grawlix.exceptions import InvalidUrl, DataNotFound +from grawlix import logging from .source import Source import re +from datetime import date # Personal marvel ip key API_KEY = "83ac0da31d3f6801f2c73c7e07ad76e8" @@ -111,9 +113,11 @@ class Marvel(Source[str]): issue_meta = response.json()["data"]["results"][0]["issue_meta"] return Metadata( title = issue_meta["title"], - series = issue_meta["series_title"], + series = issue_meta.get("series_title"), + description = issue_meta.get("description"), publisher = "Marvel", - authors = [c["full_name"] for c in issue_meta["creators"]["extended_list"]] + release_date = date.fromisoformat(issue_meta.get("release_date_digital")), + authors = [c["full_name"] for c in issue_meta["creators"]["extended_list"]] if "extended_list" in issue_meta["creators"] else [] )