# -*- coding: utf-8 -*- # Copyright 2024 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://archiveofourown.org/""" from .common import Extractor, Message from .. import text, util, exception from ..cache import cache BASE_PATTERN = (r"(?:https?://)?(?:www\.)?" r"a(?:rchiveofourown|o3)\.(?:org|com|net)") class Ao3Extractor(Extractor): """Base class for ao3 extractors""" category = "ao3" root = "https://archiveofourown.org" categorytransfer = True cookies_domain = ".archiveofourown.org" cookies_names = ("remember_user_token",) request_interval = (0.5, 1.5) def items(self): self.login() base = self.root + "/works/" data = {"_extractor": Ao3WorkExtractor} for work_id in self.works(): yield Message.Queue, base + work_id, data def works(self): return self._pagination(self.groups[0]) def login(self): if self.cookies_check(self.cookies_names): return username, password = self._get_auth_info() if username: return self.cookies_update(self._login_impl(username, password)) @cache(maxage=90*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) url = self.root + "/users/login" page = self.request(url).text pos = page.find('id="loginform"') token = text.extract( page, ' name="authenticity_token" value="', '"', pos)[0] if not token: self.log.error("Unable to extract 'authenticity_token'") data = { "authenticity_token": text.unescape(token), "user[login]" : username, "user[password]" : password, "user[remember_me]" : "1", "commit" : "Log In", } response = self.request(url, method="POST", data=data) if not response.history: raise exception.AuthenticationError() remember = response.history[0].cookies.get("remember_user_token") if not remember: raise exception.AuthenticationError() return { "remember_user_token": remember, "user_credentials" : "1", } def _pagination(self, path, needle='

Adult Content Warning") for ch in text.extract_iter(cindex, ' value="', ""): cid, _, cname = ch.partition('">') chapters[cid] = text.unescape(cname) fmts = {} path = "" download = extr(' class="download"', "") for dl in text.extract_iter(download, ' href="', "') fmts[type.lower()] = path data = { "id" : text.parse_int(work_id), "rating" : text.split_html( extr('

', "

")), "warnings" : text.split_html( extr('

', "

")), "categories" : text.split_html( extr('

', "

")), "fandom" : text.split_html( extr('

', "

")), "relationships": text.split_html( extr('

', "

")), "characters" : text.split_html( extr('

', "

")), "tags" : text.split_html( extr('

', "

")), "lang" : extr('

', "

"), "date" : text.parse_datetime( extr('

', "<"), "%Y-%m-%d"), "date_completed": text.parse_datetime( extr('>Completed:

', "<"), "%Y-%m-%d"), "date_updated" : text.parse_timestamp( path.rpartition("updated_at=")[2]), "words" : text.parse_int( extr('

', "<").replace(",", "")), "chapters" : chapters, "comments" : text.parse_int( extr('

', "<").replace(",", "")), "likes" : text.parse_int( extr('

', "<").replace(",", "")), "bookmarks" : text.parse_int(text.remove_html( extr('

', "

")).replace(",", "")), "views" : text.parse_int( extr('

', "<").replace(",", "")), "title" : text.unescape( extr(' class="title heading">', "<").strip()), "author" : text.unescape(text.remove_html( extr(' class="byline heading">', ""))), "summary" : text.split_html( extr(' class="heading">Summary:', "")), } data["language"] = util.code_to_language(data["lang"]) series = data["series"] if series: extr = text.extract_from(series) data["series"] = { "prev" : extr(' class="previous" href="/works/', '"'), "index": extr(' class="position">Part ', " "), "id" : extr(' href="/series/', '"'), "name" : text.unescape(extr(">", "<")), "next" : extr(' class="next" href="/works/', '"'), } else: data["series"] = None yield Message.Directory, data for fmt in self.formats: try: url = text.urljoin(self.root, fmts[fmt]) except KeyError: self.log.warning("%s: Format '%s' not available", work_id, fmt) else: yield Message.Url, url, text.nameext_from_url(url, data) class Ao3SeriesExtractor(Ao3Extractor): """Extractor for AO3 works of a series""" subcategory = "series" pattern = BASE_PATTERN + r"(/series/(\d+))" example = "https://archiveofourown.org/series/12345" class Ao3TagExtractor(Ao3Extractor): """Extractor for AO3 works by tag""" subcategory = "tag" pattern = BASE_PATTERN + r"(/tags/([^/?#]+)/works(?:/?\?.+)?)" example = "https://archiveofourown.org/tags/TAG/works" class Ao3SearchExtractor(Ao3Extractor): """Extractor for AO3 search results""" subcategory = "search" pattern = BASE_PATTERN + r"(/works/search/?\?.+)" example = "https://archiveofourown.org/works/search?work_search[query]=air" class Ao3UserExtractor(Ao3Extractor): """Extractor for an AO3 user profile""" subcategory = "user" pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)" r"(?:/profile)?/?(?:$|\?|#)") example = "https://archiveofourown.org/users/USER" def initialize(self): pass def items(self): base = "{}/users/{}/".format(self.root, self.groups[0]) return self._dispatch_extractors(( (Ao3UserWorksExtractor , base + "works"), (Ao3UserSeriesExtractor , base + "series"), (Ao3UserBookmarkExtractor, base + "bookmarks"), ), ("user-works", "user-series")) class Ao3UserWorksExtractor(Ao3Extractor): """Extractor for works of an AO3 user""" subcategory = "user-works" pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" r"works(?:/?\?.+)?)") example = "https://archiveofourown.org/users/USER/works" class Ao3UserSeriesExtractor(Ao3Extractor): """Extractor for series of an AO3 user""" subcategory = "user-series" pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" r"series(?:/?\?.+)?)") example = "https://archiveofourown.org/users/USER/series" def items(self): self.login() base = self.root + "/series/" data = {"_extractor": Ao3SeriesExtractor} for series_id in self.series(): yield Message.Queue, base + series_id, data def series(self): return self._pagination(self.groups[0], '