# -*- coding: utf-8 -*- # Copyright 2024 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://archiveofourown.org/""" from .common import Extractor, Message from .. import text, util, exception from ..cache import cache BASE_PATTERN = (r"(?:https?://)?(?:www\.)?" r"a(?:rchiveofourown|o3)\.(?:org|com|net)") class Ao3Extractor(Extractor): """Base class for ao3 extractors""" category = "ao3" root = "https://archiveofourown.org" categorytransfer = True cookies_domain = ".archiveofourown.org" cookies_names = ("remember_user_token",) request_interval = (0.5, 1.5) def items(self): self.login() base = self.root + "/works/" data = {"_extractor": Ao3WorkExtractor} for work_id in self.works(): yield Message.Queue, base + work_id, data def works(self): return self._pagination(self.groups[0]) def login(self): if self.cookies_check(self.cookies_names): return username, password = self._get_auth_info() if username: return self.cookies_update(self._login_impl(username, password)) @cache(maxage=90*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) url = self.root + "/users/login" page = self.request(url).text pos = page.find('id="loginform"') token = text.extract( page, ' name="authenticity_token" value="', '"', pos)[0] if not token: self.log.error("Unable to extract 'authenticity_token'") data = { "authenticity_token": text.unescape(token), "user[login]" : username, "user[password]" : password, "user[remember_me]" : "1", "commit" : "Log In", } response = self.request(url, method="POST", data=data) if not response.history: raise exception.AuthenticationError() remember = response.history[0].cookies.get("remember_user_token") if not remember: raise exception.AuthenticationError() return { "remember_user_token": remember, "user_credentials" : "1", } def _pagination(self, path, needle='