[common] simplify 'user' extractors by using 'Dispatch' mixin

This commit is contained in:
Mike Fährmann
2025-05-23 21:26:13 +02:00
parent c3e8af945d
commit e199396872
16 changed files with 73 additions and 143 deletions

View File

@@ -8,7 +8,7 @@
"""Extractors for https://archiveofourown.org/""" """Extractors for https://archiveofourown.org/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache from ..cache import cache
@@ -249,16 +249,12 @@ class Ao3SearchExtractor(Ao3Extractor):
example = "https://archiveofourown.org/works/search?work_search[query]=air" example = "https://archiveofourown.org/works/search?work_search[query]=air"
class Ao3UserExtractor(Ao3Extractor): class Ao3UserExtractor(Dispatch, Ao3Extractor):
"""Extractor for an AO3 user profile""" """Extractor for an AO3 user profile"""
subcategory = "user"
pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)" pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)"
r"(?:/profile)?/?(?:$|\?|#)") r"(?:/profile)?/?(?:$|\?|#)")
example = "https://archiveofourown.org/users/USER" example = "https://archiveofourown.org/users/USER"
def initialize(self):
pass
def items(self): def items(self):
base = "{}/users/{}/".format(self.root, self.groups[0]) base = "{}/users/{}/".format(self.root, self.groups[0])
return self._dispatch_extractors(( return self._dispatch_extractors((

View File

@@ -8,7 +8,7 @@
"""Extractors for https://bsky.app/""" """Extractors for https://bsky.app/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache, memcache from ..cache import cache, memcache
@@ -210,14 +210,10 @@ class BlueskyExtractor(Extractor):
},) },)
class BlueskyUserExtractor(BlueskyExtractor): class BlueskyUserExtractor(Dispatch, BlueskyExtractor):
subcategory = "user"
pattern = USER_PATTERN + r"$" pattern = USER_PATTERN + r"$"
example = "https://bsky.app/profile/HANDLE" example = "https://bsky.app/profile/HANDLE"
def initialize(self):
pass
def items(self): def items(self):
base = "{}/profile/{}/".format(self.root, self.groups[0]) base = "{}/profile/{}/".format(self.root, self.groups[0])
default = ("posts" if self.config("quoted", False) or default = ("posts" if self.config("quoted", False) or

View File

@@ -8,7 +8,7 @@
"""Extractors for https://www.civitai.com/""" """Extractors for https://www.civitai.com/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, util, exception from .. import text, util, exception
from ..cache import memcache from ..cache import memcache
import itertools import itertools
@@ -396,14 +396,10 @@ class CivitaiImagesExtractor(CivitaiExtractor):
return self.api.images(params) return self.api.images(params)
class CivitaiUserExtractor(CivitaiExtractor): class CivitaiUserExtractor(Dispatch, CivitaiExtractor):
subcategory = "user"
pattern = USER_PATTERN + r"/?(?:$|\?|#)" pattern = USER_PATTERN + r"/?(?:$|\?|#)"
example = "https://civitai.com/user/USER" example = "https://civitai.com/user/USER"
def initialize(self):
pass
def items(self): def items(self):
base = "{}/user/{}/".format(self.root, self.groups[0]) base = "{}/user/{}/".format(self.root, self.groups[0])
return self._dispatch_extractors(( return self._dispatch_extractors((

View File

@@ -616,29 +616,6 @@ class Extractor():
fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S") fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S")
return get("date-min", dmin), get("date-max", dmax) return get("date-min", dmin), get("date-max", dmax)
def _dispatch_extractors(self, extractor_data, default=()):
""" """
extractors = {
data[0].subcategory: data
for data in extractor_data
}
include = self.config("include", default) or ()
if include == "all":
include = extractors
elif isinstance(include, str):
include = include.replace(" ", "").split(",")
result = [(Message.Version, 1)]
for category in include:
try:
extr, url = extractors[category]
except KeyError:
self.log.warning("Invalid include '%s'", category)
else:
result.append((Message.Queue, url, {"_extractor": extr}))
return iter(result)
@classmethod @classmethod
def _dump(cls, obj): def _dump(cls, obj):
util.dump_json(obj, ensure_ascii=False, indent=2) util.dump_json(obj, ensure_ascii=False, indent=2)
@@ -796,6 +773,41 @@ class MangaExtractor(Extractor):
"""Return a list of all (chapter-url, metadata)-tuples""" """Return a list of all (chapter-url, metadata)-tuples"""
class Dispatch():
subcategory = "user"
cookies_domain = None
finalize = Extractor.finalize
skip = Extractor.skip
def __iter__(self):
return self.items()
def initialize(self):
pass
def _dispatch_extractors(self, extractor_data, default=()):
extractors = {
data[0].subcategory: data
for data in extractor_data
}
include = self.config("include", default) or ()
if include == "all":
include = extractors
elif isinstance(include, str):
include = include.replace(" ", "").split(",")
result = [(Message.Version, 1)]
for category in include:
try:
extr, url = extractors[category]
except KeyError:
self.log.warning("Invalid include '%s'", category)
else:
result.append((Message.Queue, url, {"_extractor": extr}))
return iter(result)
class AsynchronousMixin(): class AsynchronousMixin():
"""Run info extraction in a separate thread""" """Run info extraction in a separate thread"""

View File

@@ -8,7 +8,7 @@
"""Extractors for https://www.deviantart.com/""" """Extractors for https://www.deviantart.com/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache, memcache from ..cache import cache, memcache
import collections import collections
@@ -873,17 +873,11 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
.replace("\\\\", "\\") .replace("\\\\", "\\")
class DeviantartUserExtractor(DeviantartExtractor): class DeviantartUserExtractor(Dispatch, DeviantartExtractor):
"""Extractor for an artist's user profile""" """Extractor for an artist's user profile"""
subcategory = "user"
pattern = BASE_PATTERN + r"/?$" pattern = BASE_PATTERN + r"/?$"
example = "https://www.deviantart.com/USER" example = "https://www.deviantart.com/USER"
def initialize(self):
pass
skip = Extractor.skip
def items(self): def items(self):
base = "{}/{}/".format(self.root, self.user) base = "{}/{}/".format(self.root, self.user)
return self._dispatch_extractors(( return self._dispatch_extractors((

View File

@@ -8,7 +8,7 @@
"""Extractors for https://www.furaffinity.net/""" """Extractors for https://www.furaffinity.net/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, util from .. import text, util
BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?(?:f[ux]|f?xfu)raffinity\.net" BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?(?:f[ux]|f?xfu)raffinity\.net"
@@ -321,18 +321,11 @@ class FuraffinityPostExtractor(FuraffinityExtractor):
return (post_id,) return (post_id,)
class FuraffinityUserExtractor(FuraffinityExtractor): class FuraffinityUserExtractor(Dispatch, FuraffinityExtractor):
"""Extractor for furaffinity user profiles""" """Extractor for furaffinity user profiles"""
subcategory = "user"
cookies_domain = None
pattern = BASE_PATTERN + r"/user/([^/?#]+)" pattern = BASE_PATTERN + r"/user/([^/?#]+)"
example = "https://www.furaffinity.net/user/USER/" example = "https://www.furaffinity.net/user/USER/"
def initialize(self):
pass
skip = Extractor.skip
def items(self): def items(self):
base = "{}/{{}}/{}/".format(self.root, self.user) base = "{}/{{}}/{}/".format(self.root, self.user)
return self._dispatch_extractors(( return self._dispatch_extractors((

View File

@@ -8,7 +8,7 @@
"""Extractors for https://www.hentai-foundry.com/""" """Extractors for https://www.hentai-foundry.com/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, util from .. import text, util
BASE_PATTERN = r"(https?://)?(?:www\.)?hentai-foundry\.com" BASE_PATTERN = r"(https?://)?(?:www\.)?hentai-foundry\.com"
@@ -192,15 +192,11 @@ class HentaifoundryExtractor(Extractor):
self.request(url, method="POST", data=data) self.request(url, method="POST", data=data)
class HentaifoundryUserExtractor(HentaifoundryExtractor): class HentaifoundryUserExtractor(Dispatch, HentaifoundryExtractor):
"""Extractor for a hentaifoundry user profile""" """Extractor for a hentaifoundry user profile"""
subcategory = "user"
pattern = BASE_PATTERN + r"/user/([^/?#]+)/profile" pattern = BASE_PATTERN + r"/user/([^/?#]+)/profile"
example = "https://www.hentai-foundry.com/user/USER/profile" example = "https://www.hentai-foundry.com/user/USER/profile"
def initialize(self):
pass
def items(self): def items(self):
root = self.root root = self.root
user = "/user/" + self.user user = "/user/" + self.user

View File

@@ -9,7 +9,7 @@
"""Extractors for https://www.instagram.com/""" """Extractors for https://www.instagram.com/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache, memcache from ..cache import cache, memcache
import itertools import itertools
@@ -430,18 +430,11 @@ class InstagramExtractor(Extractor):
user[key] = 0 user[key] = 0
class InstagramUserExtractor(InstagramExtractor): class InstagramUserExtractor(Dispatch, InstagramExtractor):
"""Extractor for an Instagram user profile""" """Extractor for an Instagram user profile"""
subcategory = "user"
pattern = USER_PATTERN + r"/?(?:$|[?#])" pattern = USER_PATTERN + r"/?(?:$|[?#])"
example = "https://www.instagram.com/USER/" example = "https://www.instagram.com/USER/"
def initialize(self):
pass
def finalize(self):
pass
def items(self): def items(self):
base = "{}/{}/".format(self.root, self.item) base = "{}/{}/".format(self.root, self.item)
stories = "{}/stories/{}/".format(self.root, self.item) stories = "{}/stories/{}/".format(self.root, self.item)

View File

@@ -8,7 +8,7 @@
"""Extractors for https://www.newgrounds.com/""" """Extractors for https://www.newgrounds.com/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache from ..cache import cache
import itertools import itertools
@@ -450,15 +450,11 @@ class NewgroundsGamesExtractor(NewgroundsExtractor):
example = "https://USER.newgrounds.com/games" example = "https://USER.newgrounds.com/games"
class NewgroundsUserExtractor(NewgroundsExtractor): class NewgroundsUserExtractor(Dispatch, NewgroundsExtractor):
"""Extractor for a newgrounds user profile""" """Extractor for a newgrounds user profile"""
subcategory = "user"
pattern = USER_PATTERN + r"/?$" pattern = USER_PATTERN + r"/?$"
example = "https://USER.newgrounds.com" example = "https://USER.newgrounds.com"
def initialize(self):
pass
def items(self): def items(self):
base = self.user_root + "/" base = self.user_root + "/"
return self._dispatch_extractors(( return self._dispatch_extractors((

View File

@@ -8,7 +8,7 @@
"""Extractors for nijie instances""" """Extractors for nijie instances"""
from .common import BaseExtractor, Message, AsynchronousMixin from .common import BaseExtractor, Message, Dispatch, AsynchronousMixin
from .. import text, exception from .. import text, exception
from ..cache import cache from ..cache import cache
@@ -177,16 +177,11 @@ BASE_PATTERN = NijieExtractor.update({
}) })
class NijieUserExtractor(NijieExtractor): class NijieUserExtractor(Dispatch, NijieExtractor):
"""Extractor for nijie user profiles""" """Extractor for nijie user profiles"""
subcategory = "user"
cookies_domain = None
pattern = BASE_PATTERN + r"/members\.php\?id=(\d+)" pattern = BASE_PATTERN + r"/members\.php\?id=(\d+)"
example = "https://nijie.info/members.php?id=12345" example = "https://nijie.info/members.php?id=12345"
def initialize(self):
pass
def items(self): def items(self):
fmt = "{}/{{}}.php?id={}".format(self.root, self.user_id).format fmt = "{}/{{}}.php?id={}".format(self.root, self.user_id).format
return self._dispatch_extractors(( return self._dispatch_extractors((

View File

@@ -8,7 +8,7 @@
"""Extractors for https://www.pixiv.net/""" """Extractors for https://www.pixiv.net/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache, memcache from ..cache import cache, memcache
from datetime import datetime, timedelta from datetime import datetime, timedelta
@@ -367,23 +367,15 @@ class PixivExtractor(Extractor):
return {} return {}
class PixivUserExtractor(PixivExtractor): class PixivUserExtractor(Dispatch, PixivExtractor):
"""Extractor for a pixiv user profile""" """Extractor for a pixiv user profile"""
subcategory = "user"
pattern = (BASE_PATTERN + r"/(?:" pattern = (BASE_PATTERN + r"/(?:"
r"(?:en/)?u(?:sers)?/|member\.php\?id=|(?:mypage\.php)?#id=" r"(?:en/)?u(?:sers)?/|member\.php\?id=|(?:mypage\.php)?#id="
r")(\d+)(?:$|[?#])") r")(\d+)(?:$|[?#])")
example = "https://www.pixiv.net/en/users/12345" example = "https://www.pixiv.net/en/users/12345"
def __init__(self, match):
PixivExtractor.__init__(self, match)
self.user_id = match.group(1)
def initialize(self):
pass
def items(self): def items(self):
base = "{}/users/{}/".format(self.root, self.user_id) base = "{}/users/{}/".format(self.root, self.groups[0])
return self._dispatch_extractors(( return self._dispatch_extractors((
(PixivAvatarExtractor , base + "avatar"), (PixivAvatarExtractor , base + "avatar"),
(PixivBackgroundExtractor , base + "background"), (PixivBackgroundExtractor , base + "background"),

View File

@@ -8,7 +8,7 @@
"""Extractors for https://www.pornhub.com/""" """Extractors for https://www.pornhub.com/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, exception from .. import text, exception
BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?pornhub\.com" BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?pornhub\.com"
@@ -164,21 +164,13 @@ class PornhubGifExtractor(PornhubExtractor):
yield Message.Url, gif["url"], text.nameext_from_url(gif["url"], gif) yield Message.Url, gif["url"], text.nameext_from_url(gif["url"], gif)
class PornhubUserExtractor(PornhubExtractor): class PornhubUserExtractor(Dispatch, PornhubExtractor):
"""Extractor for a pornhub user""" """Extractor for a pornhub user"""
subcategory = "user"
pattern = BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)/?$" pattern = BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)/?$"
example = "https://www.pornhub.com/model/USER" example = "https://www.pornhub.com/model/USER"
def __init__(self, match):
PornhubExtractor.__init__(self, match)
self.user = match.group(1)
def initialize(self):
pass
def items(self): def items(self):
base = "{}/{}/".format(self.root, self.user) base = "{}/{}/".format(self.root, self.groups[0])
return self._dispatch_extractors(( return self._dispatch_extractors((
(PornhubPhotosExtractor, base + "photos"), (PornhubPhotosExtractor, base + "photos"),
(PornhubGifsExtractor , base + "gifs"), (PornhubGifsExtractor , base + "gifs"),

View File

@@ -8,7 +8,7 @@
"""Extractors for https://x.com/""" """Extractors for https://x.com/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache, memcache from ..cache import cache, memcache
import itertools import itertools
@@ -577,27 +577,18 @@ class TwitterExtractor(Extractor):
return self.cookies_update(_login_impl(self, username, password)) return self.cookies_update(_login_impl(self, username, password))
class TwitterUserExtractor(TwitterExtractor): class TwitterUserExtractor(Dispatch, TwitterExtractor):
"""Extractor for a Twitter user""" """Extractor for a Twitter user"""
subcategory = "user"
pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])" pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])"
r"|i(?:/user/|ntent/user\?user_id=)(\d+))") r"|i(?:/user/|ntent/user\?user_id=)(\d+))")
example = "https://x.com/USER" example = "https://x.com/USER"
def __init__(self, match):
TwitterExtractor.__init__(self, match)
user_id = match.group(2)
if user_id:
self.user = "id:" + user_id
def initialize(self):
pass
def finalize(self):
pass
def items(self): def items(self):
base = "{}/{}/".format(self.root, self.user) user, user_id = self.groups
if user_id is not None:
user = "id:" + user_id
base = "{}/{}/".format(self.root, user)
return self._dispatch_extractors(( return self._dispatch_extractors((
(TwitterInfoExtractor , base + "info"), (TwitterInfoExtractor , base + "info"),
(TwitterAvatarExtractor , base + "photo"), (TwitterAvatarExtractor , base + "photo"),

View File

@@ -8,7 +8,7 @@
"""Extractors for https://vsco.co/""" """Extractors for https://vsco.co/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, util from .. import text, util
BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co" BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co"
@@ -132,15 +132,11 @@ class VscoExtractor(Extractor):
return media return media
class VscoUserExtractor(VscoExtractor): class VscoUserExtractor(Dispatch, VscoExtractor):
"""Extractor for a vsco user profile""" """Extractor for a vsco user profile"""
subcategory = "user"
pattern = USER_PATTERN + r"/?$" pattern = USER_PATTERN + r"/?$"
example = "https://vsco.co/USER" example = "https://vsco.co/USER"
def initialize(self):
pass
def items(self): def items(self):
base = "{}/{}/".format(self.root, self.user) base = "{}/{}/".format(self.root, self.user)
return self._dispatch_extractors(( return self._dispatch_extractors((

View File

@@ -8,7 +8,7 @@
"""Extractors for https://wallhaven.cc/""" """Extractors for https://wallhaven.cc/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, exception from .. import text, exception
@@ -88,21 +88,13 @@ class WallhavenCollectionExtractor(WallhavenExtractor):
return {"username": self.username, "collection_id": self.collection_id} return {"username": self.username, "collection_id": self.collection_id}
class WallhavenUserExtractor(WallhavenExtractor): class WallhavenUserExtractor(Dispatch, WallhavenExtractor):
"""Extractor for a wallhaven user""" """Extractor for a wallhaven user"""
subcategory = "user"
pattern = r"(?:https?://)?wallhaven\.cc/user/([^/?#]+)/?$" pattern = r"(?:https?://)?wallhaven\.cc/user/([^/?#]+)/?$"
example = "https://wallhaven.cc/user/USER" example = "https://wallhaven.cc/user/USER"
def __init__(self, match):
WallhavenExtractor.__init__(self, match)
self.username = match.group(1)
def initialize(self):
pass
def items(self): def items(self):
base = "{}/user/{}/".format(self.root, self.username) base = "{}/user/{}/".format(self.root, self.groups[0])
return self._dispatch_extractors(( return self._dispatch_extractors((
(WallhavenUploadsExtractor , base + "uploads"), (WallhavenUploadsExtractor , base + "uploads"),
(WallhavenCollectionsExtractor, base + "favorites"), (WallhavenCollectionsExtractor, base + "favorites"),

View File

@@ -8,7 +8,7 @@
"""Extractors for https://www.weibo.com/""" """Extractors for https://www.weibo.com/"""
from .common import Extractor, Message from .common import Extractor, Message, Dispatch
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache from ..cache import cache
import random import random
@@ -258,7 +258,7 @@ class WeiboUserExtractor(WeiboExtractor):
def items(self): def items(self):
base = "{}/u/{}?tabtype=".format(self.root, self._user_id()) base = "{}/u/{}?tabtype=".format(self.root, self._user_id())
return self._dispatch_extractors(( return Dispatch._dispatch_extractors(self, (
(WeiboHomeExtractor , base + "home"), (WeiboHomeExtractor , base + "home"),
(WeiboFeedExtractor , base + "feed"), (WeiboFeedExtractor , base + "feed"),
(WeiboVideosExtractor , base + "video"), (WeiboVideosExtractor , base + "video"),