* [facebook] add initial support * renamed extractors & subcategories * better stability, modularity & naming * added single photo extractor, warnings & retries * more metadata + extract author followups * renamed "album" mentions to "set" for consistency * cookies are now only used when necessary also added author followups for singular images * removed f-strings * added way to continue extraction from where it left off also fixed some bugs * fixed bug wrong subcategory * added individual video extraction * extract audio + added ytdl option * updated setextract regex * added option to disable start warning the extractor should be ready :) * fixed description metadata bug * removed cookie "safeguard" + fixed for private profiles I have removed the cookie "safeguard" (not using cookies until they are necessary) as I've come to the conclusion that it does more harm than good. There is no way to detect whether the extractor has skipped private images, that could have been possibly extracted otherwise. Also, doing this provides little to no advantages. * fixed a few bugs regarding profile parsing * a few bugfixes Fixed some metadata attributes from not decoding correctly from non-latin languages, or not showing at all. Also improved few patterns. * retrigger checks * Final cleanups -Added tests -Fixed video extractor giving incorrect URLs -Removed start warning -Listed supported site correctly * fixed regex * trigger checks * fixed livestream playback extraction + bugfixes I've chosen to remove the "reactions", "comments" and "views" attributes as I've felt that they require additional maintenance even though nobody would ever actually use them to order files. I've also removed the "title" and "caption" video attributes for their inconsistency across different videos. Feel free to share your thoughts. * fixed regex * fixed filename fallback * fixed retrying when a photo url is not found * fixed end line * post url fix + better naming * fix posts * fixed tests * added profile.php url * made most of the requested changes * flake * archive: false * removed unnecessary url extract * [facebook] update - more 'Sec-Fetch-…' headers - simplify 'text.nameext_from_url()' calls - replace 'sorted(…)[-1]' with 'max(…)' - fix '_interval_429' usage - use replacement fields in logging messages * [facebook] update URL patterns get rid of '.*' and '.*?' * added few remaining tests --------- Co-authored-by: Mike Fährmann <mike_faehrmann@web.de>
292 lines
5.1 KiB
Python
292 lines
5.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2015-2023 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
import sys
|
|
import re
|
|
|
|
modules = [
|
|
"2ch",
|
|
"2chan",
|
|
"2chen",
|
|
"35photo",
|
|
"3dbooru",
|
|
"4chan",
|
|
"4archive",
|
|
"4chanarchives",
|
|
"500px",
|
|
"8chan",
|
|
"8muses",
|
|
"adultempire",
|
|
"agnph",
|
|
"ao3",
|
|
"architizer",
|
|
"artstation",
|
|
"aryion",
|
|
"batoto",
|
|
"bbc",
|
|
"behance",
|
|
"bilibili",
|
|
"blogger",
|
|
"bluesky",
|
|
"boosty",
|
|
"bunkr",
|
|
"catbox",
|
|
"chevereto",
|
|
"cien",
|
|
"civitai",
|
|
"cohost",
|
|
"comicvine",
|
|
"cyberdrop",
|
|
"danbooru",
|
|
"desktopography",
|
|
"deviantart",
|
|
"dynastyscans",
|
|
"e621",
|
|
"erome",
|
|
"everia",
|
|
"exhentai",
|
|
"facebook",
|
|
"fanbox",
|
|
"fanleaks",
|
|
"fantia",
|
|
"fapello",
|
|
"fapachi",
|
|
"flickr",
|
|
"furaffinity",
|
|
"fuskator",
|
|
"gelbooru",
|
|
"gelbooru_v01",
|
|
"gelbooru_v02",
|
|
"gofile",
|
|
"hatenablog",
|
|
"hentai2read",
|
|
"hentaicosplays",
|
|
"hentaifoundry",
|
|
"hentaifox",
|
|
"hentaihand",
|
|
"hentaihere",
|
|
"hentainexus",
|
|
"hiperdex",
|
|
"hitomi",
|
|
"hotleak",
|
|
"idolcomplex",
|
|
"imagebam",
|
|
"imagechest",
|
|
"imagefap",
|
|
"imgbb",
|
|
"imgbox",
|
|
"imgth",
|
|
"imgur",
|
|
"inkbunny",
|
|
"instagram",
|
|
"issuu",
|
|
"itaku",
|
|
"itchio",
|
|
"jschan",
|
|
"kabeuchi",
|
|
"keenspot",
|
|
"kemonoparty",
|
|
"khinsider",
|
|
"koharu",
|
|
"komikcast",
|
|
"lensdump",
|
|
"lexica",
|
|
"lightroom",
|
|
"livedoor",
|
|
"luscious",
|
|
"lynxchan",
|
|
"mangadex",
|
|
"mangafox",
|
|
"mangahere",
|
|
"mangakakalot",
|
|
"manganelo",
|
|
"mangapark",
|
|
"mangaread",
|
|
"mangasee",
|
|
"mangoxo",
|
|
"misskey",
|
|
"motherless",
|
|
"myhentaigallery",
|
|
"myportfolio",
|
|
"naver",
|
|
"naverwebtoon",
|
|
"newgrounds",
|
|
"nhentai",
|
|
"nijie",
|
|
"nitter",
|
|
"nozomi",
|
|
"nsfwalbum",
|
|
"paheal",
|
|
"patreon",
|
|
"philomena",
|
|
"photovogue",
|
|
"picarto",
|
|
"piczel",
|
|
"pillowfort",
|
|
"pinterest",
|
|
"pixeldrain",
|
|
"pixiv",
|
|
"pixnet",
|
|
"plurk",
|
|
"poipiku",
|
|
"poringa",
|
|
"pornhub",
|
|
"pornpics",
|
|
"postmill",
|
|
"reactor",
|
|
"readcomiconline",
|
|
"reddit",
|
|
"redgifs",
|
|
"rule34us",
|
|
"rule34vault",
|
|
"rule34xyz",
|
|
"saint",
|
|
"sankaku",
|
|
"sankakucomplex",
|
|
"scrolller",
|
|
"seiga",
|
|
"senmanga",
|
|
"sexcom",
|
|
"shimmie2",
|
|
"simplyhentai",
|
|
"skeb",
|
|
"slickpic",
|
|
"slideshare",
|
|
"smugmug",
|
|
"soundgasm",
|
|
"speakerdeck",
|
|
"steamgriddb",
|
|
"subscribestar",
|
|
"szurubooru",
|
|
"tapas",
|
|
"tcbscans",
|
|
"telegraph",
|
|
"tmohentai",
|
|
"toyhouse",
|
|
"tsumino",
|
|
"tumblr",
|
|
"tumblrgallery",
|
|
"twibooru",
|
|
"twitter",
|
|
"urlgalleries",
|
|
"unsplash",
|
|
"uploadir",
|
|
"urlshortener",
|
|
"vanillarock",
|
|
"vichan",
|
|
"vipergirls",
|
|
"vk",
|
|
"vsco",
|
|
"wallhaven",
|
|
"wallpapercave",
|
|
"warosu",
|
|
"weasyl",
|
|
"webmshare",
|
|
"webtoons",
|
|
"weibo",
|
|
"wikiart",
|
|
"wikifeet",
|
|
"wikimedia",
|
|
"xhamster",
|
|
"xvideos",
|
|
"zerochan",
|
|
"zzup",
|
|
"booru",
|
|
"moebooru",
|
|
"foolfuuka",
|
|
"foolslide",
|
|
"mastodon",
|
|
"shopify",
|
|
"lolisafe",
|
|
"imagehosts",
|
|
"directlink",
|
|
"recursive",
|
|
"oauth",
|
|
"noop",
|
|
"ytdl",
|
|
"generic",
|
|
]
|
|
|
|
|
|
def find(url):
|
|
"""Find a suitable extractor for the given URL"""
|
|
for cls in _list_classes():
|
|
match = cls.pattern.match(url)
|
|
if match:
|
|
return cls(match)
|
|
return None
|
|
|
|
|
|
def add(cls):
|
|
"""Add 'cls' to the list of available extractors"""
|
|
cls.pattern = re.compile(cls.pattern)
|
|
_cache.append(cls)
|
|
return cls
|
|
|
|
|
|
def add_module(module):
|
|
"""Add all extractors in 'module' to the list of available extractors"""
|
|
classes = _get_classes(module)
|
|
for cls in classes:
|
|
cls.pattern = re.compile(cls.pattern)
|
|
_cache.extend(classes)
|
|
return classes
|
|
|
|
|
|
def extractors():
|
|
"""Yield all available extractor classes"""
|
|
return sorted(
|
|
_list_classes(),
|
|
key=lambda x: x.__name__
|
|
)
|
|
|
|
|
|
# --------------------------------------------------------------------
|
|
# internals
|
|
|
|
|
|
def _list_classes():
|
|
"""Yield available extractor classes"""
|
|
yield from _cache
|
|
|
|
for module in _module_iter:
|
|
yield from add_module(module)
|
|
|
|
globals()["_list_classes"] = lambda : _cache
|
|
|
|
|
|
def _modules_internal():
|
|
globals_ = globals()
|
|
for module_name in modules:
|
|
yield __import__(module_name, globals_, None, (), 1)
|
|
|
|
|
|
def _modules_path(path, files):
|
|
sys.path.insert(0, path)
|
|
try:
|
|
return [
|
|
__import__(name[:-3])
|
|
for name in files
|
|
if name.endswith(".py")
|
|
]
|
|
finally:
|
|
del sys.path[0]
|
|
|
|
|
|
def _get_classes(module):
|
|
"""Return a list of all extractor classes in a module"""
|
|
return [
|
|
cls for cls in module.__dict__.values() if (
|
|
hasattr(cls, "pattern") and cls.__module__ == module.__name__
|
|
)
|
|
]
|
|
|
|
|
|
_cache = []
|
|
_module_iter = _modules_internal()
|