Files
gallery-dl/gallery_dl/extractor/__init__.py
Luca Russo e9370b7b8a merge #5626: [facebook] add support (#470, #2612)
* [facebook] add initial support

* renamed extractors & subcategories

* better stability, modularity & naming

* added single photo extractor, warnings & retries

* more metadata + extract author followups

* renamed "album" mentions to "set" for consistency

* cookies are now only used when necessary

also added author followups for singular images

* removed f-strings

* added way to continue extraction from where it left off

also fixed some bugs

* fixed bug wrong subcategory

* added individual video extraction

* extract audio + added ytdl option

* updated setextract regex

* added option to disable start warning

the extractor should be ready :)

* fixed description metadata bug

* removed cookie "safeguard" + fixed for private profiles

I have removed the cookie "safeguard" (not using cookies until they are necessary) as I've come to the conclusion that it does more harm than good. There is no way to detect whether the extractor has skipped private images, that could have been possibly extracted otherwise. Also, doing this provides little to no advantages.

* fixed a few bugs regarding profile parsing

* a few bugfixes

Fixed some metadata attributes from not decoding correctly from non-latin languages, or not showing at all.
Also improved few patterns.

* retrigger checks

* Final cleanups

-Added tests
-Fixed video extractor giving incorrect URLs
-Removed start warning
-Listed supported site correctly

* fixed regex

* trigger checks

* fixed livestream playback extraction + bugfixes

I've chosen to remove the "reactions", "comments" and "views" attributes as I've felt that they require additional maintenance even though nobody would ever actually use them to order files.
I've also removed the "title" and "caption" video attributes for their inconsistency across different videos.
Feel free to share your thoughts.

* fixed regex

* fixed filename fallback

* fixed retrying when a photo url is not found

* fixed end line

* post url fix + better naming

* fix posts

* fixed tests

* added profile.php url

* made most of the requested changes

* flake

* archive: false

* removed unnecessary url extract

* [facebook] update

- more 'Sec-Fetch-…' headers
- simplify 'text.nameext_from_url()' calls
- replace 'sorted(…)[-1]' with 'max(…)'
- fix '_interval_429' usage
- use replacement fields in logging messages

* [facebook] update URL patterns

get rid of '.*' and '.*?'

* added few remaining tests

---------

Co-authored-by: Mike Fährmann <mike_faehrmann@web.de>
2024-11-26 21:49:11 +01:00

292 lines
5.1 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2015-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
import sys
import re
modules = [
"2ch",
"2chan",
"2chen",
"35photo",
"3dbooru",
"4chan",
"4archive",
"4chanarchives",
"500px",
"8chan",
"8muses",
"adultempire",
"agnph",
"ao3",
"architizer",
"artstation",
"aryion",
"batoto",
"bbc",
"behance",
"bilibili",
"blogger",
"bluesky",
"boosty",
"bunkr",
"catbox",
"chevereto",
"cien",
"civitai",
"cohost",
"comicvine",
"cyberdrop",
"danbooru",
"desktopography",
"deviantart",
"dynastyscans",
"e621",
"erome",
"everia",
"exhentai",
"facebook",
"fanbox",
"fanleaks",
"fantia",
"fapello",
"fapachi",
"flickr",
"furaffinity",
"fuskator",
"gelbooru",
"gelbooru_v01",
"gelbooru_v02",
"gofile",
"hatenablog",
"hentai2read",
"hentaicosplays",
"hentaifoundry",
"hentaifox",
"hentaihand",
"hentaihere",
"hentainexus",
"hiperdex",
"hitomi",
"hotleak",
"idolcomplex",
"imagebam",
"imagechest",
"imagefap",
"imgbb",
"imgbox",
"imgth",
"imgur",
"inkbunny",
"instagram",
"issuu",
"itaku",
"itchio",
"jschan",
"kabeuchi",
"keenspot",
"kemonoparty",
"khinsider",
"koharu",
"komikcast",
"lensdump",
"lexica",
"lightroom",
"livedoor",
"luscious",
"lynxchan",
"mangadex",
"mangafox",
"mangahere",
"mangakakalot",
"manganelo",
"mangapark",
"mangaread",
"mangasee",
"mangoxo",
"misskey",
"motherless",
"myhentaigallery",
"myportfolio",
"naver",
"naverwebtoon",
"newgrounds",
"nhentai",
"nijie",
"nitter",
"nozomi",
"nsfwalbum",
"paheal",
"patreon",
"philomena",
"photovogue",
"picarto",
"piczel",
"pillowfort",
"pinterest",
"pixeldrain",
"pixiv",
"pixnet",
"plurk",
"poipiku",
"poringa",
"pornhub",
"pornpics",
"postmill",
"reactor",
"readcomiconline",
"reddit",
"redgifs",
"rule34us",
"rule34vault",
"rule34xyz",
"saint",
"sankaku",
"sankakucomplex",
"scrolller",
"seiga",
"senmanga",
"sexcom",
"shimmie2",
"simplyhentai",
"skeb",
"slickpic",
"slideshare",
"smugmug",
"soundgasm",
"speakerdeck",
"steamgriddb",
"subscribestar",
"szurubooru",
"tapas",
"tcbscans",
"telegraph",
"tmohentai",
"toyhouse",
"tsumino",
"tumblr",
"tumblrgallery",
"twibooru",
"twitter",
"urlgalleries",
"unsplash",
"uploadir",
"urlshortener",
"vanillarock",
"vichan",
"vipergirls",
"vk",
"vsco",
"wallhaven",
"wallpapercave",
"warosu",
"weasyl",
"webmshare",
"webtoons",
"weibo",
"wikiart",
"wikifeet",
"wikimedia",
"xhamster",
"xvideos",
"zerochan",
"zzup",
"booru",
"moebooru",
"foolfuuka",
"foolslide",
"mastodon",
"shopify",
"lolisafe",
"imagehosts",
"directlink",
"recursive",
"oauth",
"noop",
"ytdl",
"generic",
]
def find(url):
"""Find a suitable extractor for the given URL"""
for cls in _list_classes():
match = cls.pattern.match(url)
if match:
return cls(match)
return None
def add(cls):
"""Add 'cls' to the list of available extractors"""
cls.pattern = re.compile(cls.pattern)
_cache.append(cls)
return cls
def add_module(module):
"""Add all extractors in 'module' to the list of available extractors"""
classes = _get_classes(module)
for cls in classes:
cls.pattern = re.compile(cls.pattern)
_cache.extend(classes)
return classes
def extractors():
"""Yield all available extractor classes"""
return sorted(
_list_classes(),
key=lambda x: x.__name__
)
# --------------------------------------------------------------------
# internals
def _list_classes():
"""Yield available extractor classes"""
yield from _cache
for module in _module_iter:
yield from add_module(module)
globals()["_list_classes"] = lambda : _cache
def _modules_internal():
globals_ = globals()
for module_name in modules:
yield __import__(module_name, globals_, None, (), 1)
def _modules_path(path, files):
sys.path.insert(0, path)
try:
return [
__import__(name[:-3])
for name in files
if name.endswith(".py")
]
finally:
del sys.path[0]
def _get_classes(module):
"""Return a list of all extractor classes in a module"""
return [
cls for cls in module.__dict__.values() if (
hasattr(cls, "pattern") and cls.__module__ == module.__name__
)
]
_cache = []
_module_iter = _modules_internal()