diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py index c92969b0..38b2d5a8 100644 --- a/gallery_dl/extractor/2chan.py +++ b/gallery_dl/extractor/2chan.py @@ -20,7 +20,7 @@ class _2chanThreadExtractor(Extractor): filename_fmt = "{tim}.{extension}" archive_fmt = "{board}_{thread}_{tim}" url_fmt = "https://{server}.2chan.net/{board}/src/{filename}" - pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)" + pattern = r"(?:https?://)?([\w-]+)\.2chan\.net/([^/]+)/res/(\d+)" test = ("http://dec.2chan.net/70/res/4752.htm", { "url": "f49aa31340e9a3429226af24e19e01f5b819ca1f", "keyword": "44599c21b248e79692b2eb2da12699bd0ed5640a", diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 4f84d6ab..65c994dc 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -152,6 +152,7 @@ modules = [ "oauth", "test", "ytdl", + "generic", ] diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 7e7c2829..9a86cc47 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -15,7 +15,7 @@ import re BASE_PATTERN = ( r"(?:blogger:(?:https?://)?([^/]+)|" - r"(?:https?://)?([^.]+\.blogspot\.com))") + r"(?:https?://)?([\w-]+\.blogspot\.com))") class BloggerExtractor(Extractor): diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 6c5c7dfc..2bd8c6b6 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -56,7 +56,7 @@ class FlickrImageExtractor(FlickrExtractor): subcategory = "image" pattern = (r"(?:https?://)?(?:" r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/" - r"|[^.]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)" + r"|[\w-]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)" r"|flic\.kr/p/([A-Za-z1-9]+))") test = ( ("https://www.flickr.com/photos/departingyyz/16089302239", { diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py new file mode 100644 index 00000000..bece9057 --- /dev/null +++ b/gallery_dl/extractor/generic.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- + +"""Extractor for images in a generic web page.""" + +from .common import Extractor, Message +from .. import config, text +import re +import os.path + + +class GenericExtractor(Extractor): + """Extractor for images in a generic web page.""" + + category = "generic" + directory_fmt = ("{category}", "{pageurl}") + archive_fmt = "{imageurl}" + + # By default, the generic extractor is disabled + # and the "g(eneric):" prefix in url is required. + # If the extractor is enabled, make the prefix optional + pattern = r"(?ix)(?Pg(?:eneric)?:)" + if config.get(("extractor", "generic"), "enabled"): + pattern += r"?" + + # The generic extractor pattern should match (almost) any valid url + # Based on: https://tools.ietf.org/html/rfc3986#appendix-B + pattern += r""" + (?Phttps?://)? # optional http(s) scheme + (?P[-\w\.]+) # required domain + (?P/[^?&#]*)? # optional path + (?:\?(?P[^/?#]*))? # optional query + (?:\#(?P.*))?$ # optional fragment + """ + + def __init__(self, match): + """Init.""" + Extractor.__init__(self, match) + + # Strip the "g(eneric):" prefix + # and inform about "forced" or "fallback" mode + if match.group('generic'): + self.log.info("Forcing use of generic information extractor.") + self.url = match.group(0).partition(":")[2] + else: + self.log.info("Falling back on generic information extractor.") + self.url = match.group(0) + + # Make sure we have a scheme, or use https + if match.group('scheme'): + self.scheme = match.group('scheme') + else: + self.scheme = 'https://' + self.url = self.scheme + self.url + + # Used to resolve relative image urls + self.root = self.scheme + match.group('domain') + + def items(self): + """Get page, extract metadata & images, yield them in suitable messages. + + Adapted from common.GalleryExtractor.items() + + """ + page = self.request(self.url).text + data = self.metadata(page) + imgs = self.images(page) + + try: + data["count"] = len(imgs) + except TypeError: + pass + images = enumerate(imgs, 1) + + yield Message.Version, 1 + yield Message.Directory, data + + for data["num"], (url, imgdata) in images: + if imgdata: + data.update(imgdata) + if "extension" not in imgdata: + text.nameext_from_url(url, data) + else: + text.nameext_from_url(url, data) + yield Message.Url, url, data + + def metadata(self, page): + """Extract generic webpage metadata, return them in a dict.""" + data = {} + data['pageurl'] = self.url + data['title'] = text.extract(page, '', "")[0] or "" + data['description'] = text.extract( + page, ',