diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf index ef7b3b50..da386dd6 100644 --- a/docs/gallery-dl-example.conf +++ b/docs/gallery-dl-example.conf @@ -317,6 +317,10 @@ "archive": "~/gallery-dl/custom-archive-file-for-TBIB.db", "filename": "{id}_{md5}.{extension}", "sleep-request": [0, 1.2] + }, + + "urlshortener": { + "tinyurl": {"root": "https://tinyurl.com"} } }, diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3a1d883a..5e2e1ecc 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1270,6 +1270,22 @@ Consider all sites to be NSFW unless otherwise known. + + URL Shorteners + + + Bitly + https://bit.ly/ + Links + + + + Twitter t.co + https://t.co/ + Links + + + vichan Imageboards diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3968d727..553a1104 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -153,6 +153,7 @@ modules = [ "twitter", "unsplash", "uploadir", + "urlshortener", "vanillarock", "vichan", "vk", diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py new file mode 100644 index 00000000..1a39b5be --- /dev/null +++ b/gallery_dl/extractor/urlshortener.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for general-purpose URL shorteners""" + +from .common import BaseExtractor, Message +from .. import exception + + +class UrlshortenerExtractor(BaseExtractor): + """Base class for URL shortener extractors""" + basecategory = "urlshortener" + + +INSTANCES = { + "bitly": { + "root": "https://bit.ly", + "pattern": r"bit\.ly", + }, + "tco": { + # t.co sends 'http-equiv="refresh"' (200) when using browser UA + "headers": {"User-Agent": None}, + "root": "https://t.co", + "pattern": r"t\.co", + }, +} + +BASE_PATTERN = UrlshortenerExtractor.update(INSTANCES) + + +class UrlshortenerLinkExtractor(UrlshortenerExtractor): + """Extractor for general-purpose URL shorteners""" + subcategory = "link" + pattern = BASE_PATTERN + r"/([^/?&#]+)" + test = ( + ("https://bit.ly/3cWIUgq", { + "count": 1, + "pattern": "^https://gumroad.com/l/storm_b1", + }), + ("https://t.co/bCgBY8Iv5n", { + "count": 1, + "pattern": "^https://twitter.com/elonmusk/status/" + "1421395561324896257/photo/1", + }), + ("https://t.co/abcdefghij", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + UrlshortenerExtractor.__init__(self, match) + self.id = match.group(match.lastindex) + + try: + self.headers = INSTANCES[self.category]["headers"] + except Exception: + self.headers = None + + def items(self): + response = self.request( + "{}/{}".format(self.root, self.id), headers=self.headers, + method="HEAD", allow_redirects=False, notfound="URL") + try: + yield Message.Queue, response.headers["location"], {} + except KeyError: + raise exception.StopExtraction("Unable to resolve short URL") diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index ff75c6c8..74100d4f 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -112,6 +112,7 @@ CATEGORY_MAP = { "subscribestar" : "SubscribeStar", "tbib" : "The Big ImageBoard", "tcbscans" : "TCB Scans", + "tco" : "Twitter t.co", "thatpervert" : "ThatPervert", "thebarchive" : "The /b/ Archive", "thecollection" : "The /co/llection", @@ -132,6 +133,7 @@ CATEGORY_MAP = { } SUBCATEGORY_MAP = { + "" : "", "art" : "Art", "audio" : "Audio", "doujin" : "Doujin", @@ -266,6 +268,7 @@ BASE_MAP = { "lynxchan" : "LynxChan Imageboards", "moebooru" : "Moebooru and MyImouto", "szurubooru" : "szurubooru Instances", + "urlshortener": "URL Shorteners", "vichan" : "vichan Imageboards", }