From 9e2a945013cf2a71357983093eacdd4b5bc2910f Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Wed, 29 Mar 2023 00:06:41 +0800 Subject: [PATCH 1/4] [urlshortener] add support for bit.ly & t.co --- docs/supportedsites.md | 16 ++++++++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/urlshortener.py | 59 ++++++++++++++++++++++++++++ scripts/supportedsites.py | 3 ++ 4 files changed, 79 insertions(+) create mode 100644 gallery_dl/extractor/urlshortener.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1876b045..08b39071 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1270,6 +1270,22 @@ Consider all sites to be NSFW unless otherwise known. + + URL Shorteners + + + Bitly + https://bit.ly/ + + + + + Twitter t.co + https://t.co/ + + + + vichan Imageboards diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3968d727..553a1104 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -153,6 +153,7 @@ modules = [ "twitter", "unsplash", "uploadir", + "urlshortener", "vanillarock", "vichan", "vk", diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py new file mode 100644 index 00000000..d95f182e --- /dev/null +++ b/gallery_dl/extractor/urlshortener.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractor for general-purpose URL shorteners""" + +from .common import BaseExtractor, Message +from .. import exception + + +class UrlshortenerExtractor(BaseExtractor): + """Base class for general-purpose URL shorteners""" + basecategory = "urlshortener" + test = ( + ("https://bit.ly/3cWIUgq", { + "count": 1, + "pattern": "^https://gumroad.com/l/storm_b1" + }), + ("https://t.co/bCgBY8Iv5n", { + "count": 1, + "pattern": ("^https://twitter.com/elonmusk/status/" + "1421395561324896257/photo/1") + }), + ) + + def __init__(self, match): + BaseExtractor.__init__(self, match) + self.headers = INSTANCES[self.category].get("headers") + self.url = match.group() + + def request(self, url, **kwargs): + kwargs["headers"] = self.headers + return BaseExtractor.request(self, url, **kwargs) + + def items(self): + response = self.request( + self.url, method="HEAD", allow_redirects=False, notfound="URL") + if "location" not in response.headers: + raise exception.StopExtraction("Unable to resolve short URL") + yield Message.Queue, response.headers["location"], {} + + +INSTANCES = { + "bitly": { + "root": "https://bit.ly", + "pattern": r"bit\.ly", + }, + "tco": { + # t.co sends 'http-equiv="refresh"' (200) when using browser UA + "headers": {"User-Agent": None}, + "root": "https://t.co", + "pattern": r"t\.co", + }, +} + +UrlshortenerExtractor.pattern = \ + UrlshortenerExtractor.update(INSTANCES) + r"/[^/?#&]+" diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index ff75c6c8..74100d4f 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -112,6 +112,7 @@ CATEGORY_MAP = { "subscribestar" : "SubscribeStar", "tbib" : "The Big ImageBoard", "tcbscans" : "TCB Scans", + "tco" : "Twitter t.co", "thatpervert" : "ThatPervert", "thebarchive" : "The /b/ Archive", "thecollection" : "The /co/llection", @@ -132,6 +133,7 @@ CATEGORY_MAP = { } SUBCATEGORY_MAP = { + "" : "", "art" : "Art", "audio" : "Audio", "doujin" : "Doujin", @@ -266,6 +268,7 @@ BASE_MAP = { "lynxchan" : "LynxChan Imageboards", "moebooru" : "Moebooru and MyImouto", "szurubooru" : "szurubooru Instances", + "urlshortener": "URL Shorteners", "vichan" : "vichan Imageboards", } From 71b26adb9b737717ebd79a1a513bebe52e787e8b Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Wed, 29 Mar 2023 13:36:43 +0800 Subject: [PATCH 2/4] [urlshortener] add tinyurl.com as an example --- docs/gallery-dl-example.conf | 4 ++++ gallery_dl/extractor/urlshortener.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf index ef7b3b50..da386dd6 100644 --- a/docs/gallery-dl-example.conf +++ b/docs/gallery-dl-example.conf @@ -317,6 +317,10 @@ "archive": "~/gallery-dl/custom-archive-file-for-TBIB.db", "filename": "{id}_{md5}.{extension}", "sleep-request": [0, 1.2] + }, + + "urlshortener": { + "tinyurl": {"root": "https://tinyurl.com"} } }, diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py index d95f182e..23a6df86 100644 --- a/gallery_dl/extractor/urlshortener.py +++ b/gallery_dl/extractor/urlshortener.py @@ -11,7 +11,7 @@ from .. import exception class UrlshortenerExtractor(BaseExtractor): - """Base class for general-purpose URL shorteners""" + """Extractor for general-purpose URL shorteners""" basecategory = "urlshortener" test = ( ("https://bit.ly/3cWIUgq", { From 875485313f216ce96d7d2a2c11e47d2d1b074a42 Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Sun, 9 Apr 2023 18:06:42 +0800 Subject: [PATCH 3/4] [urlshortener] force HTTPS --- gallery_dl/extractor/urlshortener.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py index 23a6df86..3e404e86 100644 --- a/gallery_dl/extractor/urlshortener.py +++ b/gallery_dl/extractor/urlshortener.py @@ -28,7 +28,7 @@ class UrlshortenerExtractor(BaseExtractor): def __init__(self, match): BaseExtractor.__init__(self, match) self.headers = INSTANCES[self.category].get("headers") - self.url = match.group() + self.id = match.group(match.lastindex) def request(self, url, **kwargs): kwargs["headers"] = self.headers @@ -36,7 +36,8 @@ class UrlshortenerExtractor(BaseExtractor): def items(self): response = self.request( - self.url, method="HEAD", allow_redirects=False, notfound="URL") + "{}/{}".format(self.root, self.id), method="HEAD", + allow_redirects=False, notfound="URL") if "location" not in response.headers: raise exception.StopExtraction("Unable to resolve short URL") yield Message.Queue, response.headers["location"], {} @@ -56,4 +57,4 @@ INSTANCES = { } UrlshortenerExtractor.pattern = \ - UrlshortenerExtractor.update(INSTANCES) + r"/[^/?#&]+" + UrlshortenerExtractor.update(INSTANCES) + r"/([^/?#&]+)" From 5e63942b374fa08c450d888d152d1a0cae430a4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 15 Apr 2023 18:06:06 +0200 Subject: [PATCH 4/4] [urlshortener] update --- docs/supportedsites.md | 4 +- gallery_dl/extractor/urlshortener.py | 73 ++++++++++++++++------------ 2 files changed, 43 insertions(+), 34 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 08b39071..d9f6c63a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1276,13 +1276,13 @@ Consider all sites to be NSFW unless otherwise known. Bitly https://bit.ly/ - + Links Twitter t.co https://t.co/ - + Links diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py index 3e404e86..1a39b5be 100644 --- a/gallery_dl/extractor/urlshortener.py +++ b/gallery_dl/extractor/urlshortener.py @@ -4,43 +4,15 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractor for general-purpose URL shorteners""" +"""Extractors for general-purpose URL shorteners""" from .common import BaseExtractor, Message from .. import exception class UrlshortenerExtractor(BaseExtractor): - """Extractor for general-purpose URL shorteners""" + """Base class for URL shortener extractors""" basecategory = "urlshortener" - test = ( - ("https://bit.ly/3cWIUgq", { - "count": 1, - "pattern": "^https://gumroad.com/l/storm_b1" - }), - ("https://t.co/bCgBY8Iv5n", { - "count": 1, - "pattern": ("^https://twitter.com/elonmusk/status/" - "1421395561324896257/photo/1") - }), - ) - - def __init__(self, match): - BaseExtractor.__init__(self, match) - self.headers = INSTANCES[self.category].get("headers") - self.id = match.group(match.lastindex) - - def request(self, url, **kwargs): - kwargs["headers"] = self.headers - return BaseExtractor.request(self, url, **kwargs) - - def items(self): - response = self.request( - "{}/{}".format(self.root, self.id), method="HEAD", - allow_redirects=False, notfound="URL") - if "location" not in response.headers: - raise exception.StopExtraction("Unable to resolve short URL") - yield Message.Queue, response.headers["location"], {} INSTANCES = { @@ -56,5 +28,42 @@ INSTANCES = { }, } -UrlshortenerExtractor.pattern = \ - UrlshortenerExtractor.update(INSTANCES) + r"/([^/?#&]+)" +BASE_PATTERN = UrlshortenerExtractor.update(INSTANCES) + + +class UrlshortenerLinkExtractor(UrlshortenerExtractor): + """Extractor for general-purpose URL shorteners""" + subcategory = "link" + pattern = BASE_PATTERN + r"/([^/?&#]+)" + test = ( + ("https://bit.ly/3cWIUgq", { + "count": 1, + "pattern": "^https://gumroad.com/l/storm_b1", + }), + ("https://t.co/bCgBY8Iv5n", { + "count": 1, + "pattern": "^https://twitter.com/elonmusk/status/" + "1421395561324896257/photo/1", + }), + ("https://t.co/abcdefghij", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + UrlshortenerExtractor.__init__(self, match) + self.id = match.group(match.lastindex) + + try: + self.headers = INSTANCES[self.category]["headers"] + except Exception: + self.headers = None + + def items(self): + response = self.request( + "{}/{}".format(self.root, self.id), headers=self.headers, + method="HEAD", allow_redirects=False, notfound="URL") + try: + yield Message.Queue, response.headers["location"], {} + except KeyError: + raise exception.StopExtraction("Unable to resolve short URL")