From 9e2a945013cf2a71357983093eacdd4b5bc2910f Mon Sep 17 00:00:00 2001
From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com>
Date: Wed, 29 Mar 2023 00:06:41 +0800
Subject: [PATCH 1/4] [urlshortener] add support for bit.ly & t.co
---
docs/supportedsites.md | 16 ++++++++
gallery_dl/extractor/__init__.py | 1 +
gallery_dl/extractor/urlshortener.py | 59 ++++++++++++++++++++++++++++
scripts/supportedsites.py | 3 ++
4 files changed, 79 insertions(+)
create mode 100644 gallery_dl/extractor/urlshortener.py
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 1876b045..08b39071 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -1270,6 +1270,22 @@ Consider all sites to be NSFW unless otherwise known.
|
+
+ | URL Shorteners |
+
+
+ | Bitly |
+ https://bit.ly/ |
+ |
+ |
+
+
+ | Twitter t.co |
+ https://t.co/ |
+ |
+ |
+
+
| vichan Imageboards |
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 3968d727..553a1104 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -153,6 +153,7 @@ modules = [
"twitter",
"unsplash",
"uploadir",
+ "urlshortener",
"vanillarock",
"vichan",
"vk",
diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py
new file mode 100644
index 00000000..d95f182e
--- /dev/null
+++ b/gallery_dl/extractor/urlshortener.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractor for general-purpose URL shorteners"""
+
+from .common import BaseExtractor, Message
+from .. import exception
+
+
+class UrlshortenerExtractor(BaseExtractor):
+ """Base class for general-purpose URL shorteners"""
+ basecategory = "urlshortener"
+ test = (
+ ("https://bit.ly/3cWIUgq", {
+ "count": 1,
+ "pattern": "^https://gumroad.com/l/storm_b1"
+ }),
+ ("https://t.co/bCgBY8Iv5n", {
+ "count": 1,
+ "pattern": ("^https://twitter.com/elonmusk/status/"
+ "1421395561324896257/photo/1")
+ }),
+ )
+
+ def __init__(self, match):
+ BaseExtractor.__init__(self, match)
+ self.headers = INSTANCES[self.category].get("headers")
+ self.url = match.group()
+
+ def request(self, url, **kwargs):
+ kwargs["headers"] = self.headers
+ return BaseExtractor.request(self, url, **kwargs)
+
+ def items(self):
+ response = self.request(
+ self.url, method="HEAD", allow_redirects=False, notfound="URL")
+ if "location" not in response.headers:
+ raise exception.StopExtraction("Unable to resolve short URL")
+ yield Message.Queue, response.headers["location"], {}
+
+
+INSTANCES = {
+ "bitly": {
+ "root": "https://bit.ly",
+ "pattern": r"bit\.ly",
+ },
+ "tco": {
+ # t.co sends 'http-equiv="refresh"' (200) when using browser UA
+ "headers": {"User-Agent": None},
+ "root": "https://t.co",
+ "pattern": r"t\.co",
+ },
+}
+
+UrlshortenerExtractor.pattern = \
+ UrlshortenerExtractor.update(INSTANCES) + r"/[^/?#&]+"
diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
index ff75c6c8..74100d4f 100755
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@@ -112,6 +112,7 @@ CATEGORY_MAP = {
"subscribestar" : "SubscribeStar",
"tbib" : "The Big ImageBoard",
"tcbscans" : "TCB Scans",
+ "tco" : "Twitter t.co",
"thatpervert" : "ThatPervert",
"thebarchive" : "The /b/ Archive",
"thecollection" : "The /co/llection",
@@ -132,6 +133,7 @@ CATEGORY_MAP = {
}
SUBCATEGORY_MAP = {
+ "" : "",
"art" : "Art",
"audio" : "Audio",
"doujin" : "Doujin",
@@ -266,6 +268,7 @@ BASE_MAP = {
"lynxchan" : "LynxChan Imageboards",
"moebooru" : "Moebooru and MyImouto",
"szurubooru" : "szurubooru Instances",
+ "urlshortener": "URL Shorteners",
"vichan" : "vichan Imageboards",
}
From 71b26adb9b737717ebd79a1a513bebe52e787e8b Mon Sep 17 00:00:00 2001
From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com>
Date: Wed, 29 Mar 2023 13:36:43 +0800
Subject: [PATCH 2/4] [urlshortener] add tinyurl.com as an example
---
docs/gallery-dl-example.conf | 4 ++++
gallery_dl/extractor/urlshortener.py | 2 +-
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf
index ef7b3b50..da386dd6 100644
--- a/docs/gallery-dl-example.conf
+++ b/docs/gallery-dl-example.conf
@@ -317,6 +317,10 @@
"archive": "~/gallery-dl/custom-archive-file-for-TBIB.db",
"filename": "{id}_{md5}.{extension}",
"sleep-request": [0, 1.2]
+ },
+
+ "urlshortener": {
+ "tinyurl": {"root": "https://tinyurl.com"}
}
},
diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py
index d95f182e..23a6df86 100644
--- a/gallery_dl/extractor/urlshortener.py
+++ b/gallery_dl/extractor/urlshortener.py
@@ -11,7 +11,7 @@ from .. import exception
class UrlshortenerExtractor(BaseExtractor):
- """Base class for general-purpose URL shorteners"""
+ """Extractor for general-purpose URL shorteners"""
basecategory = "urlshortener"
test = (
("https://bit.ly/3cWIUgq", {
From 875485313f216ce96d7d2a2c11e47d2d1b074a42 Mon Sep 17 00:00:00 2001
From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com>
Date: Sun, 9 Apr 2023 18:06:42 +0800
Subject: [PATCH 3/4] [urlshortener] force HTTPS
---
gallery_dl/extractor/urlshortener.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py
index 23a6df86..3e404e86 100644
--- a/gallery_dl/extractor/urlshortener.py
+++ b/gallery_dl/extractor/urlshortener.py
@@ -28,7 +28,7 @@ class UrlshortenerExtractor(BaseExtractor):
def __init__(self, match):
BaseExtractor.__init__(self, match)
self.headers = INSTANCES[self.category].get("headers")
- self.url = match.group()
+ self.id = match.group(match.lastindex)
def request(self, url, **kwargs):
kwargs["headers"] = self.headers
@@ -36,7 +36,8 @@ class UrlshortenerExtractor(BaseExtractor):
def items(self):
response = self.request(
- self.url, method="HEAD", allow_redirects=False, notfound="URL")
+ "{}/{}".format(self.root, self.id), method="HEAD",
+ allow_redirects=False, notfound="URL")
if "location" not in response.headers:
raise exception.StopExtraction("Unable to resolve short URL")
yield Message.Queue, response.headers["location"], {}
@@ -56,4 +57,4 @@ INSTANCES = {
}
UrlshortenerExtractor.pattern = \
- UrlshortenerExtractor.update(INSTANCES) + r"/[^/?#&]+"
+ UrlshortenerExtractor.update(INSTANCES) + r"/([^/?#&]+)"
From 5e63942b374fa08c450d888d152d1a0cae430a4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?=
Date: Sat, 15 Apr 2023 18:06:06 +0200
Subject: [PATCH 4/4] [urlshortener] update
---
docs/supportedsites.md | 4 +-
gallery_dl/extractor/urlshortener.py | 73 ++++++++++++++++------------
2 files changed, 43 insertions(+), 34 deletions(-)
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 08b39071..d9f6c63a 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -1276,13 +1276,13 @@ Consider all sites to be NSFW unless otherwise known.
| Bitly |
https://bit.ly/ |
- |
+ Links |
|
| Twitter t.co |
https://t.co/ |
- |
+ Links |
|
diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py
index 3e404e86..1a39b5be 100644
--- a/gallery_dl/extractor/urlshortener.py
+++ b/gallery_dl/extractor/urlshortener.py
@@ -4,43 +4,15 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractor for general-purpose URL shorteners"""
+"""Extractors for general-purpose URL shorteners"""
from .common import BaseExtractor, Message
from .. import exception
class UrlshortenerExtractor(BaseExtractor):
- """Extractor for general-purpose URL shorteners"""
+ """Base class for URL shortener extractors"""
basecategory = "urlshortener"
- test = (
- ("https://bit.ly/3cWIUgq", {
- "count": 1,
- "pattern": "^https://gumroad.com/l/storm_b1"
- }),
- ("https://t.co/bCgBY8Iv5n", {
- "count": 1,
- "pattern": ("^https://twitter.com/elonmusk/status/"
- "1421395561324896257/photo/1")
- }),
- )
-
- def __init__(self, match):
- BaseExtractor.__init__(self, match)
- self.headers = INSTANCES[self.category].get("headers")
- self.id = match.group(match.lastindex)
-
- def request(self, url, **kwargs):
- kwargs["headers"] = self.headers
- return BaseExtractor.request(self, url, **kwargs)
-
- def items(self):
- response = self.request(
- "{}/{}".format(self.root, self.id), method="HEAD",
- allow_redirects=False, notfound="URL")
- if "location" not in response.headers:
- raise exception.StopExtraction("Unable to resolve short URL")
- yield Message.Queue, response.headers["location"], {}
INSTANCES = {
@@ -56,5 +28,42 @@ INSTANCES = {
},
}
-UrlshortenerExtractor.pattern = \
- UrlshortenerExtractor.update(INSTANCES) + r"/([^/?#&]+)"
+BASE_PATTERN = UrlshortenerExtractor.update(INSTANCES)
+
+
+class UrlshortenerLinkExtractor(UrlshortenerExtractor):
+ """Extractor for general-purpose URL shorteners"""
+ subcategory = "link"
+ pattern = BASE_PATTERN + r"/([^/?]+)"
+ test = (
+ ("https://bit.ly/3cWIUgq", {
+ "count": 1,
+ "pattern": "^https://gumroad.com/l/storm_b1",
+ }),
+ ("https://t.co/bCgBY8Iv5n", {
+ "count": 1,
+ "pattern": "^https://twitter.com/elonmusk/status/"
+ "1421395561324896257/photo/1",
+ }),
+ ("https://t.co/abcdefghij", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ UrlshortenerExtractor.__init__(self, match)
+ self.id = match.group(match.lastindex)
+
+ try:
+ self.headers = INSTANCES[self.category]["headers"]
+ except Exception:
+ self.headers = None
+
+ def items(self):
+ response = self.request(
+ "{}/{}".format(self.root, self.id), headers=self.headers,
+ method="HEAD", allow_redirects=False, notfound="URL")
+ try:
+ yield Message.Queue, response.headers["location"], {}
+ except KeyError:
+ raise exception.StopExtraction("Unable to resolve short URL")