update fallback URL handling
remove Message.Urllist and use a '_fallback' field inside a kwdict
This commit is contained in:
@@ -40,7 +40,7 @@ class Message():
|
||||
- 2nd element is the (external) URL as a string
|
||||
- 3rd element is a dictionary containing URL-specific metadata
|
||||
|
||||
- Message.Urllist:
|
||||
- Message.Urllist: # obsolete
|
||||
- Same as Message.Url, but its 2nd element is a list of multiple URLs
|
||||
- The additional URLs serve as a fallback if the primary one fails
|
||||
"""
|
||||
@@ -51,5 +51,5 @@ class Message():
|
||||
# Headers = 4
|
||||
# Cookies = 5
|
||||
Queue = 6
|
||||
Urllist = 7
|
||||
# Urllist = 7
|
||||
Metadata = 8
|
||||
|
||||
@@ -27,7 +27,6 @@ class TwitterExtractor(Extractor):
|
||||
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
|
||||
cookiedomain = ".twitter.com"
|
||||
root = "https://twitter.com"
|
||||
sizes = (":orig", ":large", ":medium", ":small")
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self, match)
|
||||
@@ -95,9 +94,10 @@ class TwitterExtractor(Extractor):
|
||||
|
||||
elif "media_url_https" in media:
|
||||
url = media["media_url_https"]
|
||||
urls = [url + size for size in self.sizes]
|
||||
tdata["_fallback"] = [
|
||||
url + size for size in (":large", ":medium", ":small")]
|
||||
text.nameext_from_url(url, tdata)
|
||||
yield Message.Urllist, urls, tdata
|
||||
yield Message.Url, url + ":orig", tdata
|
||||
|
||||
else:
|
||||
url = media["media_url"]
|
||||
@@ -249,7 +249,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
|
||||
test = (
|
||||
("https://twitter.com/supernaturepics", {
|
||||
"range": "1-40",
|
||||
"url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
|
||||
"url": "2b7814162028fcd238da4ff4072cf6390efe40b0",
|
||||
}),
|
||||
("https://mobile.twitter.com/supernaturepics?p=i"),
|
||||
("https://www.twitter.com/id:2976459548"),
|
||||
@@ -273,7 +273,7 @@ class TwitterMediaExtractor(TwitterExtractor):
|
||||
test = (
|
||||
("https://twitter.com/supernaturepics/media", {
|
||||
"range": "1-40",
|
||||
"url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
|
||||
"url": "2b7814162028fcd238da4ff4072cf6390efe40b0",
|
||||
}),
|
||||
("https://mobile.twitter.com/supernaturepics/media#t"),
|
||||
("https://www.twitter.com/id:2976459548/media"),
|
||||
|
||||
@@ -110,12 +110,6 @@ class Job():
|
||||
if self.pred_queue(url, kwds):
|
||||
self.handle_queue(url, kwds)
|
||||
|
||||
elif msg[0] == Message.Urllist:
|
||||
_, urls, kwds = msg
|
||||
if self.pred_url(urls[0], kwds):
|
||||
self.update_kwdict(kwds)
|
||||
self.handle_urllist(urls, kwds)
|
||||
|
||||
elif msg[0] == Message.Metadata:
|
||||
self.update_kwdict(msg[1])
|
||||
self.handle_metadata(msg[1])
|
||||
@@ -130,10 +124,6 @@ class Job():
|
||||
def handle_url(self, url, kwdict):
|
||||
"""Handle Message.Url"""
|
||||
|
||||
def handle_urllist(self, urls, kwdict):
|
||||
"""Handle Message.Urllist"""
|
||||
self.handle_url(urls[0], kwdict)
|
||||
|
||||
def handle_directory(self, kwdict):
|
||||
"""Handle Message.Directory"""
|
||||
|
||||
@@ -215,7 +205,7 @@ class DownloadJob(Job):
|
||||
else:
|
||||
self.visited = set()
|
||||
|
||||
def handle_url(self, url, kwdict, fallback=None):
|
||||
def handle_url(self, url, kwdict):
|
||||
"""Download the resource specified in 'url'"""
|
||||
postprocessors = self.postprocessors
|
||||
pathfmt = self.pathfmt
|
||||
@@ -246,7 +236,7 @@ class DownloadJob(Job):
|
||||
if not self.download(url):
|
||||
|
||||
# use fallback URLs if available
|
||||
for num, url in enumerate(fallback or (), 1):
|
||||
for num, url in enumerate(kwdict.get("_fallback", ()), 1):
|
||||
util.remove_file(pathfmt.temppath)
|
||||
self.log.info("Trying fallback URL #%d", num)
|
||||
if self.download(url):
|
||||
@@ -279,12 +269,6 @@ class DownloadJob(Job):
|
||||
pp.run_after(pathfmt)
|
||||
self._skipcnt = 0
|
||||
|
||||
def handle_urllist(self, urls, kwdict):
|
||||
"""Download the resource specified in 'url'"""
|
||||
fallback = iter(urls)
|
||||
url = next(fallback)
|
||||
self.handle_url(url, kwdict, fallback)
|
||||
|
||||
def handle_directory(self, kwdict):
|
||||
"""Set and create the target directory for downloads"""
|
||||
if not self.pathfmt:
|
||||
@@ -563,15 +547,11 @@ class UrlJob(Job):
|
||||
self.handle_queue = self.handle_url
|
||||
|
||||
@staticmethod
|
||||
def handle_url(url, _):
|
||||
def handle_url(url, kwdict):
|
||||
print(url)
|
||||
|
||||
@staticmethod
|
||||
def handle_urllist(urls, _):
|
||||
prefix = ""
|
||||
for url in urls:
|
||||
print(prefix, url, sep="")
|
||||
prefix = "| "
|
||||
if "_fallback" in kwdict:
|
||||
for url in kwdict["_fallback"]:
|
||||
print("|", url)
|
||||
|
||||
def handle_queue(self, url, _):
|
||||
try:
|
||||
@@ -625,9 +605,6 @@ class DataJob(Job):
|
||||
def handle_url(self, url, kwdict):
|
||||
self.data.append((Message.Url, url, self.filter(kwdict)))
|
||||
|
||||
def handle_urllist(self, urls, kwdict):
|
||||
self.data.append((Message.Urllist, list(urls), self.filter(kwdict)))
|
||||
|
||||
def handle_directory(self, kwdict):
|
||||
self.data.append((Message.Directory, self.filter(kwdict)))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user