From 2e516a1e3e09cb8a9e36a8f6f7e41ce8d4402f5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 12 Feb 2019 10:20:21 +0100 Subject: [PATCH] store the full original URL in Extractor.url --- gallery_dl/extractor/2chan.py | 17 +++++++++-------- gallery_dl/extractor/common.py | 1 + gallery_dl/extractor/deviantart.py | 20 ++++++++++++-------- gallery_dl/extractor/directlink.py | 1 - gallery_dl/extractor/exhentai.py | 6 +++--- gallery_dl/extractor/hentaifoundry.py | 20 +++++++++++--------- gallery_dl/extractor/imagehosts.py | 19 ++++++++++--------- gallery_dl/extractor/imgth.py | 25 ++++++++++--------------- gallery_dl/extractor/photobucket.py | 2 -- gallery_dl/extractor/reactor.py | 1 - gallery_dl/extractor/recursive.py | 12 +++++------- gallery_dl/extractor/reddit.py | 4 ---- gallery_dl/extractor/simplyhentai.py | 8 ++++---- gallery_dl/extractor/xvideos.py | 14 +++++++------- 14 files changed, 72 insertions(+), 78 deletions(-) diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py index 12a81aac..8df8645b 100644 --- a/gallery_dl/extractor/2chan.py +++ b/gallery_dl/extractor/2chan.py @@ -19,8 +19,8 @@ class FutabaThreadExtractor(Extractor): directory_fmt = ("{category}", "{board_name}", "{thread}") filename_fmt = "{tim}.{extension}" archive_fmt = "{board}_{thread}_{tim}" - urlfmt = "https://{server}.2chan.net/{board}/src/{filename}" - pattern = r"(?:https?://)?(([^.]+)\.2chan\.net/([^/]+)/res/(\d+))" + url_fmt = "https://{server}.2chan.net/{board}/src/{filename}" + pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)" test = ("http://dec.2chan.net/70/res/947.htm", { "url": "c5c12b80b290e224b6758507b3bb952044f4595b", "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0", @@ -28,22 +28,23 @@ class FutabaThreadExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - url, self.server, self.board, self.thread = match.groups() - self.url = "https://" + url + ".htm" + self.server, self.board, self.thread = match.groups() def items(self): - page = self.request(self.url).text - data = self.get_metadata(page) + url = "https://{}.2chan.net/{}/res/{}.htm".format( + self.server, self.board, self.thread) + page = self.request(url).text + data = self.metadata(page) yield Message.Version, 1 yield Message.Directory, data for post in self.posts(page): if "filename" not in post: continue post.update(data) - url = self.urlfmt.format_map(post) + url = self.url_fmt.format_map(post) yield Message.Url, url, post - def get_metadata(self, page): + def metadata(self, page): """Collect metadata for extractor-job""" title = text.extract(page, "", "")[0] title, _, boardname = title.rpartition(" - ") diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 8393cc51..893d15ff 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -34,6 +34,7 @@ class Extractor(): def __init__(self, match): self.session = requests.Session() self.log = logging.getLogger(self.category) + self.url = match.string self._set_headers() self._set_cookies() self._set_proxies() diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 50ae9a5d..b5d103ae 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -312,7 +312,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor): """Extractor for single deviations""" subcategory = "deviation" archive_fmt = "{index}.{extension}" - pattern = BASE_PATTERN + r"/(?:art|journal)/[^/?&#]+-\d+" + pattern = BASE_PATTERN + r"/((?:art|journal)/[^/?&#]+-\d+)" test = ( (("https://www.deviantart.com/shimoda7/art/" "For-the-sake-of-a-memory-10073852"), { @@ -335,23 +335,22 @@ class DeviantartDeviationExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) - self.url = match.group(0) - if not self.url.startswith("http"): - self.url = "https://" + self.url + self.path = match.group(3) def deviations(self): - response = self.request(self.url, expect=range(400, 500)) + url = "{}/{}/{}".format(self.root, self.user, self.path) + response = self.request(url, expect=range(400, 500)) deviation_id = text.extract(response.text, '//deviation/', '"')[0] if response.status_code >= 400 or not deviation_id: raise exception.NotFoundError("image") return (self.api.deviation(deviation_id),) -class DeviantartStashExtractor(DeviantartDeviationExtractor): +class DeviantartStashExtractor(DeviantartExtractor): """Extractor for sta.sh-ed deviations""" subcategory = "stash" archive_fmt = "{index}.{extension}" - pattern = r"(?:https?://)?sta\.sh/()()[a-z0-9]+" + pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)" test = ( ("https://sta.sh/022c83odnaxc", { "pattern": r"https://s3.amazonaws.com/origin-orig.deviantart.net", @@ -366,8 +365,13 @@ class DeviantartStashExtractor(DeviantartDeviationExtractor): }), ) + def __init__(self, match): + DeviantartExtractor.__init__(self, match) + self.stash_id = match.group(1) + def deviations(self): - page = self.request(self.url).text + url = "https://sta.sh/" + self.stash_id + page = self.request(url).text deviation_id = text.extract(page, '//deviation/', '"')[0] if deviation_id: diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 07e75e78..5d00d8ad 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -40,7 +40,6 @@ class DirectlinkExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.data = match.groupdict() - self.url = match.string def items(self): text.nameext_from_url(self.url, self.data) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 633faadf..c9cc4b90 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -340,7 +340,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): ExhentaiExtractor.__init__(self, match) self.params = text.parse_query(match.group(1) or "") self.params["page"] = text.parse_int(self.params.get("page")) - self.url = self.root + self.search_url = self.root def items(self): self.login() @@ -348,7 +348,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): yield Message.Version, 1 while True: - page = self.request(self.url, params=self.params).text + page = self.request(self.search_url, params=self.params).text for row in text.extract_iter(page, 'Pictures (", ")")[0] return {"user": self.user, "count": text.parse_int(count)} @@ -161,10 +161,11 @@ class HentaifoundryScrapsExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__( self, match, match.group(1), match.group(2)) - self.url = "{}/pictures/user/{}/scraps".format(self.root, self.user) + self.page_url = "{}/pictures/user/{}/scraps".format( + self.root, self.user) def get_job_metadata(self): - page = self.request(self.url + "?enterAgree=1").text + page = self.request(self.page_url + "?enterAgree=1").text count = text.extract(page, ">Scraps (", ")")[0] return {"user": self.user, "count": text.parse_int(count)} @@ -188,7 +189,8 @@ class HentaifoundryFavoriteExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__( self, match, match.group(1), match.group(2)) - self.url = "{}/user/{}/faves/pictures".format(self.root, self.user) + self.page_url = "{}/user/{}/faves/pictures".format( + self.root, self.user) class HentaifoundryRecentExtractor(HentaifoundryExtractor): @@ -203,7 +205,7 @@ class HentaifoundryRecentExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match, "", match.group(2)) self.date = match.group(1) - self.url = "{}/pictures/recent/{}".format(self.root, self.date) + self.page_url = "{}/pictures/recent/{}".format(self.root, self.date) def get_job_metadata(self): self.request(self.root + "/?enterAgree=1") @@ -221,7 +223,7 @@ class HentaifoundryPopularExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match, "", match.group(1)) - self.url = self.root + "/pictures/popular" + self.page_url = self.root + "/pictures/popular" class HentaifoundryImageExtractor(HentaifoundryExtractor): diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index f9bc3cd2..01bdc9ee 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -27,7 +27,8 @@ class ImagehostImageExtractor(SharedConfigMixin, Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.url = ("https://" if self.https else "http://") + match.group(1) + self.page_url = "http{}://{}".format( + "s" if self.https else "", match.group(1)) self.token = match.group(2) if self.params == "simple": self.params = { @@ -47,7 +48,7 @@ class ImagehostImageExtractor(SharedConfigMixin, Extractor): def items(self): page = self.request( - self.url, + self.page_url, method=self.method, data=self.params, cookies=self.cookies, @@ -95,11 +96,11 @@ class ImxtoImageExtractor(ImagehostImageExtractor): def __init__(self, match): ImagehostImageExtractor.__init__(self, match) - if "/img-" in self.url: - self.url = self.url.replace("img.yt", "imx.to") - self.urlext = True + if "/img-" in self.page_url: + self.page_url = self.page_url.replace("img.yt", "imx.to") + self.url_ext = True else: - self.urlext = False + self.url_ext = False def get_info(self, page): url, pos = text.extract( @@ -107,7 +108,7 @@ class ImxtoImageExtractor(ImagehostImageExtractor): if not url: raise exception.NotFoundError("image") filename, pos = text.extract(page, ' title="', '"', pos) - if self.urlext and filename: + if self.url_ext and filename: filename += splitext(url)[1] return url, filename or url @@ -146,7 +147,7 @@ class ImagevenueImageExtractor(ImagehostImageExtractor): def get_info(self, page): url = text.extract(page, "SRC='", "'")[0] - return text.urljoin(self.url, url), url + return text.urljoin(self.page_url, url), url class ImagetwistImageExtractor(ImagehostImageExtractor): @@ -164,7 +165,7 @@ class ImagetwistImageExtractor(ImagehostImageExtractor): @property @memcache(maxage=3*60*60) def cookies(self): - return self.request(self.url).cookies + return self.request(self.page_url).cookies def get_info(self, page): url , pos = text.extract(page, 'center;">', '')[0] - while True: - url, pos = text.extract(page, '', '')[0] + for url in text.extract_iter(thumbs, '', pos) - if pos == -1: + if '