From 2e516a1e3e09cb8a9e36a8f6f7e41ce8d4402f5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Tue, 12 Feb 2019 10:20:21 +0100
Subject: [PATCH] store the full original URL in Extractor.url

---
 gallery_dl/extractor/2chan.py         | 17 +++++++++--------
 gallery_dl/extractor/common.py        |  1 +
 gallery_dl/extractor/deviantart.py    | 20 ++++++++++++--------
 gallery_dl/extractor/directlink.py    |  1 -
 gallery_dl/extractor/exhentai.py      |  6 +++---
 gallery_dl/extractor/hentaifoundry.py | 20 +++++++++++---------
 gallery_dl/extractor/imagehosts.py    | 19 ++++++++++---------
 gallery_dl/extractor/imgth.py         | 25 ++++++++++---------------
 gallery_dl/extractor/photobucket.py   |  2 --
 gallery_dl/extractor/reactor.py       |  1 -
 gallery_dl/extractor/recursive.py     | 12 +++++-------
 gallery_dl/extractor/reddit.py        |  4 ----
 gallery_dl/extractor/simplyhentai.py  |  8 ++++----
 gallery_dl/extractor/xvideos.py       | 14 +++++++-------
 14 files changed, 72 insertions(+), 78 deletions(-)
diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py
index 12a81aac..8df8645b 100644
--- a/gallery_dl/extractor/2chan.py
+++ b/gallery_dl/extractor/2chan.py
@@ -19,8 +19,8 @@ class FutabaThreadExtractor(Extractor):
     directory_fmt = ("{category}", "{board_name}", "{thread}")
     filename_fmt = "{tim}.{extension}"
     archive_fmt = "{board}_{thread}_{tim}"
-    urlfmt = "https://{server}.2chan.net/{board}/src/{filename}"
-    pattern = r"(?:https?://)?(([^.]+)\.2chan\.net/([^/]+)/res/(\d+))"
+    url_fmt = "https://{server}.2chan.net/{board}/src/{filename}"
+    pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)"
     test = ("http://dec.2chan.net/70/res/947.htm", {
         "url": "c5c12b80b290e224b6758507b3bb952044f4595b",
         "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0",
@@ -28,22 +28,23 @@ class FutabaThreadExtractor(Extractor):
 
     def __init__(self, match):
         Extractor.__init__(self, match)
-        url, self.server, self.board, self.thread = match.groups()
-        self.url = "https://" + url + ".htm"
+        self.server, self.board, self.thread = match.groups()
 
     def items(self):
-        page = self.request(self.url).text
-        data = self.get_metadata(page)
+        url = "https://{}.2chan.net/{}/res/{}.htm".format(
+            self.server, self.board, self.thread)
+        page = self.request(url).text
+        data = self.metadata(page)
         yield Message.Version, 1
         yield Message.Directory, data
         for post in self.posts(page):
             if "filename" not in post:
                 continue
             post.update(data)
-            url = self.urlfmt.format_map(post)
+            url = self.url_fmt.format_map(post)
             yield Message.Url, url, post
 
-    def get_metadata(self, page):
+    def metadata(self, page):
         """Collect metadata for extractor-job"""
         title = text.extract(page, "<title>", "</title>")[0]
         title, _, boardname = title.rpartition(" - ")
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 8393cc51..893d15ff 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -34,6 +34,7 @@ class Extractor():
     def __init__(self, match):
         self.session = requests.Session()
         self.log = logging.getLogger(self.category)
+        self.url = match.string
         self._set_headers()
         self._set_cookies()
         self._set_proxies()
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 50ae9a5d..b5d103ae 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -312,7 +312,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
     """Extractor for single deviations"""
     subcategory = "deviation"
     archive_fmt = "{index}.{extension}"
-    pattern = BASE_PATTERN + r"/(?:art|journal)/[^/?&#]+-\d+"
+    pattern = BASE_PATTERN + r"/((?:art|journal)/[^/?&#]+-\d+)"
     test = (
         (("https://www.deviantart.com/shimoda7/art/"
           "For-the-sake-of-a-memory-10073852"), {
@@ -335,23 +335,22 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
 
     def __init__(self, match):
         DeviantartExtractor.__init__(self, match)
-        self.url = match.group(0)
-        if not self.url.startswith("http"):
-            self.url = "https://" + self.url
+        self.path = match.group(3)
 
     def deviations(self):
-        response = self.request(self.url, expect=range(400, 500))
+        url = "{}/{}/{}".format(self.root, self.user, self.path)
+        response = self.request(url, expect=range(400, 500))
         deviation_id = text.extract(response.text, '//deviation/', '"')[0]
         if response.status_code >= 400 or not deviation_id:
             raise exception.NotFoundError("image")
         return (self.api.deviation(deviation_id),)
 
 
-class DeviantartStashExtractor(DeviantartDeviationExtractor):
+class DeviantartStashExtractor(DeviantartExtractor):
     """Extractor for sta.sh-ed deviations"""
     subcategory = "stash"
     archive_fmt = "{index}.{extension}"
-    pattern = r"(?:https?://)?sta\.sh/()()[a-z0-9]+"
+    pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)"
     test = (
         ("https://sta.sh/022c83odnaxc", {
             "pattern": r"https://s3.amazonaws.com/origin-orig.deviantart.net",
@@ -366,8 +365,13 @@ class DeviantartStashExtractor(DeviantartDeviationExtractor):
         }),
     )
 
+    def __init__(self, match):
+        DeviantartExtractor.__init__(self, match)
+        self.stash_id = match.group(1)
+
     def deviations(self):
-        page = self.request(self.url).text
+        url = "https://sta.sh/" + self.stash_id
+        page = self.request(url).text
         deviation_id = text.extract(page, '//deviation/', '"')[0]
 
         if deviation_id:
diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py
index 07e75e78..5d00d8ad 100644
--- a/gallery_dl/extractor/directlink.py
+++ b/gallery_dl/extractor/directlink.py
@@ -40,7 +40,6 @@ class DirectlinkExtractor(Extractor):
     def __init__(self, match):
         Extractor.__init__(self, match)
         self.data = match.groupdict()
-        self.url = match.string
 
     def items(self):
         text.nameext_from_url(self.url, self.data)
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 633faadf..c9cc4b90 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -340,7 +340,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
         ExhentaiExtractor.__init__(self, match)
         self.params = text.parse_query(match.group(1) or "")
         self.params["page"] = text.parse_int(self.params.get("page"))
-        self.url = self.root
+        self.search_url = self.root
 
     def items(self):
         self.login()
@@ -348,7 +348,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
         yield Message.Version, 1
 
         while True:
-            page = self.request(self.url, params=self.params).text
+            page = self.request(self.search_url, params=self.params).text
 
             for row in text.extract_iter(page, '<tr class="gtr', '</tr>'):
                 yield self._parse_row(row)
@@ -397,7 +397,7 @@ class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
 
     def __init__(self, match):
         ExhentaiSearchExtractor.__init__(self, match)
-        self.url = self.root + "/favorites.php"
+        self.search_url = self.root + "/favorites.php"
 
     def init(self):
         # The first request to '/favorites.php' will return an empty list
diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py
index fe3ae42a..40cd0e39 100644
--- a/gallery_dl/extractor/hentaifoundry.py
+++ b/gallery_dl/extractor/hentaifoundry.py
@@ -23,7 +23,7 @@ class HentaifoundryExtractor(Extractor):
 
     def __init__(self, match, user="", page=1):
         Extractor.__init__(self, match)
-        self.url = ""
+        self.page_url = ""
         self.user = user
         self.start_post = 0
         self.start_page = text.parse_int(page, 1)
@@ -55,7 +55,7 @@ class HentaifoundryExtractor(Extractor):
         num = self.start_page
 
         while True:
-            page = self.request("{}/page/{}".format(self.url, num)).text
+            page = self.request("{}/page/{}".format(self.page_url, num)).text
             yield from text.extract_iter(page, 'thumbTitle"><a href="', '"')
 
             if 'class="pager"' not in page or 'class="last hidden"' in page:
@@ -135,10 +135,10 @@ class HentaifoundryUserExtractor(HentaifoundryExtractor):
     def __init__(self, match):
         HentaifoundryExtractor.__init__(
             self, match, match.group(1) or match.group(3), match.group(2))
-        self.url = "{}/pictures/user/{}".format(self.root, self.user)
+        self.page_url = "{}/pictures/user/{}".format(self.root, self.user)
 
     def get_job_metadata(self):
-        page = self.request(self.url + "?enterAgree=1").text
+        page = self.request(self.page_url + "?enterAgree=1").text
         count = text.extract(page, ">Pictures (", ")")[0]
         return {"user": self.user, "count": text.parse_int(count)}
 
@@ -161,10 +161,11 @@ class HentaifoundryScrapsExtractor(HentaifoundryExtractor):
     def __init__(self, match):
         HentaifoundryExtractor.__init__(
             self, match, match.group(1), match.group(2))
-        self.url = "{}/pictures/user/{}/scraps".format(self.root, self.user)
+        self.page_url = "{}/pictures/user/{}/scraps".format(
+            self.root, self.user)
 
     def get_job_metadata(self):
-        page = self.request(self.url + "?enterAgree=1").text
+        page = self.request(self.page_url + "?enterAgree=1").text
         count = text.extract(page, ">Scraps (", ")")[0]
         return {"user": self.user, "count": text.parse_int(count)}
 
@@ -188,7 +189,8 @@ class HentaifoundryFavoriteExtractor(HentaifoundryExtractor):
     def __init__(self, match):
         HentaifoundryExtractor.__init__(
             self, match, match.group(1), match.group(2))
-        self.url = "{}/user/{}/faves/pictures".format(self.root, self.user)
+        self.page_url = "{}/user/{}/faves/pictures".format(
+            self.root, self.user)
 
 
 class HentaifoundryRecentExtractor(HentaifoundryExtractor):
@@ -203,7 +205,7 @@ class HentaifoundryRecentExtractor(HentaifoundryExtractor):
     def __init__(self, match):
         HentaifoundryExtractor.__init__(self, match, "", match.group(2))
         self.date = match.group(1)
-        self.url = "{}/pictures/recent/{}".format(self.root, self.date)
+        self.page_url = "{}/pictures/recent/{}".format(self.root, self.date)
 
     def get_job_metadata(self):
         self.request(self.root + "/?enterAgree=1")
@@ -221,7 +223,7 @@ class HentaifoundryPopularExtractor(HentaifoundryExtractor):
 
     def __init__(self, match):
         HentaifoundryExtractor.__init__(self, match, "", match.group(1))
-        self.url = self.root + "/pictures/popular"
+        self.page_url = self.root + "/pictures/popular"
 
 
 class HentaifoundryImageExtractor(HentaifoundryExtractor):
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
index f9bc3cd2..01bdc9ee 100644
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -27,7 +27,8 @@ class ImagehostImageExtractor(SharedConfigMixin, Extractor):
 
     def __init__(self, match):
         Extractor.__init__(self, match)
-        self.url = ("https://" if self.https else "http://") + match.group(1)
+        self.page_url = "http{}://{}".format(
+            "s" if self.https else "", match.group(1))
         self.token = match.group(2)
         if self.params == "simple":
             self.params = {
@@ -47,7 +48,7 @@ class ImagehostImageExtractor(SharedConfigMixin, Extractor):
 
     def items(self):
         page = self.request(
-            self.url,
+            self.page_url,
             method=self.method,
             data=self.params,
             cookies=self.cookies,
@@ -95,11 +96,11 @@ class ImxtoImageExtractor(ImagehostImageExtractor):
 
     def __init__(self, match):
         ImagehostImageExtractor.__init__(self, match)
-        if "/img-" in self.url:
-            self.url = self.url.replace("img.yt", "imx.to")
-            self.urlext = True
+        if "/img-" in self.page_url:
+            self.page_url = self.page_url.replace("img.yt", "imx.to")
+            self.url_ext = True
         else:
-            self.urlext = False
+            self.url_ext = False
 
     def get_info(self, page):
         url, pos = text.extract(
@@ -107,7 +108,7 @@ class ImxtoImageExtractor(ImagehostImageExtractor):
         if not url:
             raise exception.NotFoundError("image")
         filename, pos = text.extract(page, ' title="', '"', pos)
-        if self.urlext and filename:
+        if self.url_ext and filename:
             filename += splitext(url)[1]
         return url, filename or url
 
@@ -146,7 +147,7 @@ class ImagevenueImageExtractor(ImagehostImageExtractor):
 
     def get_info(self, page):
         url = text.extract(page, "SRC='", "'")[0]
-        return text.urljoin(self.url, url), url
+        return text.urljoin(self.page_url, url), url
 
 
 class ImagetwistImageExtractor(ImagehostImageExtractor):
@@ -164,7 +165,7 @@ class ImagetwistImageExtractor(ImagehostImageExtractor):
     @property
     @memcache(maxage=3*60*60)
     def cookies(self):
-        return self.request(self.url).cookies
+        return self.request(self.page_url).cookies
 
     def get_info(self, page):
         url     , pos = text.extract(page, 'center;"><img src="', '"')
diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py
index fda2c267..58a0a630 100644
--- a/gallery_dl/extractor/imgth.py
+++ b/gallery_dl/extractor/imgth.py
@@ -28,34 +28,29 @@ class ImgthGalleryExtractor(Extractor):
     def __init__(self, match):
         Extractor.__init__(self, match)
         self.gid = match.group(1)
-        self.url = "https://imgth.com/gallery/" + self.gid + "/g/page/"
+        self.url_base = "https://imgth.com/gallery/" + self.gid + "/g/page/"
 
     def items(self):
-        page = self.request(self.url + "0").text
-        data = self.get_job_metadata(page)
+        page = self.request(self.url_base + "0").text
+        data = self.metadata(page)
         yield Message.Version, 1
         yield Message.Directory, data
-        for data["num"], url in enumerate(self.get_images(page), 1):
+        for data["num"], url in enumerate(self.images(page), 1):
             yield Message.Url, url, text.nameext_from_url(url, data)
 
-    def get_images(self, page):
+    def images(self, page):
         """Yield all image urls for this gallery"""
         pnum = 0
         while True:
-            pos = 0
-            page = text.extract(page, '<ul class="thumbnails">', '</ul>')[0]
-            while True:
-                url, pos = text.extract(page, '<img src="', '"', pos)
-                if not url:
-                    break
+            thumbs = text.extract(page, '<ul class="thumbnails">', '</ul>')[0]
+            for url in text.extract_iter(thumbs, '<img src="', '"'):
                 yield "https://imgth.com/images/" + url[24:]
-            pos = page.find('<li class="next">', pos)
-            if pos == -1:
+            if '<li class="next">' not in page:
                 return
             pnum += 1
-            page = self.request(self.url + str(pnum)).text
+            page = self.request(self.url_base + str(pnum)).text
 
-    def get_job_metadata(self, page):
+    def metadata(self, page):
         """Collect metadata for extractor-job"""
         return text.extract_all(page, (
             ("title", '<h1>', '</h1>'),
diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py
index c18b4026..5f82351d 100644
--- a/gallery_dl/extractor/photobucket.py
+++ b/gallery_dl/extractor/photobucket.py
@@ -50,7 +50,6 @@ class PhotobucketAlbumExtractor(Extractor):
     def __init__(self, match):
         Extractor.__init__(self, match)
         self.album_path = ""
-        self.url = match.group(0)
         self.root = "http://" + match.group(1)
         self.session.headers["Referer"] = self.url
 
@@ -128,7 +127,6 @@ class PhotobucketImageExtractor(Extractor):
 
     def __init__(self, match):
         Extractor.__init__(self, match)
-        self.url = match.group(0)
         self.user = match.group(1) or match.group(3)
         self.media_id = match.group(2)
         self.session.headers["Referer"] = self.url
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py
index a7769f1a..4f4f507d 100644
--- a/gallery_dl/extractor/reactor.py
+++ b/gallery_dl/extractor/reactor.py
@@ -27,7 +27,6 @@ class ReactorExtractor(SharedConfigMixin, Extractor):
 
     def __init__(self, match):
         Extractor.__init__(self, match)
-        self.url = match.group(0)
         self.root = "http://" + match.group(1)
         self.session.headers["Referer"] = self.root
 
diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py
index 27922797..1a793a0d 100644
--- a/gallery_dl/extractor/recursive.py
+++ b/gallery_dl/extractor/recursive.py
@@ -17,20 +17,18 @@ import re
 class RecursiveExtractor(Extractor):
     """Extractor that fetches URLs from a remote or local source"""
     category = "recursive"
-    pattern = r"r(?:ecursive)?:(.+)"
+    pattern = r"r(?:ecursive)?:"
     test = ("recursive:https://pastebin.com/raw/FLwrCYsT", {
         "url": "eee86d65c346361b818e8f4b2b307d9429f136a2",
     })
 
-    def __init__(self, match):
-        Extractor.__init__(self, match)
-        self.session.mount("file://", FileAdapter())
-        self.url = match.group(1)
-
     def items(self):
         blist = self.config(
             "blacklist", {"directlink"} | util.SPECIAL_EXTRACTORS)
-        page = self.request(self.url).text
+
+        self.session.mount("file://", FileAdapter())
+        page = self.request(self.url.partition(":")[2]).text
+
         yield Message.Version, 1
         with extractor.blacklist(blist):
             for match in re.finditer(r"https?://[^\s\"']+", page):
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 88ede387..99823440 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -143,10 +143,6 @@ class RedditImageExtractor(Extractor):
         }),
     )
 
-    def __init__(self, match):
-        Extractor.__init__(self, match)
-        self.url = match.group(0)
-
     def items(self):
         data = text.nameext_from_url(self.url)
         yield Message.Version, 1
diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py
index e638a49a..90079254 100644
--- a/gallery_dl/extractor/simplyhentai.py
+++ b/gallery_dl/extractor/simplyhentai.py
@@ -101,11 +101,11 @@ class SimplyhentaiImageExtractor(Extractor):
 
     def __init__(self, match):
         Extractor.__init__(self, match)
-        self.url = "https://www." + match.group(1)
+        self.page_url = "https://www." + match.group(1)
         self.type = match.group(2)
 
     def items(self):
-        page = self.request(self.url).text
+        page = self.request(self.page_url).text
         url_search = 'data-src="' if self.type == "image" else '<source src="'
 
         title, pos = text.extract(page, '"og:title" content="', '"')
@@ -155,10 +155,10 @@ class SimplyhentaiVideoExtractor(Extractor):
 
     def __init__(self, match):
         Extractor.__init__(self, match)
-        self.url = "https://" + match.group(1)
+        self.page_url = "https://" + match.group(1)
 
     def items(self):
-        page = self.request(self.url).text
+        page = self.request(self.page_url).text
 
         title, pos = text.extract(page, "<title>", "</title>")
         tags , pos = text.extract(page, ">Tags</div>", "</div>", pos)
diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py
index dd6f6294..c735a881 100644
--- a/gallery_dl/extractor/xvideos.py
+++ b/gallery_dl/extractor/xvideos.py
@@ -16,9 +16,10 @@ import json
 class XvideosExtractor(Extractor):
     """Base class for xvideos extractors"""
     category = "xvideos"
+    root = "https://www.xvideos.com"
 
-    def get_page(self, codes=(403, 404)):
-        response = self.request(self.url, expect=codes)
+    def get_page(self, url, codes=(403, 404)):
+        response = self.request(url, expect=codes)
         if response.status_code in codes:
             raise exception.NotFoundError(self.subcategory)
         return response.text
@@ -46,11 +47,10 @@ class XvideosGalleryExtractor(XvideosExtractor):
     def __init__(self, match):
         XvideosExtractor.__init__(self, match)
         self.user, self.gid = match.groups()
-        self.url = "https://www.xvideos.com/profiles/{}/photos/{}".format(
-            self.user, self.gid)
 
     def items(self):
-        page = self.get_page()
+        url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid)
+        page = self.get_page(url)
         data = self.get_metadata(page)
         imgs = self.get_images(page)
         data["count"] = len(imgs)
@@ -110,10 +110,10 @@ class XvideosUserExtractor(XvideosExtractor):
     def __init__(self, match):
         XvideosExtractor.__init__(self, match)
         self.user = match.group(1)
-        self.url = "https://www.xvideos.com/profiles/" + self.user
 
     def items(self):
-        page = self.get_page()
+        url = "{}/profiles/{}".format(self.root, self.user)
+        page = self.get_page(url)
         data = json.loads(text.extract(
             page, "xv.conf=", ";</script>")[0])["data"]