[exhentai] update data extraction code

- parse 'date' to datetime object - use 'text.extract_from()'
2019-05-08 15:44:29 +02:00
parent 80fdb11508
commit 1f7fa9dc8e
1 changed files with 36 additions and 38 deletions
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -110,7 +110,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
               r"|/s/([\da-f]{10})/(\d+)-(\d+))")
    test = (
        ("https://exhentai.org/g/960460/4f0e369d82/", {
-            "keyword": "993bfaf68b4823084fbd0d3339564666463b1432",
+            "keyword": "1532ca4d0e4e0738dc994ca725a228af04a4e480",
            "content": "493d759de534355c9f55f8e365565b62411de146",
        }),
        ("https://exhentai.org/g/960461/4f0e369d82/", {
@@ -169,57 +169,55 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
    def get_metadata(self, page):
        """Extract gallery metadata"""
-        data, pos = text.extract_all(page, (
+        extr = text.extract_from(page)
-            ("title"       , '<h1 id="gn">', '</h1>'),
+        data = {
-            ("title_jp"    , '<h1 id="gj">', '</h1>'),
+            "gallery_id"   : self.gallery_id,
-            ("date"        , '>Posted:</td><td class="gdt2">', '</td>'),
+            "gallery_token": self.gallery_token,
-            ("parent"      , '>Parent:</td><td class="gdt2"><a href="', '"'),
+            "title"        : text.unescape(extr('<h1 id="gn">', '</h1>')),
-            ("visible"     , '>Visible:</td><td class="gdt2">', '<'),
+            "title_jp"     : text.unescape(extr('<h1 id="gj">', '</h1>')),
-            ("language"    , '>Language:</td><td class="gdt2">', ' '),
+            "date"         : text.parse_datetime(extr(
-            ("gallery_size", '>File Size:</td><td class="gdt2">', '<'),
+                '>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"),
-            ("count"       , '>Length:</td><td class="gdt2">', ' '),
+            "parent"       : extr(
-        ))
+                '>Parent:</td><td class="gdt2"><a href="', '"'),
            "visible"      : extr(
                '>Visible:</td><td class="gdt2">', '<'),
            "language"     : extr(
                '>Language:</td><td class="gdt2">', ' '),
            "gallery_size" : text.parse_bytes(extr(
                '>File Size:</td><td class="gdt2">', '<').rstrip("Bb")),
            "count"        : text.parse_int(extr(
                '>Length:</td><td class="gdt2">', ' ')),
        }
        data["lang"] = util.language_to_code(data["language"])
        data["title"] = text.unescape(data["title"])
        data["title_jp"] = text.unescape(data["title_jp"])
        data["count"] = text.parse_int(data["count"])
        data["gallery_id"] = self.gallery_id
        data["gallery_token"] = self.gallery_token
        data["gallery_size"] = text.parse_bytes(
            data["gallery_size"].rstrip("Bb"))
        data["tags"] = [
            text.unquote(tag)
-            for tag in text.extract_iter(page, 'hentai.org/tag/', '"', pos)
+            for tag in text.extract_iter(page, 'hentai.org/tag/', '"')
        ]
        return data
    def image_from_page(self, page):
        """Get image url and data from webpage"""
-        info = text.extract_all(page, (
+        pos = page.index('<div id="i3"><a onclick="return load_image(') + 26
-            (None      , '<div id="i3"><a onclick="return load_image(', ''),
+        extr = text.extract_from(page, pos)
            ("nextkey" , "'", "'"),
            ("url"     , '<img id="img" src="', '"'),
            ("origurl" , 'hentai.org/fullimg.php', '"'),
            ("originfo", 'ownload original', '<'),
            ("startkey", 'var startkey="', '";'),
            ("showkey" , 'var showkey="', '";'),
        ))[0]
        self.key["start"] = info["startkey"]
        self.key["show"] = info["showkey"]
        self.key["next"] = info["nextkey"]
-        if self.original and info["origurl"]:
+        self.key["next"] = extr("'", "'")
-            part = text.unescape(info["origurl"])
+        iurl = extr('<img id="img" src="', '"')
-            url = self.root + "/fullimg.php" + part
+        orig = extr('hentai.org/fullimg.php', '"')
-            data = self._parse_original_info(info["originfo"])
+
        if self.original and orig:
            url = self.root + "/fullimg.php" + text.unescape(orig)
            data = self._parse_original_info(extr('ownload original', '<'))
        else:
-            url = info["url"]
+            url = iurl
            data = self._parse_image_info(url)
        data["num"] = self.image_num
-        data["image_token"] = info["startkey"]
+        data["image_token"] = self.key["start"] = extr('var startkey="', '";')
-        return url, text.nameext_from_url(info["url"], data)
+        self.key["show"] = extr('var showkey="', '";')
        return url, text.nameext_from_url(iurl, data)
    def images_from_api(self):
        """Get image url and data from api calls"""