Files
gallery-dl/gallery_dl/extractor/exhentai.py
Mike Fährmann 61fbbd2dae [exhentai] rename metadata fields to match API results (#1325)
- gallery_id    -> gid
- gallery_token -> token
- title_jp      -> title_jpn
- visible       -> expunged
- gallery_size  -> filesize
- count         -> filecount

Also changes the function of the 'metadata' option.
It is now boolean and causes extra data fields from the API to be added
instead of completely replacing the data from HTML when activated.
2021-02-26 18:21:04 +01:00

494 lines
18 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2014-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://e-hentai.org/ and https://exhentai.org/"""
from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import itertools
import random
import time
import math
BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org"
class ExhentaiExtractor(Extractor):
"""Base class for exhentai extractors"""
category = "exhentai"
directory_fmt = ("{category}", "{gid} {title[:247]}")
filename_fmt = (
"{gid}_{num:>04}_{image_token}_{filename}.{extension}")
archive_fmt = "{gid}_{num}"
cookienames = ("ipb_member_id", "ipb_pass_hash")
cookiedomain = ".exhentai.org"
root = "https://exhentai.org"
LIMIT = False
def __init__(self, match):
# allow calling 'self.config()' before 'Extractor.__init__()'
self._cfgpath = ("extractor", self.category, self.subcategory)
version = match.group(1)
domain = self.config("domain", "auto")
if domain == "auto":
domain = ("ex" if version == "ex" else "e-") + "hentai.org"
self.root = "https://" + domain
self.cookiedomain = "." + domain
Extractor.__init__(self, match)
self.limits = self.config("limits", True)
self.original = self.config("original", True)
self.wait_min = self.config("wait-min", 3)
self.wait_max = self.config("wait-max", 6)
if type(self.limits) is int:
self._limit_max = self.limits
self.limits = True
else:
self._limit_max = 0
self._remaining = 0
if self.wait_max < self.wait_min:
self.wait_max = self.wait_min
self.session.headers["Referer"] = self.root + "/"
if version != "ex":
self.session.cookies.set("nw", "1", domain=self.cookiedomain)
def request(self, *args, **kwargs):
response = Extractor.request(self, *args, **kwargs)
if self._is_sadpanda(response):
self.log.info("sadpanda.jpg")
raise exception.AuthorizationError()
return response
def wait(self, waittime=None):
"""Wait for a randomly chosen amount of seconds"""
if not waittime:
waittime = random.uniform(self.wait_min, self.wait_max)
else:
waittime = random.uniform(waittime * 0.66, waittime * 1.33)
time.sleep(waittime)
def login(self):
"""Login and set necessary cookies"""
if self.LIMIT:
raise exception.StopExtraction("Image limit reached!")
if self._check_cookies(self.cookienames):
return
username, password = self._get_auth_info()
if username:
self._update_cookies(self._login_impl(username, password))
else:
self.log.info("no username given; using e-hentai.org")
self.root = "https://e-hentai.org"
self.original = False
self.limits = False
self.session.cookies["nw"] = "1"
@cache(maxage=90*24*3600, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01"
headers = {
"Referer": "https://e-hentai.org/bounce_login.php?b=d&bt=1-1",
}
data = {
"CookieDate": "1",
"b": "d",
"bt": "1-1",
"UserName": username,
"PassWord": password,
"ipb_login_submit": "Login!",
}
response = self.request(url, method="POST", headers=headers, data=data)
if b"You are now logged in as:" not in response.content:
raise exception.AuthenticationError()
return {c: response.cookies[c] for c in self.cookienames}
@staticmethod
def _is_sadpanda(response):
"""Return True if the response object contains a sad panda"""
return (
response.headers.get("Content-Length") == "9615" and
"sadpanda.jpg" in response.headers.get("Content-Disposition", "")
)
class ExhentaiGalleryExtractor(ExhentaiExtractor):
"""Extractor for image galleries from exhentai.org"""
subcategory = "gallery"
pattern = (BASE_PATTERN +
r"(?:/g/(\d+)/([\da-f]{10})"
r"|/s/([\da-f]{10})/(\d+)-(\d+))")
test = (
("https://exhentai.org/g/1200119/d55c44d3d0/", {
"keyword": {
"cost": int,
"date": "dt:2018-03-18 20:15:00",
"eh_category": "Non-H",
"expunged": False,
"favorites": "17",
"filecount": "4",
"filesize": 1488978,
"gid": 1200119,
"height": int,
"image_token": "re:[0-9a-f]{10}",
"lang": "jp",
"language": "Japanese",
"parent": "",
"rating": r"re:\d\.\d+",
"size": int,
"tags": [
"parody:komi-san wa komyushou desu.",
"character:shouko komi",
"group:seventh lowlife",
"sample",
],
"thumb": "https://exhentai.org/t/ce/0a/ce0a5bcb583229a9b07c0f8"
"3bcb1630ab1350640-624622-736-1036-jpg_250.jpg",
"title": "C93 [Seventh_Lowlife] Komi-san ha Tokidoki Daitan de"
"su (Komi-san wa Komyushou desu) [Sample]",
"title_jpn": "(C93) [Comiketjack (わ!)] 古見さんは、時々大胆"
"です。 (古見さんは、コミュ症です。) [見本]",
"token": "d55c44d3d0",
"torrentcount": "0",
"uploader": "klorpa",
"width": int,
},
"content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff",
}),
("https://exhentai.org/g/960461/4f0e369d82/", {
"exception": exception.NotFoundError,
}),
("http://exhentai.org/g/962698/7f02358e00/", {
"exception": exception.AuthorizationError,
}),
("https://exhentai.org/s/f68367b4c8/1200119-3", {
"count": 2,
}),
("https://e-hentai.org/s/f68367b4c8/1200119-3", {
"count": 2,
}),
("https://g.e-hentai.org/g/1200119/d55c44d3d0/"),
)
def __init__(self, match):
ExhentaiExtractor.__init__(self, match)
self.key = {}
self.count = 0
self.gallery_id = text.parse_int(match.group(2) or match.group(5))
self.gallery_token = match.group(3)
self.image_token = match.group(4)
self.image_num = text.parse_int(match.group(6), 1)
def items(self):
self.login()
if self.gallery_token:
gpage = self._gallery_page()
self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0]
if not self.image_token:
self.log.error("Failed to extract initial image token")
self.log.debug("Page content:\n%s", gpage)
return
self.wait()
ipage = self._image_page()
else:
ipage = self._image_page()
part = text.extract(ipage, 'hentai.org/g/', '"')[0]
if not part:
self.log.error("Failed to extract gallery token")
self.log.debug("Page content:\n%s", ipage)
return
self.gallery_token = part.split("/")[1]
self.wait()
gpage = self._gallery_page()
data = self.get_metadata(gpage)
self.count = text.parse_int(data["filecount"])
yield Message.Directory, data
images = itertools.chain(
(self.image_from_page(ipage),), self.images_from_api())
for url, image in images:
data.update(image)
if self.limits:
self._check_limits(data)
if "/fullimg.php" in url:
data["extension"] = ""
self.wait(self.wait_max / 4)
yield Message.Url, url, data
def get_metadata(self, page):
"""Extract gallery metadata"""
data = self.metadata_from_page(page)
if self.config("metadata", False):
data.update(self.metadata_from_api())
data["date"] = text.parse_timestamp(data["posted"])
return data
def metadata_from_page(self, page):
extr = text.extract_from(page)
data = {
"gid" : self.gallery_id,
"token" : self.gallery_token,
"thumb" : extr("background:transparent url(", ")"),
"title" : text.unescape(extr('<h1 id="gn">', '</h1>')),
"title_jpn" : text.unescape(extr('<h1 id="gj">', '</h1>')),
"_" : extr('<div id="gdc"><div class="cs ct', '"'),
"eh_category" : extr('>', '<'),
"uploader" : text.unquote(extr('/uploader/', '"')),
"date" : text.parse_datetime(extr(
'>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"),
"parent" : extr(
'>Parent:</td><td class="gdt2"><a href="', '"'),
"expunged" : "Yes" != extr(
'>Visible:</td><td class="gdt2">', '<'),
"language" : extr('>Language:</td><td class="gdt2">', ' '),
"filesize" : text.parse_bytes(extr(
'>File Size:</td><td class="gdt2">', '<').rstrip("Bb")),
"filecount" : extr('>Length:</td><td class="gdt2">', ' '),
"favorites" : extr('id="favcount">', ' '),
"rating" : extr(">Average: ", "<"),
"torrentcount" : extr('>Torrent Download (', ')'),
}
data["lang"] = util.language_to_code(data["language"])
data["tags"] = [
text.unquote(tag.replace("+", " "))
for tag in text.extract_iter(page, 'hentai.org/tag/', '"')
]
return data
def metadata_from_api(self):
url = self.root + "/api.php"
data = {
"method": "gdata",
"gidlist": ((self.gallery_id, self.gallery_token),),
"namespace": 1,
}
data = self.request(url, method="POST", json=data).json()
if "error" in data:
raise exception.StopExtraction(data["error"])
return data["gmetadata"][0]
def image_from_page(self, page):
"""Get image url and data from webpage"""
pos = page.index('<div id="i3"><a onclick="return load_image(') + 26
extr = text.extract_from(page, pos)
self.key["next"] = extr("'", "'")
iurl = extr('<img id="img" src="', '"')
orig = extr('hentai.org/fullimg.php', '"')
try:
if self.original and orig:
url = self.root + "/fullimg.php" + text.unescape(orig)
data = self._parse_original_info(extr('ownload original', '<'))
else:
url = iurl
data = self._parse_image_info(url)
except IndexError:
self.log.debug("Page content:\n%s", page)
raise exception.StopExtraction(
"Unable to parse image info for '%s'", url)
data["num"] = self.image_num
data["image_token"] = self.key["start"] = extr('var startkey="', '";')
self.key["show"] = extr('var showkey="', '";')
return url, text.nameext_from_url(iurl, data)
def images_from_api(self):
"""Get image url and data from api calls"""
api_url = self.root + "/api.php"
nextkey = self.key["next"]
request = {
"method" : "showpage",
"gid" : self.gallery_id,
"imgkey" : nextkey,
"showkey": self.key["show"],
}
for request["page"] in range(self.image_num + 1, self.count + 1):
self.wait()
page = self.request(api_url, method="POST", json=request).json()
imgkey = nextkey
nextkey, pos = text.extract(page["i3"], "'", "'")
imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos)
origurl, pos = text.extract(page["i7"], '<a href="', '"')
try:
if self.original and origurl:
url = text.unescape(origurl)
data = self._parse_original_info(text.extract(
page["i7"], "ownload original", "<", pos)[0])
else:
url = imgurl
data = self._parse_image_info(url)
except IndexError:
self.log.debug("Page content:\n%s", page)
raise exception.StopExtraction(
"Unable to parse image info for '%s'", url)
data["num"] = request["page"]
data["image_token"] = imgkey
yield url, text.nameext_from_url(imgurl, data)
request["imgkey"] = nextkey
def _gallery_page(self):
url = "{}/g/{}/{}/".format(
self.root, self.gallery_id, self.gallery_token)
response = self.request(url, fatal=False)
page = response.text
if response.status_code == 404 and "Gallery Not Available" in page:
raise exception.AuthorizationError()
if page.startswith(("Key missing", "Gallery not found")):
raise exception.NotFoundError("gallery")
if "hentai.org/mpv/" in page:
self.log.warning("Enabled Multi-Page Viewer is not supported")
return page
def _image_page(self):
url = "{}/s/{}/{}-{}".format(
self.root, self.image_token, self.gallery_id, self.image_num)
page = self.request(url, fatal=False).text
if page.startswith(("Invalid page", "Keep trying")):
raise exception.NotFoundError("image page")
return page
def _check_limits(self, data):
if not self._remaining or data["num"] % 20 == 0:
self._update_limits()
self._remaining -= data["cost"]
if self._remaining <= 0:
ExhentaiExtractor.LIMIT = True
url = "{}/s/{}/{}-{}".format(
self.root, data["image_token"], self.gallery_id, data["num"])
raise exception.StopExtraction(
"Image limit reached! Continue with '%s' "
"as URL after resetting it.", url)
def _update_limits(self):
url = "https://e-hentai.org/home.php"
cookies = {
cookie.name: cookie.value
for cookie in self.session.cookies
if cookie.domain == self.cookiedomain and cookie.name != "igneous"
}
page = self.request(url, cookies=cookies).text
current, pos = text.extract(page, "<strong>", "</strong>")
maximum, pos = text.extract(page, "<strong>", "</strong>", pos)
if self._limit_max:
maximum = self._limit_max
self.log.debug("Image Limits: %s/%s", current, maximum)
self._remaining = text.parse_int(maximum) - text.parse_int(current)
@staticmethod
def _parse_image_info(url):
for part in url.split("/")[4:]:
try:
_, size, width, height, _ = part.split("-")
break
except ValueError:
pass
else:
size = width = height = 0
return {
"cost" : 1,
"size" : text.parse_int(size),
"width" : text.parse_int(width),
"height": text.parse_int(height),
}
@staticmethod
def _parse_original_info(info):
parts = info.lstrip().split(" ")
size = text.parse_bytes(parts[3] + parts[4][0])
return {
# 1 initial point + 1 per 0.1 MB
"cost" : 1 + math.ceil(size / 100000),
"size" : size,
"width" : text.parse_int(parts[0]),
"height": text.parse_int(parts[2]),
}
class ExhentaiSearchExtractor(ExhentaiExtractor):
"""Extractor for exhentai search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"/?\?(.*)$"
test = (
("https://e-hentai.org/?f_search=touhou"),
(("https://exhentai.org/?f_doujinshi=0&f_manga=0&f_artistcg=0"
"&f_gamecg=0&f_western=0&f_non-h=1&f_imageset=0&f_cosplay=0"
"&f_asianporn=0&f_misc=0&f_search=touhou&f_apply=Apply+Filter"), {
"pattern": ExhentaiGalleryExtractor.pattern,
"range": "1-30",
"count": 30,
}),
)
def __init__(self, match):
ExhentaiExtractor.__init__(self, match)
self.params = text.parse_query(match.group(2))
self.params["page"] = text.parse_int(self.params.get("page"))
self.search_url = self.root
def items(self):
self.login()
yield Message.Version, 1
data = {"_extractor": ExhentaiGalleryExtractor}
while True:
last = None
page = self.request(self.search_url, params=self.params).text
for gallery in ExhentaiGalleryExtractor.pattern.finditer(page):
url = gallery.group(0)
if url == last:
continue
last = url
yield Message.Queue, url, data
if 'class="ptdd">&gt;<' in page or ">No hits found</p>" in page:
return
self.params["page"] += 1
self.wait()
class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
"""Extractor for favorited exhentai galleries"""
subcategory = "favorite"
pattern = BASE_PATTERN + r"/favorites\.php(?:\?(.*))?"
test = (
("https://e-hentai.org/favorites.php", {
"count": 1,
"pattern": r"https?://e-hentai\.org/g/1200119/d55c44d3d0"
}),
("https://exhentai.org/favorites.php?favcat=1&f_search=touhou"
"&f_apply=Search+Favorites"),
)
def __init__(self, match):
ExhentaiSearchExtractor.__init__(self, match)
self.search_url = self.root + "/favorites.php"