Python's 'ast' module and its 'NodeVisitor' class were incredibly helpful in identifying these
141 lines
4.9 KiB
Python
141 lines
4.9 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2025 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Extractors for https://thehentaiworld.com/"""
|
|
|
|
from .common import Extractor, Message
|
|
from .. import text, util
|
|
import collections
|
|
|
|
BASE_PATTERN = r"(?:https?://)?(?:www\.)?thehentaiworld\.com"
|
|
|
|
|
|
class ThehentaiworldExtractor(Extractor):
|
|
"""Base class for thehentaiworld extractors"""
|
|
category = "thehentaiworld"
|
|
root = "https://thehentaiworld.com"
|
|
filename_fmt = "{title} ({id}{num:?-//}).{extension}"
|
|
archive_fmt = "{id}_{num}"
|
|
request_interval = (0.5, 1.5)
|
|
|
|
def items(self):
|
|
for url in self.posts():
|
|
try:
|
|
post = self._extract_post(url)
|
|
except Exception as exc:
|
|
self.status |= 1
|
|
self.log.warning("Failed to extract post %s (%s: %s)",
|
|
url, exc.__class__.__name__, exc)
|
|
continue
|
|
|
|
if "file_urls" in post:
|
|
urls = post["file_urls"]
|
|
post["count"] = len(urls)
|
|
yield Message.Directory, "", post
|
|
for post["num"], url in enumerate(urls, 1):
|
|
text.nameext_from_url(url, post)
|
|
yield Message.Url, url, post
|
|
else:
|
|
yield Message.Directory, "", post
|
|
url = post["file_url"]
|
|
text.nameext_from_url(url, post)
|
|
yield Message.Url, url, post
|
|
|
|
def _extract_post(self, url):
|
|
extr = text.extract_from(self.request(url).text)
|
|
|
|
post = {
|
|
"num" : 0,
|
|
"count" : 1,
|
|
"title" : text.unescape(extr("<title>", "<").strip()),
|
|
"id" : text.parse_int(extr(" postid-", " ")),
|
|
"slug" : extr(" post-", '"'),
|
|
"tags" : extr('id="tagsHead">', "</ul>"),
|
|
"date" : self.parse_datetime_iso(extr("<li>Posted: ", "<")),
|
|
}
|
|
|
|
if (c := url[27]) == "v":
|
|
post["type"] = "video"
|
|
post["width"] = post["height"] = 0
|
|
post["votes"] = text.parse_int(extr("(<strong>", "</strong>"))
|
|
post["score"] = text.parse_float(extr("<strong>", "<"))
|
|
post["file_url"] = extr('<source src="', '"')
|
|
else:
|
|
post["type"] = ("animated" if c == "g" else
|
|
"3d cgi" if c == "3" else
|
|
"image")
|
|
post["width"] = text.parse_int(extr("<li>Size: ", " "))
|
|
post["height"] = text.parse_int(extr("x ", "<"))
|
|
post["file_url"] = extr('a href="', '"')
|
|
post["votes"] = text.parse_int(extr("(<strong>", "</strong>"))
|
|
post["score"] = text.parse_float(extr("<strong>", "<"))
|
|
|
|
if doujin := extr('<a id="prev-page"', "</div></div><"):
|
|
repl = text.re(r"-220x\d+\.").sub
|
|
post["file_urls"] = [
|
|
repl(".", url)
|
|
for url in text.extract_iter(
|
|
doujin, 'class="border" src="', '"')
|
|
]
|
|
|
|
tags = collections.defaultdict(list)
|
|
pattern = text.re(r'<li><a class="([^"]*)" href="[^"]*">([^<]+)')
|
|
for tag_type, tag_name in pattern.findall(post["tags"]):
|
|
tags[tag_type].append(tag_name)
|
|
post["tags"] = tags_list = []
|
|
for key, value in tags.items():
|
|
tags_list.extend(value)
|
|
post["tags_" + key if key else "tags_general"] = value
|
|
|
|
return post
|
|
|
|
def _pagination(self, endpoint):
|
|
base = self.root + endpoint
|
|
pnum = self.page_start
|
|
|
|
while True:
|
|
url = base if pnum < 2 else f"{base}page/{pnum}/"
|
|
page = self.request(url).text
|
|
|
|
yield from text.extract_iter(text.extr(
|
|
page, 'id="thumbContainer"', "<script"), ' href="', '"')
|
|
|
|
if 'class="next"' not in page:
|
|
return
|
|
pnum += 1
|
|
|
|
|
|
class ThehentaiworldTagExtractor(ThehentaiworldExtractor):
|
|
subcategory = "tag"
|
|
per_page = 24
|
|
page_start = 1
|
|
post_start = 0
|
|
directory_fmt = ("{category}", "{search_tags}")
|
|
pattern = BASE_PATTERN + r"/tag/([^/?#]+)"
|
|
example = "https://thehentaiworld.com/tag/TAG/"
|
|
|
|
def posts(self):
|
|
self.kwdict["search_tags"] = tag = self.groups[0]
|
|
return util.advance(self._pagination(f"/tag/{tag}/"), self.post_start)
|
|
|
|
def skip(self, num):
|
|
pages, posts = divmod(num, self.per_page)
|
|
self.page_start += pages
|
|
self.post_start += posts
|
|
return num
|
|
|
|
|
|
class ThehentaiworldPostExtractor(ThehentaiworldExtractor):
|
|
subcategory = "post"
|
|
pattern = (BASE_PATTERN +
|
|
r"(/(?:video|(?:[\w-]+-)?hentai-image)s/([^/?#]+))")
|
|
example = "https://thehentaiworld.com/hentai-images/SLUG/"
|
|
|
|
def posts(self):
|
|
return (f"{self.root}{self.groups[0]}/",)
|