[exhentai] extract tag metadata

This commit is contained in:
Mike Fährmann
2019-01-15 13:35:58 +01:00
parent 0fb98d1d79
commit 2ffc105887

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2014-2018 Mike Fährmann # Copyright 2014-2019 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -15,6 +15,9 @@ import time
import random import random
BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org"
class ExhentaiExtractor(Extractor): class ExhentaiExtractor(Extractor):
"""Base class for exhentai extractors""" """Base class for exhentai extractors"""
category = "exhentai" category = "exhentai"
@@ -99,10 +102,10 @@ class ExhentaiExtractor(Extractor):
class ExhentaiGalleryExtractor(ExhentaiExtractor): class ExhentaiGalleryExtractor(ExhentaiExtractor):
"""Extractor for image galleries from exhentai.org""" """Extractor for image galleries from exhentai.org"""
subcategory = "gallery" subcategory = "gallery"
pattern = [r"(?:https?://)?(g\.e-|e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"] pattern = [BASE_PATTERN + r"/g/(\d+)/([\da-f]{10})"]
test = [ test = [
("https://exhentai.org/g/960460/4f0e369d82/", { ("https://exhentai.org/g/960460/4f0e369d82/", {
"keyword": "15b755fd3e2c710d7fd7ff112a5cdbf4333201b2", "keyword": "900b8dccd23c41a76e915a8df70ae77c4e0f52c7",
"content": "493d759de534355c9f55f8e365565b62411de146", "content": "493d759de534355c9f55f8e365565b62411de146",
}), }),
("https://exhentai.org/g/960461/4f0e369d82/", { ("https://exhentai.org/g/960461/4f0e369d82/", {
@@ -152,7 +155,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"gallery_id" : self.gid, "gallery_id" : self.gid,
"gallery_token": self.token, "gallery_token": self.token,
} }
text.extract_all(page, ( data, pos = text.extract_all(page, (
("title" , '<h1 id="gn">', '</h1>'), ("title" , '<h1 id="gn">', '</h1>'),
("title_jp" , '<h1 id="gj">', '</h1>'), ("title_jp" , '<h1 id="gj">', '</h1>'),
("date" , '>Posted:</td><td class="gdt2">', '</td>'), ("date" , '>Posted:</td><td class="gdt2">', '</td>'),
@@ -160,12 +163,17 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
("gallery_size", '>File Size:</td><td class="gdt2">', '<'), ("gallery_size", '>File Size:</td><td class="gdt2">', '<'),
("count" , '>Length:</td><td class="gdt2">', ' '), ("count" , '>Length:</td><td class="gdt2">', ' '),
), values=data) ), values=data)
data["lang"] = util.language_to_code(data["language"]) data["lang"] = util.language_to_code(data["language"])
data["title"] = text.unescape(data["title"]) data["title"] = text.unescape(data["title"])
data["title_jp"] = text.unescape(data["title_jp"]) data["title_jp"] = text.unescape(data["title_jp"])
data["count"] = text.parse_int(data["count"]) data["count"] = text.parse_int(data["count"])
data["gallery_size"] = text.parse_bytes( data["gallery_size"] = text.parse_bytes(
data["gallery_size"].rstrip("Bb")) data["gallery_size"].rstrip("Bb"))
data["tags"] = [
text.unquote(tag)
for tag in text.extract_iter(page, 'hentai.org/tag/', '"', pos)
]
return data return data
def get_images(self, page): def get_images(self, page):
@@ -258,12 +266,16 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
class ExhentaiSearchExtractor(ExhentaiExtractor): class ExhentaiSearchExtractor(ExhentaiExtractor):
"""Extractor for exhentai search results""" """Extractor for exhentai search results"""
subcategory = "search" subcategory = "search"
pattern = [r"(?:https?://)?(?:g\.e-|e-|ex)hentai\.org/?\?(.*)$"] pattern = [BASE_PATTERN + r"/?\?(.*)$"]
test = [ test = [
("https://exhentai.org/?f_search=touhou", None), ("https://exhentai.org/?f_search=touhou", None),
("https://exhentai.org/?f_doujinshi=0&f_manga=0&f_artistcg=0" (("https://exhentai.org/?f_doujinshi=0&f_manga=0&f_artistcg=0"
"&f_gamecg=0&f_western=0&f_non-h=1&f_imageset=0&f_cosplay=0" "&f_gamecg=0&f_western=0&f_non-h=1&f_imageset=0&f_cosplay=0"
"&f_asianporn=0&f_misc=0&f_search=touhou&f_apply=Apply+Filter", None), "&f_asianporn=0&f_misc=0&f_search=touhou&f_apply=Apply+Filter"), {
"pattern": ExhentaiGalleryExtractor.pattern[0],
"range": "1-30",
"count": 30,
}),
] ]
def __init__(self, match): def __init__(self, match):
@@ -318,8 +330,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor): class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
"""Extractor for favorited exhentai galleries""" """Extractor for favorited exhentai galleries"""
subcategory = "favorite" subcategory = "favorite"
pattern = [r"(?:https?://)?(?:g\.e-|e-|ex)hentai\.org" pattern = [BASE_PATTERN + r"/favorites\.php(?:\?(.*))?"]
r"/favorites\.php(?:\?(.*))?"]
test = [ test = [
("https://exhentai.org/favorites.php", None), ("https://exhentai.org/favorites.php", None),
("https://exhentai.org/favorites.php?favcat=1&f_search=touhou" ("https://exhentai.org/favorites.php?favcat=1&f_search=touhou"