diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index cc0dd620..3babcbfb 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -284,6 +284,14 @@ class MangaExtractor(Extractor): """Return a list of all (chapter-url, metadata)-tuples""" +class GalleryExtractor(ChapterExtractor): + + subcategory = "gallery" + filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}" + directory_fmt = ("{category}", "{gallery_id} {title}") + archive_fmt = "{gallery_id}_{page}" + + class AsynchronousMixin(): """Run info extraction in a separate thread""" diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py index eff8f980..c7dc8f87 100644 --- a/gallery_dl/extractor/hentaifox.py +++ b/gallery_dl/extractor/hentaifox.py @@ -8,17 +8,13 @@ """Extractors for https://hentaifox.com/""" -from .common import ChapterExtractor, Extractor, Message +from .common import GalleryExtractor, Extractor, Message from .. import text -class HentaifoxGalleryExtractor(ChapterExtractor): +class HentaifoxGalleryExtractor(GalleryExtractor): """Extractor for image galleries on hentaifox.com""" category = "hentaifox" - subcategory = "gallery" - filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}" - directory_fmt = ("{category}", "{gallery_id} {title}") - archive_fmt = "{gallery_id}_{page}" pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))" test = ("https://hentaifox.com/gallery/56622/", { "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg", @@ -28,7 +24,7 @@ class HentaifoxGalleryExtractor(ChapterExtractor): root = "https://hentaifox.com" def __init__(self, match): - ChapterExtractor.__init__(self, match) + GalleryExtractor.__init__(self, match) self.gallery_id = match.group(2) def metadata(self, page): diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 45e982be..4de7c938 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -8,18 +8,14 @@ """Extract images from https://hitomi.la/""" -from .common import ChapterExtractor +from .common import GalleryExtractor from .. import text, util import string -class HitomiGalleryExtractor(ChapterExtractor): +class HitomiGalleryExtractor(GalleryExtractor): """Extractor for image galleries from hitomi.la""" category = "hitomi" - subcategory = "gallery" - directory_fmt = ("{category}", "{gallery_id} {title}") - filename_fmt = "{category}_{gallery_id}_{page:>03}_{filename}.{extension}" - archive_fmt = "{gallery_id}_{page}" pattern = r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)" test = ( ("https://hitomi.la/galleries/867789.html", { @@ -36,7 +32,7 @@ class HitomiGalleryExtractor(ChapterExtractor): def __init__(self, match): self.gid = text.parse_int(match.group(1)) url = "https://hitomi.la/galleries/{}.html".format(self.gid) - ChapterExtractor.__init__(self, match, url) + GalleryExtractor.__init__(self, match, url) def metadata(self, page): pos = page.index('

Scanlator" , ""), ("convention", "Convention", ""), ("uploader" , "Uploader" , ""), - ("score" , " :rating='" , "'"), + ("rating" , " :rating='" , "'"), ))[0] url = "{}/read/{}/01/x".format(self.root, self.gallery_id) @@ -79,16 +75,20 @@ class PururinGalleryExtractor(ChapterExtractor): self._cnt = info["total_pages"] for key in ("tags", "characters"): - data[key] = self._extract_list(data[key]) + data[key] = [ + text.unescape(item) + for item in text.extract_iter(data[key], 'title="', '"') + ] for key in ("artist", "group", "parody", "type", "collection", "language", "scanlator", "convention"): - data[key] = self._extract_one(data[key]) + data[key] = text.unescape(text.extract( + data[key], 'title="', '"')[0] or "") data["gallery_id"] = text.parse_int(self.gallery_id) data["title"] = info["title"] data["title_jp"] = info.get("j_title") or "" data["uploader"] = text.remove_html(data["uploader"]) - data["score"] = text.parse_float(data["score"]) + data["rating"] = text.parse_float(data["rating"]) data["lang"] = util.language_to_code(data["language"]) return data @@ -96,12 +96,3 @@ class PururinGalleryExtractor(ChapterExtractor): ufmt = "https://cdn.pururin.io/assets/images/data/{}/{{}}.{}".format( self.gallery_id, self._ext) return [(ufmt.format(num), None) for num in range(1, self._cnt + 1)] - - @staticmethod - def _extract_list(value): - return [text.unescape(item) - for item in text.extract_iter(value, 'title="', '"')] - - @staticmethod - def _extract_one(value): - return text.unescape(text.extract(value, 'title="', '"')[0] or "") diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index 63d95fc7..d9a8ebba 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -8,16 +8,13 @@ """Extract hentai-manga from https://www.simply-hentai.com/""" -from .common import Extractor, ChapterExtractor, Message +from .common import GalleryExtractor, Extractor, Message from .. import text, util, exception -class SimplyhentaiGalleryExtractor(ChapterExtractor): +class SimplyhentaiGalleryExtractor(GalleryExtractor): """Extractor for image galleries from simply-hentai.com""" category = "simplyhentai" - subcategory = "gallery" - directory_fmt = ("{category}", "{gallery_id} {title}") - filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}" archive_fmt = "{image_id}" pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.simply-hentai\.com" r"(?!/(?:album|gifs?|images?|series)(?:/|$))" @@ -39,7 +36,7 @@ class SimplyhentaiGalleryExtractor(ChapterExtractor): def __init__(self, match): url = "https://" + match.group(1) - ChapterExtractor.__init__(self, match, url) + GalleryExtractor.__init__(self, match, url) self.session.headers["Referer"] = url def metadata(self, page): diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index b35e5c3f..c0d5a32b 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -8,7 +8,7 @@ """Extractors for https://www.tsumino.com/""" -from .common import ChapterExtractor, Extractor, Message +from .common import GalleryExtractor, Extractor, Message from .. import text, exception from ..cache import cache @@ -40,12 +40,8 @@ class TsuminoBase(): return {".aotsumino": response.history[0].cookies[".aotsumino"]} -class TsuminoGalleryExtractor(TsuminoBase, ChapterExtractor): +class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor): """Extractor for image galleries on tsumino.com""" - subcategory = "gallery" - filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}" - directory_fmt = ("{category}", "{gallery_id} {title}") - archive_fmt = "{gallery_id}_{page}" pattern = (r"(?i)(?:https?://)?(?:www\.)?tsumino\.com" r"/(?:Book/Info|Read/View)/(\d+)") test = ( @@ -78,7 +74,7 @@ class TsuminoGalleryExtractor(TsuminoBase, ChapterExtractor): def __init__(self, match): self.gallery_id = match.group(1) url = "{}/Book/Info/{}".format(self.root, self.gallery_id) - ChapterExtractor.__init__(self, match, url) + GalleryExtractor.__init__(self, match, url) def metadata(self, page): extr = text.extract