diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 057515c9..999bdba6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -259,6 +259,12 @@ Consider all listed sites to potentially be NSFW. Folders + + HatenaBlog + https://hatenablog.com + Archive, Individual Posts, Home Feed, Search Results + + HBrowse https://www.hbrowse.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9e33f2c3..26ce2093 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -53,6 +53,7 @@ modules = [ "gelbooru_v01", "gelbooru_v02", "gofile", + "hatenablog", "hbrowse", "hentai2read", "hentaicosplays", diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py new file mode 100644 index 00000000..792f6664 --- /dev/null +++ b/gallery_dl/extractor/hatenablog.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hatenablog.com""" + +import re +from .common import Extractor, Message +from .. import text + + +BASE_PATTERN = ( + r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?" + r"([\w-]+\.(?:hatenablog\.(?:com|jp)" + r"|hatenadiary\.com|hateblo\.jp)))" +) +QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" + + +class HatenablogExtractor(Extractor): + """Base class for HatenaBlog extractors""" + category = "hatenablog" + directory_fmt = ("{category}", "{domain}") + filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}" + archive_fmt = "{filename}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.domain = match.group(1) or match.group(2) + + def _init(self): + self._find_img = re.compile(r']+)').finditer + + def _handle_article(self, article: str): + extr = text.extract_from(article) + date = text.parse_datetime(extr('