[blogger] inherit from BaseExtractor

- support www.micmicidol.club (#4759)
This commit is contained in:
Mike Fährmann
2023-11-21 16:52:25 +01:00
parent 0fa85360a0
commit e17a48fe56
6 changed files with 234 additions and 103 deletions

View File

@@ -109,12 +109,6 @@ Consider all sites to be NSFW unless otherwise known.
<td>Collections, Galleries, User Profiles</td> <td>Collections, Galleries, User Profiles</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>Blogger</td>
<td>https://www.blogger.com/</td>
<td>Blogs, Labels, Posts, Search Results</td>
<td></td>
</tr>
<tr> <tr>
<td>Bunkr</td> <td>Bunkr</td>
<td>https://bunkrr.su/</td> <td>https://bunkrr.su/</td>
@@ -998,6 +992,22 @@ Consider all sites to be NSFW unless otherwise known.
<td></td> <td></td>
</tr> </tr>
<tr>
<td colspan="4"><strong>Blogger Instances</strong></td>
</tr>
<tr>
<td>Blogspot</td>
<td>https://www.blogger.com/</td>
<td>Blogs, Labels, Posts, Search Results</td>
<td></td>
</tr>
<tr>
<td>MIC MIC IDOL</td>
<td>https://www.micmicidol.club/</td>
<td>Blogs, Labels, Posts, Search Results</td>
<td></td>
</tr>
<tr> <tr>
<td colspan="4"><strong>Chevereto Instances</strong></td> <td colspan="4"><strong>Chevereto Instances</strong></td>
</tr> </tr>

View File

@@ -8,30 +8,22 @@
"""Extractors for Blogger blogs""" """Extractors for Blogger blogs"""
from .common import Extractor, Message from .common import BaseExtractor, Message
from .. import text, util from .. import text, util
import re import re
BASE_PATTERN = (
r"(?:blogger:(?:https?://)?([^/]+)|"
r"(?:https?://)?([\w-]+\.blogspot\.com))")
class BloggerExtractor(BaseExtractor):
class BloggerExtractor(Extractor):
"""Base class for blogger extractors""" """Base class for blogger extractors"""
category = "blogger" basecategory = "blogger"
directory_fmt = ("{category}", "{blog[name]}", directory_fmt = ("blogger", "{blog[name]}",
"{post[date]:%Y-%m-%d} {post[title]}") "{post[date]:%Y-%m-%d} {post[title]}")
filename_fmt = "{num:>03}.{extension}" filename_fmt = "{num:>03}.{extension}"
archive_fmt = "{post[id]}_{num}" archive_fmt = "{post[id]}_{num}"
root = "https://www.blogger.com"
def __init__(self, match):
Extractor.__init__(self, match)
self.blog = match.group(1) or match.group(2)
def _init(self): def _init(self):
self.api = BloggerAPI(self) self.api = BloggerAPI(self)
self.blog = self.root.rpartition("/")[2]
self.videos = self.config("videos", True) self.videos = self.config("videos", True)
def items(self): def items(self):
@@ -92,6 +84,18 @@ class BloggerExtractor(Extractor):
"""Return additional metadata""" """Return additional metadata"""
BASE_PATTERN = BloggerExtractor.update({
"blogspot": {
"root": None,
"pattern": r"[\w-]+\.blogspot\.com",
},
"micmicidol": {
"root": "https://www.micmicidol.club",
"pattern": r"(?:www\.)?micmicidol\.club",
},
})
class BloggerPostExtractor(BloggerExtractor): class BloggerPostExtractor(BloggerExtractor):
"""Extractor for a single blog post""" """Extractor for a single blog post"""
subcategory = "post" subcategory = "post"
@@ -100,7 +104,7 @@ class BloggerPostExtractor(BloggerExtractor):
def __init__(self, match): def __init__(self, match):
BloggerExtractor.__init__(self, match) BloggerExtractor.__init__(self, match)
self.path = match.group(3) self.path = match.group(match.lastindex)
def posts(self, blog): def posts(self, blog):
return (self.api.post_by_path(blog["id"], self.path),) return (self.api.post_by_path(blog["id"], self.path),)
@@ -124,7 +128,7 @@ class BloggerSearchExtractor(BloggerExtractor):
def __init__(self, match): def __init__(self, match):
BloggerExtractor.__init__(self, match) BloggerExtractor.__init__(self, match)
self.query = text.unquote(match.group(3)) self.query = text.unquote(match.group(match.lastindex))
def posts(self, blog): def posts(self, blog):
return self.api.blog_search(blog["id"], self.query) return self.api.blog_search(blog["id"], self.query)
@@ -141,7 +145,7 @@ class BloggerLabelExtractor(BloggerExtractor):
def __init__(self, match): def __init__(self, match):
BloggerExtractor.__init__(self, match) BloggerExtractor.__init__(self, match)
self.label = text.unquote(match.group(3)) self.label = text.unquote(match.group(match.lastindex))
def posts(self, blog): def posts(self, blog):
return self.api.blog_posts(blog["id"], self.label) return self.api.blog_posts(blog["id"], self.label)

View File

@@ -87,6 +87,7 @@ CATEGORY_MAP = {
"mangaread" : "MangaRead", "mangaread" : "MangaRead",
"mangasee" : "MangaSee", "mangasee" : "MangaSee",
"mastodon.social": "mastodon.social", "mastodon.social": "mastodon.social",
"micmicidol" : "MIC MIC IDOL",
"myhentaigallery": "My Hentai Gallery", "myhentaigallery": "My Hentai Gallery",
"myportfolio" : "Adobe Portfolio", "myportfolio" : "Adobe Portfolio",
"naverwebtoon" : "NaverWebtoon", "naverwebtoon" : "NaverWebtoon",
@@ -292,6 +293,10 @@ BASE_MAP = {
"vichan" : "vichan Imageboards", "vichan" : "vichan Imageboards",
} }
URL_MAP = {
"blogspot": "https://www.blogger.com/",
}
_OAUTH = '<a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a>' _OAUTH = '<a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a>'
_COOKIES = '<a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a>' _COOKIES = '<a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a>'
_APIKEY_DB = \ _APIKEY_DB = \
@@ -362,7 +367,7 @@ IGNORE_LIST = (
def domain(cls): def domain(cls):
"""Return the web-domain related to an extractor class""" """Return the domain name associated with an extractor class"""
try: try:
url = sys.modules[cls.__module__].__doc__.split()[-1] url = sys.modules[cls.__module__].__doc__.split()[-1]
if url.startswith("http"): if url.startswith("http"):
@@ -429,10 +434,13 @@ def build_extractor_list():
for category, root in extr.instances: for category, root in extr.instances:
base[category].append(extr.subcategory) base[category].append(extr.subcategory)
if category not in domains: if category not in domains:
if not root and results: if not root:
# use domain from first matching test if category in URL_MAP:
test = results.category(category)[0] root = URL_MAP[category].rstrip("/")
root = test["#class"].from_url(test["#url"]).root elif results:
# use domain from first matching test
test = results.category(category)[0]
root = test["#class"].from_url(test["#url"]).root
domains[category] = root + "/" domains[category] = root + "/"
# sort subcategory lists # sort subcategory lists

View File

@@ -8,100 +8,30 @@ from gallery_dl.extractor import blogger
__tests__ = ( __tests__ = (
{
"#url" : "https://julianbphotography.blogspot.com/2010/12/moon-rise.html",
"#category": ("", "blogger", "post"),
"#class" : blogger.BloggerPostExtractor,
"#pattern" : "https://3.bp.blogspot.com/.*/s0/Icy-Moonrise-.*.jpg",
"#sha1_url": "9928429fb62f712eb4de80f53625eccecc614aae",
"blog": {
"date" : "dt:2010-11-21 18:19:42",
"description": "",
"id" : "5623928067739466034",
"kind" : "blogger#blog",
"locale" : dict,
"name" : "Julian Bunker Photography",
"pages" : int,
"posts" : int,
"published" : "2010-11-21T10:19:42-08:00",
"updated" : str,
"url" : "http://julianbphotography.blogspot.com/",
},
"post": {
"author" : "Julian Bunker",
"content" : str,
"date" : "dt:2010-12-26 01:08:00",
"etag" : str,
"id" : "6955139236418998998",
"kind" : "blogger#post",
"published": "2010-12-25T17:08:00-08:00",
"replies" : "0",
"title" : "Moon Rise",
"updated" : "2011-12-06T05:21:24-08:00",
"url" : r"re:.+/2010/12/moon-rise.html$",
},
"num" : int,
"url" : str,
},
{ {
"#url" : "blogger:http://www.julianbunker.com/2010/12/moon-rise.html", "#url" : "blogger:http://www.julianbunker.com/2010/12/moon-rise.html",
"#category": ("", "blogger", "post"), "#category": ("blogger", "www.julianbunker.com", "post"),
"#class" : blogger.BloggerPostExtractor, "#class" : blogger.BloggerPostExtractor,
}, },
{
"#url" : "http://cfnmscenesinmovies.blogspot.com/2011/11/cfnm-scene-jenna-fischer-in-office.html",
"#comment" : "video (#587)",
"#category": ("", "blogger", "post"),
"#class" : blogger.BloggerPostExtractor,
"#pattern" : r"https://.+\.googlevideo\.com/videoplayback",
},
{
"#url" : "https://randomthingsthroughmyletterbox.blogspot.com/2022/01/bitter-flowers-by-gunnar-staalesen-blog.html",
"#comment" : "new image domain (#2204)",
"#category": ("", "blogger", "post"),
"#class" : blogger.BloggerPostExtractor,
"#pattern" : "https://blogger.googleusercontent.com/img/a/.+=s0$",
"#count" : 8,
},
{
"#url" : "https://julianbphotography.blogspot.com/",
"#category": ("", "blogger", "blog"),
"#class" : blogger.BloggerBlogExtractor,
"#pattern" : r"https://\d\.bp\.blogspot\.com/.*/s0/[^.]+\.jpg",
"#range" : "1-25",
"#count" : 25,
},
{ {
"#url" : "blogger:https://www.kefblog.com.ng/", "#url" : "blogger:https://www.kefblog.com.ng/",
"#category": ("", "blogger", "blog"), "#category": ("blogger", "www.kefblog.com.ng", "blog"),
"#class" : blogger.BloggerBlogExtractor, "#class" : blogger.BloggerBlogExtractor,
"#range" : "1-25", "#range" : "1-25",
"#count" : 25, "#count" : 25,
}, },
{ {
"#url" : "https://julianbphotography.blogspot.com/search?q=400mm", "#url" : "blogger:http://www.julianbunker.com/search?q=400mm",
"#category": ("", "blogger", "search"), "#category": ("blogger", "1www.julianbunker.com", "search"),
"#class" : blogger.BloggerSearchExtractor, "#class" : blogger.BloggerSearchExtractor,
"#count" : "< 10",
"query": "400mm",
}, },
{ {
"#url" : "https://dmmagazine.blogspot.com/search/label/D%26D", "#url" : "blogger:http://www.julianbunker.com/search/label/D%26D",
"#category": ("", "blogger", "label"), "#category": ("blogger", "www.julianbunker.com", "label"),
"#class" : blogger.BloggerLabelExtractor, "#class" : blogger.BloggerLabelExtractor,
"#range" : "1-25",
"#count" : 25,
"label": "D&D",
}, },
) )

95
test/results/blogspot.py Normal file
View File

@@ -0,0 +1,95 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import blogger
__tests__ = (
{
"#url" : "https://julianbphotography.blogspot.com/2010/12/moon-rise.html",
"#category": ("blogger", "blogspot", "post"),
"#class" : blogger.BloggerPostExtractor,
"#urls" : "https://3.bp.blogspot.com/-zlJddJtJOUo/Tt4WooTPNtI/AAAAAAAABG8/dGT2cGp2E7Y/s0/Icy-Moonrise---For-Web.jpg",
"blog": {
"date" : "dt:2010-11-21 18:19:42",
"description": "",
"id" : "5623928067739466034",
"kind" : "blogger#blog",
"locale" : dict,
"name" : "Julian Bunker Photography",
"pages" : int,
"posts" : int,
"published" : "2010-11-21T10:19:42-08:00",
"updated" : str,
"url" : "http://julianbphotography.blogspot.com/",
},
"post": {
"author" : "Julian Bunker",
"content" : str,
"date" : "dt:2010-12-26 01:08:00",
"etag" : str,
"id" : "6955139236418998998",
"kind" : "blogger#post",
"published": "2010-12-25T17:08:00-08:00",
"replies" : "0",
"title" : "Moon Rise",
"updated" : "2011-12-06T05:21:24-08:00",
"url" : "http://julianbphotography.blogspot.com/2010/12/moon-rise.html",
},
"extension": "jpg",
"filename" : "Icy-Moonrise---For-Web",
"num" : 1,
"num" : int,
"url" : "https://3.bp.blogspot.com/-zlJddJtJOUo/Tt4WooTPNtI/AAAAAAAABG8/dGT2cGp2E7Y/s0/Icy-Moonrise---For-Web.jpg",
},
{
"#url" : "http://cfnmscenesinmovies.blogspot.com/2011/11/cfnm-scene-jenna-fischer-in-office.html",
"#comment" : "video (#587)",
"#category": ("blogger", "blogspot", "post"),
"#class" : blogger.BloggerPostExtractor,
"#pattern" : r"https://.+\.googlevideo\.com/videoplayback",
},
{
"#url" : "https://randomthingsthroughmyletterbox.blogspot.com/2022/01/bitter-flowers-by-gunnar-staalesen-blog.html",
"#comment" : "new image domain (#2204)",
"#category": ("blogger", "blogspot", "post"),
"#class" : blogger.BloggerPostExtractor,
"#pattern" : "https://blogger.googleusercontent.com/img/a/.+=s0$",
"#count" : 8,
},
{
"#url" : "https://julianbphotography.blogspot.com/",
"#category": ("blogger", "blogspot", "blog"),
"#class" : blogger.BloggerBlogExtractor,
"#pattern" : r"https://\d\.bp\.blogspot\.com/.*/s0/[^.]+\.jpg",
"#range" : "1-25",
"#count" : 25,
},
{
"#url" : "https://julianbphotography.blogspot.com/search?q=400mm",
"#category": ("blogger", "blogspot", "search"),
"#class" : blogger.BloggerSearchExtractor,
"#count" : "< 10",
"query": "400mm",
},
{
"#url" : "https://dmmagazine.blogspot.com/search/label/D%26D",
"#category": ("blogger", "blogspot", "label"),
"#class" : blogger.BloggerLabelExtractor,
"#range" : "1-25",
"#count" : 25,
"label": "D&D",
},
)

View File

@@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import blogger
__tests__ = (
{
"#url" : "https://www.micmicidol.club/2023/11/weekly-taishu-20231113-cover.html",
"#category": ("blogger", "micmicidol", "post"),
"#class" : blogger.BloggerPostExtractor,
"#urls" : "https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhgtpSSdrol9aKP_ztcc_mp9TUUS0U_t2DYJuGX3XCs6X5CkxIb-pM98QlxbkgJFvQj-0e6RbXNBf047qyMDZLcPJsm9dTqAn2XkTVfLhWRaxxVvIYnHYu0R0d7WsAUSFs0MDe4Sotpuqp5DQnjr45T17CXKbWtq9cR3op9dDQh3yiw2a6_HInIjLRm5io/s0/000-micmicidol.jpg",
"blog": {
"date" : "dt:2023-09-18 19:48:53",
"description": "",
"id" : "7192714164191173242",
"kind" : "blogger#blog",
"locale" : {
"country" : "TW",
"language": "zh",
"variant" : "",
},
"name" : "MIC MIC IDOL",
"pages" : int,
"posts" : int,
"published" : "2023-09-18T12:48:53-07:00",
"updated" : str,
"url" : "http://www.micmicidol.club/"
},
"post": {
"author" : "MIC MIC IDOL",
"content" : "&nbsp;",
"date" : "dt:2023-11-18 08:01:00",
"etag" : str,
"id" : "5395888649239375388",
"kind" : "blogger#post",
"labels" : [
"- Cover",
"Weekly Taishu",
"Weekly Taishu Cover",
],
"published": "2023-11-18T00:01:00-08:00",
"replies" : "0",
"title" : "Weekly Taishu 週刊大衆 2023.11.13 Cover",
"updated" : "2023-11-18T03:00:42-08:00",
"url" : "http://www.micmicidol.club/2023/11/weekly-taishu-20231113-cover.html"
},
"num" : 1,
"url" : "https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhgtpSSdrol9aKP_ztcc_mp9TUUS0U_t2DYJuGX3XCs6X5CkxIb-pM98QlxbkgJFvQj-0e6RbXNBf047qyMDZLcPJsm9dTqAn2XkTVfLhWRaxxVvIYnHYu0R0d7WsAUSFs0MDe4Sotpuqp5DQnjr45T17CXKbWtq9cR3op9dDQh3yiw2a6_HInIjLRm5io/s0/000-micmicidol.jpg",
},
{
"#url" : "https://www.micmicidol.club/",
"#category": ("blogger", "micmicidol", "blog"),
"#class" : blogger.BloggerBlogExtractor,
"#range" : "1-25",
"#count" : 25,
},
{
"#url" : "https://www.micmicidol.club/search?q=cover",
"#category": ("blogger", "micmicidol", "search"),
"#class" : blogger.BloggerSearchExtractor,
"#range" : "1-25",
"#count" : 25,
"query" : "cover",
},
{
"#url" : "https://www.micmicidol.club/search/label/Weekly%20Taishu%20Cover",
"#category": ("blogger", "micmicidol", "label"),
"#class" : blogger.BloggerLabelExtractor,
"#range" : "1-25",
"#count" : 25,
"label" : "Weekly Taishu Cover",
},
)