From 3cb8327c60363031fd2f930fd89baf19fbe06f64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?=
Date: Thu, 1 Sep 2022 21:44:22 +0200
Subject: [PATCH] [zerochan] add 'metadata' option (#2861)
---
docs/configuration.rst | 12 +++++++
docs/gallery-dl.conf | 3 +-
gallery_dl/extractor/zerochan.py | 55 ++++++++++++++++++++++++--------
3 files changed, 56 insertions(+), 14 deletions(-)
diff --git a/docs/configuration.rst b/docs/configuration.rst
index 90de2983..7ab49275 100644
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -2797,6 +2797,18 @@ Description
Location of a youtube-dl configuration file to load options from.
+extractor.zerochan.metadata
+---------------------------
+Type
+ ``bool``
+Default
+ ``false``
+Description
+ Extract additional metadata (date, md5, tags, ...)
+
+ Note: This requires 1-2 additional HTTP request for each post.
+
+
extractor.[booru].tags
----------------------
Type
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index 1e485eea..6ba50f25 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -341,7 +341,8 @@
"zerochan":
{
"username": null,
- "password": null
+ "password": null,
+ "metadata": false
},
"booru":
{
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index 2b5acd89..72cf4389 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -11,6 +11,8 @@
from .booru import BooruExtractor
from ..cache import cache
from .. import text, exception
+from xml.etree import ElementTree
+
BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"
@@ -54,7 +56,7 @@ class ZerochanExtractor(BooruExtractor):
return response.cookies
- def _parse_entry_page(self, entry_id):
+ def _parse_entry_html(self, entry_id):
url = "{}/{}".format(self.root, entry_id)
extr = text.extract_from(self.request(url).text)
@@ -66,10 +68,26 @@ class ZerochanExtractor(BooruExtractor):
'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"),
"width" : extr('"width": "', ' '),
"height": extr('"height": "', ' '),
- "size" : extr('"contentSize": "', 'B'),
+ "size" : text.parse_bytes(extr('"contentSize": "', 'B')),
"path" : text.split_html(extr(
'class="breadcrumbs', '
'))[3::2],
- "tags" : extr('alt="Tags: ', '"').split(", ")
+ "tags" : extr('alt="Tags: Anime, ', '"').split(", ")
+ }
+
+ def _parse_entry_xml(self, entry_id):
+ url = "{}/{}?xml".format(self.root, entry_id)
+ item = ElementTree.fromstring(self.request(url).text)[0][-1]
+ # content = item[4].attrib
+
+ return {
+ # "id" : entry_id,
+ # "file_url": content["url"],
+ # "width" : content["width"],
+ # "height": content["height"],
+ # "size" : content["filesize"],
+ "name" : item[2].text,
+ "tags" : item[5].text.lstrip().split(", "),
+ "md5" : item[6].text,
}
@@ -105,6 +123,7 @@ class ZerochanTagExtractor(ZerochanExtractor):
url = self.root + "/" + self.search_tag
params = text.parse_query(self.query)
params["p"] = text.parse_int(params.get("p"), 1)
+ metadata = self.config("metadata")
while True:
page = self.request(url, params=params).text
@@ -115,15 +134,22 @@ class ZerochanTagExtractor(ZerochanExtractor):
post = extr('