From f0321f423dcd22d0ddf090a866339edf86b33f16 Mon Sep 17 00:00:00 2001
From: enduser420 <91022934+enduser420@users.noreply.github.com>
Date: Wed, 5 Oct 2022 01:48:13 +0530
Subject: [PATCH] [2chen] Add 2chen.moe extractor (#2707)
* [2chen] Add 2chen.moe extractor
* change "==" to is
* fix for "test_unique_pattern_matches"
* fix regex pattern and group matching
* fix regex again
* [2chen] add 'reply_no' and 'hash' metadata and change 'filename_fmt'
also made an entry in supportedsites.md
* [2chen] unescape 'title'
* [2chen] partition() -> rpartition()
* [2chen] extract 'date' and 'name' metadata
* [2chen] remove 'offset' argument
* [2chen] do some changes
* [2chen] do some more changes
* [2chen] unescape 'name' and 'filename'
---
docs/supportedsites.md | 6 ++
gallery_dl/extractor/2chen.py | 99 ++++++++++++++++++++++++++++++++
gallery_dl/extractor/__init__.py | 1 +
3 files changed, 106 insertions(+)
create mode 100644 gallery_dl/extractor/2chen.py
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index e9a1a518..d288b4c0 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -13,6 +13,12 @@ Consider all sites to be NSFW unless otherwise known.
+
+ | 2chen |
+ https://2chen.moe/ |
+ Boards, Threads |
+ |
+
| 35PHOTO |
https://35photo.pro/ |
diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py
new file mode 100644
index 00000000..3e65fe6d
--- /dev/null
+++ b/gallery_dl/extractor/2chen.py
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://2chen.moe/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class _2chenThreadExtractor(Extractor):
+ """Extractor for 2chen threads"""
+ category = "2chen"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board}", "{thread} {title}")
+ filename_fmt = "{time} {filename}.{extension}"
+ archive_fmt = "{hash}"
+ root = "https://2chen.moe"
+ pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/(\d+)"
+ test = (
+ ("https://2chen.moe/jp/303786", {
+ "count": ">= 10",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board, self.thread = match.groups()
+
+ def items(self):
+ url = "{}/{}/{}".format(self.root, self.board, self.thread)
+ page = self.request(url, encoding="utf-8").text
+ data = self.metadata(page)
+ yield Message.Directory, data
+ for post in self.posts(page):
+ if not post["url"]:
+ continue
+ post.update(data)
+ post["url"] = self.root + post["url"]
+ post["time"] = text.parse_int(post["date"].timestamp())
+ yield Message.Url, post["url"], text.nameext_from_url(
+ post["filename"], post)
+
+ def metadata(self, page):
+ board, pos = text.extract(page, 'class="board">/', '/<')
+ title = text.extract(page, "", "
", pos)[0]
+ return {
+ "board" : board,
+ "thread": self.thread,
+ "title" : text.unescape(title),
+ }
+
+ def posts(self, page):
+ """Return iterable with relevant posts"""
+ return map(self.parse, text.extract_iter(
+ page, 'class="glass media', ''))
+
+ def parse(self, post):
+ extr = text.extract_from(post)
+ return {
+ "name" : text.unescape(extr("", "")),
+ "date" : text.parse_datetime(
+ extr("