# -*- coding: utf-8 -*- # Copyright 2024 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://ci-en.net/""" from .common import Extractor, Message from .. import text, util BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)" class CienExtractor(Extractor): category = "cien" root = "https://ci-en.net" def __init__(self, match): self.root = text.root_from_url(match.group(0)) Extractor.__init__(self, match) def _pagination_articles(self, url, params): data = {"_extractor": CienArticleExtractor} params["page"] = text.parse_int(params.get("page"), 1) while True: page = self.request(url, params=params).text for card in text.extract_iter( page, ' class="c-cardCase-item', ''): article_url = text.extr(card, ' href="', '"') yield Message.Queue, article_url, data if ' rel="next"' not in page: return params["page"] += 1 class CienArticleExtractor(CienExtractor): subcategory = "article" filename_fmt = "{num:>02} {filename}.{extension}" directory_fmt = ("{category}", "{author[name]}", "{post_id} {name}") archive_fmt = "{post_id}_{num}" pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)" example = "https://ci-en.net/creator/123/article/12345" def items(self): url = "{}/creator/{}/article/{}".format( self.root, self.groups[0], self.groups[1]) page = self.request(url, notfound="article").text post = util.json_loads(text.extr( page, ''))[0] files = self._extract_files(post.get("articleBody") or page) post["post_id"] = text.parse_int(self.groups[1]) post["count"] = len(files) post["date"] = text.parse_datetime(post["datePublished"]) try: del post["publisher"] del post["sameAs"] except Exception: pass yield Message.Directory, post for post["num"], file in enumerate(files, 1): post.update(file) if "extension" not in file: text.nameext_from_url(file["url"], post) yield Message.Url, file["url"], post def _extract_files(self, page): files = [] for image in text.extract_iter( page, 'class="file-player-image"', ""): size = text.extr(image, ' data-size="', '"') w, _, h = size.partition("x") files.append({ "url" : text.extr(image, ' data-raw="', '"'), "width" : text.parse_int(w), "height": text.parse_int(h), "type" : "image", }) for video in text.extract_iter( page, ""): path = text.extr(video, ' base-path="', '"') name = text.extr(video, ' file-name="', '"') auth = text.extr(video, ' auth-key="', '"') file = text.nameext_from_url(name) file["url"] = "{}video-web.mp4?{}".format(path, auth) file["type"] = "video" files.append(file) for download in text.extract_iter( page, 'class="downloadBlock', ""): name = text.extr(download, "

", "<") file = text.nameext_from_url(name.rpartition(" ")[0]) file["url"] = text.extr(download, ' href="', '"') file["type"] = "attachment" files.append(file) return files def _extract_galleries(self, page): # TODO files = [] for gallery in text.extract_iter( page, ""): url = "https://ci-en.dlsite.com/api/creator/gallery/images" params = { "hash" : text.extr(gallery, ' hash="', '"'), "gallery_id": text.extr(gallery, ' gallery-id="', '"'), "time" : text.extr(gallery, ' time="', '"'), } self.request(url, params=params) return files class CienCreatorExtractor(CienExtractor): subcategory = "creator" pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$" example = "https://ci-en.net/creator/123" def items(self): url = "{}/creator/{}/article".format(self.root, self.groups[0]) params = text.parse_query(self.groups[1]) params["mode"] = "list" return self._pagination_articles(url, params) class CienRecentExtractor(CienExtractor): subcategory = "recent" pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?" example = "https://ci-en.net/mypage/recent" def items(self): url = self.root + "/mypage/recent" params = text.parse_query(self.groups[0]) return self._pagination_articles(url, params) class CienFollowingExtractor(CienExtractor): subcategory = "following" pattern = BASE_PATTERN + r"/mypage/subscription(/following)?" example = "https://ci-en.net/mypage/subscription" def items(self): url = self.root + "/mypage/subscription" + (self.groups[0] or "") page = self.request(url).text data = {"_extractor": CienCreatorExtractor} for subscription in text.extract_iter( page, 'class="c-grid-subscriptionInfo', ''): url = text.extr(subscription, ' href="', '"') yield Message.Queue, url, data