# -*- coding: utf-8 -*- # Copyright 2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://www.subscribestar.com/""" from .common import Extractor, Message from .. import text import datetime import json BASE_PATTERN = r"(?:https?://)?(?:www\.)?subscribestar\.(com|adult)" class SubscribestarExtractor(Extractor): """Base class for subscribestar extractors""" category = "subscribestar" root = "https://www.subscribestar.com" directory_fmt = ("{category}", "{author_name}") filename_fmt = "{post_id}_{id}.{extension}" archive_fmt = "{id}" def __init__(self, match): tld, self.item = match.groups() if tld == "adult": self.root = "https://subscribestar.adult" self.subcategory += "-adult" Extractor.__init__(self, match) self.metadata = self.config("metadata", False) self._year = " " + str(datetime.date.today().year) def items(self): for post_html in self.posts(): media = self._media_from_post(post_html) if not media: continue data = self._data_from_post(post_html) yield Message.Directory, data for item in media: item.update(data) url = item["url"] yield Message.Url, url, text.nameext_from_url(url, item) def posts(self): """Yield HTML content of all relevant posts""" @staticmethod def _media_from_post(html): gallery = text.extract(html, 'data-gallery="', '"')[0] if gallery: return [ item for item in json.loads(text.unescape(gallery)) if "/previews/" not in item["url"] ] return () def _data_from_post(self, html): extr = text.extract_from(html) data = { "post_id" : text.parse_int(extr('data-id="', '"')), "author_id" : text.parse_int(extr('data-user-id="', '"')), "author_name": text.unescape(extr('href="/', '"')), "author_nick": text.unescape(extr('>', '<')), "content" : (extr( '