gallery-dl/gallery_dl/extractor/bcy.py

# -*- coding: utf-8 -*-

# Copyright 2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://bcy.net/"""

from .common import Extractor, Message
from .. import text
import json
import re


class BcyExtractor(Extractor):
    """Base class for bcy extractors"""
    category = "bcy"
    directory_fmt = ("{category}", "{user[id]} {user[name]}")
    filename_fmt = "{post[id]} {id}.{extension}"
    archive_fmt = "{post[id]}_{id}"
    root = "https://bcy.net"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.item_id = match.group(1)

    def items(self):
        sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub
        iroot = "https://img-bcy-qn.pstatp.com"
        noop = self.config("noop")

        for post_id in self.posts():
            post = self._parse_post(post_id)
            if not post:
                continue

            yield Message.Directory, post
            for post["num"], image in enumerate(post["_multi"], 1):
                post["id"] = image["mid"]
                post["width"] = image["w"]
                post["height"] = image["h"]

                url = image["path"].partition("~")[0]
                text.nameext_from_url(url, post)

                if post["extension"]:
                    if not url.startswith(iroot):
                        url = sub(iroot, url)
                    post["filter"] = ""
                    yield Message.Url, url, post

                else:
                    post["filter"] = "watermark"
                    yield Message.Url, image["origin"], post

                    if noop:
                        post["extension"] = ""
                        post["filter"] = "noop"
                        yield Message.Url, image["original_path"], post

    def _parse_post(self, post_id):
        url = "{}/item/detail/{}".format(self.root, post_id)
        response = self.request(url)
        if response.status_code >= 400:
            return None

        data = json.loads(
            text.extract(response.text, 'JSON.parse("', '");')[0]
            .replace('\\\\u002F', '/')
            .replace('\\"', '"')
        )["detail"]

        post = data["post_data"]
        if not post["multi"]:
            return None
        user = data["detail_user"]

        return {
            "user": {
                "id"     : user["uid"],
                "name"   : user["uname"],
                "avatar" : user["avatar"],
            },
            "post": {
                "id"     : text.parse_int(post["item_id"]),
                "tags"   : [t["tag_name"] for t in post["post_tags"]],
                "date"   : text.parse_timestamp(post["ctime"]),
                "parody" : text.parse_unicode_escapes(post["work"]),
                "content": post["plain"],
                "likes"  : post["like_count"],
                "shares" : post["share_count"],
                "replies": post["reply_count"],
            },
            "_multi": post["multi"],
        }


class BcyUserExtractor(BcyExtractor):
    """Extractor for user timelines"""
    subcategory = "user"
    pattern = r"(?:https?://)?bcy\.net/u/(\d+)"
    test = ("https://bcy.net/u/1933712", {
        "pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/\w+.jpg",
        "count": ">= 25",
    })

    def posts(self):
        url = self.root + "/apiv3/user/selfPosts"
        params = {
            "uid": self.item_id,
            "since": None,
            #  "_signature": None,
        }

        while True:
            data = self.request(url, params=params).json()

            item = None
            for item in data["data"]["items"]:
                if item["item_detail"]["multi"]:
                    yield item["item_detail"]["item_id"]

            if not item:
                return
            params["since"] = item["since"]


class BcyPostExtractor(BcyExtractor):
    """Extractor for individual posts"""
    subcategory = "post"
    pattern = r"(?:https?://)?bcy\.net/item/detail/(\d+)"
    test = ("https://bcy.net/item/detail/6355835481002893070", {
        "url": "301202375e61fd6e0e2e35de6c3ac9f74885dec3",
        "count": 1,
        "keyword": {
            "user": {
                "id"     : 1933712,
                "name"   : "wukloo",
                "avatar" : str,
            },
            "post": {
                "id"     : 6355835481002893070,
                "tags"   : list,
                "date"   : "dt:2016-11-22 08:47:46",
                "parody" : "东方PROJECT",
                "content": "re:根据微博的建议稍微做了点修改",
                "likes"  : int,
                "shares" : int,
                "replies": int,
            },
            "id": 8330182,
            "num": 1,
            "width" : 3000,
            "height": 1687,
            "filename": "712e0780b09011e696f973c3d1568337",
            "extension": "jpg",
        },
    })

    def posts(self):
        return (self.item_id,)