[weibo] add 'user' and 'status' extractors

2019-02-16 22:56:04 +01:00
parent f8782c05f2
commit 19860655a3
3 changed files with 139 additions and 1 deletions
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@@ -77,7 +77,7 @@ Rule 34              https://rule34.xxx/                 Pools, Posts, Tag-Searc
 Safebooru            https://safebooru.org/              Pools, Posts, Tag-Searches
 Sankaku Channel      https://chan.sankakucomplex.com/    Pools, Posts, Tag-Searches                         Optional
 Sea Otter Scans      https://reader.seaotterscans.com/   Chapters, Manga
-Sen Manga            http://raw.senmanga.com/            Chapters
+Sen Manga            https://raw.senmanga.com/           Chapters
 Sense-Scans          http://sensescans.com/reader/       Chapters, Manga
 Simply Hentai        https://www.simply-hentai.com/      Galleries, individual Images, Videos
 SlideShare           https://www.slideshare.net/         Presentations
@@ -88,6 +88,7 @@ Tumblr               https://www.tumblr.com/             Images from Users, Like
 Twitter              https://twitter.com/                Media Timelines, Timelines, Tweets
 Wallhaven            https://alpha.wallhaven.cc/         individual Images, Search Results                  Optional
 Warosu               https://warosu.org/                 Threads
 Weibo                https://www.weibo.com/              Images from Users, Images from Statuses
 World Three          http://www.slide.world-three.org/   Chapters, Manga
 XVideos              https://www.xvideos.com/            Images from Users, Galleries
 Yandere              https://yande.re/                   Pools, Popular Images, Posts, Tag-Searches
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@@ -78,6 +78,7 @@ modules = [
    "twitter",
    "wallhaven",
    "warosu",
    "weibo",
    "yandere",
    "xvideos",
    "yuki",
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -0,0 +1,136 @@
 # -*- coding: utf-8 -*-
 # Copyright 2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Extractors for https://www.weibo.com/"""
 from .common import Extractor, Message
 from .. import text
 import json
 class WeiboExtractor(Extractor):
    category = "weibo"
    directory_fmt = ("{category}", "{user[screen_name]}")
    filename_fmt = "{status[id]}_{num:>02}.{extension}"
    archive_fmt = "{status[id]}_{num}"
    root = "https://m.weibo.cn"
    def __init__(self, match):
        Extractor.__init__(self, match)
        self.retweets = self.config("retweets", True)
    def items(self):
        first = True
        for status in self.statuses():
            obj = status
            num = 1
            if first:
                yield Message.Version, 1
                yield Message.Directory, status
                first = False
            while True:
                if "pics" in obj:
                    for image in obj["pics"]:
                        pid = image["pid"]
                        if "large" in image:
                            image = image["large"]
                        data = text.nameext_from_url(image["url"], {
                            "num": num,
                            "pid": pid,
                            "width": text.parse_int(image["geo"]["width"]),
                            "height": text.parse_int(image["geo"]["height"]),
                            "status": status,
                        })
                        yield Message.Url, image["url"], data
                        num += 1
                if "page_info" in obj and "media_info" in obj["page_info"]:
                    info = obj["page_info"]["media_info"]
                    url = info.get("stream_url_hd") or info["stream_url"]
                    data = text.nameext_from_url(url, {
                        "num": num,
                        "url": url,
                        "width": 0,
                        "height": 0,
                        "status": status,
                    })
                    yield Message.Url, url, data
                if self.retweets and "retweeted_status" in obj:
                    obj = obj["retweeted_status"]
                else:
                    break
    def statuses(self):
        """Returns an iterable containing all relevant 'status' objects"""
 class WeiboUserExtractor(WeiboExtractor):
    """Extractor for all images of a user on weibo.cn"""
    subcategory = "user"
    pattern = (r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)"
               r"/(?:u|p(?:rofile)?)/(\d+)")
    test = (
        ("https://m.weibo.cn/u/2314621010", {
            "range": "1-30",
        }),
        ("https://m.weibo.cn/profile/2314621010"),
        ("https://m.weibo.cn/p/2304132314621010_-_WEIBO_SECOND_PROFILE_WEIBO"),
        ("https://www.weibo.com/p/1003062314621010/home"),
    )
    def __init__(self, match):
        WeiboExtractor.__init__(self, match)
        self.user_id = match.group(1)
    def statuses(self):
        url = self.root + "/api/container/getIndex"
        params = {"page": 1, "containerid": "107603" + self.user_id[-10:]}
        while True:
            data = self.request(url, params=params).json()
            for card in data["data"]["cards"]:
                if "mblog" in card:
                    yield card["mblog"]
            if len(data["data"]["cards"]) < 5:
                return
            params["page"] += 1
 class WeiboStatusExtractor(WeiboExtractor):
    """Extractor for images from a status on weibo.cn"""
    subcategory = "status"
    pattern = (r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)"
               r"/(?:detail|status)/(\d+)")
    test = (
        ("https://m.weibo.cn/detail/4323047042991618", {
            "pattern": r"https://wx\d+.sinaimg.cn/large/\w+.jpg",
        }),
        ("https://m.weibo.cn/detail/4339748116375525", {
            "pattern": r"http://f.us.sinaimg.cn/\w+\.mp4\?label=mp4_hd",
        }),
        ("https://m.weibo.cn/status/4339748116375525"),
    )
    def __init__(self, match):
        WeiboExtractor.__init__(self, match)
        self.status_id = match.group(1)
    def statuses(self):
        url = "{}/detail/{}".format(self.root, self.status_id)
        page = self.request(url).text
        data = json.loads(text.extract(
            page, " var $render_data = [", "][0] || {};")[0])
        return (data["status"],)