[pixiv] rewrite

- same functionality, better(?) code quality, easier to extend

- added test for the user-tag functionality

- removed the 'artist-id', 'artist-name' and 'artist-nick'
  keywords, which can be replaced with 'user[id]', 'user[name]'
  and 'user[account]' respectively
This commit is contained in:
Mike Fährmann
2017-06-04 16:33:36 +02:00
parent 338f79147f
commit e365f1d799

View File

@@ -14,44 +14,30 @@ from ..cache import cache
import re import re
class PixivUserExtractor(Extractor): class PixivExtractor(Extractor):
"""Extractor for works of a pixiv-user""" """Base class for pixiv extractors"""
category = "pixiv" category = "pixiv"
subcategory = "user" directory_fmt = ["{category}", "{user[id]} {user[account]}"]
directory_fmt = ["{category}", "{artist-id}-{artist-nick}"] filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}"
filename_fmt = "{category}_{artist-id}_{id}{num}.{extension}"
pattern = [r"(?:https?://)?(?:www\.)?pixiv\.net/"
r"member(?:_illust)?\.php\?id=(\d+)(?:&tag=(.*))?"]
test = [
("http://www.pixiv.net/member_illust.php?id=173530", {
"url": "852c31ad83b6840bacbce824d85f2a997889efb7",
}),
("http://www.pixiv.net/member_illust.php?id=173531", {
"exception": exception.NotFoundError,
}),
]
member_url = "https://www.pixiv.net/member_illust.php"
illust_url = "https://www.pixiv.net/member_illust.php?mode=medium" illust_url = "https://www.pixiv.net/member_illust.php?mode=medium"
def __init__(self, match): def __init__(self):
Extractor.__init__(self) Extractor.__init__(self)
self.artist_id = match.group(1)
if (len(match.groups()) > 2):
self.tag = match.group(2)
else:
self.tag = None
self.api = PixivAPI(self) self.api = PixivAPI(self)
self.api_call = self.api.user_works self.user_id = -1
self.load_ugoira = self.config("ugoira", True) self.load_ugoira = self.config("ugoira", True)
def items(self): def items(self):
metadata = self.get_job_metadata() metadata = self.get_metadata()
yield Message.Version, 1 yield Message.Version, 1
yield Message.Headers, self.session.headers yield Message.Headers, self.session.headers
yield Message.Cookies, self.session.cookies yield Message.Cookies, self.session.cookies
yield Message.Directory, metadata yield Message.Directory, metadata
for work in self.get_works(): for work in self.works():
work = self.prepare_work(work)
pos = work["extension"].rfind("?", -18) pos = work["extension"].rfind("?", -18)
if pos != -1: if pos != -1:
timestamp = work["extension"][pos:] timestamp = work["extension"][pos:]
@@ -88,27 +74,13 @@ class PixivUserExtractor(Extractor):
) )
yield Message.Url, url, work yield Message.Url, url, work
def get_works(self): def works(self):
"""Yield all work-items for a pixiv-member""" """Return all work-items for a pixiv-member"""
pagenum = 1 return []
while True:
data = self.api_call(self.artist_id, pagenum)
for work in data["response"]:
if self.tag is None or \
self.tag.lower() in [x.lower() for x in work["tags"]]:
yield self.prepare_work(work)
pinfo = data["pagination"]
if pinfo["current"] == pinfo["pages"]:
return
pagenum = pinfo["next"]
def prepare_work(self, work): def prepare_work(self, work):
"""Prepare a work-dictionary with additional keywords""" """Prepare a work-dictionary with additional keywords"""
user = work["user"]
url = work["image_urls"]["large"] url = work["image_urls"]["large"]
work["artist-id"] = user["id"]
work["artist-name"] = user["name"]
work["artist-nick"] = user["account"]
work["num"] = "" work["num"] = ""
work["url"] = url work["url"] = url
work["extension"] = url[url.rfind(".")+1:] work["extension"] = url[url.rfind(".")+1:]
@@ -122,7 +94,7 @@ class PixivUserExtractor(Extractor):
).text ).text
# parse page # parse page
frames, _ = text.extract(page, ',"frames":[', ']') frames = text.extract(page, ',"frames":[', ']')[0]
# build url # build url
url = re.sub( url = re.sub(
@@ -136,20 +108,46 @@ class PixivUserExtractor(Extractor):
r'\{"file":"([^"]+)","delay":(\d+)\},?', r'\{"file":"([^"]+)","delay":(\d+)\},?',
r'\1 \2\n', frames r'\1 \2\n', frames
) )
return url, framelist return url, framelist
def get_job_metadata(self, user=None): def get_metadata(self, user=None):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
if not user: if not user:
user = self.api.user(self.artist_id)["response"][0] user = self.api.user(self.user_id)[0]
return { return {"user": user}
"artist-id": user["id"],
"artist-name": user["name"],
"artist-nick": user["account"],
}
class PixivWorkExtractor(PixivUserExtractor): class PixivUserExtractor(PixivExtractor):
"""Extractor for works of a pixiv-user"""
subcategory = "user"
pattern = [r"(?:https?://)?(?:www\.)?pixiv\.net/"
r"member(?:_illust)?\.php\?id=(\d+)(?:.*&tag=([^&#]+))?"]
test = [
("http://www.pixiv.net/member_illust.php?id=173530", {
"url": "852c31ad83b6840bacbce824d85f2a997889efb7",
}),
("https://www.pixiv.net/member_illust.php?id=173530&tag=HITMAN", {
"url": "3ecb4970dd91ce1de0a9449671b42db5e3fe2b08",
}),
("http://www.pixiv.net/member_illust.php?id=173531", {
"exception": exception.NotFoundError,
}),
]
def __init__(self, match):
PixivExtractor.__init__(self)
self.user_id, tag = match.groups()
self.tag = tag.lower() if tag else None
def works(self):
for work in self.api.user_works(self.user_id):
if (not self.tag or
self.tag in [tag.lower() for tag in work["tags"]]):
yield work
class PixivWorkExtractor(PixivExtractor):
"""Extractor for a single pixiv work/illustration""" """Extractor for a single pixiv work/illustration"""
subcategory = "work" subcategory = "work"
pattern = [(r"(?:https?://)?(?:www\.)?pixiv\.net/member(?:_illust)?\.php" pattern = [(r"(?:https?://)?(?:www\.)?pixiv\.net/member(?:_illust)?\.php"
@@ -178,35 +176,37 @@ class PixivWorkExtractor(PixivUserExtractor):
] ]
def __init__(self, match): def __init__(self, match):
PixivUserExtractor.__init__(self, match) PixivExtractor.__init__(self)
self.illust_id = match.group(1) self.illust_id = match.group(1)
self.load_ugoira = True self.load_ugoira = True
self.work = None self.work = None
def get_works(self): def works(self):
return (self.prepare_work(self.work),) return (self.work,)
def get_job_metadata(self, user=None): def get_metadata(self, user=None):
"""Collect metadata for extractor-job""" self.work = self.api.work(self.illust_id)[0]
self.work = self.api.work(self.illust_id)["response"][0] return PixivExtractor.get_metadata(self, self.work["user"])
return PixivUserExtractor.get_job_metadata(self, self.work["user"])
class PixivFavoriteExtractor(PixivUserExtractor): class PixivFavoriteExtractor(PixivExtractor):
"""Extractor for all favorites/bookmarks of a pixiv-user""" """Extractor for all favorites/bookmarks of a pixiv-user"""
subcategory = "favorite" subcategory = "favorite"
directory_fmt = ["{category}", "bookmarks", "{artist-id}-{artist-nick}"] directory_fmt = ["{category}", "bookmarks", "{user[id]} {user[account]}"]
pattern = [r"(?:https?://)?(?:www\.)?pixiv\.net/bookmark\.php\?id=(\d+)"] pattern = [r"(?:https?://)?(?:www\.)?pixiv\.net/bookmark\.php\?id=(\d+)"]
test = [("http://www.pixiv.net/bookmark.php?id=173530", { test = [("http://www.pixiv.net/bookmark.php?id=173530", {
"url": "e717eb511500f2fa3497aaee796a468ecf685cc4", "url": "e717eb511500f2fa3497aaee796a468ecf685cc4",
})] })]
def __init__(self, match): def __init__(self, match):
PixivUserExtractor.__init__(self, match) PixivExtractor.__init__(self)
self.api_call = self.api.user_favorite_works self.user_id = match.group(1)
def works(self):
return self.api.user_favorite_works(self.user_id)
def prepare_work(self, work): def prepare_work(self, work):
return PixivUserExtractor.prepare_work(self, work["work"]) return PixivExtractor.prepare_work(self, work["work"])
class PixivBookmarkExtractor(PixivFavoriteExtractor): class PixivBookmarkExtractor(PixivFavoriteExtractor):
@@ -215,18 +215,11 @@ class PixivBookmarkExtractor(PixivFavoriteExtractor):
pattern = [r"(?:https?://)?(?:www\.)?pixiv\.net/bookmark\.php()$"] pattern = [r"(?:https?://)?(?:www\.)?pixiv\.net/bookmark\.php()$"]
test = [] test = []
def __init__(self, match): def get_metadata(self, user=None):
PixivFavoriteExtractor.__init__(self, match)
self.api.login() self.api.login()
self.artist_id = self.api.user_id user = self.api.user_info
self.user_id = user["id"]
return PixivExtractor.get_metadata(self, user)
def require_login(func):
"""Decorator: auto-login before api-calls"""
def wrap(self, *args):
self.login()
return func(self, *args)
return wrap
class PixivAPI(): class PixivAPI():
@@ -242,6 +235,7 @@ class PixivAPI():
self.log = extractor.log self.log = extractor.log
self.username = extractor.config("username") self.username = extractor.config("username")
self.password = extractor.config("password") self.password = extractor.config("password")
self.user_info = None
self.session.headers.update({ self.session.headers.update({
"Referer": "https://www.pixiv.net/", "Referer": "https://www.pixiv.net/",
'App-OS': 'ios', 'App-OS': 'ios',
@@ -249,63 +243,35 @@ class PixivAPI():
'App-Version': '6.7.1', 'App-Version': '6.7.1',
'User-Agent': 'PixivIOSApp/6.7.1 (iOS 10.3.1; iPhone8,1)', 'User-Agent': 'PixivIOSApp/6.7.1 (iOS 10.3.1; iPhone8,1)',
}) })
self.user_id = -1
@require_login
def user(self, user_id): def user(self, user_id):
"""Query information about a pixiv user""" """Query information about a pixiv user"""
response = self.session.get( endpoint = "users/" + user_id
"https://public-api.secure.pixiv.net/v1/users/" return self._call(endpoint, {})["response"]
"{user}.json".format(user=user_id)
)
return self._parse(response)
@require_login
def work(self, illust_id): def work(self, illust_id):
"""Query information about a single pixiv work/illustration""" """Query information about a single pixiv work/illustration"""
params = { endpoint = "works/" + illust_id
"image_sizes": "large", params = {"image_sizes": "large"}
} return self._call(endpoint, params)["response"]
response = self.session.get(
"https://public-api.secure.pixiv.net/v1/works/"
"{illust}.json".format(illust=illust_id), params=params
)
return self._parse(response)
@require_login def user_works(self, user_id):
def user_works(self, user_id, page, per_page=20):
"""Query information about the works of a pixiv user""" """Query information about the works of a pixiv user"""
params = { endpoint = "users/{user}/works".format(user=user_id)
"page": page, params = {"image_sizes": "large"}
"per_page": per_page, return self._pagination(endpoint, params)
"image_sizes": "large",
}
response = self.session.get(
"https://public-api.secure.pixiv.net/v1/users/"
"{user}/works.json".format(user=user_id), params=params
)
return self._parse(response)
@require_login def user_favorite_works(self, user_id):
def user_favorite_works(self, user_id, page, per_page=20): """Query information about the favorite works of a pixiv user"""
"""Query information about the favorites works of a pixiv user""" endpoint = "users/{user}/favorite_works".format(user=user_id)
params = { params = {"image_sizes": "large", "include_stats": False}
"page": page, return self._pagination(endpoint, params)
"per_page": per_page,
"include_stats": False,
"image_sizes": "large",
}
response = self.session.get(
"https://public-api.secure.pixiv.net/v1/users/"
"{user}/favorite_works.json".format(user=user_id), params=params
)
return self._parse(response)
def login(self): def login(self):
"""Login and gain a Pixiv Public-API access token""" """Login and gain a Pixiv Public-API access token"""
self.user_id, auth_header = self._login_impl( self.user_info, access_token = self._login_impl(
self.username, self.password) self.username, self.password)
self.session.headers["Authorization"] = auth_header self.session.headers["Authorization"] = access_token
@cache(maxage=50*60, keyarg=1) @cache(maxage=50*60, keyarg=1)
def _login_impl(self, username, password): def _login_impl(self, username, password):
@@ -317,27 +283,39 @@ class PixivAPI():
"grant_type": "password", "grant_type": "password",
"client_id": "bYGKuGVw91e0NMfPGp44euvGt59s", "client_id": "bYGKuGVw91e0NMfPGp44euvGt59s",
"client_secret": "HP3RmkgAmEGro0gn1x9ioawQE8WMfvLXDz3ZqxpK", "client_secret": "HP3RmkgAmEGro0gn1x9ioawQE8WMfvLXDz3ZqxpK",
'get_secure_url': 1, "get_secure_url": 1,
} }
response = self.session.post( response = self.session.post(
"https://oauth.secure.pixiv.net/auth/token", data=data "https://oauth.secure.pixiv.net/auth/token", data=data
) )
if response.status_code not in (200, 301, 302): if response.status_code != 200:
raise exception.AuthenticationError() raise exception.AuthenticationError()
try: try:
response = self._parse(response)["response"] response = response.json()["response"]
token = response["access_token"] token = response["access_token"]
user = response["user"]["id"] user = response["user"]
except: except KeyError:
raise Exception("Get access_token error! Response: %s" % (token)) raise Exception("Get token error! Response: %s" % (response))
return user, "Bearer " + token return user, "Bearer " + token
@staticmethod def _call(self, endpoint, params, _empty=[None]):
def _parse(response, empty=[None]): url = "https://public-api.secure.pixiv.net/v1/" + endpoint + ".json"
"""Parse a Pixiv Public-API response"""
data = response.json() self.login()
data = self.session.get(url, params=params).json()
status = data.get("status") status = data.get("status")
response = data.get("response", empty) response = data.get("response", _empty)
if status == "failure" or response == empty: if status == "failure" or response == _empty:
raise exception.NotFoundError() raise exception.NotFoundError()
return data return data
def _pagination(self, endpoint, params):
while True:
data = self._call(endpoint, params)
yield from data["response"]
pinfo = data["pagination"]
if pinfo["current"] == pinfo["pages"]:
return
params["page"] = pinfo["next"]