replace extractor.request() 'expect' argument

with
- 'fatal': allow 4xx status codes
- 'notfound': raise NotFoundError on 404
This commit is contained in:
Mike Fährmann
2019-07-04 23:45:26 +02:00
parent 2ff73873f0
commit fdec59f8e2
16 changed files with 51 additions and 84 deletions

View File

@@ -79,9 +79,7 @@ class ArtstationExtractor(Extractor):
def get_user_info(self, username): def get_user_info(self, username):
"""Return metadata for a specific user""" """Return metadata for a specific user"""
url = "{}/users/{}/quick.json".format(self.root, username.lower()) url = "{}/users/{}/quick.json".format(self.root, username.lower())
response = self.request(url, expect=(404,)) response = self.request(url, notfound="user")
if response.status_code == 404:
raise exception.NotFoundError("user")
return response.json() return response.json()
def _pagination(self, url, params=None): def _pagination(self, url, params=None):

View File

@@ -66,8 +66,8 @@ class Extractor():
return config.interpolate( return config.interpolate(
("extractor", self.category, self.subcategory, key), default) ("extractor", self.category, self.subcategory, key), default)
def request(self, url, method="GET", *, session=None, def request(self, url, method="GET", *, session=None, retries=None,
encoding=None, expect=(), retries=None, **kwargs): encoding=None, fatal=True, notfound=None, **kwargs):
tries = 1 tries = 1
retries = self._retries if retries is None else retries retries = self._retries if retries is None else retries
session = self.session if session is None else session session = self.session if session is None else session
@@ -86,10 +86,13 @@ class Extractor():
raise exception.HttpError(exc) raise exception.HttpError(exc)
else: else:
code = response.status_code code = response.status_code
if 200 <= code < 400 or code in expect: if 200 <= code < 400 or not fatal and \
(400 <= code < 429 or 431 <= code < 500):
if encoding: if encoding:
response.encoding = encoding response.encoding = encoding
return response return response
if notfound and code == 404:
raise exception.NotFoundError(notfound)
if cloudflare.is_challenge(response): if cloudflare.is_challenge(response):
self.log.info("Solving Cloudflare challenge") self.log.info("Solving Cloudflare challenge")
url, domain, cookies = cloudflare.solve_challenge( url, domain, cookies = cloudflare.solve_challenge(
@@ -98,7 +101,7 @@ class Extractor():
continue continue
msg = "{}: {} for url: {}".format(code, response.reason, url) msg = "{}: {} for url: {}".format(code, response.reason, url)
if code < 500 and code != 429: if code < 500 and code != 429 and code != 430:
break break
self.log.debug("%s (%s/%s)", msg, tries, retries+1) self.log.debug("%s (%s/%s)", msg, tries, retries+1)

View File

@@ -416,7 +416,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
def deviations(self): def deviations(self):
url = "{}/{}/{}".format(self.root, self.user, self.path) url = "{}/{}/{}".format(self.root, self.user, self.path)
response = self._html_request(url, expect=range(400, 500)) response = self._html_request(url, fatal=False)
deviation_id = text.extract(response.text, '//deviation/', '"')[0] deviation_id = text.extract(response.text, '//deviation/', '"')[0]
if response.status_code >= 400 or not deviation_id: if response.status_code >= 400 or not deviation_id:
raise exception.NotFoundError("image") raise exception.NotFoundError("image")
@@ -767,7 +767,7 @@ class DeviantartAPI():
def user_profile(self, username): def user_profile(self, username):
"""Get user profile information""" """Get user profile information"""
endpoint = "user/profile/" + username endpoint = "user/profile/" + username
return self._call(endpoint, expect_error=True) return self._call(endpoint, fatal=False)
def authenticate(self, refresh_token): def authenticate(self, refresh_token):
"""Authenticate the application by requesting an access token""" """Authenticate the application by requesting an access token"""
@@ -797,7 +797,7 @@ class DeviantartAPI():
_refresh_token_cache.update(refresh_token, data["refresh_token"]) _refresh_token_cache.update(refresh_token, data["refresh_token"])
return "Bearer " + data["access_token"] return "Bearer " + data["access_token"]
def _call(self, endpoint, params=None, expect_error=False, public=True): def _call(self, endpoint, params=None, fatal=True, public=True):
"""Call an API endpoint""" """Call an API endpoint"""
url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint
while True: while True:
@@ -806,11 +806,7 @@ class DeviantartAPI():
self.authenticate(None if public else self.refresh_token) self.authenticate(None if public else self.refresh_token)
response = self.extractor.request( response = self.extractor.request(
url, url, headers=self.headers, params=params, fatal=False)
params=params,
headers=self.headers,
expect=range(400, 500),
)
data = response.json() data = response.json()
status = response.status_code status = response.status_code
@@ -818,7 +814,7 @@ class DeviantartAPI():
if self.delay > self.delay_min: if self.delay > self.delay_min:
self.delay -= 1 self.delay -= 1
return data return data
if expect_error: if not fatal:
return None return None
if data.get("error_description") == "User not found.": if data.get("error_description") == "User not found.":
raise exception.NotFoundError("user or group") raise exception.NotFoundError("user or group")

View File

@@ -259,7 +259,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def _gallery_page(self): def _gallery_page(self):
url = "{}/g/{}/{}/".format( url = "{}/g/{}/{}/".format(
self.root, self.gallery_id, self.gallery_token) self.root, self.gallery_id, self.gallery_token)
response = self.request(url, expect=range(400, 500)) response = self.request(url, fatal=False)
page = response.text page = response.text
if response.status_code == 404 and "Gallery Not Available" in page: if response.status_code == 404 and "Gallery Not Available" in page:
@@ -271,7 +271,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def _image_page(self): def _image_page(self):
url = "{}/s/{}/{}-{}".format( url = "{}/s/{}/{}-{}".format(
self.root, self.image_token, self.gallery_id, self.image_num) self.root, self.image_token, self.gallery_id, self.image_num)
page = self.request(url, expect=range(400, 500)).text page = self.request(url, fatal=False).text
if page.startswith(("Invalid page", "Keep trying")): if page.startswith(("Invalid page", "Keep trying")):
raise exception.NotFoundError("image page") raise exception.NotFoundError("image page")

View File

@@ -16,16 +16,15 @@ import json
class ImgurExtractor(Extractor): class ImgurExtractor(Extractor):
"""Base class for imgur extractors""" """Base class for imgur extractors"""
category = "imgur" category = "imgur"
root = "https://imgur.com"
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.item_id = match.group(1) self.item_id = match.group(1)
self.mp4 = self.config("mp4", True) self.mp4 = self.config("mp4", True)
def _get_data(self, urlpart): def _get_data(self, path):
response = self.request("https://imgur.com/" + urlpart, expect=(404,)) response = self.request(self.root + path, notfound=self.subcategory)
if response.status_code == 404:
raise exception.NotFoundError(self.subcategory)
data = text.extract(response.text, "image : ", ",\n")[0] data = text.extract(response.text, "image : ", ",\n")[0]
return self._clean(json.loads(data)) return self._clean(json.loads(data))
@@ -102,7 +101,7 @@ class ImgurImageExtractor(ImgurExtractor):
) )
def items(self): def items(self):
image = self._get_data(self.item_id) image = self._get_data("/" + self.item_id)
url = self._prepare(image) url = self._prepare(image)
yield Message.Version, 1 yield Message.Version, 1
@@ -165,13 +164,13 @@ class ImgurAlbumExtractor(ImgurExtractor):
) )
def items(self): def items(self):
album = self._get_data("a/" + self.item_id + "/all") album = self._get_data("/a/" + self.item_id + "/all")
images = album["album_images"]["images"] images = album["album_images"]["images"]
del album["album_images"] del album["album_images"]
if int(album["num_images"]) > len(images): if int(album["num_images"]) > len(images):
url = ("https://imgur.com/ajaxalbums/getimages/" + url = "{}/ajaxalbums/getimages/{}/hit.json".format(
self.item_id + "/hit.json") self.root, self.item_id)
images = self.request(url).json()["data"]["images"] images = self.request(url).json()["data"]["images"]
yield Message.Version, 1 yield Message.Version, 1

View File

@@ -106,13 +106,8 @@ class NijieExtractor(AsynchronousMixin, Extractor):
params = {"id": self.user_id, "p": 1} params = {"id": self.user_id, "p": 1}
while True: while True:
response = self.request(url, params=params, expect=(404,)) page = self.request(url, params=params, notfound="artist").text
if response.status_code == 404: yield from text.extract_iter(page, 'illust_id="', '"')
raise exception.NotFoundError("artist")
page = response.text
ids = list(text.extract_iter(page, ' illust_id="', '"'))
yield from ids
if '<a rel="next"' not in page: if '<a rel="next"' not in page:
return return
@@ -190,10 +185,8 @@ class NijieImageExtractor(NijieExtractor):
self.page = "" self.page = ""
def get_job_metadata(self): def get_job_metadata(self):
response = self.request(self.view_url + self.image_id, expect=(404,)) self.page = self.request(
if response.status_code == 404: self.view_url + self.image_id, notfound="image").text
raise exception.NotFoundError("image")
self.page = response.text
self.user_id = text.extract( self.user_id = text.extract(
self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0] self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0]
return NijieExtractor.get_job_metadata(self) return NijieExtractor.get_job_metadata(self)

View File

@@ -228,14 +228,14 @@ class PinterestAPI():
params = {"data": json.dumps({"options": options}), "source_url": ""} params = {"data": json.dumps({"options": options}), "source_url": ""}
response = self.extractor.request( response = self.extractor.request(
url, params=params, headers=self.HEADERS, expect=range(400, 500)) url, params=params, headers=self.HEADERS, fatal=False)
try: try:
data = response.json() data = response.json()
except ValueError: except ValueError:
data = {} data = {}
if 200 <= response.status_code < 400 and not response.history: if response.status_code < 400 and not response.history:
return data return data
if response.status_code == 404 or response.history: if response.status_code == 404 or response.history:

View File

@@ -143,9 +143,7 @@ class PixivMeExtractor(PixivExtractor):
def items(self): def items(self):
url = "https://pixiv.me/" + self.account url = "https://pixiv.me/" + self.account
response = self.request( response = self.request(
url, method="HEAD", allow_redirects=False, expect=(404,)) url, method="HEAD", allow_redirects=False, notfound="user")
if response.status_code == 404:
raise exception.NotFoundError("user")
yield Message.Version, 1 yield Message.Version, 1
yield Message.Queue, response.headers["Location"], {} yield Message.Queue, response.headers["Location"], {}
@@ -445,7 +443,7 @@ class PixivAppAPI():
data["password"] = password data["password"] = password
response = self.extractor.request( response = self.extractor.request(
url, method="POST", data=data, expect=(400,)) url, method="POST", data=data, fatal=False)
if response.status_code >= 400: if response.status_code >= 400:
raise exception.AuthenticationError() raise exception.AuthenticationError()
@@ -491,10 +489,9 @@ class PixivAppAPI():
url = "https://app-api.pixiv.net/" + endpoint url = "https://app-api.pixiv.net/" + endpoint
self.login() self.login()
response = self.extractor.request( response = self.extractor.request(url, params=params, fatal=False)
url, params=params, expect=range(400, 500))
if 200 <= response.status_code < 400: if response.status_code < 400:
return response.json() return response.json()
if response.status_code == 404: if response.status_code == 404:
raise exception.NotFoundError() raise exception.NotFoundError()

View File

@@ -235,8 +235,7 @@ class RedditAPI():
url = "https://oauth.reddit.com" + endpoint url = "https://oauth.reddit.com" + endpoint
params["raw_json"] = 1 params["raw_json"] = 1
self.authenticate() self.authenticate()
response = self.extractor.request( response = self.extractor.request(url, params=params, fatal=False)
url, params=params, expect=range(400, 500))
remaining = response.headers.get("x-ratelimit-remaining") remaining = response.headers.get("x-ratelimit-remaining")
if remaining and float(remaining) < 2: if remaining and float(remaining) < 2:
wait = int(response.headers["x-ratelimit-reset"]) wait = int(response.headers["x-ratelimit-reset"])

View File

@@ -110,8 +110,8 @@ class SankakucomplexTagExtractor(SankakucomplexExtractor):
yield Message.Version, 1 yield Message.Version, 1
while True: while True:
url = "{}/{}/page/{}/".format(self.root, self.path, pnum) url = "{}/{}/page/{}/".format(self.root, self.path, pnum)
response = self.request(url, expect=(404,)) response = self.request(url, fatal=False)
if response.status_code == 404: if response.status_code >= 400:
return return
for url in text.extract_iter(response.text, 'data-direct="', '"'): for url in text.extract_iter(response.text, 'data-direct="', '"'):
if url != last: if url != last:

View File

@@ -43,9 +43,7 @@ class SeigaExtractor(Extractor):
"""Get url for an image with id 'image_id'""" """Get url for an image with id 'image_id'"""
url = "{}/image/source/{}".format(self.root, image_id) url = "{}/image/source/{}".format(self.root, image_id)
response = self.request( response = self.request(
url, method="HEAD", allow_redirects=False, expect=(404,)) url, method="HEAD", allow_redirects=False, notfound="image")
if response.status_code == 404:
raise exception.NotFoundError("image")
return response.headers["Location"].replace("/o/", "/priv/", 1) return response.headers["Location"].replace("/o/", "/priv/", 1)
def login(self): def login(self):

View File

@@ -49,10 +49,10 @@ class SexcomExtractor(Extractor):
return return
url = text.urljoin(self.root, url) url = text.urljoin(self.root, url)
def _parse_pin(self, url, expect=range(400, 429)): def _parse_pin(self, url):
response = self.request(url, expect=expect) response = self.request(url, fatal=False)
if response.status_code >= 400: if response.status_code >= 400:
self.log.warning("Unable to fetch %s (%s: %s)", self.log.warning('Unable to fetch %s ("%s: %s")',
url, response.status_code, response.reason) url, response.status_code, response.reason)
return None return None
extr = text.extract_from(response.text) extr = text.extract_from(response.text)

View File

@@ -10,7 +10,6 @@
from .common import Extractor, Message, SharedConfigMixin, generate_extractors from .common import Extractor, Message, SharedConfigMixin, generate_extractors
from .. import text from .. import text
import time
import re import re
@@ -24,19 +23,9 @@ class ShopifyExtractor(SharedConfigMixin, Extractor):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.item_url = self.root + match.group(1) self.item_url = self.root + match.group(1)
def request(self, url, method="GET", expect=range(400, 500), **kwargs): def request(self, url, **kwargs):
tries = 0 kwargs["retries"] = float("inf")
kwargs["expect"] = expect return Extractor.request(self, url, **kwargs)
while True:
response = Extractor.request(self, url, method, **kwargs)
if response.status_code not in (429, 430):
return response
tries += 1
waittime = 2 ** (tries + 2)
self.log.warning(
"HTTP status %s: %s - Waiting for %d seconds",
response.status_code, response.reason, waittime)
time.sleep(waittime)
def items(self): def items(self):
data = self.metadata() data = self.metadata()
@@ -45,9 +34,10 @@ class ShopifyExtractor(SharedConfigMixin, Extractor):
headers = {"X-Requested-With": "XMLHttpRequest"} headers = {"X-Requested-With": "XMLHttpRequest"}
for url in self.products(): for url in self.products():
response = self.request(url + ".json", headers=headers) response = self.request(
url + ".json", headers=headers, fatal=False)
if response.status_code >= 400: if response.status_code >= 400:
self.log.warning('Skipping %s ("%d: %s")', self.log.warning('Skipping %s ("%s: %s")',
url, response.status_code, response.reason) url, response.status_code, response.reason)
continue continue
product = response.json()["product"] product = response.json()["product"]

View File

@@ -107,9 +107,9 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
def images(self, page): def images(self, page):
url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id) url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id)
headers = {"Referer": self.chapter_url} headers = {"Referer": self.chapter_url}
response = self.request(url, headers=headers, expect=(404,)) response = self.request(url, headers=headers, fatal=False)
if response.status_code == 404: if response.status_code >= 400:
url = "{}/Read/View/{}".format(self.root, self.gallery_id) url = "{}/Read/View/{}".format(self.root, self.gallery_id)
self.log.error( self.log.error(
"Failed to get gallery JSON data. Visit '%s' in a browser " "Failed to get gallery JSON data. Visit '%s' in a browser "

View File

@@ -18,12 +18,6 @@ class XvideosExtractor(Extractor):
category = "xvideos" category = "xvideos"
root = "https://www.xvideos.com" root = "https://www.xvideos.com"
def get_page(self, url, codes=(403, 404)):
response = self.request(url, expect=codes)
if response.status_code in codes:
raise exception.NotFoundError(self.subcategory)
return response.text
class XvideosGalleryExtractor(XvideosExtractor): class XvideosGalleryExtractor(XvideosExtractor):
"""Extractor for user profile galleries from xvideos.com""" """Extractor for user profile galleries from xvideos.com"""
@@ -50,7 +44,7 @@ class XvideosGalleryExtractor(XvideosExtractor):
def items(self): def items(self):
url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid) url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid)
page = self.get_page(url) page = self.request(url, notfound=self.subcategory).text
data = self.get_metadata(page) data = self.get_metadata(page)
imgs = self.get_images(page) imgs = self.get_images(page)
data["count"] = len(imgs) data["count"] = len(imgs)
@@ -113,7 +107,7 @@ class XvideosUserExtractor(XvideosExtractor):
def items(self): def items(self):
url = "{}/profiles/{}".format(self.root, self.user) url = "{}/profiles/{}".format(self.root, self.user)
page = self.get_page(url) page = self.request(url, notfound=self.subcategory).text
data = json.loads(text.extract( data = json.loads(text.extract(
page, "xv.conf=", ";</script>")[0])["data"] page, "xv.conf=", ";</script>")[0])["data"]

View File

@@ -126,7 +126,7 @@ class OAuth1API():
self.session = extractor.session self.session = extractor.session
self.api_key = api_key self.api_key = api_key
def request(self, url, method="GET", *, expect=range(400, 500), **kwargs): def request(self, url, method="GET", **kwargs):
kwargs["expect"] = expect kwargs["fatal"] = False
kwargs["session"] = self.session kwargs["session"] = self.session
return self.extractor.request(url, method, **kwargs) return self.extractor.request(url, method, **kwargs)