replace extractor.request() 'expect' argument
with - 'fatal': allow 4xx status codes - 'notfound': raise NotFoundError on 404
This commit is contained in:
@@ -79,9 +79,7 @@ class ArtstationExtractor(Extractor):
|
||||
def get_user_info(self, username):
|
||||
"""Return metadata for a specific user"""
|
||||
url = "{}/users/{}/quick.json".format(self.root, username.lower())
|
||||
response = self.request(url, expect=(404,))
|
||||
if response.status_code == 404:
|
||||
raise exception.NotFoundError("user")
|
||||
response = self.request(url, notfound="user")
|
||||
return response.json()
|
||||
|
||||
def _pagination(self, url, params=None):
|
||||
|
||||
@@ -66,8 +66,8 @@ class Extractor():
|
||||
return config.interpolate(
|
||||
("extractor", self.category, self.subcategory, key), default)
|
||||
|
||||
def request(self, url, method="GET", *, session=None,
|
||||
encoding=None, expect=(), retries=None, **kwargs):
|
||||
def request(self, url, method="GET", *, session=None, retries=None,
|
||||
encoding=None, fatal=True, notfound=None, **kwargs):
|
||||
tries = 1
|
||||
retries = self._retries if retries is None else retries
|
||||
session = self.session if session is None else session
|
||||
@@ -86,10 +86,13 @@ class Extractor():
|
||||
raise exception.HttpError(exc)
|
||||
else:
|
||||
code = response.status_code
|
||||
if 200 <= code < 400 or code in expect:
|
||||
if 200 <= code < 400 or not fatal and \
|
||||
(400 <= code < 429 or 431 <= code < 500):
|
||||
if encoding:
|
||||
response.encoding = encoding
|
||||
return response
|
||||
if notfound and code == 404:
|
||||
raise exception.NotFoundError(notfound)
|
||||
if cloudflare.is_challenge(response):
|
||||
self.log.info("Solving Cloudflare challenge")
|
||||
url, domain, cookies = cloudflare.solve_challenge(
|
||||
@@ -98,7 +101,7 @@ class Extractor():
|
||||
continue
|
||||
|
||||
msg = "{}: {} for url: {}".format(code, response.reason, url)
|
||||
if code < 500 and code != 429:
|
||||
if code < 500 and code != 429 and code != 430:
|
||||
break
|
||||
|
||||
self.log.debug("%s (%s/%s)", msg, tries, retries+1)
|
||||
|
||||
@@ -416,7 +416,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
|
||||
|
||||
def deviations(self):
|
||||
url = "{}/{}/{}".format(self.root, self.user, self.path)
|
||||
response = self._html_request(url, expect=range(400, 500))
|
||||
response = self._html_request(url, fatal=False)
|
||||
deviation_id = text.extract(response.text, '//deviation/', '"')[0]
|
||||
if response.status_code >= 400 or not deviation_id:
|
||||
raise exception.NotFoundError("image")
|
||||
@@ -767,7 +767,7 @@ class DeviantartAPI():
|
||||
def user_profile(self, username):
|
||||
"""Get user profile information"""
|
||||
endpoint = "user/profile/" + username
|
||||
return self._call(endpoint, expect_error=True)
|
||||
return self._call(endpoint, fatal=False)
|
||||
|
||||
def authenticate(self, refresh_token):
|
||||
"""Authenticate the application by requesting an access token"""
|
||||
@@ -797,7 +797,7 @@ class DeviantartAPI():
|
||||
_refresh_token_cache.update(refresh_token, data["refresh_token"])
|
||||
return "Bearer " + data["access_token"]
|
||||
|
||||
def _call(self, endpoint, params=None, expect_error=False, public=True):
|
||||
def _call(self, endpoint, params=None, fatal=True, public=True):
|
||||
"""Call an API endpoint"""
|
||||
url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint
|
||||
while True:
|
||||
@@ -806,11 +806,7 @@ class DeviantartAPI():
|
||||
|
||||
self.authenticate(None if public else self.refresh_token)
|
||||
response = self.extractor.request(
|
||||
url,
|
||||
params=params,
|
||||
headers=self.headers,
|
||||
expect=range(400, 500),
|
||||
)
|
||||
url, headers=self.headers, params=params, fatal=False)
|
||||
data = response.json()
|
||||
status = response.status_code
|
||||
|
||||
@@ -818,7 +814,7 @@ class DeviantartAPI():
|
||||
if self.delay > self.delay_min:
|
||||
self.delay -= 1
|
||||
return data
|
||||
if expect_error:
|
||||
if not fatal:
|
||||
return None
|
||||
if data.get("error_description") == "User not found.":
|
||||
raise exception.NotFoundError("user or group")
|
||||
|
||||
@@ -259,7 +259,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
|
||||
def _gallery_page(self):
|
||||
url = "{}/g/{}/{}/".format(
|
||||
self.root, self.gallery_id, self.gallery_token)
|
||||
response = self.request(url, expect=range(400, 500))
|
||||
response = self.request(url, fatal=False)
|
||||
page = response.text
|
||||
|
||||
if response.status_code == 404 and "Gallery Not Available" in page:
|
||||
@@ -271,7 +271,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
|
||||
def _image_page(self):
|
||||
url = "{}/s/{}/{}-{}".format(
|
||||
self.root, self.image_token, self.gallery_id, self.image_num)
|
||||
page = self.request(url, expect=range(400, 500)).text
|
||||
page = self.request(url, fatal=False).text
|
||||
|
||||
if page.startswith(("Invalid page", "Keep trying")):
|
||||
raise exception.NotFoundError("image page")
|
||||
|
||||
@@ -16,16 +16,15 @@ import json
|
||||
class ImgurExtractor(Extractor):
|
||||
"""Base class for imgur extractors"""
|
||||
category = "imgur"
|
||||
root = "https://imgur.com"
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self, match)
|
||||
self.item_id = match.group(1)
|
||||
self.mp4 = self.config("mp4", True)
|
||||
|
||||
def _get_data(self, urlpart):
|
||||
response = self.request("https://imgur.com/" + urlpart, expect=(404,))
|
||||
if response.status_code == 404:
|
||||
raise exception.NotFoundError(self.subcategory)
|
||||
def _get_data(self, path):
|
||||
response = self.request(self.root + path, notfound=self.subcategory)
|
||||
data = text.extract(response.text, "image : ", ",\n")[0]
|
||||
return self._clean(json.loads(data))
|
||||
|
||||
@@ -102,7 +101,7 @@ class ImgurImageExtractor(ImgurExtractor):
|
||||
)
|
||||
|
||||
def items(self):
|
||||
image = self._get_data(self.item_id)
|
||||
image = self._get_data("/" + self.item_id)
|
||||
url = self._prepare(image)
|
||||
|
||||
yield Message.Version, 1
|
||||
@@ -165,13 +164,13 @@ class ImgurAlbumExtractor(ImgurExtractor):
|
||||
)
|
||||
|
||||
def items(self):
|
||||
album = self._get_data("a/" + self.item_id + "/all")
|
||||
album = self._get_data("/a/" + self.item_id + "/all")
|
||||
images = album["album_images"]["images"]
|
||||
del album["album_images"]
|
||||
|
||||
if int(album["num_images"]) > len(images):
|
||||
url = ("https://imgur.com/ajaxalbums/getimages/" +
|
||||
self.item_id + "/hit.json")
|
||||
url = "{}/ajaxalbums/getimages/{}/hit.json".format(
|
||||
self.root, self.item_id)
|
||||
images = self.request(url).json()["data"]["images"]
|
||||
|
||||
yield Message.Version, 1
|
||||
|
||||
@@ -106,13 +106,8 @@ class NijieExtractor(AsynchronousMixin, Extractor):
|
||||
params = {"id": self.user_id, "p": 1}
|
||||
|
||||
while True:
|
||||
response = self.request(url, params=params, expect=(404,))
|
||||
if response.status_code == 404:
|
||||
raise exception.NotFoundError("artist")
|
||||
|
||||
page = response.text
|
||||
ids = list(text.extract_iter(page, ' illust_id="', '"'))
|
||||
yield from ids
|
||||
page = self.request(url, params=params, notfound="artist").text
|
||||
yield from text.extract_iter(page, 'illust_id="', '"')
|
||||
|
||||
if '<a rel="next"' not in page:
|
||||
return
|
||||
@@ -190,10 +185,8 @@ class NijieImageExtractor(NijieExtractor):
|
||||
self.page = ""
|
||||
|
||||
def get_job_metadata(self):
|
||||
response = self.request(self.view_url + self.image_id, expect=(404,))
|
||||
if response.status_code == 404:
|
||||
raise exception.NotFoundError("image")
|
||||
self.page = response.text
|
||||
self.page = self.request(
|
||||
self.view_url + self.image_id, notfound="image").text
|
||||
self.user_id = text.extract(
|
||||
self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0]
|
||||
return NijieExtractor.get_job_metadata(self)
|
||||
|
||||
@@ -228,14 +228,14 @@ class PinterestAPI():
|
||||
params = {"data": json.dumps({"options": options}), "source_url": ""}
|
||||
|
||||
response = self.extractor.request(
|
||||
url, params=params, headers=self.HEADERS, expect=range(400, 500))
|
||||
url, params=params, headers=self.HEADERS, fatal=False)
|
||||
|
||||
try:
|
||||
data = response.json()
|
||||
except ValueError:
|
||||
data = {}
|
||||
|
||||
if 200 <= response.status_code < 400 and not response.history:
|
||||
if response.status_code < 400 and not response.history:
|
||||
return data
|
||||
|
||||
if response.status_code == 404 or response.history:
|
||||
|
||||
@@ -143,9 +143,7 @@ class PixivMeExtractor(PixivExtractor):
|
||||
def items(self):
|
||||
url = "https://pixiv.me/" + self.account
|
||||
response = self.request(
|
||||
url, method="HEAD", allow_redirects=False, expect=(404,))
|
||||
if response.status_code == 404:
|
||||
raise exception.NotFoundError("user")
|
||||
url, method="HEAD", allow_redirects=False, notfound="user")
|
||||
yield Message.Version, 1
|
||||
yield Message.Queue, response.headers["Location"], {}
|
||||
|
||||
@@ -445,7 +443,7 @@ class PixivAppAPI():
|
||||
data["password"] = password
|
||||
|
||||
response = self.extractor.request(
|
||||
url, method="POST", data=data, expect=(400,))
|
||||
url, method="POST", data=data, fatal=False)
|
||||
if response.status_code >= 400:
|
||||
raise exception.AuthenticationError()
|
||||
|
||||
@@ -491,10 +489,9 @@ class PixivAppAPI():
|
||||
url = "https://app-api.pixiv.net/" + endpoint
|
||||
|
||||
self.login()
|
||||
response = self.extractor.request(
|
||||
url, params=params, expect=range(400, 500))
|
||||
response = self.extractor.request(url, params=params, fatal=False)
|
||||
|
||||
if 200 <= response.status_code < 400:
|
||||
if response.status_code < 400:
|
||||
return response.json()
|
||||
if response.status_code == 404:
|
||||
raise exception.NotFoundError()
|
||||
|
||||
@@ -235,8 +235,7 @@ class RedditAPI():
|
||||
url = "https://oauth.reddit.com" + endpoint
|
||||
params["raw_json"] = 1
|
||||
self.authenticate()
|
||||
response = self.extractor.request(
|
||||
url, params=params, expect=range(400, 500))
|
||||
response = self.extractor.request(url, params=params, fatal=False)
|
||||
remaining = response.headers.get("x-ratelimit-remaining")
|
||||
if remaining and float(remaining) < 2:
|
||||
wait = int(response.headers["x-ratelimit-reset"])
|
||||
|
||||
@@ -110,8 +110,8 @@ class SankakucomplexTagExtractor(SankakucomplexExtractor):
|
||||
yield Message.Version, 1
|
||||
while True:
|
||||
url = "{}/{}/page/{}/".format(self.root, self.path, pnum)
|
||||
response = self.request(url, expect=(404,))
|
||||
if response.status_code == 404:
|
||||
response = self.request(url, fatal=False)
|
||||
if response.status_code >= 400:
|
||||
return
|
||||
for url in text.extract_iter(response.text, 'data-direct="', '"'):
|
||||
if url != last:
|
||||
|
||||
@@ -43,9 +43,7 @@ class SeigaExtractor(Extractor):
|
||||
"""Get url for an image with id 'image_id'"""
|
||||
url = "{}/image/source/{}".format(self.root, image_id)
|
||||
response = self.request(
|
||||
url, method="HEAD", allow_redirects=False, expect=(404,))
|
||||
if response.status_code == 404:
|
||||
raise exception.NotFoundError("image")
|
||||
url, method="HEAD", allow_redirects=False, notfound="image")
|
||||
return response.headers["Location"].replace("/o/", "/priv/", 1)
|
||||
|
||||
def login(self):
|
||||
|
||||
@@ -49,10 +49,10 @@ class SexcomExtractor(Extractor):
|
||||
return
|
||||
url = text.urljoin(self.root, url)
|
||||
|
||||
def _parse_pin(self, url, expect=range(400, 429)):
|
||||
response = self.request(url, expect=expect)
|
||||
def _parse_pin(self, url):
|
||||
response = self.request(url, fatal=False)
|
||||
if response.status_code >= 400:
|
||||
self.log.warning("Unable to fetch %s (%s: %s)",
|
||||
self.log.warning('Unable to fetch %s ("%s: %s")',
|
||||
url, response.status_code, response.reason)
|
||||
return None
|
||||
extr = text.extract_from(response.text)
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
|
||||
from .common import Extractor, Message, SharedConfigMixin, generate_extractors
|
||||
from .. import text
|
||||
import time
|
||||
import re
|
||||
|
||||
|
||||
@@ -24,19 +23,9 @@ class ShopifyExtractor(SharedConfigMixin, Extractor):
|
||||
Extractor.__init__(self, match)
|
||||
self.item_url = self.root + match.group(1)
|
||||
|
||||
def request(self, url, method="GET", expect=range(400, 500), **kwargs):
|
||||
tries = 0
|
||||
kwargs["expect"] = expect
|
||||
while True:
|
||||
response = Extractor.request(self, url, method, **kwargs)
|
||||
if response.status_code not in (429, 430):
|
||||
return response
|
||||
tries += 1
|
||||
waittime = 2 ** (tries + 2)
|
||||
self.log.warning(
|
||||
"HTTP status %s: %s - Waiting for %d seconds",
|
||||
response.status_code, response.reason, waittime)
|
||||
time.sleep(waittime)
|
||||
def request(self, url, **kwargs):
|
||||
kwargs["retries"] = float("inf")
|
||||
return Extractor.request(self, url, **kwargs)
|
||||
|
||||
def items(self):
|
||||
data = self.metadata()
|
||||
@@ -45,9 +34,10 @@ class ShopifyExtractor(SharedConfigMixin, Extractor):
|
||||
|
||||
headers = {"X-Requested-With": "XMLHttpRequest"}
|
||||
for url in self.products():
|
||||
response = self.request(url + ".json", headers=headers)
|
||||
response = self.request(
|
||||
url + ".json", headers=headers, fatal=False)
|
||||
if response.status_code >= 400:
|
||||
self.log.warning('Skipping %s ("%d: %s")',
|
||||
self.log.warning('Skipping %s ("%s: %s")',
|
||||
url, response.status_code, response.reason)
|
||||
continue
|
||||
product = response.json()["product"]
|
||||
|
||||
@@ -107,9 +107,9 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
|
||||
def images(self, page):
|
||||
url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id)
|
||||
headers = {"Referer": self.chapter_url}
|
||||
response = self.request(url, headers=headers, expect=(404,))
|
||||
response = self.request(url, headers=headers, fatal=False)
|
||||
|
||||
if response.status_code == 404:
|
||||
if response.status_code >= 400:
|
||||
url = "{}/Read/View/{}".format(self.root, self.gallery_id)
|
||||
self.log.error(
|
||||
"Failed to get gallery JSON data. Visit '%s' in a browser "
|
||||
|
||||
@@ -18,12 +18,6 @@ class XvideosExtractor(Extractor):
|
||||
category = "xvideos"
|
||||
root = "https://www.xvideos.com"
|
||||
|
||||
def get_page(self, url, codes=(403, 404)):
|
||||
response = self.request(url, expect=codes)
|
||||
if response.status_code in codes:
|
||||
raise exception.NotFoundError(self.subcategory)
|
||||
return response.text
|
||||
|
||||
|
||||
class XvideosGalleryExtractor(XvideosExtractor):
|
||||
"""Extractor for user profile galleries from xvideos.com"""
|
||||
@@ -50,7 +44,7 @@ class XvideosGalleryExtractor(XvideosExtractor):
|
||||
|
||||
def items(self):
|
||||
url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid)
|
||||
page = self.get_page(url)
|
||||
page = self.request(url, notfound=self.subcategory).text
|
||||
data = self.get_metadata(page)
|
||||
imgs = self.get_images(page)
|
||||
data["count"] = len(imgs)
|
||||
@@ -113,7 +107,7 @@ class XvideosUserExtractor(XvideosExtractor):
|
||||
|
||||
def items(self):
|
||||
url = "{}/profiles/{}".format(self.root, self.user)
|
||||
page = self.get_page(url)
|
||||
page = self.request(url, notfound=self.subcategory).text
|
||||
data = json.loads(text.extract(
|
||||
page, "xv.conf=", ";</script>")[0])["data"]
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ class OAuth1API():
|
||||
self.session = extractor.session
|
||||
self.api_key = api_key
|
||||
|
||||
def request(self, url, method="GET", *, expect=range(400, 500), **kwargs):
|
||||
kwargs["expect"] = expect
|
||||
def request(self, url, method="GET", **kwargs):
|
||||
kwargs["fatal"] = False
|
||||
kwargs["session"] = self.session
|
||||
return self.extractor.request(url, method, **kwargs)
|
||||
|
||||
Reference in New Issue
Block a user