store the full original URL in Extractor.url
This commit is contained in:
@@ -19,8 +19,8 @@ class FutabaThreadExtractor(Extractor):
|
|||||||
directory_fmt = ("{category}", "{board_name}", "{thread}")
|
directory_fmt = ("{category}", "{board_name}", "{thread}")
|
||||||
filename_fmt = "{tim}.{extension}"
|
filename_fmt = "{tim}.{extension}"
|
||||||
archive_fmt = "{board}_{thread}_{tim}"
|
archive_fmt = "{board}_{thread}_{tim}"
|
||||||
urlfmt = "https://{server}.2chan.net/{board}/src/{filename}"
|
url_fmt = "https://{server}.2chan.net/{board}/src/{filename}"
|
||||||
pattern = r"(?:https?://)?(([^.]+)\.2chan\.net/([^/]+)/res/(\d+))"
|
pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)"
|
||||||
test = ("http://dec.2chan.net/70/res/947.htm", {
|
test = ("http://dec.2chan.net/70/res/947.htm", {
|
||||||
"url": "c5c12b80b290e224b6758507b3bb952044f4595b",
|
"url": "c5c12b80b290e224b6758507b3bb952044f4595b",
|
||||||
"keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0",
|
"keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0",
|
||||||
@@ -28,22 +28,23 @@ class FutabaThreadExtractor(Extractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
url, self.server, self.board, self.thread = match.groups()
|
self.server, self.board, self.thread = match.groups()
|
||||||
self.url = "https://" + url + ".htm"
|
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
page = self.request(self.url).text
|
url = "https://{}.2chan.net/{}/res/{}.htm".format(
|
||||||
data = self.get_metadata(page)
|
self.server, self.board, self.thread)
|
||||||
|
page = self.request(url).text
|
||||||
|
data = self.metadata(page)
|
||||||
yield Message.Version, 1
|
yield Message.Version, 1
|
||||||
yield Message.Directory, data
|
yield Message.Directory, data
|
||||||
for post in self.posts(page):
|
for post in self.posts(page):
|
||||||
if "filename" not in post:
|
if "filename" not in post:
|
||||||
continue
|
continue
|
||||||
post.update(data)
|
post.update(data)
|
||||||
url = self.urlfmt.format_map(post)
|
url = self.url_fmt.format_map(post)
|
||||||
yield Message.Url, url, post
|
yield Message.Url, url, post
|
||||||
|
|
||||||
def get_metadata(self, page):
|
def metadata(self, page):
|
||||||
"""Collect metadata for extractor-job"""
|
"""Collect metadata for extractor-job"""
|
||||||
title = text.extract(page, "<title>", "</title>")[0]
|
title = text.extract(page, "<title>", "</title>")[0]
|
||||||
title, _, boardname = title.rpartition(" - ")
|
title, _, boardname = title.rpartition(" - ")
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ class Extractor():
|
|||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
self.log = logging.getLogger(self.category)
|
self.log = logging.getLogger(self.category)
|
||||||
|
self.url = match.string
|
||||||
self._set_headers()
|
self._set_headers()
|
||||||
self._set_cookies()
|
self._set_cookies()
|
||||||
self._set_proxies()
|
self._set_proxies()
|
||||||
|
|||||||
@@ -312,7 +312,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
|
|||||||
"""Extractor for single deviations"""
|
"""Extractor for single deviations"""
|
||||||
subcategory = "deviation"
|
subcategory = "deviation"
|
||||||
archive_fmt = "{index}.{extension}"
|
archive_fmt = "{index}.{extension}"
|
||||||
pattern = BASE_PATTERN + r"/(?:art|journal)/[^/?&#]+-\d+"
|
pattern = BASE_PATTERN + r"/((?:art|journal)/[^/?&#]+-\d+)"
|
||||||
test = (
|
test = (
|
||||||
(("https://www.deviantart.com/shimoda7/art/"
|
(("https://www.deviantart.com/shimoda7/art/"
|
||||||
"For-the-sake-of-a-memory-10073852"), {
|
"For-the-sake-of-a-memory-10073852"), {
|
||||||
@@ -335,23 +335,22 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
DeviantartExtractor.__init__(self, match)
|
DeviantartExtractor.__init__(self, match)
|
||||||
self.url = match.group(0)
|
self.path = match.group(3)
|
||||||
if not self.url.startswith("http"):
|
|
||||||
self.url = "https://" + self.url
|
|
||||||
|
|
||||||
def deviations(self):
|
def deviations(self):
|
||||||
response = self.request(self.url, expect=range(400, 500))
|
url = "{}/{}/{}".format(self.root, self.user, self.path)
|
||||||
|
response = self.request(url, expect=range(400, 500))
|
||||||
deviation_id = text.extract(response.text, '//deviation/', '"')[0]
|
deviation_id = text.extract(response.text, '//deviation/', '"')[0]
|
||||||
if response.status_code >= 400 or not deviation_id:
|
if response.status_code >= 400 or not deviation_id:
|
||||||
raise exception.NotFoundError("image")
|
raise exception.NotFoundError("image")
|
||||||
return (self.api.deviation(deviation_id),)
|
return (self.api.deviation(deviation_id),)
|
||||||
|
|
||||||
|
|
||||||
class DeviantartStashExtractor(DeviantartDeviationExtractor):
|
class DeviantartStashExtractor(DeviantartExtractor):
|
||||||
"""Extractor for sta.sh-ed deviations"""
|
"""Extractor for sta.sh-ed deviations"""
|
||||||
subcategory = "stash"
|
subcategory = "stash"
|
||||||
archive_fmt = "{index}.{extension}"
|
archive_fmt = "{index}.{extension}"
|
||||||
pattern = r"(?:https?://)?sta\.sh/()()[a-z0-9]+"
|
pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)"
|
||||||
test = (
|
test = (
|
||||||
("https://sta.sh/022c83odnaxc", {
|
("https://sta.sh/022c83odnaxc", {
|
||||||
"pattern": r"https://s3.amazonaws.com/origin-orig.deviantart.net",
|
"pattern": r"https://s3.amazonaws.com/origin-orig.deviantart.net",
|
||||||
@@ -366,8 +365,13 @@ class DeviantartStashExtractor(DeviantartDeviationExtractor):
|
|||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
DeviantartExtractor.__init__(self, match)
|
||||||
|
self.stash_id = match.group(1)
|
||||||
|
|
||||||
def deviations(self):
|
def deviations(self):
|
||||||
page = self.request(self.url).text
|
url = "https://sta.sh/" + self.stash_id
|
||||||
|
page = self.request(url).text
|
||||||
deviation_id = text.extract(page, '//deviation/', '"')[0]
|
deviation_id = text.extract(page, '//deviation/', '"')[0]
|
||||||
|
|
||||||
if deviation_id:
|
if deviation_id:
|
||||||
|
|||||||
@@ -40,7 +40,6 @@ class DirectlinkExtractor(Extractor):
|
|||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
self.data = match.groupdict()
|
self.data = match.groupdict()
|
||||||
self.url = match.string
|
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
text.nameext_from_url(self.url, self.data)
|
text.nameext_from_url(self.url, self.data)
|
||||||
|
|||||||
@@ -340,7 +340,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
|
|||||||
ExhentaiExtractor.__init__(self, match)
|
ExhentaiExtractor.__init__(self, match)
|
||||||
self.params = text.parse_query(match.group(1) or "")
|
self.params = text.parse_query(match.group(1) or "")
|
||||||
self.params["page"] = text.parse_int(self.params.get("page"))
|
self.params["page"] = text.parse_int(self.params.get("page"))
|
||||||
self.url = self.root
|
self.search_url = self.root
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
self.login()
|
self.login()
|
||||||
@@ -348,7 +348,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
|
|||||||
yield Message.Version, 1
|
yield Message.Version, 1
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
page = self.request(self.url, params=self.params).text
|
page = self.request(self.search_url, params=self.params).text
|
||||||
|
|
||||||
for row in text.extract_iter(page, '<tr class="gtr', '</tr>'):
|
for row in text.extract_iter(page, '<tr class="gtr', '</tr>'):
|
||||||
yield self._parse_row(row)
|
yield self._parse_row(row)
|
||||||
@@ -397,7 +397,7 @@ class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
ExhentaiSearchExtractor.__init__(self, match)
|
ExhentaiSearchExtractor.__init__(self, match)
|
||||||
self.url = self.root + "/favorites.php"
|
self.search_url = self.root + "/favorites.php"
|
||||||
|
|
||||||
def init(self):
|
def init(self):
|
||||||
# The first request to '/favorites.php' will return an empty list
|
# The first request to '/favorites.php' will return an empty list
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ class HentaifoundryExtractor(Extractor):
|
|||||||
|
|
||||||
def __init__(self, match, user="", page=1):
|
def __init__(self, match, user="", page=1):
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
self.url = ""
|
self.page_url = ""
|
||||||
self.user = user
|
self.user = user
|
||||||
self.start_post = 0
|
self.start_post = 0
|
||||||
self.start_page = text.parse_int(page, 1)
|
self.start_page = text.parse_int(page, 1)
|
||||||
@@ -55,7 +55,7 @@ class HentaifoundryExtractor(Extractor):
|
|||||||
num = self.start_page
|
num = self.start_page
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
page = self.request("{}/page/{}".format(self.url, num)).text
|
page = self.request("{}/page/{}".format(self.page_url, num)).text
|
||||||
yield from text.extract_iter(page, 'thumbTitle"><a href="', '"')
|
yield from text.extract_iter(page, 'thumbTitle"><a href="', '"')
|
||||||
|
|
||||||
if 'class="pager"' not in page or 'class="last hidden"' in page:
|
if 'class="pager"' not in page or 'class="last hidden"' in page:
|
||||||
@@ -135,10 +135,10 @@ class HentaifoundryUserExtractor(HentaifoundryExtractor):
|
|||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
HentaifoundryExtractor.__init__(
|
HentaifoundryExtractor.__init__(
|
||||||
self, match, match.group(1) or match.group(3), match.group(2))
|
self, match, match.group(1) or match.group(3), match.group(2))
|
||||||
self.url = "{}/pictures/user/{}".format(self.root, self.user)
|
self.page_url = "{}/pictures/user/{}".format(self.root, self.user)
|
||||||
|
|
||||||
def get_job_metadata(self):
|
def get_job_metadata(self):
|
||||||
page = self.request(self.url + "?enterAgree=1").text
|
page = self.request(self.page_url + "?enterAgree=1").text
|
||||||
count = text.extract(page, ">Pictures (", ")")[0]
|
count = text.extract(page, ">Pictures (", ")")[0]
|
||||||
return {"user": self.user, "count": text.parse_int(count)}
|
return {"user": self.user, "count": text.parse_int(count)}
|
||||||
|
|
||||||
@@ -161,10 +161,11 @@ class HentaifoundryScrapsExtractor(HentaifoundryExtractor):
|
|||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
HentaifoundryExtractor.__init__(
|
HentaifoundryExtractor.__init__(
|
||||||
self, match, match.group(1), match.group(2))
|
self, match, match.group(1), match.group(2))
|
||||||
self.url = "{}/pictures/user/{}/scraps".format(self.root, self.user)
|
self.page_url = "{}/pictures/user/{}/scraps".format(
|
||||||
|
self.root, self.user)
|
||||||
|
|
||||||
def get_job_metadata(self):
|
def get_job_metadata(self):
|
||||||
page = self.request(self.url + "?enterAgree=1").text
|
page = self.request(self.page_url + "?enterAgree=1").text
|
||||||
count = text.extract(page, ">Scraps (", ")")[0]
|
count = text.extract(page, ">Scraps (", ")")[0]
|
||||||
return {"user": self.user, "count": text.parse_int(count)}
|
return {"user": self.user, "count": text.parse_int(count)}
|
||||||
|
|
||||||
@@ -188,7 +189,8 @@ class HentaifoundryFavoriteExtractor(HentaifoundryExtractor):
|
|||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
HentaifoundryExtractor.__init__(
|
HentaifoundryExtractor.__init__(
|
||||||
self, match, match.group(1), match.group(2))
|
self, match, match.group(1), match.group(2))
|
||||||
self.url = "{}/user/{}/faves/pictures".format(self.root, self.user)
|
self.page_url = "{}/user/{}/faves/pictures".format(
|
||||||
|
self.root, self.user)
|
||||||
|
|
||||||
|
|
||||||
class HentaifoundryRecentExtractor(HentaifoundryExtractor):
|
class HentaifoundryRecentExtractor(HentaifoundryExtractor):
|
||||||
@@ -203,7 +205,7 @@ class HentaifoundryRecentExtractor(HentaifoundryExtractor):
|
|||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
HentaifoundryExtractor.__init__(self, match, "", match.group(2))
|
HentaifoundryExtractor.__init__(self, match, "", match.group(2))
|
||||||
self.date = match.group(1)
|
self.date = match.group(1)
|
||||||
self.url = "{}/pictures/recent/{}".format(self.root, self.date)
|
self.page_url = "{}/pictures/recent/{}".format(self.root, self.date)
|
||||||
|
|
||||||
def get_job_metadata(self):
|
def get_job_metadata(self):
|
||||||
self.request(self.root + "/?enterAgree=1")
|
self.request(self.root + "/?enterAgree=1")
|
||||||
@@ -221,7 +223,7 @@ class HentaifoundryPopularExtractor(HentaifoundryExtractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
HentaifoundryExtractor.__init__(self, match, "", match.group(1))
|
HentaifoundryExtractor.__init__(self, match, "", match.group(1))
|
||||||
self.url = self.root + "/pictures/popular"
|
self.page_url = self.root + "/pictures/popular"
|
||||||
|
|
||||||
|
|
||||||
class HentaifoundryImageExtractor(HentaifoundryExtractor):
|
class HentaifoundryImageExtractor(HentaifoundryExtractor):
|
||||||
|
|||||||
@@ -27,7 +27,8 @@ class ImagehostImageExtractor(SharedConfigMixin, Extractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
self.url = ("https://" if self.https else "http://") + match.group(1)
|
self.page_url = "http{}://{}".format(
|
||||||
|
"s" if self.https else "", match.group(1))
|
||||||
self.token = match.group(2)
|
self.token = match.group(2)
|
||||||
if self.params == "simple":
|
if self.params == "simple":
|
||||||
self.params = {
|
self.params = {
|
||||||
@@ -47,7 +48,7 @@ class ImagehostImageExtractor(SharedConfigMixin, Extractor):
|
|||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
page = self.request(
|
page = self.request(
|
||||||
self.url,
|
self.page_url,
|
||||||
method=self.method,
|
method=self.method,
|
||||||
data=self.params,
|
data=self.params,
|
||||||
cookies=self.cookies,
|
cookies=self.cookies,
|
||||||
@@ -95,11 +96,11 @@ class ImxtoImageExtractor(ImagehostImageExtractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
ImagehostImageExtractor.__init__(self, match)
|
ImagehostImageExtractor.__init__(self, match)
|
||||||
if "/img-" in self.url:
|
if "/img-" in self.page_url:
|
||||||
self.url = self.url.replace("img.yt", "imx.to")
|
self.page_url = self.page_url.replace("img.yt", "imx.to")
|
||||||
self.urlext = True
|
self.url_ext = True
|
||||||
else:
|
else:
|
||||||
self.urlext = False
|
self.url_ext = False
|
||||||
|
|
||||||
def get_info(self, page):
|
def get_info(self, page):
|
||||||
url, pos = text.extract(
|
url, pos = text.extract(
|
||||||
@@ -107,7 +108,7 @@ class ImxtoImageExtractor(ImagehostImageExtractor):
|
|||||||
if not url:
|
if not url:
|
||||||
raise exception.NotFoundError("image")
|
raise exception.NotFoundError("image")
|
||||||
filename, pos = text.extract(page, ' title="', '"', pos)
|
filename, pos = text.extract(page, ' title="', '"', pos)
|
||||||
if self.urlext and filename:
|
if self.url_ext and filename:
|
||||||
filename += splitext(url)[1]
|
filename += splitext(url)[1]
|
||||||
return url, filename or url
|
return url, filename or url
|
||||||
|
|
||||||
@@ -146,7 +147,7 @@ class ImagevenueImageExtractor(ImagehostImageExtractor):
|
|||||||
|
|
||||||
def get_info(self, page):
|
def get_info(self, page):
|
||||||
url = text.extract(page, "SRC='", "'")[0]
|
url = text.extract(page, "SRC='", "'")[0]
|
||||||
return text.urljoin(self.url, url), url
|
return text.urljoin(self.page_url, url), url
|
||||||
|
|
||||||
|
|
||||||
class ImagetwistImageExtractor(ImagehostImageExtractor):
|
class ImagetwistImageExtractor(ImagehostImageExtractor):
|
||||||
@@ -164,7 +165,7 @@ class ImagetwistImageExtractor(ImagehostImageExtractor):
|
|||||||
@property
|
@property
|
||||||
@memcache(maxage=3*60*60)
|
@memcache(maxage=3*60*60)
|
||||||
def cookies(self):
|
def cookies(self):
|
||||||
return self.request(self.url).cookies
|
return self.request(self.page_url).cookies
|
||||||
|
|
||||||
def get_info(self, page):
|
def get_info(self, page):
|
||||||
url , pos = text.extract(page, 'center;"><img src="', '"')
|
url , pos = text.extract(page, 'center;"><img src="', '"')
|
||||||
|
|||||||
@@ -28,34 +28,29 @@ class ImgthGalleryExtractor(Extractor):
|
|||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
self.gid = match.group(1)
|
self.gid = match.group(1)
|
||||||
self.url = "https://imgth.com/gallery/" + self.gid + "/g/page/"
|
self.url_base = "https://imgth.com/gallery/" + self.gid + "/g/page/"
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
page = self.request(self.url + "0").text
|
page = self.request(self.url_base + "0").text
|
||||||
data = self.get_job_metadata(page)
|
data = self.metadata(page)
|
||||||
yield Message.Version, 1
|
yield Message.Version, 1
|
||||||
yield Message.Directory, data
|
yield Message.Directory, data
|
||||||
for data["num"], url in enumerate(self.get_images(page), 1):
|
for data["num"], url in enumerate(self.images(page), 1):
|
||||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||||
|
|
||||||
def get_images(self, page):
|
def images(self, page):
|
||||||
"""Yield all image urls for this gallery"""
|
"""Yield all image urls for this gallery"""
|
||||||
pnum = 0
|
pnum = 0
|
||||||
while True:
|
while True:
|
||||||
pos = 0
|
thumbs = text.extract(page, '<ul class="thumbnails">', '</ul>')[0]
|
||||||
page = text.extract(page, '<ul class="thumbnails">', '</ul>')[0]
|
for url in text.extract_iter(thumbs, '<img src="', '"'):
|
||||||
while True:
|
|
||||||
url, pos = text.extract(page, '<img src="', '"', pos)
|
|
||||||
if not url:
|
|
||||||
break
|
|
||||||
yield "https://imgth.com/images/" + url[24:]
|
yield "https://imgth.com/images/" + url[24:]
|
||||||
pos = page.find('<li class="next">', pos)
|
if '<li class="next">' not in page:
|
||||||
if pos == -1:
|
|
||||||
return
|
return
|
||||||
pnum += 1
|
pnum += 1
|
||||||
page = self.request(self.url + str(pnum)).text
|
page = self.request(self.url_base + str(pnum)).text
|
||||||
|
|
||||||
def get_job_metadata(self, page):
|
def metadata(self, page):
|
||||||
"""Collect metadata for extractor-job"""
|
"""Collect metadata for extractor-job"""
|
||||||
return text.extract_all(page, (
|
return text.extract_all(page, (
|
||||||
("title", '<h1>', '</h1>'),
|
("title", '<h1>', '</h1>'),
|
||||||
|
|||||||
@@ -50,7 +50,6 @@ class PhotobucketAlbumExtractor(Extractor):
|
|||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
self.album_path = ""
|
self.album_path = ""
|
||||||
self.url = match.group(0)
|
|
||||||
self.root = "http://" + match.group(1)
|
self.root = "http://" + match.group(1)
|
||||||
self.session.headers["Referer"] = self.url
|
self.session.headers["Referer"] = self.url
|
||||||
|
|
||||||
@@ -128,7 +127,6 @@ class PhotobucketImageExtractor(Extractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
self.url = match.group(0)
|
|
||||||
self.user = match.group(1) or match.group(3)
|
self.user = match.group(1) or match.group(3)
|
||||||
self.media_id = match.group(2)
|
self.media_id = match.group(2)
|
||||||
self.session.headers["Referer"] = self.url
|
self.session.headers["Referer"] = self.url
|
||||||
|
|||||||
@@ -27,7 +27,6 @@ class ReactorExtractor(SharedConfigMixin, Extractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
self.url = match.group(0)
|
|
||||||
self.root = "http://" + match.group(1)
|
self.root = "http://" + match.group(1)
|
||||||
self.session.headers["Referer"] = self.root
|
self.session.headers["Referer"] = self.root
|
||||||
|
|
||||||
|
|||||||
@@ -17,20 +17,18 @@ import re
|
|||||||
class RecursiveExtractor(Extractor):
|
class RecursiveExtractor(Extractor):
|
||||||
"""Extractor that fetches URLs from a remote or local source"""
|
"""Extractor that fetches URLs from a remote or local source"""
|
||||||
category = "recursive"
|
category = "recursive"
|
||||||
pattern = r"r(?:ecursive)?:(.+)"
|
pattern = r"r(?:ecursive)?:"
|
||||||
test = ("recursive:https://pastebin.com/raw/FLwrCYsT", {
|
test = ("recursive:https://pastebin.com/raw/FLwrCYsT", {
|
||||||
"url": "eee86d65c346361b818e8f4b2b307d9429f136a2",
|
"url": "eee86d65c346361b818e8f4b2b307d9429f136a2",
|
||||||
})
|
})
|
||||||
|
|
||||||
def __init__(self, match):
|
|
||||||
Extractor.__init__(self, match)
|
|
||||||
self.session.mount("file://", FileAdapter())
|
|
||||||
self.url = match.group(1)
|
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
blist = self.config(
|
blist = self.config(
|
||||||
"blacklist", {"directlink"} | util.SPECIAL_EXTRACTORS)
|
"blacklist", {"directlink"} | util.SPECIAL_EXTRACTORS)
|
||||||
page = self.request(self.url).text
|
|
||||||
|
self.session.mount("file://", FileAdapter())
|
||||||
|
page = self.request(self.url.partition(":")[2]).text
|
||||||
|
|
||||||
yield Message.Version, 1
|
yield Message.Version, 1
|
||||||
with extractor.blacklist(blist):
|
with extractor.blacklist(blist):
|
||||||
for match in re.finditer(r"https?://[^\s\"']+", page):
|
for match in re.finditer(r"https?://[^\s\"']+", page):
|
||||||
|
|||||||
@@ -143,10 +143,6 @@ class RedditImageExtractor(Extractor):
|
|||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self, match):
|
|
||||||
Extractor.__init__(self, match)
|
|
||||||
self.url = match.group(0)
|
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
data = text.nameext_from_url(self.url)
|
data = text.nameext_from_url(self.url)
|
||||||
yield Message.Version, 1
|
yield Message.Version, 1
|
||||||
|
|||||||
@@ -101,11 +101,11 @@ class SimplyhentaiImageExtractor(Extractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
self.url = "https://www." + match.group(1)
|
self.page_url = "https://www." + match.group(1)
|
||||||
self.type = match.group(2)
|
self.type = match.group(2)
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
page = self.request(self.url).text
|
page = self.request(self.page_url).text
|
||||||
url_search = 'data-src="' if self.type == "image" else '<source src="'
|
url_search = 'data-src="' if self.type == "image" else '<source src="'
|
||||||
|
|
||||||
title, pos = text.extract(page, '"og:title" content="', '"')
|
title, pos = text.extract(page, '"og:title" content="', '"')
|
||||||
@@ -155,10 +155,10 @@ class SimplyhentaiVideoExtractor(Extractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
self.url = "https://" + match.group(1)
|
self.page_url = "https://" + match.group(1)
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
page = self.request(self.url).text
|
page = self.request(self.page_url).text
|
||||||
|
|
||||||
title, pos = text.extract(page, "<title>", "</title>")
|
title, pos = text.extract(page, "<title>", "</title>")
|
||||||
tags , pos = text.extract(page, ">Tags</div>", "</div>", pos)
|
tags , pos = text.extract(page, ">Tags</div>", "</div>", pos)
|
||||||
|
|||||||
@@ -16,9 +16,10 @@ import json
|
|||||||
class XvideosExtractor(Extractor):
|
class XvideosExtractor(Extractor):
|
||||||
"""Base class for xvideos extractors"""
|
"""Base class for xvideos extractors"""
|
||||||
category = "xvideos"
|
category = "xvideos"
|
||||||
|
root = "https://www.xvideos.com"
|
||||||
|
|
||||||
def get_page(self, codes=(403, 404)):
|
def get_page(self, url, codes=(403, 404)):
|
||||||
response = self.request(self.url, expect=codes)
|
response = self.request(url, expect=codes)
|
||||||
if response.status_code in codes:
|
if response.status_code in codes:
|
||||||
raise exception.NotFoundError(self.subcategory)
|
raise exception.NotFoundError(self.subcategory)
|
||||||
return response.text
|
return response.text
|
||||||
@@ -46,11 +47,10 @@ class XvideosGalleryExtractor(XvideosExtractor):
|
|||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
XvideosExtractor.__init__(self, match)
|
XvideosExtractor.__init__(self, match)
|
||||||
self.user, self.gid = match.groups()
|
self.user, self.gid = match.groups()
|
||||||
self.url = "https://www.xvideos.com/profiles/{}/photos/{}".format(
|
|
||||||
self.user, self.gid)
|
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
page = self.get_page()
|
url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid)
|
||||||
|
page = self.get_page(url)
|
||||||
data = self.get_metadata(page)
|
data = self.get_metadata(page)
|
||||||
imgs = self.get_images(page)
|
imgs = self.get_images(page)
|
||||||
data["count"] = len(imgs)
|
data["count"] = len(imgs)
|
||||||
@@ -110,10 +110,10 @@ class XvideosUserExtractor(XvideosExtractor):
|
|||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
XvideosExtractor.__init__(self, match)
|
XvideosExtractor.__init__(self, match)
|
||||||
self.user = match.group(1)
|
self.user = match.group(1)
|
||||||
self.url = "https://www.xvideos.com/profiles/" + self.user
|
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
page = self.get_page()
|
url = "{}/profiles/{}".format(self.root, self.user)
|
||||||
|
page = self.get_page(url)
|
||||||
data = json.loads(text.extract(
|
data = json.loads(text.extract(
|
||||||
page, "xv.conf=", ";</script>")[0])["data"]
|
page, "xv.conf=", ";</script>")[0])["data"]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user