store the full original URL in Extractor.url

This commit is contained in:
Mike Fährmann
2019-02-12 10:20:21 +01:00
parent 580baef72c
commit 2e516a1e3e
14 changed files with 72 additions and 78 deletions

View File

@@ -17,20 +17,18 @@ import re
class RecursiveExtractor(Extractor):
"""Extractor that fetches URLs from a remote or local source"""
category = "recursive"
pattern = r"r(?:ecursive)?:(.+)"
pattern = r"r(?:ecursive)?:"
test = ("recursive:https://pastebin.com/raw/FLwrCYsT", {
"url": "eee86d65c346361b818e8f4b2b307d9429f136a2",
})
def __init__(self, match):
Extractor.__init__(self, match)
self.session.mount("file://", FileAdapter())
self.url = match.group(1)
def items(self):
blist = self.config(
"blacklist", {"directlink"} | util.SPECIAL_EXTRACTORS)
page = self.request(self.url).text
self.session.mount("file://", FileAdapter())
page = self.request(self.url.partition(":")[2]).text
yield Message.Version, 1
with extractor.blacklist(blist):
for match in re.finditer(r"https?://[^\s\"']+", page):