remove 'extractor.blacklist' context manager
This commit is contained in:
@@ -140,7 +140,7 @@ def find(url):
|
||||
"""Find a suitable extractor for the given URL"""
|
||||
for cls in _list_classes():
|
||||
match = cls.pattern.match(url)
|
||||
if match and cls not in _blacklist:
|
||||
if match:
|
||||
return cls(match)
|
||||
return None
|
||||
|
||||
@@ -169,26 +169,10 @@ def extractors():
|
||||
)
|
||||
|
||||
|
||||
class blacklist():
|
||||
"""Context Manager to blacklist extractor modules"""
|
||||
def __init__(self, categories, extractors=None):
|
||||
self.extractors = extractors or []
|
||||
for cls in _list_classes():
|
||||
if cls.category in categories:
|
||||
self.extractors.append(cls)
|
||||
|
||||
def __enter__(self):
|
||||
_blacklist.update(self.extractors)
|
||||
|
||||
def __exit__(self, etype, value, traceback):
|
||||
_blacklist.clear()
|
||||
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# internals
|
||||
|
||||
_cache = []
|
||||
_blacklist = set()
|
||||
_module_iter = iter(modules)
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2019 Mike Fährmann
|
||||
# Copyright 2019-2020 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -9,7 +9,7 @@
|
||||
"""Extractors for https://www.plurk.com/"""
|
||||
|
||||
from .common import Extractor, Message
|
||||
from .. import text, extractor, exception
|
||||
from .. import text, exception
|
||||
import datetime
|
||||
import time
|
||||
import json
|
||||
@@ -23,12 +23,9 @@ class PlurkExtractor(Extractor):
|
||||
|
||||
def items(self):
|
||||
urls = self._urls_ex if self.config("comments", False) else self._urls
|
||||
|
||||
yield Message.Version, 1
|
||||
with extractor.blacklist(("plurk",)):
|
||||
for plurk in self.plurks():
|
||||
for url in urls(plurk):
|
||||
yield Message.Queue, url, plurk
|
||||
for plurk in self.plurks():
|
||||
for url in urls(plurk):
|
||||
yield Message.Queue, url, plurk
|
||||
|
||||
def plurks(self):
|
||||
"""Return an iterable with all relevant 'plurk' objects"""
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
"""Recursive extractor"""
|
||||
|
||||
from .common import Extractor, Message
|
||||
from .. import extractor, util
|
||||
import requests
|
||||
import re
|
||||
|
||||
@@ -23,17 +22,12 @@ class RecursiveExtractor(Extractor):
|
||||
})
|
||||
|
||||
def items(self):
|
||||
blist = self.config(
|
||||
"blacklist", {"directlink"} | util.SPECIAL_EXTRACTORS)
|
||||
|
||||
self.session.mount("file://", FileAdapter())
|
||||
page = self.request(self.url.partition(":")[2]).text
|
||||
del self.session.adapters["file://"]
|
||||
|
||||
yield Message.Version, 1
|
||||
with extractor.blacklist(blist):
|
||||
for match in re.finditer(r"https?://[^\s\"']+", page):
|
||||
yield Message.Queue, match.group(0), {}
|
||||
for match in re.finditer(r"https?://[^\s\"']+", page):
|
||||
yield Message.Queue, match.group(0), {}
|
||||
|
||||
|
||||
class FileAdapter(requests.adapters.BaseAdapter):
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
"""Extract images from https://www.tumblr.com/"""
|
||||
|
||||
from .common import Extractor, Message
|
||||
from .. import text, oauth, extractor, exception
|
||||
from .. import text, oauth, exception
|
||||
from datetime import datetime, timedelta
|
||||
import re
|
||||
|
||||
@@ -128,12 +128,9 @@ class TumblrExtractor(Extractor):
|
||||
|
||||
if self.external: # external links
|
||||
post["extension"] = None
|
||||
with extractor.blacklist(("tumblr",)):
|
||||
for key in ("permalink_url", "url"):
|
||||
url = post.get(key)
|
||||
if url:
|
||||
yield Message.Queue, url, post
|
||||
break
|
||||
url = post.get("permalink_url") or post.get("url")
|
||||
if url:
|
||||
yield Message.Queue, url, post
|
||||
|
||||
def posts(self):
|
||||
"""Return an iterable containing all relevant posts"""
|
||||
|
||||
Reference in New Issue
Block a user