[architizer] add 'firm' extractor (#1369)

This commit is contained in:
Mike Fährmann
2021-03-19 01:31:34 +01:00
parent 204523611c
commit 58b93635ee
2 changed files with 29 additions and 2 deletions

View File

@@ -8,7 +8,7 @@
"""Extractors for https://architizer.com/"""
from .common import GalleryExtractor
from .common import GalleryExtractor, Extractor, Message
from .. import text
@@ -72,3 +72,30 @@ class ArchitizerProjectExtractor(GalleryExtractor):
for url in text.extract_iter(
page, "property='og:image:secure_url' content='", "?")
]
class ArchitizerFirmExtractor(Extractor):
"""Extractor for all projects of a firm"""
category = "architizer"
subcategory = "firm"
root = "https://architizer.com"
pattern = r"(?:https?://)?architizer\.com/firms/([^/?#]+)"
test = ("https://architizer.com/firms/olson-kundig/", {
"pattern": ArchitizerProjectExtractor.pattern,
"count": ">= 90",
})
def __init__(self, match):
Extractor.__init__(self, match)
self.firm = match.group(1)
def items(self):
url = url = "{}/firms/{}/?requesting_merlin=pages".format(
self.root, self.firm)
page = self.request(url).text
data = {"_extractor": ArchitizerProjectExtractor}
for project in text.extract_iter(page, '<a href="/projects/', '"'):
if not project.startswith("q/"):
url = "{}/projects/{}".format(self.root, project)
yield Message.Queue, url, data