From 976ccb267fc9146d0465aa68d0c0f98e6253d47f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 6 Mar 2019 17:20:24 +0100 Subject: [PATCH] [myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. --- docs/supportedsites.rst | 2 +- gallery_dl/extractor/myportfolio.py | 83 +++++++++++------------------ test/test_results.py | 1 - 3 files changed, 33 insertions(+), 53 deletions(-) diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index b53cac52..f9dcd2bc 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -7,7 +7,7 @@ Site URL Capabilities 4chan https://www.4chan.org/ Threads 4plebs https://archive.4plebs.org/ Threads 8chan https://8ch.net/ Threads -Adobe Portfolio https://www.myportfolio.com/ Images from Users, Galleries +Adobe Portfolio https://www.myportfolio.com/ Galleries arch.b4k.co https://arch.b4k.co/ Threads Archive of Sins https://archiveofsins.com/ Threads Archived.Moe https://archived.moe/ Threads diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py index baf07cbf..089d8c9a 100644 --- a/gallery_dl/extractor/myportfolio.py +++ b/gallery_dl/extractor/myportfolio.py @@ -12,11 +12,6 @@ from .common import Extractor, Message from .. import text -BASE_PATTERN = ( - r"(?:myportfolio:(?:https?://)?([^/]+)|" - r"(?:https?://)?([^.]+\.myportfolio\.com))") - - class MyportfolioGalleryExtractor(Extractor): """Extractor for an image gallery on www.myportfolio.com""" category = "myportfolio" @@ -24,38 +19,56 @@ class MyportfolioGalleryExtractor(Extractor): directory_fmt = ("{category}", "{user}", "{title}") filename_fmt = "{num:>02}.{extension}" archive_fmt = "{user}_{filename}" - pattern = BASE_PATTERN + r"/(?!projects/?$)([^/?&#]+)" + pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|" + r"(?:https?://)?([^.]+\.myportfolio\.com))" + r"(/[^/?&#]+)?") test = ( ("https://hannahcosgrove.myportfolio.com/chloe", { "url": "d5cf993a05439a9d8a99590aa61e14e5ac8d0cd0", "keyword": "89b055a6ce833ba4f060ab1f97f086e58ce8bbd1", }), + ("https://hannahcosgrove.myportfolio.com/lfw", { + "pattern": r"https://hannahcosgrove\.myportfolio\.com/[^/?&#+]+$", + "count": ">= 8", + }), ("myportfolio:https://tooco.com.ar/6-of-diamonds-paradise-bird", { "count": 3, }), + ("myportfolio:https://tooco.com.ar/", { + "count": ">= 40", + }), ) def __init__(self, match): Extractor.__init__(self, match) - self.domain = match.group(1) or match.group(2) - self.gallery = match.group(3) + domain1, domain2, self.path = match.groups() + self.domain = domain1 or domain2 + self.prefix = "myportfolio:" if domain1 else "" def items(self): - url = "https://{}/{}".format(self.domain, self.gallery) + yield Message.Version, 1 + url = "https://" + self.domain + (self.path or "") page = self.request(url).text - data = self.get_metadata(page) - imgs = self.get_images(page) - data["count"] = len(imgs) + projects = text.extract( + page, '
- ", but both # <user> and <title> can contain a "-" as well, so we get the title # from somewhere else and cut that amount from the og:title content @@ -77,38 +90,6 @@ class MyportfolioGalleryExtractor(Extractor): } @staticmethod - def get_images(page): + def images(page): """Extract and return a list of all image-urls""" return list(text.extract_iter(page, 'js-lightbox" data-src="', '"')) - - -class MyportfolioUserExtractor(Extractor): - """Extractor for a user's galleries on www.myportfolio.com""" - category = "myportfolio" - subcategory = "user" - pattern = BASE_PATTERN + r"/?$" - test = ( - ("https://hannahcosgrove.myportfolio.com/", { - "pattern": r"https://hannahcosgrove\.myportfolio\.com/[^/?&#+]+$", - "count": ">= 23", - }), - ("myportfolio:https://tooco.com.ar/", { - "count": ">= 40", - }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.domain = match.group(1) or match.group(2) - self.prefix = "myportfolio:" if match.group(1) else "" - - def items(self): - url = "https://" + self.domain - page = self.request(url).text - main = text.extract(page, "<main>", "</main>")[0] - data = {"_extractor": MyportfolioGalleryExtractor} - - yield Message.Version, 1 - for path in text.extract_iter(main, ' href="', '"'): - if path and path[0] == "/": - yield Message.Queue, self.prefix + url + path, data diff --git a/test/test_results.py b/test/test_results.py index a9375dce..b0fef212 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -28,7 +28,6 @@ BROKEN = { "fallenangels", "komikcast", "mangapark", - "myportfolio", "seaotterscans", }