[myportfolio] combine gallery and user extractors

An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content.
2019-03-06 17:20:24 +01:00
parent efd104e45e
commit 976ccb267f
3 changed files with 33 additions and 53 deletions
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@@ -7,7 +7,7 @@ Site                 URL                                 Capabilities
 4chan                https://www.4chan.org/              Threads
 4plebs               https://archive.4plebs.org/         Threads
 8chan                https://8ch.net/                    Threads
-Adobe Portfolio      https://www.myportfolio.com/        Images from Users, Galleries
+Adobe Portfolio      https://www.myportfolio.com/        Galleries
 arch.b4k.co          https://arch.b4k.co/                Threads
 Archive of Sins      https://archiveofsins.com/          Threads
 Archived.Moe         https://archived.moe/               Threads
--- a/gallery_dl/extractor/myportfolio.py
+++ b/gallery_dl/extractor/myportfolio.py
@@ -12,11 +12,6 @@ from .common import Extractor, Message
 from .. import text
 BASE_PATTERN = (
    r"(?:myportfolio:(?:https?://)?([^/]+)|"
    r"(?:https?://)?([^.]+\.myportfolio\.com))")
 class MyportfolioGalleryExtractor(Extractor):
    """Extractor for an image gallery on www.myportfolio.com"""
    category = "myportfolio"
@@ -24,38 +19,56 @@ class MyportfolioGalleryExtractor(Extractor):
    directory_fmt = ("{category}", "{user}", "{title}")
    filename_fmt = "{num:>02}.{extension}"
    archive_fmt = "{user}_{filename}"
-    pattern = BASE_PATTERN + r"/(?!projects/?$)([^/?&#]+)"
+    pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|"
               r"(?:https?://)?([^.]+\.myportfolio\.com))"
               r"(/[^/?&#]+)?")
    test = (
        ("https://hannahcosgrove.myportfolio.com/chloe", {
            "url": "d5cf993a05439a9d8a99590aa61e14e5ac8d0cd0",
            "keyword": "89b055a6ce833ba4f060ab1f97f086e58ce8bbd1",
        }),
        ("https://hannahcosgrove.myportfolio.com/lfw", {
            "pattern": r"https://hannahcosgrove\.myportfolio\.com/[^/?&#+]+$",
            "count": ">= 8",
        }),
        ("myportfolio:https://tooco.com.ar/6-of-diamonds-paradise-bird", {
            "count": 3,
        }),
        ("myportfolio:https://tooco.com.ar/", {
            "count": ">= 40",
        }),
    )
    def __init__(self, match):
        Extractor.__init__(self, match)
-        self.domain = match.group(1) or match.group(2)
+        domain1, domain2, self.path = match.groups()
-        self.gallery = match.group(3)
+        self.domain = domain1 or domain2
        self.prefix = "myportfolio:" if domain1 else ""
    def items(self):
-        url = "https://{}/{}".format(self.domain, self.gallery)
+        yield Message.Version, 1
        url = "https://" + self.domain + (self.path or "")
        page = self.request(url).text
-        data = self.get_metadata(page)
+        projects = text.extract(
-        imgs = self.get_images(page)
+            page, '<section class="project-covers', '</section>')[0]
        data["count"] = len(imgs)
-        yield Message.Version, 1
+        if projects:
-        yield Message.Directory, data
+            data = {"_extractor": MyportfolioGalleryExtractor}
-        for data["num"], url in enumerate(imgs, 1):
+            base = self.prefix + "https://" + self.domain
-            yield Message.Url, url, text.nameext_from_url(url, data)
+            for path in text.extract_iter(projects, ' href="', '"'):
                yield Message.Queue, base + path, data
        else:
            data = self.metadata(page)
            imgs = self.images(page)
            data["count"] = len(imgs)
            yield Message.Directory, data
            for data["num"], url in enumerate(imgs, 1):
                yield Message.Url, url, text.nameext_from_url(url, data)
    @staticmethod
-    def get_metadata(page):
+    def metadata(page):
-        """Collect metadata for extractor-job"""
+        """Collect general image metadata"""
        # og:title contains data as "<user> - <title>", but both
        # <user> and <title> can contain a "-" as well, so we get the title
        # from somewhere else and cut that amount from the og:title content
@@ -77,38 +90,6 @@ class MyportfolioGalleryExtractor(Extractor):
        }
    @staticmethod
-    def get_images(page):
+    def images(page):
        """Extract and return a list of all image-urls"""
        return list(text.extract_iter(page, 'js-lightbox" data-src="', '"'))
 class MyportfolioUserExtractor(Extractor):
    """Extractor for a user's galleries on www.myportfolio.com"""
    category = "myportfolio"
    subcategory = "user"
    pattern = BASE_PATTERN + r"/?$"
    test = (
        ("https://hannahcosgrove.myportfolio.com/", {
            "pattern": r"https://hannahcosgrove\.myportfolio\.com/[^/?&#+]+$",
            "count": ">= 23",
        }),
        ("myportfolio:https://tooco.com.ar/", {
            "count": ">= 40",
        }),
    )
    def __init__(self, match):
        Extractor.__init__(self, match)
        self.domain = match.group(1) or match.group(2)
        self.prefix = "myportfolio:" if match.group(1) else ""
    def items(self):
        url = "https://" + self.domain
        page = self.request(url).text
        main = text.extract(page, "<main>", "</main>")[0]
        data = {"_extractor": MyportfolioGalleryExtractor}
        yield Message.Version, 1
        for path in text.extract_iter(main, ' href="', '"'):
            if path and path[0] == "/":
                yield Message.Queue, self.prefix + url + path, data
--- a/test/test_results.py
+++ b/test/test_results.py
@@ -28,7 +28,6 @@ BROKEN = {
    "fallenangels",
    "komikcast",
    "mangapark",
    "myportfolio",
    "seaotterscans",
 }