From 976ccb267fc9146d0465aa68d0c0f98e6253d47f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Wed, 6 Mar 2019 17:20:24 +0100
Subject: [PATCH] [myportfolio] combine gallery and user extractors

An URL alone isn't good enough to distinguish between a gallery or a
gallery-listing, so the new extractor decides what to do based on the
page's content.
---
 docs/supportedsites.rst             |  2 +-
 gallery_dl/extractor/myportfolio.py | 83 +++++++++++------------------
 test/test_results.py                |  1 -
 3 files changed, 33 insertions(+), 53 deletions(-)
diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst
index b53cac52..f9dcd2bc 100644
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@@ -7,7 +7,7 @@ Site                 URL                                 Capabilities
 4chan                https://www.4chan.org/              Threads
 4plebs               https://archive.4plebs.org/         Threads
 8chan                https://8ch.net/                    Threads
-Adobe Portfolio      https://www.myportfolio.com/        Images from Users, Galleries
+Adobe Portfolio      https://www.myportfolio.com/        Galleries
 arch.b4k.co          https://arch.b4k.co/                Threads
 Archive of Sins      https://archiveofsins.com/          Threads
 Archived.Moe         https://archived.moe/               Threads
diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py
index baf07cbf..089d8c9a 100644
--- a/gallery_dl/extractor/myportfolio.py
+++ b/gallery_dl/extractor/myportfolio.py
@@ -12,11 +12,6 @@ from .common import Extractor, Message
 from .. import text
 
 
-BASE_PATTERN = (
-    r"(?:myportfolio:(?:https?://)?([^/]+)|"
-    r"(?:https?://)?([^.]+\.myportfolio\.com))")
-
-
 class MyportfolioGalleryExtractor(Extractor):
     """Extractor for an image gallery on www.myportfolio.com"""
     category = "myportfolio"
@@ -24,38 +19,56 @@ class MyportfolioGalleryExtractor(Extractor):
     directory_fmt = ("{category}", "{user}", "{title}")
     filename_fmt = "{num:>02}.{extension}"
     archive_fmt = "{user}_{filename}"
-    pattern = BASE_PATTERN + r"/(?!projects/?$)([^/?&#]+)"
+    pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|"
+               r"(?:https?://)?([^.]+\.myportfolio\.com))"
+               r"(/[^/?&#]+)?")
     test = (
         ("https://hannahcosgrove.myportfolio.com/chloe", {
             "url": "d5cf993a05439a9d8a99590aa61e14e5ac8d0cd0",
             "keyword": "89b055a6ce833ba4f060ab1f97f086e58ce8bbd1",
         }),
+        ("https://hannahcosgrove.myportfolio.com/lfw", {
+            "pattern": r"https://hannahcosgrove\.myportfolio\.com/[^/?&#+]+$",
+            "count": ">= 8",
+        }),
         ("myportfolio:https://tooco.com.ar/6-of-diamonds-paradise-bird", {
             "count": 3,
         }),
+        ("myportfolio:https://tooco.com.ar/", {
+            "count": ">= 40",
+        }),
     )
 
     def __init__(self, match):
         Extractor.__init__(self, match)
-        self.domain = match.group(1) or match.group(2)
-        self.gallery = match.group(3)
+        domain1, domain2, self.path = match.groups()
+        self.domain = domain1 or domain2
+        self.prefix = "myportfolio:" if domain1 else ""
 
     def items(self):
-        url = "https://{}/{}".format(self.domain, self.gallery)
+        yield Message.Version, 1
+        url = "https://" + self.domain + (self.path or "")
         page = self.request(url).text
 
-        data = self.get_metadata(page)
-        imgs = self.get_images(page)
-        data["count"] = len(imgs)
+        projects = text.extract(
+            page, '<section class="project-covers', '</section>')[0]
 
-        yield Message.Version, 1
-        yield Message.Directory, data
-        for data["num"], url in enumerate(imgs, 1):
-            yield Message.Url, url, text.nameext_from_url(url, data)
+        if projects:
+            data = {"_extractor": MyportfolioGalleryExtractor}
+            base = self.prefix + "https://" + self.domain
+            for path in text.extract_iter(projects, ' href="', '"'):
+                yield Message.Queue, base + path, data
+        else:
+            data = self.metadata(page)
+            imgs = self.images(page)
+            data["count"] = len(imgs)
+            yield Message.Directory, data
+            for data["num"], url in enumerate(imgs, 1):
+                yield Message.Url, url, text.nameext_from_url(url, data)
 
     @staticmethod
-    def get_metadata(page):
-        """Collect metadata for extractor-job"""
+    def metadata(page):
+        """Collect general image metadata"""
         # og:title contains data as "<user> - <title>", but both
         # <user> and <title> can contain a "-" as well, so we get the title
         # from somewhere else and cut that amount from the og:title content
@@ -77,38 +90,6 @@ class MyportfolioGalleryExtractor(Extractor):
         }
 
     @staticmethod
-    def get_images(page):
+    def images(page):
         """Extract and return a list of all image-urls"""
         return list(text.extract_iter(page, 'js-lightbox" data-src="', '"'))
-
-
-class MyportfolioUserExtractor(Extractor):
-    """Extractor for a user's galleries on www.myportfolio.com"""
-    category = "myportfolio"
-    subcategory = "user"
-    pattern = BASE_PATTERN + r"/?$"
-    test = (
-        ("https://hannahcosgrove.myportfolio.com/", {
-            "pattern": r"https://hannahcosgrove\.myportfolio\.com/[^/?&#+]+$",
-            "count": ">= 23",
-        }),
-        ("myportfolio:https://tooco.com.ar/", {
-            "count": ">= 40",
-        }),
-    )
-
-    def __init__(self, match):
-        Extractor.__init__(self, match)
-        self.domain = match.group(1) or match.group(2)
-        self.prefix = "myportfolio:" if match.group(1) else ""
-
-    def items(self):
-        url = "https://" + self.domain
-        page = self.request(url).text
-        main = text.extract(page, "<main>", "</main>")[0]
-        data = {"_extractor": MyportfolioGalleryExtractor}
-
-        yield Message.Version, 1
-        for path in text.extract_iter(main, ' href="', '"'):
-            if path and path[0] == "/":
-                yield Message.Queue, self.prefix + url + path, data
diff --git a/test/test_results.py b/test/test_results.py
index a9375dce..b0fef212 100644
--- a/test/test_results.py
+++ b/test/test_results.py
@@ -28,7 +28,6 @@ BROKEN = {
     "fallenangels",
     "komikcast",
     "mangapark",
-    "myportfolio",
     "seaotterscans",
 }