diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 4c48d738..6963ce2a 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -13,12 +13,15 @@ from .. import text, exception from ..cache import cache +BASE_PATTERN = r"(?:https?://)?(?:www\.)?nijie\.info" + + class NijieExtractor(AsynchronousMixin, Extractor): """Base class for nijie extractors""" category = "nijie" directory_fmt = ("{category}", "{user_id}") - filename_fmt = "{category}_{artist_id}_{image_id}_p{index:>02}.{extension}" - archive_fmt = "{image_id}_{index}" + filename_fmt = "{category}_{artist_id}_{image_id}_p{num:>02}.{extension}" + archive_fmt = "{image_id}_{num}" cookiedomain = "nijie.info" cookienames = ("nemail", "nlogin") root = "https://nijie.info" @@ -32,56 +35,64 @@ class NijieExtractor(AsynchronousMixin, Extractor): def items(self): self.login() - data = self.get_job_metadata() - + metadata = self.metadata() yield Message.Version, 1 - yield Message.Directory, data - for image_id in self.get_image_ids(): - for image_url, image_data in self.get_image_data(image_id): - image_data.update(data) - if not image_data["extension"]: - image_data["extension"] = "jpg" - yield Message.Url, image_url, image_data + for image_id in self.image_ids(): - def get_job_metadata(self): + response = self.request(self.view_url + image_id, fatal=False) + if response.status_code >= 400: + continue + page = response.text + + data = self._extract_data(page) + data.update(metadata) + data["image_id"] = text.parse_int(image_id) + yield Message.Directory, data + + for image in self._extract_images(page): + image.update(data) + if not image["extension"]: + image["extension"] = "jpg" + yield Message.Url, image["url"], image + + def metadata(self): """Collect metadata for extractor-job""" return {"user_id": text.parse_int(self.user_id)} - def get_image_ids(self): + def image_ids(self): """Collect all relevant image-ids""" + return () - def get_image_data(self, image_id): - """Get URL and metadata for images specified by 'image_id'""" - page = self.request(self.view_url + image_id).text - return self.extract_image_data(page, image_id) + @staticmethod + def _extract_data(page): + """Extract image metadata from 'page'""" + extr = text.extract_from(page) + keywords = text.unescape(extr( + 'name="keywords" content="', '"')).split(",") + return { + "title" : keywords[0].strip(), + "description": text.unescape(extr( + '"og:description" content="', '"')), + "date" : text.parse_datetime(extr( + '"datePublished": "', '"')[:-4] + "+0900", + "%a %d %b %Y %I:%M:%S %p%z"), + "artist_id" : text.parse_int(extr( + '"sameAs": "https://nijie.info/members.php?id=', '"')), + "artist_name": keywords[1], + "tags" : keywords[2:-1], + } - def extract_image_data(self, page, image_id): - """Get URL and metadata for images from 'page'""" - title, pos = text.extract( - page, '= 18", }) - def get_image_ids(self): + def image_ids(self): return self._pagination("members_dojin") @@ -151,30 +173,28 @@ class NijieFavoriteExtractor(NijieExtractor): """Extractor for all favorites/bookmarks of a nijie-user""" subcategory = "favorite" directory_fmt = ("{category}", "bookmarks", "{user_id}") - archive_fmt = "f_{user_id}_{image_id}_{index}" - pattern = (r"(?:https?://)?(?:www\.)?nijie\.info" - r"/user_like_illust_view\.php\?id=(\d+)") + archive_fmt = "f_{user_id}_{image_id}_{num}" + pattern = BASE_PATTERN + r"/user_like_illust_view\.php\?id=(\d+)" test = ("https://nijie.info/user_like_illust_view.php?id=44", { "count": ">= 16", }) - def get_image_ids(self): + def image_ids(self): return self._pagination("user_like_illust_view") class NijieImageExtractor(NijieExtractor): """Extractor for a work/image from nijie.info""" subcategory = "image" - pattern = (r"(?:https?://)?(?:www\.)?nijie\.info" - r"/view(?:_popup)?\.php\?id=(\d+)") + pattern = BASE_PATTERN + r"/view(?:_popup)?\.php\?id=(\d+)" test = ( ("https://nijie.info/view.php?id=70720", { "url": "5497f897311397dafa188521258624346a0af2a3", - "keyword": "408393d010307c76d52cbd0a4368d6d357805aea", + "keyword": "7b7cfc89fa59652a100f94d4b765130d93281c66", "content": "d85e3ea896ed5e4da0bca2390ad310a4df716ca6", }), ("https://nijie.info/view.php?id=70724", { - "exception": exception.NotFoundError, + "count": 0, }), ("https://nijie.info/view_popup.php?id=70720"), ) @@ -182,17 +202,15 @@ class NijieImageExtractor(NijieExtractor): def __init__(self, match): NijieExtractor.__init__(self, match) self.image_id = match.group(1) - self.page = "" - def get_job_metadata(self): - self.page = self.request( - self.view_url + self.image_id, notfound="image").text - self.user_id = text.extract( - self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0] - return NijieExtractor.get_job_metadata(self) + def metadata(self): + return {} - def get_image_ids(self): + def image_ids(self): return (self.image_id,) - def get_image_data(self, _): - return self.extract_image_data(self.page, self.image_id) + @staticmethod + def _extract_data(page): + data = NijieExtractor._extract_data(page) + data["user_id"] = data["artist_id"] + return data