From e9cc7194974ad6f8cda64088e268ec0d844d552d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 25 Sep 2020 23:43:11 +0200 Subject: [PATCH] [weasyl] update and simplify - simplify 'pattern' regexps - parse 'posted_at' as 'date' - use unaltered 'title' ({title!l:R /_/} to lowercase and replace spaces) --- docs/supportedsites.rst | 7 +- gallery_dl/extractor/weasyl.py | 165 +++++++++++++++------------------ scripts/supportedsites.py | 4 + 3 files changed, 84 insertions(+), 92 deletions(-) diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index 8234186f..0b0cf0ca 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -129,7 +129,7 @@ Twitter https://twitter.com/ |twitter-C| VSCO https://vsco.co/ Collections, individual Images, User Profiles Wallhaven https://wallhaven.cc/ individual Images, Search Results Optional (`API Key `__) Warosu https://warosu.org/ Threads -Weasyl https://www.weasyl.com/ |weasyl-C| +Weasyl https://www.weasyl.com/ Folders, Journals, Submissions Webtoon https://www.webtoons.com/ Comics, Episodes Weibo https://www.weibo.com/ Images from Statuses, User Profiles WikiArt.org https://www.wikiart.org/ Artists, Artist Listings, Artworks @@ -155,14 +155,13 @@ Turboimagehost https://www.turboimagehost.com/ individual Images .. |deviantart-C| replace:: Collections, Deviations, Favorites, Folders, Galleries, Journals, Popular Images, Scraps, Sta.sh, User Profiles .. |flickr-C| replace:: Albums, Favorites, Galleries, Groups, individual Images, Search Results, User Profiles .. |furaffinity-C| replace:: Favorites, Galleries, Posts, Scraps, Search Results, User Profiles -.. |hentaifoundry-C| replace:: Favorites, individual Images, Popular Images, Recent Images, Scraps, User Profiles +.. |hentaifoundry-C| replace:: Favorites, Galleries, individual Images, Popular Images, Recent Images, Scraps, User Profiles .. |imgur-C| replace:: Albums, Favorites, Galleries, individual Images, Search Results, Subreddits, Tag Searches, User Profiles .. |instagram-C| replace:: Channels, individual Images, Saved Posts, Stories, Tag Searches, User Profiles .. |newgrounds-C| replace:: Art, Audio, Favorites, individual Images, Media Files, Movies, User Profiles -.. |nijie-C| replace:: Doujin, Favorites, individual Images, User Profiles +.. |nijie-C| replace:: Doujin, Favorites, Illustrations, individual Images, User Profiles .. |pixiv-C| replace:: Favorites, Follows, pixiv.me Links, Rankings, Search Results, User Profiles, individual Images .. |reddit-C| replace:: individual Images, Submissions, Subreddits, User Profiles .. |smugmug-C| replace:: Albums, individual Images, Images from Users and Folders .. |twitter-C| replace:: Bookmarks, Likes, Media Timelines, Search Results, Timelines, Tweets -.. |weasyl-C| replace:: Folders, Journals, Journals, Submissions, Submissions .. |yuki-S| replace:: yuki.la 4chan archive diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index 0fb5b2a0..e61cb6f4 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -8,7 +8,6 @@ from .common import Extractor, Message from .. import text -import re BASE_PATTERN = r"(?:https://)?(?:www\.)?weasyl.com/" @@ -16,19 +15,18 @@ BASE_PATTERN = r"(?:https://)?(?:www\.)?weasyl.com/" class WeasylExtractor(Extractor): category = "weasyl" directory_fmt = ("{category}", "{owner_login}") - filename_fmt = "{submitid}_{title}.{extension}" + filename_fmt = "{submitid} {title}.{extension}" archive_fmt = "{submitid}" root = "https://www.weasyl.com" - def __init__(self, match): - Extractor.__init__(self, match) - @staticmethod def populate_submission(data): # Some submissions don't have content and can be skipped if "submission" in data["media"]: data["url"] = data["media"]["submission"][0]["url"] - data["extension"] = text.ext_from_url(data["url"]) + data["date"] = text.parse_datetime( + data["posted_at"][:19], "%Y-%m-%dT%H:%M:%S") + text.nameext_from_url(data["url"], data) return True return False @@ -36,100 +34,110 @@ class WeasylExtractor(Extractor): return self.request( "{}/api/submissions/{}/view".format(self.root, submitid)).json() - def retrieve_journal(self, id): + def retrieve_journal(self, journalid): data = self.request( - "{}/api/journals/{}/view".format(self.root, id)).json() + "{}/api/journals/{}/view".format(self.root, journalid)).json() data["extension"] = "html" data["html"] = "text:" + data["content"] + data["date"] = text.parse_datetime(data["posted_at"]) return data - def submissions(self): - nextid = 0 - while nextid is not None: - url = "{}/api/users/{}/gallery?nextid={}".format( - self.root, self.owner_login, nextid - ) - folderid = self.folderid if hasattr(self, "folderid") else None - if folderid: - url += "&folderid={}".format(self.folderid) - json = self.request(url).json() - for data in json["submissions"]: - if self.populate_submission(data): - data["folderid"] = folderid + def submissions(self, owner_login, folderid=None): + url = "{}/api/users/{}/gallery".format(self.root, owner_login) + params = { + "nextid" : None, + "folderid": folderid, + } + + while True: + data = self.request(url, params=params).json() + for submission in data["submissions"]: + if self.populate_submission(submission): + submission["folderid"] = folderid # Do any submissions have more than one url? If so # a urllist of the submission array urls would work. - yield Message.Url, data["url"], data - nextid = json["nextid"] + yield Message.Url, submission["url"], submission + if not data["nextid"]: + return + params["nextid"] = data["nextid"] class WeasylSubmissionExtractor(WeasylExtractor): subcategory = "submission" - pattern = (BASE_PATTERN + - r"(?:~[\w-]+/submissions|submission)/(\d+)/?([\w-]+)?") + pattern = BASE_PATTERN + r"(?:~[\w-]+/submissions|submission)/(\d+)" test = ( - "https://www.weasyl.com/submission/2031/a-wesley", { - "keyword": { - "url": "https://cdn.weasyl.com/~fiz/submissions/2031/41ebc1c29" + ("https://www.weasyl.com/~fiz/submissions/2031/a-wesley", { + "pattern": "https://cdn.weasyl.com/~fiz/submissions/2031/41ebc1c29" "40be928532785dfbf35c37622664d2fbb8114c3b063df969562fc5" "1/fiz-a-wesley.png", - } - } + "keyword": { + "comments" : int, + "date" : "dt:2012-04-20 00:38:04", + "description" : "

(flex)

", + "favorites" : int, + "folder_name" : "Wesley Stuff", + "folderid" : 2081, + "friends_only": False, + "owner" : "Fiz", + "owner_login" : "fiz", + "rating" : "general", + "submitid" : 2031, + "subtype" : "visual", + "tags" : list, + "title" : "A Wesley!", + "type" : "submission", + "views" : int, + }, + }), + ("https://www.weasyl.com/submission/2031/a-wesley"), ) def __init__(self, match): WeasylExtractor.__init__(self, match) - self.submitid = int(match.group(1)) - if len(match.groups()) == 3: - self.title = match.group(2) + self.submitid = match.group(1) def items(self): - yield Message.Version, 1 data = self.request_submission(self.submitid) - yield Message.Directory, data if self.populate_submission(data): + yield Message.Directory, data yield Message.Url, data["url"], data class WeasylSubmissionsExtractor(WeasylExtractor): subcategory = "submissions" - pattern = BASE_PATTERN + r"(?:~([\w-]+)/?|submissions/([\w-]+))$" + pattern = BASE_PATTERN + r"(?:~|submissions/)([\w-]+)/?$" test = ( ("https://www.weasyl.com/~tanidareal", { "count": ">= 200" }), - ("https://www.weasyl.com/submissions/tanidareal", { - "count": ">= 200" - }) + ("https://www.weasyl.com/submissions/tanidareal"), ) def __init__(self, match): WeasylExtractor.__init__(self, match) - self.owner_login = match.group(1) if match.group(1) else match.group(2) + self.owner_login = match.group(1) def items(self): yield Message.Version, 1 yield Message.Directory, {"owner_login": self.owner_login} - yield from self.submissions() + yield from self.submissions(self.owner_login) class WeasylFolderExtractor(WeasylExtractor): subcategory = "folder" directory_fmt = ("{category}", "{owner_login}", "{folder_name}") pattern = BASE_PATTERN + r"submissions/([\w-]+)\?folderid=(\d+)" - test = ( - "https://www.weasyl.com/submissions/tanidareal?folderid=7403", { - "count": ">= 12" - } - ) + test = ("https://www.weasyl.com/submissions/tanidareal?folderid=7403", { + "count": ">= 12" + }) def __init__(self, match): WeasylExtractor.__init__(self, match) - self.owner_login = match.group(1) - self.folderid = int(match.group(2)) + self.owner_login, self.folderid = match.groups() def items(self): yield Message.Version, 1 - iter = self.submissions() + iter = self.submissions(self.owner_login, self.folderid) # Folder names are only on single submission api calls msg, url, data = next(iter) details = self.request_submission(data["submitid"]) @@ -140,53 +148,37 @@ class WeasylFolderExtractor(WeasylExtractor): class WeasylJournalExtractor(WeasylExtractor): subcategory = "journal" - filename_fmt = "{journalid}_{title}.{extension}" + filename_fmt = "{journalid} {title}.{extension}" archive_fmt = "{journalid}" - pattern = BASE_PATTERN + r"journal/(\d+)/?([\w-]+)?" - test = ( - ("https://www.weasyl.com/journal/17647", { - "keyword": { - "content": - "

javascript:alert(42);

No more of that!

", - "title": "bbcode", - } - }), - ("https://www.weasyl.com/journal/17647/bbcode", { - "keyword": { - "content": - "

javascript:alert(42);

No more of that!

", - "title": "bbcode", - } - }) - ) + pattern = BASE_PATTERN + r"journal/(\d+)" + test = ("https://www.weasyl.com/journal/17647/bbcode", { + "keyword": { + "title" : "BBCode", + "date" : "dt:2013-09-19 23:11:23", + "content": "

javascript:alert(42);

" + "

No more of that!

", + }, + }) def __init__(self, match): WeasylExtractor.__init__(self, match) - self.journalid = int(match.group(1)) - if match.group(2): - self.title = match.group(2) + self.journalid = match.group(1) def items(self): - yield Message.Version, 1 data = self.retrieve_journal(self.journalid) - if hasattr(self, "title"): - data["title"] = self.title - else: - data["title"] = data["title"].lower() + yield Message.Version, 1 yield Message.Directory, data yield Message.Url, data["html"], data class WeasylJournalsExtractor(WeasylExtractor): subcategory = "journals" - filename_fmt = "{journalid}_{title}.{extension}" + filename_fmt = "{journalid} {title}.{extension}" archive_fmt = "{journalid}" pattern = BASE_PATTERN + r"journals/([\w-]+)" - test = ( - "https://www.weasyl.com/journals/charmander", { - "count": ">= 2", - } - ) + test = ("https://www.weasyl.com/journals/charmander", { + "count": ">= 2", + }) def __init__(self, match): WeasylExtractor.__init__(self, match) @@ -195,12 +187,9 @@ class WeasylJournalsExtractor(WeasylExtractor): def items(self): yield Message.Version, 1 yield Message.Directory, {"owner_login": self.owner_login} - response = self.request("{}/journals/{}".format( - self.root, self.owner_login - )) - for journal in re.finditer(r'"/journal/(\d+)/([\w-]+)"', - response.text): - data = self.retrieve_journal(int(journal.group(1))) - data["title"] = journal.group(2) + url = "{}/journals/{}".format(self.root, self.owner_login) + page = self.request(url).text + for journalid in text.extract_iter(page, 'href="/journal/', '/'): + data = self.retrieve_journal(journalid) yield Message.Url, data["html"], data diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 2a23c210..57be351d 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -143,6 +143,10 @@ SUBCATEGORY_MAP = { "wikiart": { "artists": "Artist Listings", }, + "weasyl": { + "journals" : "", + "submissions": "", + }, } _OAUTH = " (`OAuth `__)"