diff --git a/CHANGELOG.md b/CHANGELOG.md index 0765ef0d..ae0aead7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# Unreleased +- Fixed extraction of `mangadex` manga with more than 100 chapters (#84) + ## 1.3.5 - 2018-05-04 - Added support for: - `smugmug` - https://www.smugmug.com/ diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index aa117cc0..41641052 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -167,7 +167,7 @@ class DeviantartGalleryExtractor(DeviantartExtractor): subcategory = "gallery" archive_fmt = "g_{username}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"(?:/(?:gallery/?(?:\?catpath=/)?)?)?$"] test = [ ("http://shimoda7.deviantart.com/gallery/", { @@ -197,7 +197,7 @@ class DeviantartFolderExtractor(DeviantartExtractor): subcategory = "folder" directory_fmt = ["{category}", "{folder[owner]}", "{folder[title]}"] archive_fmt = "F_{folder[uuid]}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"/gallery/(\d+)/([^/?&#]+)"] test = [ ("http://shimoda7.deviantart.com/gallery/722019/Miscellaneous", { @@ -232,8 +232,8 @@ class DeviantartDeviationExtractor(DeviantartExtractor): """Extractor for single deviations""" subcategory = "deviation" archive_fmt = "{index}.{extension}" - pattern = [(r"(?:https?://)?([^.]+\.deviantart\.com/" - r"(?:art|journal)/[^/?&#]+-\d+)"), + pattern = [(r"(?:https?://)?(?!www\.)([\w-]+\.deviantart\.com" + r"/(?:art|journal)/[^/?&#]+-\d+)"), (r"(?:https?://)?(sta\.sh/[a-z0-9]+)")] test = [ (("http://shimoda7.deviantart.com/art/" @@ -276,7 +276,7 @@ class DeviantartFavoriteExtractor(DeviantartExtractor): subcategory = "favorite" directory_fmt = ["{category}", "{username}", "Favourites"] archive_fmt = "f_{username}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"/favourites/?(?:\?catpath=/)?$"] test = [ ("http://h3813067.deviantart.com/favourites/", { @@ -304,7 +304,7 @@ class DeviantartCollectionExtractor(DeviantartExtractor): directory_fmt = ["{category}", "{collection[owner]}", "Favourites", "{collection[title]}"] archive_fmt = "C_{collection[uuid]}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"/favourites/(\d+)/([^/?&#]+)"] test = [(("https://pencilshadings.deviantart.com" "/favourites/70595441/3D-Favorites"), { @@ -334,7 +334,7 @@ class DeviantartJournalExtractor(DeviantartExtractor): subcategory = "journal" directory_fmt = ["{category}", "{username}", "Journal"] archive_fmt = "j_{username}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"/(?:journal|blog)/?(?:\?catpath=/)?$"] test = [ ("https://angrywhitewanker.deviantart.com/journal/", { @@ -348,6 +348,50 @@ class DeviantartJournalExtractor(DeviantartExtractor): return self.api.browse_user_journals(self.user, self.offset) +class DeviantartPopularExtractor(DeviantartExtractor): + """Extractor for popular deviations""" + subcategory = "popular" + directory_fmt = ["{category}", "Popular", + "{popular[range]}", "{popular[search]}"] + archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}" + pattern = [r"(?:https?://)?www\.deviantart\.com" + r"((?:/\w+)*)/(?:popular-([^/?&#]+))?/?(?:\?([^#]*))?"] + test = [ + ("https://www.deviantart.com/popular-8-hours/?q=tree+house", { + "options": (("original", False),), + }), + ("https://www.deviantart.com/artisan/popular-all-time/?q=tree", None), + ("https://www.deviantart.com/?q=tree", None), + ("https://www.deviantart.com/", None), + ] + + def __init__(self, match): + DeviantartExtractor.__init__(self) + self.search_term = self.time_range = self.category_path = None + + path, trange, query = match.groups() + if path: + self.category_path = path.lstrip("/") + if trange: + self.time_range = trange.replace("-", "").replace("hours", "hr") + if query: + self.search_term = text.parse_query(query).get("q") + + self.popular = { + "search": self.search_term or "", + "range": trange or "24-hours", + "path": self.category_path, + } + + def deviations(self): + return self.api.browse_popular( + self.search_term, self.time_range, self.category_path, self.offset) + + def prepare(self, deviation): + DeviantartExtractor.prepare(self, deviation) + deviation["popular"] = self.popular + + class DeviantartAPI(): """Minimal interface for the deviantart API""" CLIENT_ID = "5388" @@ -368,6 +412,15 @@ class DeviantartAPI(): self.client_secret = extractor.config( "client-secret", self.CLIENT_SECRET) + def browse_popular(self, query=None, timerange=None, + category_path=None, offset=0): + """Yield popular deviations""" + endpoint = "browse/popular" + params = {"q": query, "offset": offset, "limit": 120, + "timerange": timerange, "category_path": category_path, + "mature_content": self.mature} + return self._pagination(endpoint, params) + def browse_user_journals(self, username, offset=0): """Yield all journal entries of a specific user""" endpoint = "browse/user/journals" diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 90164d24..7e88eead 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -8,91 +8,103 @@ """Extract images from http://www.imagebam.com/""" -from .common import Extractor, AsynchronousExtractor, Message +from .common import Extractor, Message from .. import text -class ImagebamGalleryExtractor(AsynchronousExtractor): - """Extractor for image galleries from imagebam.com""" +class ImagebamExtractor(Extractor): + """Base class for imagebam extractors""" category = "imagebam" - subcategory = "gallery" - directory_fmt = ["{category}", "{title} - {gallery_key}"] - filename_fmt = "{num:>03}-{name}.{extension}" - archive_fmt = "{gallery_key}_{image_id}" - pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+)"] - test = [(("http://www.imagebam.com/" - "gallery/adz2y0f9574bjpmonaismyrhtjgvey4o"), { - "url": "fb01925129a1ff1941762eaa3a2783a66de6847f", - "keyword": "2541078f61ce50714715e21757176dd69126f804", - "content": "596e6bfa157f2c7169805d50075c2986549973a8", - })] root = "http://www.imagebam.com" + def get_image_data(self, page_url, data): + """Fill 'data' and return image URL""" + page = self.request(page_url).text + image_url = text.extract(page, 'property="og:image" content="', '"')[0] + data["extension"] = image_url.rpartition(".")[2] + data["image_key"] = page_url.rpartition("/")[2] + data["image_id"] = data["image_key"][6:] + return image_url + + +class ImagebamGalleryExtractor(ImagebamExtractor): + """Extractor for image galleries from imagebam.com""" + subcategory = "gallery" + directory_fmt = ["{category}", "{title} - {gallery_key}"] + filename_fmt = "{num:>03}-{image_key}.{extension}" + archive_fmt = "{gallery_key}_{image_key}" + pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)"] + test = [ + ("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", { + "url": "fb01925129a1ff1941762eaa3a2783a66de6847f", + "keyword": "9e25b8827474ac93c54855e798d60aa3cbecbd7a", + "content": "596e6bfa157f2c7169805d50075c2986549973a8", + }), + ("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", { + "url": "7d54178cecddfd46025cc9759f5b675fbb8f65af", + "keyword": "7d7db9664061132be50aa0d98e9602e98eb581ce", + }), + ] + def __init__(self, match): - AsynchronousExtractor.__init__(self) - self.gkey = match.group(1) + ImagebamExtractor.__init__(self) + self.gallery_key = match.group(1) def items(self): - data, url = self.get_job_metadata() + url = "{}/gallery/{}".format(self.root, self.gallery_key) + page = text.extract( + self.request(url).text, "
", "
")[0] + + data = self.get_metadata(page) + imgs = self.get_image_pages(page) + data["count"] = len(imgs) + data["gallery_key"] = self.gallery_key + yield Message.Version, 1 yield Message.Directory, data - data["num"] = 0 - for image_url, image_id in self.get_images(url): - data["image_id"] = image_id - data["num"] += 1 - text.nameext_from_url(image_url, data) - yield Message.Url, image_url, data.copy() + for data["num"], page_url in enumerate(imgs, 1): + image_url = self.get_image_data(page_url, data) + yield Message.Url, image_url, data - def get_job_metadata(self): - """Collect metadata for extractor-job""" - url = self.root + "/gallery/" + self.gkey - page = self.request(url, encoding="utf-8").text - data, pos = text.extract_all(page, ( - (None , " ", " <"), - ("count" , "'>", " images"), - ), values={"gallery_key": self.gkey}) - url, pos = text.extract( - page, " ", " ", ""), + ("description", ":#FCFCFC;'>", ""), + ))[0] - def get_images(self, url): - """Yield all image-urls and -ids for a gallery""" - done = False - while not done: - page = self.request(self.root + url).text - pos = text.extract( - page, 'class="btn btn-default" title="Next">', '' - )[1] - if pos == 0: - done = True - else: - url, pos = text.extract(page, ' href="', '"', pos-70) - image_id , pos = text.extract(page, 'class="image" id="', '"', pos) - image_url, pos = text.extract(page, 'src="', '"', pos) - yield image_url, image_id + @staticmethod + def get_image_pages(page): + """Return a list of all image pages""" + return list(text.extract_iter(page, "= 100", + }), ] scheme = "https" per_page = 100 @@ -165,4 +168,4 @@ class MangadexMangaExtractor(MangadexExtractor, MangaExtractor): return results num += 1 - page = self.request("{}/_/{}/".format(self.url, num)).text + page = self.request("{}/_/chapters/{}/".format(self.url, num)).text diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index a244cf9e..1583e03c 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -72,7 +72,7 @@ class PinterestBoardExtractor(PinterestExtractor): "url": "85911dfca313f3f7f48c2aa0bc684f539d1d80a6", }), ("https://www.pinterest.com/g1952848/test/", { - "exception": exception.NotFoundError, + "exception": exception.GalleryDLException, }), ] @@ -161,7 +161,11 @@ class PinterestAPI(): response = self.extractor.request( url, params=params, headers=self.HEADERS, fatal=False) - data = response.json() + + try: + data = response.json() + except ValueError: + data = {} if 200 <= response.status_code < 400 and not response.history: return data @@ -169,6 +173,7 @@ class PinterestAPI(): if response.status_code == 404 or response.history: raise exception.NotFoundError(self.extractor.subcategory) self.extractor.log.error("API request failed") + self.extractor.log.debug("%s", response.text) raise exception.StopExtraction() def _pagination(self, resource, options): diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 55f15f1d..6794b165 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -72,12 +72,15 @@ class RedditExtractor(Extractor): class RedditSubredditExtractor(RedditExtractor): """Extractor for images from subreddits on reddit.com""" subcategory = "subreddit" - pattern = [r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/([^/?&#]+)" + pattern = [r"(?:https?://)?(?:\w+\.)?reddit\.com/r/([^/?&#]+)" r"(/[a-z]+)?/?" r"(?:\?.*?(?:\bt=([a-z]+))?)?$"] test = [ ("https://www.reddit.com/r/lavaporn/", None), ("https://www.reddit.com/r/lavaporn/top/?sort=top&t=month", None), + ("https://old.reddit.com/r/lavaporn/", None), + ("https://np.reddit.com/r/lavaporn/", None), + ("https://m.reddit.com/r/lavaporn/", None), ] def __init__(self, match): @@ -94,7 +97,7 @@ class RedditSubmissionExtractor(RedditExtractor): """Extractor for images from a submission on reddit.com""" subcategory = "submission" pattern = [(r"(?:https?://)?(?:" - r"(?:m\.|www\.)?reddit\.com/r/[^/]+/comments|" + r"(?:\w+\.)?reddit\.com/r/[^/?&#]+/comments|" r"redd\.it" r")/([a-z0-9]+)")] test = [ @@ -102,6 +105,8 @@ class RedditSubmissionExtractor(RedditExtractor): "pattern": r"https?://i\.imgur\.com/AaAUCgy\.jpg", "count": 1, }), + ("https://old.reddit.com/r/lavaporn/comments/2a00np/", None), + ("https://np.reddit.com/r/lavaporn/comments/2a00np/", None), ("https://m.reddit.com/r/lavaporn/comments/2a00np/", None), ("https://redd.it/2a00np/", None), ] diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index 99d936a6..15b77b63 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -36,7 +36,7 @@ class XvideosGalleryExtractor(XvideosExtractor): (("https://www.xvideos.com/profiles" "/pervertedcouple/photos/751031/random_stuff"), { "url": "4f0d992e5dc39def2c3ac8e099d17bf09e76e3c7", - "keyword": "71d64a9b2ba7015850d3aed3fbcae1e7e0481515", + "keyword": "750d462802d56eead0fe06c4f453419396d2f944", }), ("https://www.xvideos.com/profiles/pervertedcouple/photos/751032/", { "exception": exception.NotFoundError, diff --git a/test/test_results.py b/test/test_results.py index 2e13ebea..90fc1b96 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -16,11 +16,13 @@ from gallery_dl import extractor, job, config, exception # these don't work on travis-ci TRAVIS_SKIP = { "exhentai", "kissmanga", "mangafox", "dynastyscans", "nijie", - "archivedmoe", "archiveofsins", "thebarchive", "sankaku", "idolcomplex", + "archivedmoe", "archiveofsins", "thebarchive", "fireden", + "sankaku", "idolcomplex", } # temporary issues, etc. BROKEN = { + "pixiv", # /users//favorite_works API endpoint is gone }