diff --git a/CHANGELOG.md b/CHANGELOG.md index 0765ef0d..ae0aead7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# Unreleased +- Fixed extraction of `mangadex` manga with more than 100 chapters (#84) + ## 1.3.5 - 2018-05-04 - Added support for: - `smugmug` - https://www.smugmug.com/ diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index aa117cc0..41641052 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -167,7 +167,7 @@ class DeviantartGalleryExtractor(DeviantartExtractor): subcategory = "gallery" archive_fmt = "g_{username}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"(?:/(?:gallery/?(?:\?catpath=/)?)?)?$"] test = [ ("http://shimoda7.deviantart.com/gallery/", { @@ -197,7 +197,7 @@ class DeviantartFolderExtractor(DeviantartExtractor): subcategory = "folder" directory_fmt = ["{category}", "{folder[owner]}", "{folder[title]}"] archive_fmt = "F_{folder[uuid]}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"/gallery/(\d+)/([^/?]+)"] test = [ ("http://shimoda7.deviantart.com/gallery/722019/Miscellaneous", { @@ -232,8 +232,8 @@ class DeviantartDeviationExtractor(DeviantartExtractor): """Extractor for single deviations""" subcategory = "deviation" archive_fmt = "{index}.{extension}" - pattern = [(r"(?:https?://)?([^.]+\.deviantart\.com/" - r"(?:art|journal)/[^/?]+-\d+)"), + pattern = [(r"(?:https?://)?(?!www\.)([\w-]+\.deviantart\.com" + r"/(?:art|journal)/[^/?]+-\d+)"), (r"(?:https?://)?(sta\.sh/[a-z0-9]+)")] test = [ (("http://shimoda7.deviantart.com/art/" @@ -276,7 +276,7 @@ class DeviantartFavoriteExtractor(DeviantartExtractor): subcategory = "favorite" directory_fmt = ["{category}", "{username}", "Favourites"] archive_fmt = "f_{username}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"/favourites/?(?:\?catpath=/)?$"] test = [ ("http://h3813067.deviantart.com/favourites/", { @@ -304,7 +304,7 @@ class DeviantartCollectionExtractor(DeviantartExtractor): directory_fmt = ["{category}", "{collection[owner]}", "Favourites", "{collection[title]}"] archive_fmt = "C_{collection[uuid]}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"/favourites/(\d+)/([^/?]+)"] test = [(("https://pencilshadings.deviantart.com" "/favourites/70595441/3D-Favorites"), { @@ -334,7 +334,7 @@ class DeviantartJournalExtractor(DeviantartExtractor): subcategory = "journal" directory_fmt = ["{category}", "{username}", "Journal"] archive_fmt = "j_{username}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"/(?:journal|blog)/?(?:\?catpath=/)?$"] test = [ ("https://angrywhitewanker.deviantart.com/journal/", { @@ -348,6 +348,50 @@ class DeviantartJournalExtractor(DeviantartExtractor): return self.api.browse_user_journals(self.user, self.offset) +class DeviantartPopularExtractor(DeviantartExtractor): + """Extractor for popular deviations""" + subcategory = "popular" + directory_fmt = ["{category}", "Popular", + "{popular[range]}", "{popular[search]}"] + archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}" + pattern = [r"(?:https?://)?www\.deviantart\.com" + r"((?:/\w+)*)/(?:popular-([^/?]+))?/?(?:\?([^#]*))?"] + test = [ + ("https://www.deviantart.com/popular-8-hours/?q=tree+house", { + "options": (("original", False),), + }), + ("https://www.deviantart.com/artisan/popular-all-time/?q=tree", None), + ("https://www.deviantart.com/?q=tree", None), + ("https://www.deviantart.com/", None), + ] + + def __init__(self, match): + DeviantartExtractor.__init__(self) + self.search_term = self.time_range = self.category_path = None + + path, trange, query = match.groups() + if path: + self.category_path = path.lstrip("/") + if trange: + self.time_range = trange.replace("-", "").replace("hours", "hr") + if query: + self.search_term = text.parse_query(query).get("q") + + self.popular = { + "search": self.search_term or "", + "range": trange or "24-hours", + "path": self.category_path, + } + + def deviations(self): + return self.api.browse_popular( + self.search_term, self.time_range, self.category_path, self.offset) + + def prepare(self, deviation): + DeviantartExtractor.prepare(self, deviation) + deviation["popular"] = self.popular + + class DeviantartAPI(): """Minimal interface for the deviantart API""" CLIENT_ID = "5388" @@ -368,6 +412,15 @@ class DeviantartAPI(): self.client_secret = extractor.config( "client-secret", self.CLIENT_SECRET) + def browse_popular(self, query=None, timerange=None, + category_path=None, offset=0): + """Yield popular deviations""" + endpoint = "browse/popular" + params = {"q": query, "offset": offset, "limit": 120, + "timerange": timerange, "category_path": category_path, + "mature_content": self.mature} + return self._pagination(endpoint, params) + def browse_user_journals(self, username, offset=0): """Yield all journal entries of a specific user""" endpoint = "browse/user/journals" diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 90164d24..7e88eead 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -8,91 +8,103 @@ """Extract images from http://www.imagebam.com/""" -from .common import Extractor, AsynchronousExtractor, Message +from .common import Extractor, Message from .. import text -class ImagebamGalleryExtractor(AsynchronousExtractor): - """Extractor for image galleries from imagebam.com""" +class ImagebamExtractor(Extractor): + """Base class for imagebam extractors""" category = "imagebam" - subcategory = "gallery" - directory_fmt = ["{category}", "{title} - {gallery_key}"] - filename_fmt = "{num:>03}-{name}.{extension}" - archive_fmt = "{gallery_key}_{image_id}" - pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+)"] - test = [(("http://www.imagebam.com/" - "gallery/adz2y0f9574bjpmonaismyrhtjgvey4o"), { - "url": "fb01925129a1ff1941762eaa3a2783a66de6847f", - "keyword": "2541078f61ce50714715e21757176dd69126f804", - "content": "596e6bfa157f2c7169805d50075c2986549973a8", - })] root = "http://www.imagebam.com" + def get_image_data(self, page_url, data): + """Fill 'data' and return image URL""" + page = self.request(page_url).text + image_url = text.extract(page, 'property="og:image" content="', '"')[0] + data["extension"] = image_url.rpartition(".")[2] + data["image_key"] = page_url.rpartition("/")[2] + data["image_id"] = data["image_key"][6:] + return image_url + + +class ImagebamGalleryExtractor(ImagebamExtractor): + """Extractor for image galleries from imagebam.com""" + subcategory = "gallery" + directory_fmt = ["{category}", "{title} - {gallery_key}"] + filename_fmt = "{num:>03}-{image_key}.{extension}" + archive_fmt = "{gallery_key}_{image_key}" + pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)"] + test = [ + ("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", { + "url": "fb01925129a1ff1941762eaa3a2783a66de6847f", + "keyword": "9e25b8827474ac93c54855e798d60aa3cbecbd7a", + "content": "596e6bfa157f2c7169805d50075c2986549973a8", + }), + ("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", { + "url": "7d54178cecddfd46025cc9759f5b675fbb8f65af", + "keyword": "7d7db9664061132be50aa0d98e9602e98eb581ce", + }), + ] + def __init__(self, match): - AsynchronousExtractor.__init__(self) - self.gkey = match.group(1) + ImagebamExtractor.__init__(self) + self.gallery_key = match.group(1) def items(self): - data, url = self.get_job_metadata() + url = "{}/gallery/{}".format(self.root, self.gallery_key) + page = text.extract( + self.request(url).text, "
")[0] + + data = self.get_metadata(page) + imgs = self.get_image_pages(page) + data["count"] = len(imgs) + data["gallery_key"] = self.gallery_key + yield Message.Version, 1 yield Message.Directory, data - data["num"] = 0 - for image_url, image_id in self.get_images(url): - data["image_id"] = image_id - data["num"] += 1 - text.nameext_from_url(image_url, data) - yield Message.Url, image_url, data.copy() + for data["num"], page_url in enumerate(imgs, 1): + image_url = self.get_image_data(page_url, data) + yield Message.Url, image_url, data - def get_job_metadata(self): - """Collect metadata for extractor-job""" - url = self.root + "/gallery/" + self.gkey - page = self.request(url, encoding="utf-8").text - data, pos = text.extract_all(page, ( - (None , "