diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py index ee6fd5c0..d0e59ad6 100644 --- a/gallery_dl/extractor/3dbooru.py +++ b/gallery_dl/extractor/3dbooru.py @@ -71,7 +71,7 @@ class ThreedeebooruPopularExtractor(booru.MoebooruPopularMixin, r"/post/popular_(?Pby_(?:day|week|month)|recent)" r"(?:\?(?P[^#]*))?") test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", { - "url": "a447e115fdab60c25ab71c4fdb1b9f509bc23f99", + "url": "c70268dce441a9ccc3383c244ec15edb059f494f", "count": 20, }) diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index 251b2205..f532cac8 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -24,21 +24,21 @@ class DynastyscansBase(): def _parse_image_page(self, image_id): url = "{}/images/{}".format(self.root, image_id) - page = self.request(url).text + extr = text.extract_from(self.request(url).text) - date, pos = text.extract(page, "class='create_at'>", "") - tags, pos = text.extract(page, "class='tags'>", "", pos) - src , pos = text.extract(page, "class='btn-group'>", "", pos) - url , pos = text.extract(page, ' src="', '"', pos) + date = extr("class='create_at'>", "") + tags = extr("class='tags'>", "") + src = extr("class='btn-group'>", "") + url = extr(' src="', '"') src = text.extract(src, 'href="', '"')[0] if "Source<" in src else "" return { - "url": self.root + url, + "url" : self.root + url, "image_id": text.parse_int(image_id), - "tags": text.split_html(text.unescape(tags)), - "date": text.remove_html(date), - "source": text.unescape(src), + "tags" : text.split_html(text.unescape(tags)), + "date" : text.remove_html(date), + "source" : text.unescape(src), } @@ -59,28 +59,26 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): ) def metadata(self, page): - info , pos = text.extract(page, "

", "") - author, pos = text.extract(page, " by ", "", pos) - group , pos = text.extract(page, '"icon-print"> ', '', pos) - date , pos = text.extract(page, '"icon-calendar"> ', '<', pos) - + extr = text.extract_from(page) match = re.match( (r"(?:]*>)?([^<]+)(?:)?" # manga name r"(?: ch(\d+)([^:<]*))?" # chapter info r"(?:: (.+))?"), # title - info + extr("

", ""), ) + author = extr(" by ", "") + group = extr('"icon-print"> ', '') return { - "manga": text.unescape(match.group(1)), - "chapter": text.parse_int(match.group(2)), + "manga" : text.unescape(match.group(1)), + "chapter" : text.parse_int(match.group(2)), "chapter_minor": match.group(3) or "", - "title": text.unescape(match.group(4) or ""), - "author": text.remove_html(author), - "group": (text.remove_html(group) or - text.extract(group, ' alt="', '"')[0] or ""), - "date": date, - "lang": "en", + "title" : text.unescape(match.group(4) or ""), + "author" : text.remove_html(author), + "group" : (text.remove_html(group) or + text.extract(group, ' alt="', '"')[0] or ""), + "date" : extr('"icon-calendar"> ', '<'), + "lang" : "en", "language": "English", } diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py index 896b7754..cb131612 100644 --- a/gallery_dl/extractor/fallenangels.py +++ b/gallery_dl/extractor/fallenangels.py @@ -39,17 +39,16 @@ class FallenangelsChapterExtractor(ChapterExtractor): ChapterExtractor.__init__(self, match, url) def metadata(self, page): + extr = text.extract_from(page) lang = "vi" if self.version == "truyen" else "en" - data = { - "chapter": self.chapter, + return { + "manga" : extr('name="description" content="', ' Chapter '), + "title" : extr(': ', ' - Page 1'), + "chapter" : self.chapter, "chapter_minor": self.minor or "", - "lang": lang, + "lang" : lang, "language": util.code_to_language(lang), } - return text.extract_all(page, ( - ("manga", 'name="description" content="', ' Chapter '), - ("title", ': ', ' - Page 1'), - ), values=data)[0] @staticmethod def images(page): @@ -83,26 +82,24 @@ class FallenangelsMangaExtractor(MangaExtractor): MangaExtractor.__init__(self, match, url) def chapters(self, page): - language = util.code_to_language(self.lang) + extr = text.extract_from(page) results = [] - pos = 0 - while True: - test, pos = text.extract(page, '
  • ', '<', pos) - title , pos = text.extract(page, '', '', pos) + language = util.code_to_language(self.lang) + while extr('
  • ', '<') + title = extr('', '') - manga, _, chapter = chapter.rpartition(" ") + manga, _, chapter = cha.rpartition(" ") chapter, dot, minor = chapter.partition(".") results.append((url, { - "manga": manga, - "title": text.unescape(title), - "volume": text.parse_int(volume), - "chapter": text.parse_int(chapter), + "manga" : manga, + "title" : text.unescape(title), + "volume" : text.parse_int(vol), + "chapter" : text.parse_int(chapter), "chapter_minor": dot + minor, - "lang": self.lang, + "lang" : self.lang, "language": language, })) + return results diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 92de190a..14baa366 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -38,6 +38,7 @@ class FoolslideBase(SharedConfigMixin): data["volume"] = text.parse_int(info[2]) data["chapter"] = text.parse_int(info[3]) data["chapter_minor"] = "." + info[4] if len(info) >= 5 else "" + data["title"] = data["chapter_string"].partition(":")[2].strip() return data @@ -74,14 +75,11 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): yield Message.Url, url, data def metadata(self, page): - _ , pos = text.extract(page, '

    ', '') - manga , pos = text.extract(page, 'title="', '"', pos) - chapter, pos = text.extract(page, 'title="', '"', pos) - chapter = text.unescape(chapter) + extr = text.extract_from(page) + extr('

    ', '') return self.parse_chapter_url(self.chapter_url, { - "manga": text.unescape(manga).strip(), - "title": chapter.partition(":")[2].strip(), - "chapter_string": chapter, + "manga" : text.unescape(extr('title="', '"')).strip(), + "chapter_string": text.unescape(extr('title="', '"')), }) def images(self, page): @@ -101,25 +99,20 @@ class FoolslideMangaExtractor(FoolslideBase, MangaExtractor): pattern_fmt = r"(/series/[^/?&#]+)" def chapters(self, page): - manga , pos = text.extract(page, '

    ', '

    ') - author, pos = text.extract(page, 'Author: ', 'Artist: ', '', '
  • ')).strip() + author = extr('Author: ', 'Artist: ', '", "

    ") - data = text.extract_all(page, ( - ("parody" , ">Parodies:" , ""), - ("characters", ">Characters:", ""), - ("tags" , ">Tags:" , ""), - ("artist" , ">Artists:" , ""), - ("group" , ">Groups:" , ""), - ("type" , ">Category:" , ""), - ), pos)[0] + def metadata(self, page, split=text.split_html): + extr = text.extract_from(page) - for key, value in data.items(): - data[key] = text.split_html(value)[::2] - data["gallery_id"] = text.parse_int(self.gallery_id) - data["title"] = text.unescape(title) - data["type"] = data["type"][0] if data["type"] else "" - data["language"] = "English" - data["lang"] = "en" - return data + return { + "gallery_id": text.parse_int(self.gallery_id), + "title" : text.unescape(extr("

    ", "

    ")), + "parody" : split(extr(">Parodies:" , ""))[::2], + "characters": split(extr(">Characters:", ""))[::2], + "tags" : split(extr(">Tags:" , ""))[::2], + "artist" : split(extr(">Artists:" , ""))[::2], + "group" : split(extr(">Groups:" , ""))[::2], + "type" : text.remove_html(extr(">Category:", "")), + "language" : "English", + "lang" : "en", + } def images(self, page): return [ diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py index f0ecfd47..64af9d86 100644 --- a/gallery_dl/extractor/livedoor.py +++ b/gallery_dl/extractor/livedoor.py @@ -38,18 +38,14 @@ class LivedoorExtractor(Extractor): """Return an iterable with post objects""" def _load(self, data, body): - pid , pos = text.extract(data, "id : '" , "'") - title, pos = text.extract(data, "title : '", "'", pos) - cat1 , pos = text.extract(data, "name:'" , "'", pos) - cat2 , pos = text.extract(data, "name:'" , "'", pos) - date , pos = text.extract(data, "date : '" , "'", pos) - tags , pos = text.extract(body, '
    ', '') + extr = text.extract_from(data) + tags = text.extract(body, '
    ', '')[0] return { - "id" : text.parse_int(pid), - "title" : title, - "date" : date, - "categories": [cat1, cat2], + "id" : text.parse_int(extr("id : '", "'")), + "title" : extr("title : '", "'"), + "categories": [extr("name:'", "'"), extr("name:'", "'")], + "date" : extr("date : '", "'"), "tags" : text.split_html(tags), "user" : self.user, "body" : body, @@ -108,22 +104,15 @@ class LivedoorBlogExtractor(LivedoorExtractor): url = "{}/{}".format(self.root, self.user) while url: - page = self.request(url).text - pos = 0 - + extr = text.extract_from(self.request(url).text) while True: - data, pos = text.extract(page, '.articles.push(', ');', pos) + data = extr('.articles.push(', ');') if not data: break - body, pos = text.extract( - page, - '
    ', - '', - pos, - ) + body = extr('
    ', + '') yield self._load(data, body) - - url = text.extract(page, '