use 'text.extract_from()' in a few places

This commit is contained in:
Mike Fährmann
2019-04-19 23:02:29 +02:00
parent 21a7e395a7
commit f2cf1c1d73
10 changed files with 116 additions and 162 deletions

View File

@@ -71,7 +71,7 @@ class ThreedeebooruPopularExtractor(booru.MoebooruPopularMixin,
r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)" r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
r"(?:\?(?P<query>[^#]*))?") r"(?:\?(?P<query>[^#]*))?")
test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", { test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", {
"url": "a447e115fdab60c25ab71c4fdb1b9f509bc23f99", "url": "c70268dce441a9ccc3383c244ec15edb059f494f",
"count": 20, "count": 20,
}) })

View File

@@ -24,21 +24,21 @@ class DynastyscansBase():
def _parse_image_page(self, image_id): def _parse_image_page(self, image_id):
url = "{}/images/{}".format(self.root, image_id) url = "{}/images/{}".format(self.root, image_id)
page = self.request(url).text extr = text.extract_from(self.request(url).text)
date, pos = text.extract(page, "class='create_at'>", "</span>") date = extr("class='create_at'>", "</span>")
tags, pos = text.extract(page, "class='tags'>", "</span>", pos) tags = extr("class='tags'>", "</span>")
src , pos = text.extract(page, "class='btn-group'>", "</div>", pos) src = extr("class='btn-group'>", "</div>")
url , pos = text.extract(page, ' src="', '"', pos) url = extr(' src="', '"')
src = text.extract(src, 'href="', '"')[0] if "Source<" in src else "" src = text.extract(src, 'href="', '"')[0] if "Source<" in src else ""
return { return {
"url": self.root + url, "url" : self.root + url,
"image_id": text.parse_int(image_id), "image_id": text.parse_int(image_id),
"tags": text.split_html(text.unescape(tags)), "tags" : text.split_html(text.unescape(tags)),
"date": text.remove_html(date), "date" : text.remove_html(date),
"source": text.unescape(src), "source" : text.unescape(src),
} }
@@ -59,28 +59,26 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
) )
def metadata(self, page): def metadata(self, page):
info , pos = text.extract(page, "<h3 id='chapter-title'><b>", "</b>") extr = text.extract_from(page)
author, pos = text.extract(page, " by ", "</a>", pos)
group , pos = text.extract(page, '"icon-print"></i> ', '</span>', pos)
date , pos = text.extract(page, '"icon-calendar"></i> ', '<', pos)
match = re.match( match = re.match(
(r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name (r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name
r"(?: ch(\d+)([^:<]*))?" # chapter info r"(?: ch(\d+)([^:<]*))?" # chapter info
r"(?:: (.+))?"), # title r"(?:: (.+))?"), # title
info extr("<h3 id='chapter-title'><b>", "</b>"),
) )
author = extr(" by ", "</a>")
group = extr('"icon-print"></i> ', '</span>')
return { return {
"manga": text.unescape(match.group(1)), "manga" : text.unescape(match.group(1)),
"chapter": text.parse_int(match.group(2)), "chapter" : text.parse_int(match.group(2)),
"chapter_minor": match.group(3) or "", "chapter_minor": match.group(3) or "",
"title": text.unescape(match.group(4) or ""), "title" : text.unescape(match.group(4) or ""),
"author": text.remove_html(author), "author" : text.remove_html(author),
"group": (text.remove_html(group) or "group" : (text.remove_html(group) or
text.extract(group, ' alt="', '"')[0] or ""), text.extract(group, ' alt="', '"')[0] or ""),
"date": date, "date" : extr('"icon-calendar"></i> ', '<'),
"lang": "en", "lang" : "en",
"language": "English", "language": "English",
} }

View File

@@ -39,17 +39,16 @@ class FallenangelsChapterExtractor(ChapterExtractor):
ChapterExtractor.__init__(self, match, url) ChapterExtractor.__init__(self, match, url)
def metadata(self, page): def metadata(self, page):
extr = text.extract_from(page)
lang = "vi" if self.version == "truyen" else "en" lang = "vi" if self.version == "truyen" else "en"
data = { return {
"chapter": self.chapter, "manga" : extr('name="description" content="', ' Chapter '),
"title" : extr(': ', ' - Page 1'),
"chapter" : self.chapter,
"chapter_minor": self.minor or "", "chapter_minor": self.minor or "",
"lang": lang, "lang" : lang,
"language": util.code_to_language(lang), "language": util.code_to_language(lang),
} }
return text.extract_all(page, (
("manga", 'name="description" content="', ' Chapter '),
("title", ': ', ' - Page 1'),
), values=data)[0]
@staticmethod @staticmethod
def images(page): def images(page):
@@ -83,26 +82,24 @@ class FallenangelsMangaExtractor(MangaExtractor):
MangaExtractor.__init__(self, match, url) MangaExtractor.__init__(self, match, url)
def chapters(self, page): def chapters(self, page):
language = util.code_to_language(self.lang) extr = text.extract_from(page)
results = [] results = []
pos = 0 language = util.code_to_language(self.lang)
while True: while extr('<li style="', '"'):
test, pos = text.extract(page, '<li style="', '', pos) vol = extr('class="volume-', '"')
if test is None: url = extr('href="', '"')
return results cha = extr('>', '<')
volume , pos = text.extract(page, 'class="volume-', '"', pos) title = extr('<em>', '</em>')
url , pos = text.extract(page, 'href="', '"', pos)
chapter, pos = text.extract(page, '>', '<', pos)
title , pos = text.extract(page, '<em>', '</em>', pos)
manga, _, chapter = chapter.rpartition(" ") manga, _, chapter = cha.rpartition(" ")
chapter, dot, minor = chapter.partition(".") chapter, dot, minor = chapter.partition(".")
results.append((url, { results.append((url, {
"manga": manga, "manga" : manga,
"title": text.unescape(title), "title" : text.unescape(title),
"volume": text.parse_int(volume), "volume" : text.parse_int(vol),
"chapter": text.parse_int(chapter), "chapter" : text.parse_int(chapter),
"chapter_minor": dot + minor, "chapter_minor": dot + minor,
"lang": self.lang, "lang" : self.lang,
"language": language, "language": language,
})) }))
return results

View File

@@ -38,6 +38,7 @@ class FoolslideBase(SharedConfigMixin):
data["volume"] = text.parse_int(info[2]) data["volume"] = text.parse_int(info[2])
data["chapter"] = text.parse_int(info[3]) data["chapter"] = text.parse_int(info[3])
data["chapter_minor"] = "." + info[4] if len(info) >= 5 else "" data["chapter_minor"] = "." + info[4] if len(info) >= 5 else ""
data["title"] = data["chapter_string"].partition(":")[2].strip()
return data return data
@@ -74,14 +75,11 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
yield Message.Url, url, data yield Message.Url, url, data
def metadata(self, page): def metadata(self, page):
_ , pos = text.extract(page, '<h1 class="tbtitle dnone">', '') extr = text.extract_from(page)
manga , pos = text.extract(page, 'title="', '"', pos) extr('<h1 class="tbtitle dnone">', '')
chapter, pos = text.extract(page, 'title="', '"', pos)
chapter = text.unescape(chapter)
return self.parse_chapter_url(self.chapter_url, { return self.parse_chapter_url(self.chapter_url, {
"manga": text.unescape(manga).strip(), "manga" : text.unescape(extr('title="', '"')).strip(),
"title": chapter.partition(":")[2].strip(), "chapter_string": text.unescape(extr('title="', '"')),
"chapter_string": chapter,
}) })
def images(self, page): def images(self, page):
@@ -101,25 +99,20 @@ class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
pattern_fmt = r"(/series/[^/?&#]+)" pattern_fmt = r"(/series/[^/?&#]+)"
def chapters(self, page): def chapters(self, page):
manga , pos = text.extract(page, '<h1 class="title">', '</h1>') extr = text.extract_from(page)
author, pos = text.extract(page, '<b>Author</b>: ', '<br', pos) manga = text.unescape(extr('<h1 class="title">', '</h1>')).strip()
artist, pos = text.extract(page, '<b>Artist</b>: ', '<br', pos) author = extr('<b>Author</b>: ', '<br')
manga = text.unescape(manga).strip() artist = extr('<b>Artist</b>: ', '<br')
results = [] results = []
while True: while True:
url, pos = text.extract( url = extr('<div class="title"><a href="', '"')
page, '<div class="title"><a href="', '"', pos)
if not url: if not url:
return results return results
chapter, pos = text.extract(page, 'title="', '"', pos)
group , pos = text.extract(page, 'title="', '"', pos)
results.append((url, self.parse_chapter_url(url, { results.append((url, self.parse_chapter_url(url, {
"manga": manga, "author": author, "artist": artist, "manga": manga, "author": author, "artist": artist,
"group": group, "chapter_string": chapter, "chapter_string": extr('title="', '"'),
"title": chapter.partition(": ")[2] or "", "group" : extr('title="', '"'),
}))) })))
@@ -166,7 +159,7 @@ EXTRACTORS = {
"test-manga": "test-manga":
("https://reader.kireicake.com/series/wonderland/", { ("https://reader.kireicake.com/series/wonderland/", {
"url": "d067b649af1cc88fa8c8b698fde04a10909fd169", "url": "d067b649af1cc88fa8c8b698fde04a10909fd169",
"keyword": "99caa336a9d48e27e3b8e56a0a1e6faf9fc13a51", "keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb",
}), }),
}, },
"powermanga": { "powermanga": {
@@ -214,7 +207,7 @@ EXTRACTORS = {
"test-manga": "test-manga":
("http://sensescans.com/reader/series/hakkenden/", { ("http://sensescans.com/reader/series/hakkenden/", {
"url": "2360ccb0ead0ff2f5e27b7aef7eb17b9329de2f2", "url": "2360ccb0ead0ff2f5e27b7aef7eb17b9329de2f2",
"keyword": "122cf92c32e6428c50f56ffaf29d06b96750ed71", "keyword": "4919f2bfed38e3a34dc984ec8d1dbd7a03044e23",
}), }),
}, },
"worldthree": { "worldthree": {

View File

@@ -21,7 +21,7 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor):
r"(/manga/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)") r"(/manga/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)")
test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", { test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", {
"url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2", "url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2",
"keyword": "3688ddd3f3077c93eaa8021477ef66d18dc6c159", "keyword": "7182d262810faa692827c947d2f360dfcb8d5e43",
}) })
root = "https://hentai.cafe" root = "https://hentai.cafe"
@@ -51,7 +51,7 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor):
# foolslide URL # foolslide URL
("https://hentai.cafe/manga/series/saitom-box/", { ("https://hentai.cafe/manga/series/saitom-box/", {
"url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076", "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
"keyword": "46012b857eb1a1394bc55c0efe7aa4e7f704d10d", "keyword": "f0ece32d958f889d8229ed4052716d398a0a875c",
}), }),
) )
root = "https://hentai.cafe" root = "https://hentai.cafe"

View File

@@ -31,25 +31,21 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
GalleryExtractor.__init__(self, match) GalleryExtractor.__init__(self, match)
self.gallery_id = match.group(2) self.gallery_id = match.group(2)
def metadata(self, page): def metadata(self, page, split=text.split_html):
title, pos = text.extract(page, "<h1>", "</h1>") extr = text.extract_from(page)
data = text.extract_all(page, (
("parody" , ">Parodies:" , "</a></span>"),
("characters", ">Characters:", "</a></span>"),
("tags" , ">Tags:" , "</a></span>"),
("artist" , ">Artists:" , "</a></span>"),
("group" , ">Groups:" , "</a></span>"),
("type" , ">Category:" , "</a></span>"),
), pos)[0]
for key, value in data.items(): return {
data[key] = text.split_html(value)[::2] "gallery_id": text.parse_int(self.gallery_id),
data["gallery_id"] = text.parse_int(self.gallery_id) "title" : text.unescape(extr("<h1>", "</h1>")),
data["title"] = text.unescape(title) "parody" : split(extr(">Parodies:" , "</a></span>"))[::2],
data["type"] = data["type"][0] if data["type"] else "" "characters": split(extr(">Characters:", "</a></span>"))[::2],
data["language"] = "English" "tags" : split(extr(">Tags:" , "</a></span>"))[::2],
data["lang"] = "en" "artist" : split(extr(">Artists:" , "</a></span>"))[::2],
return data "group" : split(extr(">Groups:" , "</a></span>"))[::2],
"type" : text.remove_html(extr(">Category:", "</a></span>")),
"language" : "English",
"lang" : "en",
}
def images(self, page): def images(self, page):
return [ return [

View File

@@ -38,18 +38,14 @@ class LivedoorExtractor(Extractor):
"""Return an iterable with post objects""" """Return an iterable with post objects"""
def _load(self, data, body): def _load(self, data, body):
pid , pos = text.extract(data, "id : '" , "'") extr = text.extract_from(data)
title, pos = text.extract(data, "title : '", "'", pos) tags = text.extract(body, '</dt><dd>', '</dl>')[0]
cat1 , pos = text.extract(data, "name:'" , "'", pos)
cat2 , pos = text.extract(data, "name:'" , "'", pos)
date , pos = text.extract(data, "date : '" , "'", pos)
tags , pos = text.extract(body, '</dt><dd>', '</dl>')
return { return {
"id" : text.parse_int(pid), "id" : text.parse_int(extr("id : '", "'")),
"title" : title, "title" : extr("title : '", "'"),
"date" : date, "categories": [extr("name:'", "'"), extr("name:'", "'")],
"categories": [cat1, cat2], "date" : extr("date : '", "'"),
"tags" : text.split_html(tags), "tags" : text.split_html(tags),
"user" : self.user, "user" : self.user,
"body" : body, "body" : body,
@@ -108,22 +104,15 @@ class LivedoorBlogExtractor(LivedoorExtractor):
url = "{}/{}".format(self.root, self.user) url = "{}/{}".format(self.root, self.user)
while url: while url:
page = self.request(url).text extr = text.extract_from(self.request(url).text)
pos = 0
while True: while True:
data, pos = text.extract(page, '.articles.push(', ');', pos) data = extr('.articles.push(', ');')
if not data: if not data:
break break
body, pos = text.extract( body = extr('<div class="article-body-inner">',
page, '<!-- articleBody End -->')
'<div class="article-body-inner">',
'<!-- articleBody End -->',
pos,
)
yield self._load(data, body) yield self._load(data, body)
url = extr('<a rel="next" href="', '"')
url = text.extract(page, '<a rel="next" href="', '"', pos)[0]
class LivedoorPostExtractor(LivedoorExtractor): class LivedoorPostExtractor(LivedoorExtractor):
@@ -148,13 +137,8 @@ class LivedoorPostExtractor(LivedoorExtractor):
def posts(self): def posts(self):
url = "{}/{}/archives/{}.html".format( url = "{}/{}/archives/{}.html".format(
self.root, self.user, self.post_id) self.root, self.user, self.post_id)
page = self.request(url).text extr = text.extract_from(self.request(url).text)
data = extr('articles :', '</script>')
data, pos = text.extract(page, 'articles :', '</script>') body = extr('<div class="article-body-inner">',
body, pos = text.extract( '<!-- articleBody End -->')
page,
'<div class="article-body-inner">',
'<!-- articleBody End -->',
pos,
)
return (self._load(data, body),) return (self._load(data, body),)

View File

@@ -62,7 +62,7 @@ class LusciousAlbumExtractor(LusciousBase, GalleryExtractor):
test = ( test = (
("https://luscious.net/albums/okinami-no-koigokoro_277031/", { ("https://luscious.net/albums/okinami-no-koigokoro_277031/", {
"url": "7e4984a271a1072ac6483e4228a045895aff86f3", "url": "7e4984a271a1072ac6483e4228a045895aff86f3",
"keyword": "1b1e8981afa250a0181e31b15ce3cbaa37c00856", "keyword": "f9c34e1a5b0c1f119e9f644c99933ecf7d7dbfd2",
"content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3", "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3",
}), }),
("https://luscious.net/albums/virgin-killer-sweater_282582/", { ("https://luscious.net/albums/virgin-killer-sweater_282582/", {

View File

@@ -77,42 +77,31 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
GalleryExtractor.__init__(self, match, url) GalleryExtractor.__init__(self, match, url)
def metadata(self, page): def metadata(self, page):
extr = text.extract extr = text.extract_from(page)
title, pos = extr(page, '"og:title" content="', '"') title = extr('"og:title" content="', '"')
thumb, pos = extr(page, '"og:image" content="', '"', pos)
title_en, _, title_jp = text.unescape(title).partition("/") title_en, _, title_jp = text.unescape(title).partition("/")
title_en = title_en.strip() title_en = title_en.strip()
title_jp = title_jp.strip() title_jp = title_jp.strip()
uploader , pos = extr(page, 'id="Uploader">' , '</div>', pos)
date , pos = extr(page, 'id="Uploaded">' , '</div>', pos)
rating , pos = extr(page, 'id="Rating">' , '</div>', pos)
gtype , pos = extr(page, 'id="Category">' , '</div>', pos)
collection, pos = extr(page, 'id="Collection">', '</div>', pos)
group , pos = extr(page, 'id="Group">' , '</div>', pos)
artist , pos = extr(page, 'id="Artist">' , '</div>', pos)
parody , pos = extr(page, 'id="Parody">' , '</div>', pos)
character , pos = extr(page, 'id="Character">' , '</div>', pos)
tags , pos = extr(page, 'id="Tag">' , '</div>', pos)
return { return {
"gallery_id": text.parse_int(self.gallery_id), "gallery_id": text.parse_int(self.gallery_id),
"title": title_en or title_jp, "title" : title_en or title_jp,
"title_en": title_en, "title_en" : title_en,
"title_jp": title_jp, "title_jp" : title_jp,
"thumbnail": thumb, "thumbnail" : extr('"og:image" content="', '"'),
"uploader": text.remove_html(uploader), "uploader" : text.remove_html(extr('id="Uploader">', '</div>')),
"date": date.strip(), "date" : extr('id="Uploaded">', '</div>').strip(),
"rating": text.parse_float(rating.partition(" ")[0]), "rating" : text.parse_float(extr(
"type": text.remove_html(gtype), 'id="Rating">', '</div>').partition(" ")[0]),
"collection": text.remove_html(collection), "type" : text.remove_html(extr('id="Category">' , '</div>')),
"group": text.split_html(group), "collection": text.remove_html(extr('id="Collection">', '</div>')),
"artist": text.split_html(artist), "group" : text.split_html(extr('id="Group">' , '</div>')),
"parody": text.split_html(parody), "artist" : text.split_html(extr('id="Artist">' , '</div>')),
"characters": text.split_html(character), "parody" : text.split_html(extr('id="Parody">' , '</div>')),
"tags": text.split_html(tags), "characters": text.split_html(extr('id="Character">' , '</div>')),
"language": "English", "tags" : text.split_html(extr('id="Tag">' , '</div>')),
"lang": "en", "language" : "English",
"lang" : "en",
} }
def images(self, page): def images(self, page):

View File

@@ -87,18 +87,15 @@ class TwitterExtractor(Extractor):
@staticmethod @staticmethod
def _data_from_tweet(tweet): def _data_from_tweet(tweet):
data = text.extract_all(tweet, ( extr = text.extract_from(tweet)
("tweet_id" , 'data-tweet-id="' , '"'), return {
("retweet_id", 'data-retweet-id="' , '"'), "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')),
("retweeter" , 'data-retweeter="' , '"'), "retweet_id": text.parse_int(extr('data-retweet-id="', '"')),
("user" , 'data-screen-name="', '"'), "retweeter" : extr('data-retweeter="' , '"'),
("username" , 'data-name="' , '"'), "user" : extr('data-screen-name="', '"'),
("user_id" , 'data-user-id="' , '"'), "username" : extr('data-name="' , '"'),
))[0] "user_id" : text.parse_int(extr('data-user-id="' , '"')),
for key in ("tweet_id", "retweet_id", "user_id"): }
data[key] = text.parse_int(data[key])
data["retweeter"] = data["retweeter"] or ""
return data
def _tweets_from_api(self, url): def _tweets_from_api(self, url):
params = { params = {