use 'text.extract_from()' in a few places

This commit is contained in:
Mike Fährmann
2019-04-19 23:02:29 +02:00
parent 21a7e395a7
commit f2cf1c1d73
10 changed files with 116 additions and 162 deletions

View File

@@ -71,7 +71,7 @@ class ThreedeebooruPopularExtractor(booru.MoebooruPopularMixin,
r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
r"(?:\?(?P<query>[^#]*))?")
test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", {
"url": "a447e115fdab60c25ab71c4fdb1b9f509bc23f99",
"url": "c70268dce441a9ccc3383c244ec15edb059f494f",
"count": 20,
})

View File

@@ -24,21 +24,21 @@ class DynastyscansBase():
def _parse_image_page(self, image_id):
url = "{}/images/{}".format(self.root, image_id)
page = self.request(url).text
extr = text.extract_from(self.request(url).text)
date, pos = text.extract(page, "class='create_at'>", "</span>")
tags, pos = text.extract(page, "class='tags'>", "</span>", pos)
src , pos = text.extract(page, "class='btn-group'>", "</div>", pos)
url , pos = text.extract(page, ' src="', '"', pos)
date = extr("class='create_at'>", "</span>")
tags = extr("class='tags'>", "</span>")
src = extr("class='btn-group'>", "</div>")
url = extr(' src="', '"')
src = text.extract(src, 'href="', '"')[0] if "Source<" in src else ""
return {
"url": self.root + url,
"url" : self.root + url,
"image_id": text.parse_int(image_id),
"tags": text.split_html(text.unescape(tags)),
"date": text.remove_html(date),
"source": text.unescape(src),
"tags" : text.split_html(text.unescape(tags)),
"date" : text.remove_html(date),
"source" : text.unescape(src),
}
@@ -59,28 +59,26 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
)
def metadata(self, page):
info , pos = text.extract(page, "<h3 id='chapter-title'><b>", "</b>")
author, pos = text.extract(page, " by ", "</a>", pos)
group , pos = text.extract(page, '"icon-print"></i> ', '</span>', pos)
date , pos = text.extract(page, '"icon-calendar"></i> ', '<', pos)
extr = text.extract_from(page)
match = re.match(
(r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name
r"(?: ch(\d+)([^:<]*))?" # chapter info
r"(?:: (.+))?"), # title
info
extr("<h3 id='chapter-title'><b>", "</b>"),
)
author = extr(" by ", "</a>")
group = extr('"icon-print"></i> ', '</span>')
return {
"manga": text.unescape(match.group(1)),
"chapter": text.parse_int(match.group(2)),
"manga" : text.unescape(match.group(1)),
"chapter" : text.parse_int(match.group(2)),
"chapter_minor": match.group(3) or "",
"title": text.unescape(match.group(4) or ""),
"author": text.remove_html(author),
"group": (text.remove_html(group) or
text.extract(group, ' alt="', '"')[0] or ""),
"date": date,
"lang": "en",
"title" : text.unescape(match.group(4) or ""),
"author" : text.remove_html(author),
"group" : (text.remove_html(group) or
text.extract(group, ' alt="', '"')[0] or ""),
"date" : extr('"icon-calendar"></i> ', '<'),
"lang" : "en",
"language": "English",
}

View File

@@ -39,17 +39,16 @@ class FallenangelsChapterExtractor(ChapterExtractor):
ChapterExtractor.__init__(self, match, url)
def metadata(self, page):
extr = text.extract_from(page)
lang = "vi" if self.version == "truyen" else "en"
data = {
"chapter": self.chapter,
return {
"manga" : extr('name="description" content="', ' Chapter '),
"title" : extr(': ', ' - Page 1'),
"chapter" : self.chapter,
"chapter_minor": self.minor or "",
"lang": lang,
"lang" : lang,
"language": util.code_to_language(lang),
}
return text.extract_all(page, (
("manga", 'name="description" content="', ' Chapter '),
("title", ': ', ' - Page 1'),
), values=data)[0]
@staticmethod
def images(page):
@@ -83,26 +82,24 @@ class FallenangelsMangaExtractor(MangaExtractor):
MangaExtractor.__init__(self, match, url)
def chapters(self, page):
language = util.code_to_language(self.lang)
extr = text.extract_from(page)
results = []
pos = 0
while True:
test, pos = text.extract(page, '<li style="', '', pos)
if test is None:
return results
volume , pos = text.extract(page, 'class="volume-', '"', pos)
url , pos = text.extract(page, 'href="', '"', pos)
chapter, pos = text.extract(page, '>', '<', pos)
title , pos = text.extract(page, '<em>', '</em>', pos)
language = util.code_to_language(self.lang)
while extr('<li style="', '"'):
vol = extr('class="volume-', '"')
url = extr('href="', '"')
cha = extr('>', '<')
title = extr('<em>', '</em>')
manga, _, chapter = chapter.rpartition(" ")
manga, _, chapter = cha.rpartition(" ")
chapter, dot, minor = chapter.partition(".")
results.append((url, {
"manga": manga,
"title": text.unescape(title),
"volume": text.parse_int(volume),
"chapter": text.parse_int(chapter),
"manga" : manga,
"title" : text.unescape(title),
"volume" : text.parse_int(vol),
"chapter" : text.parse_int(chapter),
"chapter_minor": dot + minor,
"lang": self.lang,
"lang" : self.lang,
"language": language,
}))
return results

View File

@@ -38,6 +38,7 @@ class FoolslideBase(SharedConfigMixin):
data["volume"] = text.parse_int(info[2])
data["chapter"] = text.parse_int(info[3])
data["chapter_minor"] = "." + info[4] if len(info) >= 5 else ""
data["title"] = data["chapter_string"].partition(":")[2].strip()
return data
@@ -74,14 +75,11 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
yield Message.Url, url, data
def metadata(self, page):
_ , pos = text.extract(page, '<h1 class="tbtitle dnone">', '')
manga , pos = text.extract(page, 'title="', '"', pos)
chapter, pos = text.extract(page, 'title="', '"', pos)
chapter = text.unescape(chapter)
extr = text.extract_from(page)
extr('<h1 class="tbtitle dnone">', '')
return self.parse_chapter_url(self.chapter_url, {
"manga": text.unescape(manga).strip(),
"title": chapter.partition(":")[2].strip(),
"chapter_string": chapter,
"manga" : text.unescape(extr('title="', '"')).strip(),
"chapter_string": text.unescape(extr('title="', '"')),
})
def images(self, page):
@@ -101,25 +99,20 @@ class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
pattern_fmt = r"(/series/[^/?&#]+)"
def chapters(self, page):
manga , pos = text.extract(page, '<h1 class="title">', '</h1>')
author, pos = text.extract(page, '<b>Author</b>: ', '<br', pos)
artist, pos = text.extract(page, '<b>Artist</b>: ', '<br', pos)
manga = text.unescape(manga).strip()
extr = text.extract_from(page)
manga = text.unescape(extr('<h1 class="title">', '</h1>')).strip()
author = extr('<b>Author</b>: ', '<br')
artist = extr('<b>Artist</b>: ', '<br')
results = []
while True:
url, pos = text.extract(
page, '<div class="title"><a href="', '"', pos)
url = extr('<div class="title"><a href="', '"')
if not url:
return results
chapter, pos = text.extract(page, 'title="', '"', pos)
group , pos = text.extract(page, 'title="', '"', pos)
results.append((url, self.parse_chapter_url(url, {
"manga": manga, "author": author, "artist": artist,
"group": group, "chapter_string": chapter,
"title": chapter.partition(": ")[2] or "",
"chapter_string": extr('title="', '"'),
"group" : extr('title="', '"'),
})))
@@ -166,7 +159,7 @@ EXTRACTORS = {
"test-manga":
("https://reader.kireicake.com/series/wonderland/", {
"url": "d067b649af1cc88fa8c8b698fde04a10909fd169",
"keyword": "99caa336a9d48e27e3b8e56a0a1e6faf9fc13a51",
"keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb",
}),
},
"powermanga": {
@@ -214,7 +207,7 @@ EXTRACTORS = {
"test-manga":
("http://sensescans.com/reader/series/hakkenden/", {
"url": "2360ccb0ead0ff2f5e27b7aef7eb17b9329de2f2",
"keyword": "122cf92c32e6428c50f56ffaf29d06b96750ed71",
"keyword": "4919f2bfed38e3a34dc984ec8d1dbd7a03044e23",
}),
},
"worldthree": {

View File

@@ -21,7 +21,7 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor):
r"(/manga/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)")
test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", {
"url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2",
"keyword": "3688ddd3f3077c93eaa8021477ef66d18dc6c159",
"keyword": "7182d262810faa692827c947d2f360dfcb8d5e43",
})
root = "https://hentai.cafe"
@@ -51,7 +51,7 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor):
# foolslide URL
("https://hentai.cafe/manga/series/saitom-box/", {
"url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
"keyword": "46012b857eb1a1394bc55c0efe7aa4e7f704d10d",
"keyword": "f0ece32d958f889d8229ed4052716d398a0a875c",
}),
)
root = "https://hentai.cafe"

View File

@@ -31,25 +31,21 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
GalleryExtractor.__init__(self, match)
self.gallery_id = match.group(2)
def metadata(self, page):
title, pos = text.extract(page, "<h1>", "</h1>")
data = text.extract_all(page, (
("parody" , ">Parodies:" , "</a></span>"),
("characters", ">Characters:", "</a></span>"),
("tags" , ">Tags:" , "</a></span>"),
("artist" , ">Artists:" , "</a></span>"),
("group" , ">Groups:" , "</a></span>"),
("type" , ">Category:" , "</a></span>"),
), pos)[0]
def metadata(self, page, split=text.split_html):
extr = text.extract_from(page)
for key, value in data.items():
data[key] = text.split_html(value)[::2]
data["gallery_id"] = text.parse_int(self.gallery_id)
data["title"] = text.unescape(title)
data["type"] = data["type"][0] if data["type"] else ""
data["language"] = "English"
data["lang"] = "en"
return data
return {
"gallery_id": text.parse_int(self.gallery_id),
"title" : text.unescape(extr("<h1>", "</h1>")),
"parody" : split(extr(">Parodies:" , "</a></span>"))[::2],
"characters": split(extr(">Characters:", "</a></span>"))[::2],
"tags" : split(extr(">Tags:" , "</a></span>"))[::2],
"artist" : split(extr(">Artists:" , "</a></span>"))[::2],
"group" : split(extr(">Groups:" , "</a></span>"))[::2],
"type" : text.remove_html(extr(">Category:", "</a></span>")),
"language" : "English",
"lang" : "en",
}
def images(self, page):
return [

View File

@@ -38,18 +38,14 @@ class LivedoorExtractor(Extractor):
"""Return an iterable with post objects"""
def _load(self, data, body):
pid , pos = text.extract(data, "id : '" , "'")
title, pos = text.extract(data, "title : '", "'", pos)
cat1 , pos = text.extract(data, "name:'" , "'", pos)
cat2 , pos = text.extract(data, "name:'" , "'", pos)
date , pos = text.extract(data, "date : '" , "'", pos)
tags , pos = text.extract(body, '</dt><dd>', '</dl>')
extr = text.extract_from(data)
tags = text.extract(body, '</dt><dd>', '</dl>')[0]
return {
"id" : text.parse_int(pid),
"title" : title,
"date" : date,
"categories": [cat1, cat2],
"id" : text.parse_int(extr("id : '", "'")),
"title" : extr("title : '", "'"),
"categories": [extr("name:'", "'"), extr("name:'", "'")],
"date" : extr("date : '", "'"),
"tags" : text.split_html(tags),
"user" : self.user,
"body" : body,
@@ -108,22 +104,15 @@ class LivedoorBlogExtractor(LivedoorExtractor):
url = "{}/{}".format(self.root, self.user)
while url:
page = self.request(url).text
pos = 0
extr = text.extract_from(self.request(url).text)
while True:
data, pos = text.extract(page, '.articles.push(', ');', pos)
data = extr('.articles.push(', ');')
if not data:
break
body, pos = text.extract(
page,
'<div class="article-body-inner">',
'<!-- articleBody End -->',
pos,
)
body = extr('<div class="article-body-inner">',
'<!-- articleBody End -->')
yield self._load(data, body)
url = text.extract(page, '<a rel="next" href="', '"', pos)[0]
url = extr('<a rel="next" href="', '"')
class LivedoorPostExtractor(LivedoorExtractor):
@@ -148,13 +137,8 @@ class LivedoorPostExtractor(LivedoorExtractor):
def posts(self):
url = "{}/{}/archives/{}.html".format(
self.root, self.user, self.post_id)
page = self.request(url).text
data, pos = text.extract(page, 'articles :', '</script>')
body, pos = text.extract(
page,
'<div class="article-body-inner">',
'<!-- articleBody End -->',
pos,
)
extr = text.extract_from(self.request(url).text)
data = extr('articles :', '</script>')
body = extr('<div class="article-body-inner">',
'<!-- articleBody End -->')
return (self._load(data, body),)

View File

@@ -62,7 +62,7 @@ class LusciousAlbumExtractor(LusciousBase, GalleryExtractor):
test = (
("https://luscious.net/albums/okinami-no-koigokoro_277031/", {
"url": "7e4984a271a1072ac6483e4228a045895aff86f3",
"keyword": "1b1e8981afa250a0181e31b15ce3cbaa37c00856",
"keyword": "f9c34e1a5b0c1f119e9f644c99933ecf7d7dbfd2",
"content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3",
}),
("https://luscious.net/albums/virgin-killer-sweater_282582/", {

View File

@@ -77,42 +77,31 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
extr = text.extract
title, pos = extr(page, '"og:title" content="', '"')
thumb, pos = extr(page, '"og:image" content="', '"', pos)
extr = text.extract_from(page)
title = extr('"og:title" content="', '"')
title_en, _, title_jp = text.unescape(title).partition("/")
title_en = title_en.strip()
title_jp = title_jp.strip()
uploader , pos = extr(page, 'id="Uploader">' , '</div>', pos)
date , pos = extr(page, 'id="Uploaded">' , '</div>', pos)
rating , pos = extr(page, 'id="Rating">' , '</div>', pos)
gtype , pos = extr(page, 'id="Category">' , '</div>', pos)
collection, pos = extr(page, 'id="Collection">', '</div>', pos)
group , pos = extr(page, 'id="Group">' , '</div>', pos)
artist , pos = extr(page, 'id="Artist">' , '</div>', pos)
parody , pos = extr(page, 'id="Parody">' , '</div>', pos)
character , pos = extr(page, 'id="Character">' , '</div>', pos)
tags , pos = extr(page, 'id="Tag">' , '</div>', pos)
return {
"gallery_id": text.parse_int(self.gallery_id),
"title": title_en or title_jp,
"title_en": title_en,
"title_jp": title_jp,
"thumbnail": thumb,
"uploader": text.remove_html(uploader),
"date": date.strip(),
"rating": text.parse_float(rating.partition(" ")[0]),
"type": text.remove_html(gtype),
"collection": text.remove_html(collection),
"group": text.split_html(group),
"artist": text.split_html(artist),
"parody": text.split_html(parody),
"characters": text.split_html(character),
"tags": text.split_html(tags),
"language": "English",
"lang": "en",
"title" : title_en or title_jp,
"title_en" : title_en,
"title_jp" : title_jp,
"thumbnail" : extr('"og:image" content="', '"'),
"uploader" : text.remove_html(extr('id="Uploader">', '</div>')),
"date" : extr('id="Uploaded">', '</div>').strip(),
"rating" : text.parse_float(extr(
'id="Rating">', '</div>').partition(" ")[0]),
"type" : text.remove_html(extr('id="Category">' , '</div>')),
"collection": text.remove_html(extr('id="Collection">', '</div>')),
"group" : text.split_html(extr('id="Group">' , '</div>')),
"artist" : text.split_html(extr('id="Artist">' , '</div>')),
"parody" : text.split_html(extr('id="Parody">' , '</div>')),
"characters": text.split_html(extr('id="Character">' , '</div>')),
"tags" : text.split_html(extr('id="Tag">' , '</div>')),
"language" : "English",
"lang" : "en",
}
def images(self, page):

View File

@@ -87,18 +87,15 @@ class TwitterExtractor(Extractor):
@staticmethod
def _data_from_tweet(tweet):
data = text.extract_all(tweet, (
("tweet_id" , 'data-tweet-id="' , '"'),
("retweet_id", 'data-retweet-id="' , '"'),
("retweeter" , 'data-retweeter="' , '"'),
("user" , 'data-screen-name="', '"'),
("username" , 'data-name="' , '"'),
("user_id" , 'data-user-id="' , '"'),
))[0]
for key in ("tweet_id", "retweet_id", "user_id"):
data[key] = text.parse_int(data[key])
data["retweeter"] = data["retweeter"] or ""
return data
extr = text.extract_from(tweet)
return {
"tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')),
"retweet_id": text.parse_int(extr('data-retweet-id="', '"')),
"retweeter" : extr('data-retweeter="' , '"'),
"user" : extr('data-screen-name="', '"'),
"username" : extr('data-name="' , '"'),
"user_id" : text.parse_int(extr('data-user-id="' , '"')),
}
def _tweets_from_api(self, url):
params = {