use 'text.extract_from()' in a few places
This commit is contained in:
@@ -71,7 +71,7 @@ class ThreedeebooruPopularExtractor(booru.MoebooruPopularMixin,
|
||||
r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
|
||||
r"(?:\?(?P<query>[^#]*))?")
|
||||
test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", {
|
||||
"url": "a447e115fdab60c25ab71c4fdb1b9f509bc23f99",
|
||||
"url": "c70268dce441a9ccc3383c244ec15edb059f494f",
|
||||
"count": 20,
|
||||
})
|
||||
|
||||
|
||||
@@ -24,21 +24,21 @@ class DynastyscansBase():
|
||||
|
||||
def _parse_image_page(self, image_id):
|
||||
url = "{}/images/{}".format(self.root, image_id)
|
||||
page = self.request(url).text
|
||||
extr = text.extract_from(self.request(url).text)
|
||||
|
||||
date, pos = text.extract(page, "class='create_at'>", "</span>")
|
||||
tags, pos = text.extract(page, "class='tags'>", "</span>", pos)
|
||||
src , pos = text.extract(page, "class='btn-group'>", "</div>", pos)
|
||||
url , pos = text.extract(page, ' src="', '"', pos)
|
||||
date = extr("class='create_at'>", "</span>")
|
||||
tags = extr("class='tags'>", "</span>")
|
||||
src = extr("class='btn-group'>", "</div>")
|
||||
url = extr(' src="', '"')
|
||||
|
||||
src = text.extract(src, 'href="', '"')[0] if "Source<" in src else ""
|
||||
|
||||
return {
|
||||
"url": self.root + url,
|
||||
"url" : self.root + url,
|
||||
"image_id": text.parse_int(image_id),
|
||||
"tags": text.split_html(text.unescape(tags)),
|
||||
"date": text.remove_html(date),
|
||||
"source": text.unescape(src),
|
||||
"tags" : text.split_html(text.unescape(tags)),
|
||||
"date" : text.remove_html(date),
|
||||
"source" : text.unescape(src),
|
||||
}
|
||||
|
||||
|
||||
@@ -59,28 +59,26 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
|
||||
)
|
||||
|
||||
def metadata(self, page):
|
||||
info , pos = text.extract(page, "<h3 id='chapter-title'><b>", "</b>")
|
||||
author, pos = text.extract(page, " by ", "</a>", pos)
|
||||
group , pos = text.extract(page, '"icon-print"></i> ', '</span>', pos)
|
||||
date , pos = text.extract(page, '"icon-calendar"></i> ', '<', pos)
|
||||
|
||||
extr = text.extract_from(page)
|
||||
match = re.match(
|
||||
(r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name
|
||||
r"(?: ch(\d+)([^:<]*))?" # chapter info
|
||||
r"(?:: (.+))?"), # title
|
||||
info
|
||||
extr("<h3 id='chapter-title'><b>", "</b>"),
|
||||
)
|
||||
author = extr(" by ", "</a>")
|
||||
group = extr('"icon-print"></i> ', '</span>')
|
||||
|
||||
return {
|
||||
"manga": text.unescape(match.group(1)),
|
||||
"chapter": text.parse_int(match.group(2)),
|
||||
"manga" : text.unescape(match.group(1)),
|
||||
"chapter" : text.parse_int(match.group(2)),
|
||||
"chapter_minor": match.group(3) or "",
|
||||
"title": text.unescape(match.group(4) or ""),
|
||||
"author": text.remove_html(author),
|
||||
"group": (text.remove_html(group) or
|
||||
text.extract(group, ' alt="', '"')[0] or ""),
|
||||
"date": date,
|
||||
"lang": "en",
|
||||
"title" : text.unescape(match.group(4) or ""),
|
||||
"author" : text.remove_html(author),
|
||||
"group" : (text.remove_html(group) or
|
||||
text.extract(group, ' alt="', '"')[0] or ""),
|
||||
"date" : extr('"icon-calendar"></i> ', '<'),
|
||||
"lang" : "en",
|
||||
"language": "English",
|
||||
}
|
||||
|
||||
|
||||
@@ -39,17 +39,16 @@ class FallenangelsChapterExtractor(ChapterExtractor):
|
||||
ChapterExtractor.__init__(self, match, url)
|
||||
|
||||
def metadata(self, page):
|
||||
extr = text.extract_from(page)
|
||||
lang = "vi" if self.version == "truyen" else "en"
|
||||
data = {
|
||||
"chapter": self.chapter,
|
||||
return {
|
||||
"manga" : extr('name="description" content="', ' Chapter '),
|
||||
"title" : extr(': ', ' - Page 1'),
|
||||
"chapter" : self.chapter,
|
||||
"chapter_minor": self.minor or "",
|
||||
"lang": lang,
|
||||
"lang" : lang,
|
||||
"language": util.code_to_language(lang),
|
||||
}
|
||||
return text.extract_all(page, (
|
||||
("manga", 'name="description" content="', ' Chapter '),
|
||||
("title", ': ', ' - Page 1'),
|
||||
), values=data)[0]
|
||||
|
||||
@staticmethod
|
||||
def images(page):
|
||||
@@ -83,26 +82,24 @@ class FallenangelsMangaExtractor(MangaExtractor):
|
||||
MangaExtractor.__init__(self, match, url)
|
||||
|
||||
def chapters(self, page):
|
||||
language = util.code_to_language(self.lang)
|
||||
extr = text.extract_from(page)
|
||||
results = []
|
||||
pos = 0
|
||||
while True:
|
||||
test, pos = text.extract(page, '<li style="', '', pos)
|
||||
if test is None:
|
||||
return results
|
||||
volume , pos = text.extract(page, 'class="volume-', '"', pos)
|
||||
url , pos = text.extract(page, 'href="', '"', pos)
|
||||
chapter, pos = text.extract(page, '>', '<', pos)
|
||||
title , pos = text.extract(page, '<em>', '</em>', pos)
|
||||
language = util.code_to_language(self.lang)
|
||||
while extr('<li style="', '"'):
|
||||
vol = extr('class="volume-', '"')
|
||||
url = extr('href="', '"')
|
||||
cha = extr('>', '<')
|
||||
title = extr('<em>', '</em>')
|
||||
|
||||
manga, _, chapter = chapter.rpartition(" ")
|
||||
manga, _, chapter = cha.rpartition(" ")
|
||||
chapter, dot, minor = chapter.partition(".")
|
||||
results.append((url, {
|
||||
"manga": manga,
|
||||
"title": text.unescape(title),
|
||||
"volume": text.parse_int(volume),
|
||||
"chapter": text.parse_int(chapter),
|
||||
"manga" : manga,
|
||||
"title" : text.unescape(title),
|
||||
"volume" : text.parse_int(vol),
|
||||
"chapter" : text.parse_int(chapter),
|
||||
"chapter_minor": dot + minor,
|
||||
"lang": self.lang,
|
||||
"lang" : self.lang,
|
||||
"language": language,
|
||||
}))
|
||||
return results
|
||||
|
||||
@@ -38,6 +38,7 @@ class FoolslideBase(SharedConfigMixin):
|
||||
data["volume"] = text.parse_int(info[2])
|
||||
data["chapter"] = text.parse_int(info[3])
|
||||
data["chapter_minor"] = "." + info[4] if len(info) >= 5 else ""
|
||||
data["title"] = data["chapter_string"].partition(":")[2].strip()
|
||||
return data
|
||||
|
||||
|
||||
@@ -74,14 +75,11 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
|
||||
yield Message.Url, url, data
|
||||
|
||||
def metadata(self, page):
|
||||
_ , pos = text.extract(page, '<h1 class="tbtitle dnone">', '')
|
||||
manga , pos = text.extract(page, 'title="', '"', pos)
|
||||
chapter, pos = text.extract(page, 'title="', '"', pos)
|
||||
chapter = text.unescape(chapter)
|
||||
extr = text.extract_from(page)
|
||||
extr('<h1 class="tbtitle dnone">', '')
|
||||
return self.parse_chapter_url(self.chapter_url, {
|
||||
"manga": text.unescape(manga).strip(),
|
||||
"title": chapter.partition(":")[2].strip(),
|
||||
"chapter_string": chapter,
|
||||
"manga" : text.unescape(extr('title="', '"')).strip(),
|
||||
"chapter_string": text.unescape(extr('title="', '"')),
|
||||
})
|
||||
|
||||
def images(self, page):
|
||||
@@ -101,25 +99,20 @@ class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
|
||||
pattern_fmt = r"(/series/[^/?&#]+)"
|
||||
|
||||
def chapters(self, page):
|
||||
manga , pos = text.extract(page, '<h1 class="title">', '</h1>')
|
||||
author, pos = text.extract(page, '<b>Author</b>: ', '<br', pos)
|
||||
artist, pos = text.extract(page, '<b>Artist</b>: ', '<br', pos)
|
||||
manga = text.unescape(manga).strip()
|
||||
extr = text.extract_from(page)
|
||||
manga = text.unescape(extr('<h1 class="title">', '</h1>')).strip()
|
||||
author = extr('<b>Author</b>: ', '<br')
|
||||
artist = extr('<b>Artist</b>: ', '<br')
|
||||
|
||||
results = []
|
||||
while True:
|
||||
url, pos = text.extract(
|
||||
page, '<div class="title"><a href="', '"', pos)
|
||||
url = extr('<div class="title"><a href="', '"')
|
||||
if not url:
|
||||
return results
|
||||
|
||||
chapter, pos = text.extract(page, 'title="', '"', pos)
|
||||
group , pos = text.extract(page, 'title="', '"', pos)
|
||||
|
||||
results.append((url, self.parse_chapter_url(url, {
|
||||
"manga": manga, "author": author, "artist": artist,
|
||||
"group": group, "chapter_string": chapter,
|
||||
"title": chapter.partition(": ")[2] or "",
|
||||
"chapter_string": extr('title="', '"'),
|
||||
"group" : extr('title="', '"'),
|
||||
})))
|
||||
|
||||
|
||||
@@ -166,7 +159,7 @@ EXTRACTORS = {
|
||||
"test-manga":
|
||||
("https://reader.kireicake.com/series/wonderland/", {
|
||||
"url": "d067b649af1cc88fa8c8b698fde04a10909fd169",
|
||||
"keyword": "99caa336a9d48e27e3b8e56a0a1e6faf9fc13a51",
|
||||
"keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb",
|
||||
}),
|
||||
},
|
||||
"powermanga": {
|
||||
@@ -214,7 +207,7 @@ EXTRACTORS = {
|
||||
"test-manga":
|
||||
("http://sensescans.com/reader/series/hakkenden/", {
|
||||
"url": "2360ccb0ead0ff2f5e27b7aef7eb17b9329de2f2",
|
||||
"keyword": "122cf92c32e6428c50f56ffaf29d06b96750ed71",
|
||||
"keyword": "4919f2bfed38e3a34dc984ec8d1dbd7a03044e23",
|
||||
}),
|
||||
},
|
||||
"worldthree": {
|
||||
|
||||
@@ -21,7 +21,7 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor):
|
||||
r"(/manga/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)")
|
||||
test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", {
|
||||
"url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2",
|
||||
"keyword": "3688ddd3f3077c93eaa8021477ef66d18dc6c159",
|
||||
"keyword": "7182d262810faa692827c947d2f360dfcb8d5e43",
|
||||
})
|
||||
root = "https://hentai.cafe"
|
||||
|
||||
@@ -51,7 +51,7 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor):
|
||||
# foolslide URL
|
||||
("https://hentai.cafe/manga/series/saitom-box/", {
|
||||
"url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
|
||||
"keyword": "46012b857eb1a1394bc55c0efe7aa4e7f704d10d",
|
||||
"keyword": "f0ece32d958f889d8229ed4052716d398a0a875c",
|
||||
}),
|
||||
)
|
||||
root = "https://hentai.cafe"
|
||||
|
||||
@@ -31,25 +31,21 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
|
||||
GalleryExtractor.__init__(self, match)
|
||||
self.gallery_id = match.group(2)
|
||||
|
||||
def metadata(self, page):
|
||||
title, pos = text.extract(page, "<h1>", "</h1>")
|
||||
data = text.extract_all(page, (
|
||||
("parody" , ">Parodies:" , "</a></span>"),
|
||||
("characters", ">Characters:", "</a></span>"),
|
||||
("tags" , ">Tags:" , "</a></span>"),
|
||||
("artist" , ">Artists:" , "</a></span>"),
|
||||
("group" , ">Groups:" , "</a></span>"),
|
||||
("type" , ">Category:" , "</a></span>"),
|
||||
), pos)[0]
|
||||
def metadata(self, page, split=text.split_html):
|
||||
extr = text.extract_from(page)
|
||||
|
||||
for key, value in data.items():
|
||||
data[key] = text.split_html(value)[::2]
|
||||
data["gallery_id"] = text.parse_int(self.gallery_id)
|
||||
data["title"] = text.unescape(title)
|
||||
data["type"] = data["type"][0] if data["type"] else ""
|
||||
data["language"] = "English"
|
||||
data["lang"] = "en"
|
||||
return data
|
||||
return {
|
||||
"gallery_id": text.parse_int(self.gallery_id),
|
||||
"title" : text.unescape(extr("<h1>", "</h1>")),
|
||||
"parody" : split(extr(">Parodies:" , "</a></span>"))[::2],
|
||||
"characters": split(extr(">Characters:", "</a></span>"))[::2],
|
||||
"tags" : split(extr(">Tags:" , "</a></span>"))[::2],
|
||||
"artist" : split(extr(">Artists:" , "</a></span>"))[::2],
|
||||
"group" : split(extr(">Groups:" , "</a></span>"))[::2],
|
||||
"type" : text.remove_html(extr(">Category:", "</a></span>")),
|
||||
"language" : "English",
|
||||
"lang" : "en",
|
||||
}
|
||||
|
||||
def images(self, page):
|
||||
return [
|
||||
|
||||
@@ -38,18 +38,14 @@ class LivedoorExtractor(Extractor):
|
||||
"""Return an iterable with post objects"""
|
||||
|
||||
def _load(self, data, body):
|
||||
pid , pos = text.extract(data, "id : '" , "'")
|
||||
title, pos = text.extract(data, "title : '", "'", pos)
|
||||
cat1 , pos = text.extract(data, "name:'" , "'", pos)
|
||||
cat2 , pos = text.extract(data, "name:'" , "'", pos)
|
||||
date , pos = text.extract(data, "date : '" , "'", pos)
|
||||
tags , pos = text.extract(body, '</dt><dd>', '</dl>')
|
||||
extr = text.extract_from(data)
|
||||
tags = text.extract(body, '</dt><dd>', '</dl>')[0]
|
||||
|
||||
return {
|
||||
"id" : text.parse_int(pid),
|
||||
"title" : title,
|
||||
"date" : date,
|
||||
"categories": [cat1, cat2],
|
||||
"id" : text.parse_int(extr("id : '", "'")),
|
||||
"title" : extr("title : '", "'"),
|
||||
"categories": [extr("name:'", "'"), extr("name:'", "'")],
|
||||
"date" : extr("date : '", "'"),
|
||||
"tags" : text.split_html(tags),
|
||||
"user" : self.user,
|
||||
"body" : body,
|
||||
@@ -108,22 +104,15 @@ class LivedoorBlogExtractor(LivedoorExtractor):
|
||||
url = "{}/{}".format(self.root, self.user)
|
||||
|
||||
while url:
|
||||
page = self.request(url).text
|
||||
pos = 0
|
||||
|
||||
extr = text.extract_from(self.request(url).text)
|
||||
while True:
|
||||
data, pos = text.extract(page, '.articles.push(', ');', pos)
|
||||
data = extr('.articles.push(', ');')
|
||||
if not data:
|
||||
break
|
||||
body, pos = text.extract(
|
||||
page,
|
||||
'<div class="article-body-inner">',
|
||||
'<!-- articleBody End -->',
|
||||
pos,
|
||||
)
|
||||
body = extr('<div class="article-body-inner">',
|
||||
'<!-- articleBody End -->')
|
||||
yield self._load(data, body)
|
||||
|
||||
url = text.extract(page, '<a rel="next" href="', '"', pos)[0]
|
||||
url = extr('<a rel="next" href="', '"')
|
||||
|
||||
|
||||
class LivedoorPostExtractor(LivedoorExtractor):
|
||||
@@ -148,13 +137,8 @@ class LivedoorPostExtractor(LivedoorExtractor):
|
||||
def posts(self):
|
||||
url = "{}/{}/archives/{}.html".format(
|
||||
self.root, self.user, self.post_id)
|
||||
page = self.request(url).text
|
||||
|
||||
data, pos = text.extract(page, 'articles :', '</script>')
|
||||
body, pos = text.extract(
|
||||
page,
|
||||
'<div class="article-body-inner">',
|
||||
'<!-- articleBody End -->',
|
||||
pos,
|
||||
)
|
||||
extr = text.extract_from(self.request(url).text)
|
||||
data = extr('articles :', '</script>')
|
||||
body = extr('<div class="article-body-inner">',
|
||||
'<!-- articleBody End -->')
|
||||
return (self._load(data, body),)
|
||||
|
||||
@@ -62,7 +62,7 @@ class LusciousAlbumExtractor(LusciousBase, GalleryExtractor):
|
||||
test = (
|
||||
("https://luscious.net/albums/okinami-no-koigokoro_277031/", {
|
||||
"url": "7e4984a271a1072ac6483e4228a045895aff86f3",
|
||||
"keyword": "1b1e8981afa250a0181e31b15ce3cbaa37c00856",
|
||||
"keyword": "f9c34e1a5b0c1f119e9f644c99933ecf7d7dbfd2",
|
||||
"content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3",
|
||||
}),
|
||||
("https://luscious.net/albums/virgin-killer-sweater_282582/", {
|
||||
|
||||
@@ -77,42 +77,31 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
|
||||
GalleryExtractor.__init__(self, match, url)
|
||||
|
||||
def metadata(self, page):
|
||||
extr = text.extract
|
||||
title, pos = extr(page, '"og:title" content="', '"')
|
||||
thumb, pos = extr(page, '"og:image" content="', '"', pos)
|
||||
extr = text.extract_from(page)
|
||||
title = extr('"og:title" content="', '"')
|
||||
title_en, _, title_jp = text.unescape(title).partition("/")
|
||||
title_en = title_en.strip()
|
||||
title_jp = title_jp.strip()
|
||||
|
||||
uploader , pos = extr(page, 'id="Uploader">' , '</div>', pos)
|
||||
date , pos = extr(page, 'id="Uploaded">' , '</div>', pos)
|
||||
rating , pos = extr(page, 'id="Rating">' , '</div>', pos)
|
||||
gtype , pos = extr(page, 'id="Category">' , '</div>', pos)
|
||||
collection, pos = extr(page, 'id="Collection">', '</div>', pos)
|
||||
group , pos = extr(page, 'id="Group">' , '</div>', pos)
|
||||
artist , pos = extr(page, 'id="Artist">' , '</div>', pos)
|
||||
parody , pos = extr(page, 'id="Parody">' , '</div>', pos)
|
||||
character , pos = extr(page, 'id="Character">' , '</div>', pos)
|
||||
tags , pos = extr(page, 'id="Tag">' , '</div>', pos)
|
||||
|
||||
return {
|
||||
"gallery_id": text.parse_int(self.gallery_id),
|
||||
"title": title_en or title_jp,
|
||||
"title_en": title_en,
|
||||
"title_jp": title_jp,
|
||||
"thumbnail": thumb,
|
||||
"uploader": text.remove_html(uploader),
|
||||
"date": date.strip(),
|
||||
"rating": text.parse_float(rating.partition(" ")[0]),
|
||||
"type": text.remove_html(gtype),
|
||||
"collection": text.remove_html(collection),
|
||||
"group": text.split_html(group),
|
||||
"artist": text.split_html(artist),
|
||||
"parody": text.split_html(parody),
|
||||
"characters": text.split_html(character),
|
||||
"tags": text.split_html(tags),
|
||||
"language": "English",
|
||||
"lang": "en",
|
||||
"title" : title_en or title_jp,
|
||||
"title_en" : title_en,
|
||||
"title_jp" : title_jp,
|
||||
"thumbnail" : extr('"og:image" content="', '"'),
|
||||
"uploader" : text.remove_html(extr('id="Uploader">', '</div>')),
|
||||
"date" : extr('id="Uploaded">', '</div>').strip(),
|
||||
"rating" : text.parse_float(extr(
|
||||
'id="Rating">', '</div>').partition(" ")[0]),
|
||||
"type" : text.remove_html(extr('id="Category">' , '</div>')),
|
||||
"collection": text.remove_html(extr('id="Collection">', '</div>')),
|
||||
"group" : text.split_html(extr('id="Group">' , '</div>')),
|
||||
"artist" : text.split_html(extr('id="Artist">' , '</div>')),
|
||||
"parody" : text.split_html(extr('id="Parody">' , '</div>')),
|
||||
"characters": text.split_html(extr('id="Character">' , '</div>')),
|
||||
"tags" : text.split_html(extr('id="Tag">' , '</div>')),
|
||||
"language" : "English",
|
||||
"lang" : "en",
|
||||
}
|
||||
|
||||
def images(self, page):
|
||||
|
||||
@@ -87,18 +87,15 @@ class TwitterExtractor(Extractor):
|
||||
|
||||
@staticmethod
|
||||
def _data_from_tweet(tweet):
|
||||
data = text.extract_all(tweet, (
|
||||
("tweet_id" , 'data-tweet-id="' , '"'),
|
||||
("retweet_id", 'data-retweet-id="' , '"'),
|
||||
("retweeter" , 'data-retweeter="' , '"'),
|
||||
("user" , 'data-screen-name="', '"'),
|
||||
("username" , 'data-name="' , '"'),
|
||||
("user_id" , 'data-user-id="' , '"'),
|
||||
))[0]
|
||||
for key in ("tweet_id", "retweet_id", "user_id"):
|
||||
data[key] = text.parse_int(data[key])
|
||||
data["retweeter"] = data["retweeter"] or ""
|
||||
return data
|
||||
extr = text.extract_from(tweet)
|
||||
return {
|
||||
"tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')),
|
||||
"retweet_id": text.parse_int(extr('data-retweet-id="', '"')),
|
||||
"retweeter" : extr('data-retweeter="' , '"'),
|
||||
"user" : extr('data-screen-name="', '"'),
|
||||
"username" : extr('data-name="' , '"'),
|
||||
"user_id" : text.parse_int(extr('data-user-id="' , '"')),
|
||||
}
|
||||
|
||||
def _tweets_from_api(self, url):
|
||||
params = {
|
||||
|
||||
Reference in New Issue
Block a user