[shimmie2] autodetect single or double quotes

This commit is contained in:
Mike Fährmann
2023-12-22 15:25:28 +01:00
parent 2a60645095
commit 7cd0211cc9

View File

@@ -35,8 +35,6 @@ class Shimmie2Extractor(BaseExtractor):
if self.category == "giantessbooru":
self.posts = self._posts_giantessbooru
elif self.category == "rule34hentai":
self.posts = self._posts_rule34hentai
def items(self):
data = self.metadata()
@@ -66,6 +64,13 @@ class Shimmie2Extractor(BaseExtractor):
"""Return an iterable containing data of all relevant posts"""
return ()
def _quote_type(self, page):
"""Return quoting character used in 'page' (' or ")"""
try:
return page[page.index("<link rel=")+10]
except Exception:
return "'"
INSTANCES = {
"loudbooru": {
@@ -127,21 +132,23 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
if init:
init = False
has_mime = ("data-mime='" in page)
has_pid = ("data-post-id='" in page)
quote = self._quote_type(page)
has_mime = (" data-mime=" in page)
has_pid = (" data-post-id=" in page)
while True:
if has_mime:
mime = extr("data-mime='", "'")
mime = extr(" data-mime="+quote, quote)
if has_pid:
pid = extr("data-post-id='", "'")
pid = extr(" data-post-id="+quote, quote)
else:
pid = extr("href='/post/view/", "?")
pid = extr(" href='/post/view/", quote)
if not pid:
break
tags, dimensions, size = extr("title='", "'").split(" // ")
tags, dimensions, size = extr(
"title="+quote, quote).split(" // ")[:3]
width, _, height = dimensions.partition("x")
md5 = extr("/_thumbs/", "/")
@@ -193,56 +200,6 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
if not extr('/{}">{}<'.format(pnum, pnum), ">"):
return
def _posts_rule34hentai(self):
pnum = text.parse_int(self.page, 1)
file_url_fmt = self.file_url_fmt.format
init = True
mime = ""
while True:
url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
page = self.request(url).text
extr = text.extract_from(page)
if init:
init = False
has_mime = ("data-mime=\"" in page)
has_pid = ("data-post-id=\"" in page)
while True:
if has_mime:
mime = extr("data-mime=\"", "\"")
if has_pid:
pid = extr("data-post-id=\"", "\"")
else:
pid = extr("href='/post/view/", "?")
if not pid:
break
tags, dimensions, size, ext = extr(
"title=\"", "\"").split(" // ")
width, _, height = dimensions.partition("x")
md5 = extr("/_thumbs/", "/")
yield {
"file_url": file_url_fmt(
self.root, md5, pid, text.quote(tags),
mime.rpartition("/")[2] if mime else "jpg"),
"id": pid,
"md5": md5,
"tags": tags,
"width": width,
"height": height,
"size": text.parse_bytes(size[:-1]),
}
pnum += 1
if not extr(">Next<", ">"):
if not extr("/{}'>{}<".format(pnum, pnum), ">"):
return
class Shimmie2PostExtractor(Shimmie2Extractor):
"""Extractor for single shimmie2 posts"""
@@ -256,15 +213,17 @@ class Shimmie2PostExtractor(Shimmie2Extractor):
def posts(self):
url = "{}/post/view/{}".format(self.root, self.post_id)
extr = text.extract_from(self.request(url).text)
page = self.request(url).text
extr = text.extract_from(page)
quote = self._quote_type(page)
post = {
"id" : self.post_id,
"tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"),
"md5" : extr("/_thumbs/", "/"),
"file_url": self.root + (
extr("id='main_image' src='", "'") or
extr("<source src='", "'")).lstrip("."),
extr("id={0}main_image{0} src={0}".format(quote), quote) or
extr("<source src="+quote, quote)).lstrip("."),
"width" : extr("data-width=", " ").strip("\"'"),
"height" : extr("data-height=", ">").partition(
" ")[0].strip("\"'"),
@@ -290,25 +249,3 @@ class Shimmie2PostExtractor(Shimmie2Extractor):
"height" : 0,
"size" : 0,
},)
def _posts_rule34hentai(self):
url = "{}/post/view/{}".format(self.root, self.post_id)
extr = text.extract_from(self.request(url).text)
post = {
"id" : self.post_id,
"tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"),
"md5" : extr("/_thumbs/", "/"),
"file_url": self.root + (
extr('id="main_image" src="', '"') or
extr('<source src="', '"')).lstrip("."),
"width" : extr("data-width=", " ").strip("\"'"),
"height" : extr("data-height=", ">").partition(
" ")[0].strip("\"'"),
"size" : 0,
}
if not post["md5"]:
post["md5"] = text.extr(post["file_url"], "/_images/", "/")
return (post,)