[erome] restructure extractor hierarchy (#7804)
This commit is contained in:
@@ -22,62 +22,20 @@ class EromeExtractor(Extractor):
|
||||
filename_fmt = "{album_id} {title} {num:>02}.{extension}"
|
||||
archive_fmt = "{album_id}_{num}"
|
||||
root = "https://www.erome.com"
|
||||
_cookies = True
|
||||
|
||||
def items(self):
|
||||
self.__cookies = True
|
||||
base = f"{self.root}/a/"
|
||||
data = {"_extractor": EromeAlbumExtractor}
|
||||
for album_id in self.albums():
|
||||
url = f"{self.root}/a/{album_id}"
|
||||
|
||||
try:
|
||||
page = self.request(url).text
|
||||
except exception.HttpError as exc:
|
||||
self.log.warning(
|
||||
"Unable to fetch album '%s' (%s)", album_id, exc)
|
||||
continue
|
||||
|
||||
title, pos = text.extract(
|
||||
page, 'property="og:title" content="', '"')
|
||||
pos = page.index('<div class="user-profile', pos)
|
||||
user, pos = text.extract(
|
||||
page, 'href="https://www.erome.com/', '"', pos)
|
||||
tags, pos = text.extract(
|
||||
page, '<p class="mt-10"', '</p>', pos)
|
||||
|
||||
urls = []
|
||||
date = None
|
||||
groups = page.split('<div class="media-group"')
|
||||
for group in util.advance(groups, 1):
|
||||
url = (text.extr(group, '<source src="', '"') or
|
||||
text.extr(group, 'data-src="', '"'))
|
||||
if url:
|
||||
urls.append(url)
|
||||
if not date:
|
||||
ts = text.extr(group, '?v=', '"')
|
||||
if len(ts) > 1:
|
||||
date = text.parse_timestamp(ts)
|
||||
|
||||
data = {
|
||||
"album_id": album_id,
|
||||
"title" : text.unescape(title),
|
||||
"user" : text.unquote(user),
|
||||
"count" : len(urls),
|
||||
"date" : date,
|
||||
"tags" : ([t.replace("+", " ")
|
||||
for t in text.extract_iter(tags, "?q=", '"')]
|
||||
if tags else ()),
|
||||
"_http_headers": {"Referer": url},
|
||||
}
|
||||
|
||||
yield Message.Directory, data
|
||||
for data["num"], url in enumerate(urls, 1):
|
||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||
yield Message.Queue, f"{base}{album_id}", data
|
||||
|
||||
def albums(self):
|
||||
return ()
|
||||
|
||||
def request(self, url, **kwargs):
|
||||
if self.__cookies:
|
||||
self.__cookies = False
|
||||
if self._cookies:
|
||||
self._cookies = False
|
||||
self.cookies.update(_cookie_cache())
|
||||
|
||||
for _ in range(5):
|
||||
@@ -106,8 +64,52 @@ class EromeAlbumExtractor(EromeExtractor):
|
||||
pattern = BASE_PATTERN + r"/a/(\w+)"
|
||||
example = "https://www.erome.com/a/ID"
|
||||
|
||||
def albums(self):
|
||||
return (self.groups[0],)
|
||||
def items(self):
|
||||
album_id = self.groups[0]
|
||||
url = f"{self.root}/a/{album_id}"
|
||||
|
||||
try:
|
||||
page = self.request(url).text
|
||||
except exception.HttpError as exc:
|
||||
raise exception.AbortExtraction(
|
||||
f"{album_id}: Unable to fetch album page ({exc})")
|
||||
|
||||
title, pos = text.extract(
|
||||
page, 'property="og:title" content="', '"')
|
||||
pos = page.index('<div class="user-profile', pos)
|
||||
user, pos = text.extract(
|
||||
page, 'href="https://www.erome.com/', '"', pos)
|
||||
tags, pos = text.extract(
|
||||
page, '<p class="mt-10"', '</p>', pos)
|
||||
|
||||
urls = []
|
||||
date = None
|
||||
groups = page.split('<div class="media-group"')
|
||||
for group in util.advance(groups, 1):
|
||||
url = (text.extr(group, '<source src="', '"') or
|
||||
text.extr(group, 'data-src="', '"'))
|
||||
if url:
|
||||
urls.append(url)
|
||||
if not date:
|
||||
ts = text.extr(group, '?v=', '"')
|
||||
if len(ts) > 1:
|
||||
date = text.parse_timestamp(ts)
|
||||
|
||||
data = {
|
||||
"album_id": album_id,
|
||||
"title" : text.unescape(title),
|
||||
"user" : text.unquote(user),
|
||||
"count" : len(urls),
|
||||
"date" : date,
|
||||
"tags" : ([t.replace("+", " ")
|
||||
for t in text.extract_iter(tags, "?q=", '"')]
|
||||
if tags else ()),
|
||||
"_http_headers": {"Referer": url},
|
||||
}
|
||||
|
||||
yield Message.Directory, data
|
||||
for data["num"], url in enumerate(urls, 1):
|
||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||
|
||||
|
||||
class EromeUserExtractor(EromeExtractor):
|
||||
|
||||
@@ -42,6 +42,7 @@ __tests__ = (
|
||||
"#url" : "https://www.erome.com/yYgWBZw8o8qsMzM",
|
||||
"#category": ("", "erome", "user"),
|
||||
"#class" : erome.EromeUserExtractor,
|
||||
"#pattern" : erome.EromeAlbumExtractor.pattern,
|
||||
"#range" : "1-25",
|
||||
"#count" : 25,
|
||||
},
|
||||
@@ -50,6 +51,7 @@ __tests__ = (
|
||||
"#url" : "https://www.erome.com/search?q=cute",
|
||||
"#category": ("", "erome", "search"),
|
||||
"#class" : erome.EromeSearchExtractor,
|
||||
"#pattern" : erome.EromeAlbumExtractor.pattern,
|
||||
"#range" : "1-25",
|
||||
"#count" : 25,
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user