[chevereto:album] extract 'album_…' metadata (#8604)
This commit is contained in:
@@ -21,10 +21,12 @@ class CheveretoExtractor(BaseExtractor):
|
|||||||
def _init(self):
|
def _init(self):
|
||||||
self.path = self.groups[-1]
|
self.path = self.groups[-1]
|
||||||
|
|
||||||
def _pagination(self, url):
|
def _pagination(self, url, callback=None):
|
||||||
while True:
|
page = self.request(url).text
|
||||||
page = self.request(url).text
|
if callback is not None:
|
||||||
|
callback(page)
|
||||||
|
|
||||||
|
while True:
|
||||||
for item in text.extract_iter(
|
for item in text.extract_iter(
|
||||||
page, '<div class="list-item-image ', 'image-container'):
|
page, '<div class="list-item-image ', 'image-container'):
|
||||||
yield text.urljoin(self.root, text.extr(
|
yield text.urljoin(self.root, text.extr(
|
||||||
@@ -35,6 +37,7 @@ class CheveretoExtractor(BaseExtractor):
|
|||||||
return
|
return
|
||||||
if url[0] == "/":
|
if url[0] == "/":
|
||||||
url = self.root + url
|
url = self.root + url
|
||||||
|
page = self.request(url).text
|
||||||
|
|
||||||
|
|
||||||
BASE_PATTERN = CheveretoExtractor.update({
|
BASE_PATTERN = CheveretoExtractor.update({
|
||||||
@@ -155,10 +158,21 @@ class CheveretoAlbumExtractor(CheveretoExtractor):
|
|||||||
albums = (url,)
|
albums = (url,)
|
||||||
|
|
||||||
for album in albums:
|
for album in albums:
|
||||||
for item_url in self._pagination(album):
|
for item_url in self._pagination(
|
||||||
|
album, self._extract_metadata_album):
|
||||||
data = data_video if "/video/" in item_url else data_image
|
data = data_video if "/video/" in item_url else data_image
|
||||||
yield Message.Queue, item_url, data
|
yield Message.Queue, item_url, data
|
||||||
|
|
||||||
|
def _extract_metadata_album(self, page):
|
||||||
|
url, pos = text.extract(
|
||||||
|
page, 'property="og:url" content="', '"')
|
||||||
|
|
||||||
|
kwdict = self.kwdict
|
||||||
|
kwdict["album_slug"], _, kwdict["album_id"] = \
|
||||||
|
url[url.rfind("/")+1:].rpartition(".")
|
||||||
|
kwdict["album"] = text.unescape(text.extract(
|
||||||
|
page, 'property="og:title" content="', '"', pos)[0])
|
||||||
|
|
||||||
|
|
||||||
class CheveretoCategoryExtractor(CheveretoExtractor):
|
class CheveretoCategoryExtractor(CheveretoExtractor):
|
||||||
"""Extractor for chevereto galleries"""
|
"""Extractor for chevereto galleries"""
|
||||||
|
|||||||
@@ -118,6 +118,10 @@ __tests__ = (
|
|||||||
"#category": ("chevereto", "jpgfish", "album"),
|
"#category": ("chevereto", "jpgfish", "album"),
|
||||||
"#class" : chevereto.CheveretoAlbumExtractor,
|
"#class" : chevereto.CheveretoAlbumExtractor,
|
||||||
"#count" : 2,
|
"#count" : 2,
|
||||||
|
|
||||||
|
"album" : "funny meme album",
|
||||||
|
"album_id" : "CDilP",
|
||||||
|
"album_slug": "funny-meme-album",
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -125,6 +129,10 @@ __tests__ = (
|
|||||||
"#category": ("chevereto", "jpgfish", "album"),
|
"#category": ("chevereto", "jpgfish", "album"),
|
||||||
"#class" : chevereto.CheveretoAlbumExtractor,
|
"#class" : chevereto.CheveretoAlbumExtractor,
|
||||||
"#count" : 114,
|
"#count" : 114,
|
||||||
|
|
||||||
|
"album" : "Gunggingnsk OF",
|
||||||
|
"album_id" : "N9OOI",
|
||||||
|
"album_slug": "gunggingnsk",
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -132,6 +140,10 @@ __tests__ = (
|
|||||||
"#category": ("chevereto", "jpgfish", "album"),
|
"#category": ("chevereto", "jpgfish", "album"),
|
||||||
"#class" : chevereto.CheveretoAlbumExtractor,
|
"#class" : chevereto.CheveretoAlbumExtractor,
|
||||||
"#count" : 100,
|
"#count" : 100,
|
||||||
|
|
||||||
|
"album" : "101-200",
|
||||||
|
"album_id" : "aNJ6A",
|
||||||
|
"album_slug": "101-200",
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -139,6 +151,10 @@ __tests__ = (
|
|||||||
"#category": ("chevereto", "jpgfish", "album"),
|
"#category": ("chevereto", "jpgfish", "album"),
|
||||||
"#class" : chevereto.CheveretoAlbumExtractor,
|
"#class" : chevereto.CheveretoAlbumExtractor,
|
||||||
"#count" : 606,
|
"#count" : 606,
|
||||||
|
|
||||||
|
"album" : "re:([12345]0)?1-[123456]00",
|
||||||
|
"album_id" : str,
|
||||||
|
"album_slug": str,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user