From fe19e233f342f03103e9666596c22fdb257da564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 13 Dec 2019 19:03:56 +0100 Subject: [PATCH] [xvideos] improve MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - derive from GalleryExtractor - match '…-channels' URLs - "better" metadata structure --- gallery_dl/extractor/xvideos.py | 124 ++++++++++++++++---------------- 1 file changed, 63 insertions(+), 61 deletions(-) diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index e253b7f6..80a3614a 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -6,86 +6,91 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.xvideos.com/""" +"""Extractors for https://www.xvideos.com/""" -from .common import Extractor, Message -from .. import text, exception +from .common import GalleryExtractor, Extractor, Message +from .. import text import json -class XvideosExtractor(Extractor): +class XvideosBase(): """Base class for xvideos extractors""" category = "xvideos" root = "https://www.xvideos.com" -class XvideosGalleryExtractor(XvideosExtractor): - """Extractor for user profile galleries from xvideos.com""" +class XvideosGalleryExtractor(XvideosBase, GalleryExtractor): + """Extractor for user profile galleries on xvideos.com""" subcategory = "gallery" - directory_fmt = ("{category}", "{user[name]}", "{title}") - filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" - archive_fmt = "{gallery_id}_{num}" + directory_fmt = ("{category}", "{user[name]}", + "{gallery[id]} {gallery[title]}") + filename_fmt = "{category}_{gallery[id]}_{num:>03}.{extension}" + archive_fmt = "{gallery[id]}_{num}" pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com" - r"/profiles/([^/?&#]+)/photos/(\d+)") + r"/(?:profiles|amateur-channels|model-channels)" + r"/([^/?&#]+)/photos/(\d+)") test = ( - (("https://www.xvideos.com/profiles" - "/pervertedcouple/photos/751031/random_stuff"), { + ("https://www.xvideos.com/profiles/pervertedcouple/photos/751031", { "url": "4f0d992e5dc39def2c3ac8e099d17bf09e76e3c7", - "keyword": "65979d63a69576cf692b41d5fbbd995cc40a51b9", - }), - ("https://www.xvideos.com/profiles/pervertedcouple/photos/751032/", { - "exception": exception.NotFoundError, + "keyword": { + "gallery": { + "id" : 751031, + "title": "Random Stuff", + "tags" : list, + }, + "user": { + "id" : 20245371, + "name" : "pervertedcouple", + "display" : "Pervertedcouple", + "sex" : "Woman", + "description": str, + }, + }, }), + ("https://www.xvideos.com/amateur-channels/pervertedcouple/photos/12"), + ("https://www.xvideos.com/model-channels/pervertedcouple/photos/12"), ) def __init__(self, match): - XvideosExtractor.__init__(self, match) - self.user, self.gid = match.groups() + self.user, self.gallery_id = match.groups() + url = "{}/profiles/{}/photos/{}".format( + self.root, self.user, self.gallery_id) + GalleryExtractor.__init__(self, match, url) - def items(self): - url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid) - page = self.request(url, notfound=self.subcategory).text - data = self.get_metadata(page) - imgs = self.get_images(page) - data["count"] = len(imgs) - yield Message.Version, 1 - yield Message.Directory, data - for url in imgs: - data["num"] = text.parse_int(url.rsplit("_", 2)[1]) - data["extension"] = url.rpartition(".")[2] - yield Message.Url, url, data - - def get_metadata(self, page): - """Collect metadata for extractor-job""" - data = text.extract_all(page, ( - ("userid" , '"id_user":', ','), - ("display", '"display":"', '"'), - ("title" , '"title":"', '"'), - ("descr" , '', ''), - ("tags" , 'Tagged:', '<'), - ))[0] + def metadata(self, page): + extr = text.extract_from(page) + user = { + "id" : text.parse_int(extr('"id_user":', ',')), + "display": extr('"display":"', '"'), + "sex" : extr('"sex":"', '"'), + "name" : self.user, + } + title = extr('"title":"', '"') + user["description"] = extr( + '', '').strip() + tags = extr('Tagged:', '<').strip() return { - "user": { - "id": text.parse_int(data["userid"]), - "name": self.user, - "display": data["display"], - "description": data["descr"].strip(), + "user": user, + "gallery": { + "id" : text.parse_int(self.gallery_id), + "title": text.unescape(title), + "tags" : text.unescape(tags).split(", ") if tags else [], }, - "tags": text.unescape(data["tags"] or "").strip().split(", "), - "title": text.unescape(data["title"]), - "gallery_id": text.parse_int(self.gid), } @staticmethod - def get_images(page): + def images(page): """Return a list of all image urls for this gallery""" - return list(text.extract_iter( - page, '