[motherless] add 'media' and 'gallery' extractors
(#2074, #4413, #6221)
This commit is contained in:
@@ -607,6 +607,12 @@ Consider all listed sites to potentially be NSFW.
|
||||
<td>Albums, Channels</td>
|
||||
<td>Supported</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Motherless</td>
|
||||
<td>https://motherless.com/</td>
|
||||
<td>Galleries, Media Files</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>My Hentai Gallery</td>
|
||||
<td>https://myhentaigallery.com/</td>
|
||||
|
||||
167
gallery_dl/extractor/motherless.py
Normal file
167
gallery_dl/extractor/motherless.py
Normal file
@@ -0,0 +1,167 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extractors for https://motherless.com/"""
|
||||
|
||||
from .common import Extractor, Message
|
||||
from .. import text, util
|
||||
from ..cache import memcache
|
||||
from datetime import timedelta
|
||||
|
||||
BASE_PATTERN = r"(?:https?://)?motherless\.com"
|
||||
|
||||
|
||||
class MotherlessExtractor(Extractor):
|
||||
"""Base class for motherless extractors"""
|
||||
category = "motherless"
|
||||
root = "https://motherless.com"
|
||||
filename_fmt = "{id} {title}.{extension}"
|
||||
archive_fmt = "{id}"
|
||||
|
||||
|
||||
class MotherlessMediaExtractor(MotherlessExtractor):
|
||||
"""Extractor for a single image/video from motherless.com"""
|
||||
subcategory = "media"
|
||||
pattern = (BASE_PATTERN +
|
||||
r"/((?:g/[^/?#]+/|G[IV]?[A-Z0-9]+/)?"
|
||||
r"(?!G)[A-Z0-9]+)")
|
||||
example = "https://motherless.com/ABC123"
|
||||
|
||||
def items(self):
|
||||
file = self._extract_media(self.groups[0])
|
||||
url = file["url"]
|
||||
yield Message.Directory, file
|
||||
yield Message.Url, url, text.nameext_from_url(url, file)
|
||||
|
||||
def _extract_media(self, path):
|
||||
url = self.root + "/" + path
|
||||
page = self.request(url).text
|
||||
extr = text.extract_from(page)
|
||||
|
||||
path, _, media_id = path.rpartition("/")
|
||||
data = {
|
||||
"id" : media_id,
|
||||
"type" : extr("__mediatype = '", "'"),
|
||||
"group": extr("__group = '", "'"),
|
||||
"url" : extr("__fileurl = '", "'"),
|
||||
"tags" : [
|
||||
text.unescape(tag)
|
||||
for tag in text.extract_iter(
|
||||
extr('class="media-meta-tags">', "</div>"), ">#", "<")
|
||||
],
|
||||
"title": text.unescape(extr("<h1>", "<")),
|
||||
"views": text.parse_int(extr(
|
||||
'class="count">', " ").replace(",", "")),
|
||||
"favorites": text.parse_int(extr(
|
||||
'class="count">', " ").replace(",", "")),
|
||||
"date" : self._parse_datetime(extr('class="count">', "<")),
|
||||
"uploader": text.unescape(extr('class="username">', "<").strip()),
|
||||
}
|
||||
|
||||
if path and path[0] == "G":
|
||||
data["gallery_id"] = path[1:]
|
||||
data["gallery_title"] = self._extract_gallery_title(
|
||||
page, data["gallery_id"])
|
||||
|
||||
return data
|
||||
|
||||
def _parse_datetime(self, dt):
|
||||
if " ago" not in dt:
|
||||
return text.parse_datetime(dt, "%d %b %Y")
|
||||
|
||||
value = text.parse_int(dt[:-5])
|
||||
delta = timedelta(0, value*3600) if dt[-5] == "h" else timedelta(value)
|
||||
return (util.datetime_utcnow() - delta).replace(
|
||||
hour=0, minute=0, second=0)
|
||||
|
||||
@memcache(keyarg=2)
|
||||
def _extract_gallery_title(self, page, gallery_id):
|
||||
title = text.extr(
|
||||
text.extr(page, '<h1 class="content-title">', "</h1>"),
|
||||
"From the gallery:", "<")
|
||||
if title:
|
||||
return text.unescape(title.strip())
|
||||
|
||||
pos = page.find(' href="/G' + gallery_id + '"')
|
||||
if pos >= 0:
|
||||
return text.unescape(text.extract(
|
||||
page, ' title="', '"', pos)[0])
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
class MotherlessGalleryExtractor(MotherlessExtractor):
|
||||
"""Extractor for a motherless.com gallery"""
|
||||
subcategory = "gallery"
|
||||
directory_fmt = ("{category}", "{uploader}",
|
||||
"{gallery_id} {gallery_title}")
|
||||
archive_fmt = "{gallery_id}_{id}"
|
||||
pattern = BASE_PATTERN + "/G([IVG])?([A-Z0-9]+)/?$"
|
||||
example = "https://motherless.com/GABC123"
|
||||
|
||||
def items(self):
|
||||
type, gid = self.groups
|
||||
|
||||
if not type:
|
||||
data = {"_extractor": MotherlessGalleryExtractor}
|
||||
yield Message.Queue, self.root + "/GI" + gid, data
|
||||
yield Message.Queue, self.root + "/GV" + gid, data
|
||||
return
|
||||
|
||||
url = "{}/G{}{}".format(self.root, type, gid)
|
||||
page = self.request(url).text
|
||||
data = self._extract_gallery_data(page)
|
||||
|
||||
for num, thumb in enumerate(self._pagination(page), 1):
|
||||
file = self._parse_thumb_data(thumb)
|
||||
file.update(data)
|
||||
file["num"] = num
|
||||
url = file["url"]
|
||||
yield Message.Directory, file
|
||||
yield Message.Url, url, text.nameext_from_url(url, file)
|
||||
|
||||
def _pagination(self, page):
|
||||
while True:
|
||||
for thumb in text.extract_iter(
|
||||
page, 'class="thumb-container', "</div>"):
|
||||
yield thumb
|
||||
|
||||
url = text.extr(page, '<link rel="next" href="', '"')
|
||||
if not url:
|
||||
return
|
||||
page = self.request(text.unescape(url)).text
|
||||
|
||||
def _extract_gallery_data(self, page):
|
||||
extr = text.extract_from(page)
|
||||
return {
|
||||
"gallery_id": self.groups[-1],
|
||||
"gallery_title": text.unescape(extr(
|
||||
"<title>", "<").rpartition(" | ")[0]),
|
||||
"uploader": text.remove_html(extr(
|
||||
'class="gallery-member-username">', "</")),
|
||||
"count": text.parse_int(
|
||||
extr('<span class="active">', ")")
|
||||
.rpartition("(")[2].replace(",", "")),
|
||||
}
|
||||
|
||||
def _parse_thumb_data(self, thumb):
|
||||
extr = text.extract_from(thumb)
|
||||
data = {
|
||||
"id" : extr('data-codename="', '"'),
|
||||
"type" : extr('data-mediatype="', '"'),
|
||||
"thumbnail": extr('class="static" src="', '"'),
|
||||
"title" : extr(' alt="', '"'),
|
||||
}
|
||||
|
||||
type = data["type"]
|
||||
url = data["thumbnail"].replace("thumb", type)
|
||||
if type == "video":
|
||||
url = "{}/{}.mp4".format(url.rpartition("/")[0], data["id"])
|
||||
data["url"] = url
|
||||
|
||||
return data
|
||||
127
test/results/motherless.py
Normal file
127
test/results/motherless.py
Normal file
@@ -0,0 +1,127 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
from gallery_dl.extractor import motherless
|
||||
|
||||
|
||||
__tests__ = (
|
||||
{
|
||||
"#url" : "https://motherless.com/B0168DB",
|
||||
"#class": motherless.MotherlessMediaExtractor,
|
||||
"#urls" : "https://cdn5-images.motherlessmedia.com/images/B0168DB.jpg",
|
||||
"#sha1_content": "10629fc5dd7a9623af7dd57f1a322d0f24ac9acc",
|
||||
|
||||
"date" : "dt:2013-03-29 00:00:00",
|
||||
"extension": "jpg",
|
||||
"favorites": range(0, 10),
|
||||
"filename" : "B0168DB",
|
||||
"group" : "",
|
||||
"id" : "B0168DB",
|
||||
"tags" : [
|
||||
"Lady J",
|
||||
"outdoor",
|
||||
"closeup. face"
|
||||
],
|
||||
"title" : "388652199_d6fc8a9515_o.jpg",
|
||||
"type" : "image",
|
||||
"uploader" : "anonymous",
|
||||
"url" : "https://cdn5-images.motherlessmedia.com/images/B0168DB.jpg",
|
||||
"views" : range(90, 200),
|
||||
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://motherless.com/G43D8704/F0C07D3",
|
||||
"#class": motherless.MotherlessMediaExtractor,
|
||||
"#urls" : "https://cdn5-images.motherlessmedia.com/images/F0C07D3.jpg",
|
||||
|
||||
"date" : "dt:2014-08-13 00:00:00",
|
||||
"extension" : "jpg",
|
||||
"favorites" : range(100, 200),
|
||||
"filename" : "F0C07D3",
|
||||
"gallery_id": "43D8704",
|
||||
"gallery_title": "SpeechLess",
|
||||
"group" : "",
|
||||
"id" : "F0C07D3",
|
||||
"tags" : [],
|
||||
"title" : "Spunky Angels Amy Black Dress",
|
||||
"type" : "image",
|
||||
"uploader" : "jonesyjonesy",
|
||||
"url" : "https://cdn5-images.motherlessmedia.com/images/F0C07D3.jpg",
|
||||
"views" : range(14000, 20000),
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://motherless.com/g/classic_porn/19D6C80",
|
||||
"#class": motherless.MotherlessMediaExtractor,
|
||||
"#urls" : "https://cdn5-images.motherlessmedia.com/images/19D6C80.gif",
|
||||
|
||||
"date" : "dt:2021-05-11 00:00:00",
|
||||
"extension": "gif",
|
||||
"favorites": range(10, 50),
|
||||
"filename" : "19D6C80",
|
||||
"group" : "classic_porn",
|
||||
"id" : "19D6C80",
|
||||
"tags" : [],
|
||||
"title" : "Kaffee 1",
|
||||
"type" : "image",
|
||||
"uploader" : "KurtRitter",
|
||||
"url" : "https://cdn5-images.motherlessmedia.com/images/19D6C80.gif",
|
||||
"views" : range(150000, 300000),
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://motherless.com/G43D8704",
|
||||
"#class": motherless.MotherlessGalleryExtractor,
|
||||
"#urls": (
|
||||
"https://motherless.com/GI43D8704",
|
||||
"https://motherless.com/GV43D8704",
|
||||
),
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://motherless.com/GI43D8704",
|
||||
"#class": motherless.MotherlessGalleryExtractor,
|
||||
"#pattern": r"https://cdn5-images\.motherlessmedia\.com/images/\w+\.(jpg|png|gif)",
|
||||
"#range" : "1-100",
|
||||
"#count" : 100,
|
||||
|
||||
"count" : 6503,
|
||||
"extension" : {"jpg", "png", "gif"},
|
||||
"filename" : str,
|
||||
"gallery_id" : "43D8704",
|
||||
"gallery_title": "SpeechLess",
|
||||
"id" : str,
|
||||
"num" : int,
|
||||
"thumbnail" : r"re:https://cdn5-thumbs\.motherlessmedia\.com/thumbs/\w+\.\w+",
|
||||
"title" : str,
|
||||
"type" : "image",
|
||||
"uploader" : "gaylobe",
|
||||
"url" : r"re:https://cdn5-images\.motherlessmedia\.com/images/\w+\.(jpg|png|gif)",
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://motherless.com/GV43D8704",
|
||||
"#class": motherless.MotherlessGalleryExtractor,
|
||||
"#pattern": r"https://cdn5-videos.motherlessmedia.com/videos/\w+\.mp4",
|
||||
"#range" : "1-100",
|
||||
"#count" : 100,
|
||||
|
||||
"count" : 869,
|
||||
"extension" : "mp4",
|
||||
"filename" : str,
|
||||
"gallery_id" : "43D8704",
|
||||
"gallery_title": "SpeechLess",
|
||||
"id" : str,
|
||||
"num" : int,
|
||||
"thumbnail" : r"re:https://cdn5-thumbs\.motherlessmedia\.com/thumbs/[\w-]+\.\w+",
|
||||
"title" : str,
|
||||
"type" : "video",
|
||||
"uploader" : "gaylobe",
|
||||
"url" : r"re:https://cdn5-videos.motherlessmedia.com/videos/\w+\.mp4",
|
||||
},
|
||||
|
||||
)
|
||||
Reference in New Issue
Block a user