Merge branch 'master' into da-extra-stash

This commit is contained in:
ClosedPort22
2022-12-10 21:34:37 +08:00
committed by GitHub
33 changed files with 283 additions and 285 deletions

View File

@@ -146,6 +146,7 @@ class _35photoTagExtractor(_35photoExtractor):
test = ("https://35photo.pro/tags/landscape/", {
"range": "1-25",
"count": 25,
"archive": False,
})
def __init__(self, match):

View File

@@ -92,8 +92,8 @@ class _8chanThreadExtractor(_8chanExtractor):
"uniquePosters": 9,
"usesCustomCss": True,
"usesCustomJs": False,
"wsPort": 8880,
"wssPort": 2087,
"?wsPort": 8880,
"?wssPort": 2087,
},
}),
("https://8chan.se/vhs/res/4.html"),

View File

@@ -74,7 +74,6 @@ modules = [
"keenspot",
"kemonoparty",
"khinsider",
"kissgoddess",
"kohlchan",
"komikcast",
"lightroom",

View File

@@ -915,20 +915,6 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
DeviantartStashExtractor.pattern),
"count": 2,
}),
# video
("https://www.deviantart.com/chi-u/art/-VIDEO-Brushes-330774593", {
"pattern": r"https://wixmp-.+wixmp.com/v/mp4/.+\.720p\.\w+.mp4",
"keyword": {
"filename": r"re:_video____brushes_\w+_by_chi_u-d5gxnb5",
"extension": "mp4",
"target": {
"duration": 306,
"filesize": 19367585,
"quality": "720p",
"src": str,
},
}
}),
# journal
("https://www.deviantart.com/shimoda7/journal/ARTility-583755752", {
"url": "d34b2c9f873423e665a1b8ced20fcb75951694a3",

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2021 Mike Fährmann
# Copyright 2021-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -92,16 +92,29 @@ class EromeAlbumExtractor(EromeExtractor):
"""Extractor for albums on erome.com"""
subcategory = "album"
pattern = BASE_PATTERN + r"/a/(\w+)"
test = ("https://www.erome.com/a/TyFMI7ik", {
"pattern": r"https://s\d+\.erome\.com/\d+/TyFMI7ik/\w+",
"count": 9,
"keyword": {
"album_id": "TyFMI7ik",
"num": int,
"title": "Ryan Ryans",
"user": "xanub",
},
})
test = (
("https://www.erome.com/a/NQgdlWvk", {
"pattern": r"https://v\d+\.erome\.com/\d+"
r"/NQgdlWvk/j7jlzmYB_480p\.mp4",
"count": 1,
"keyword": {
"album_id": "NQgdlWvk",
"num": 1,
"title": "porn",
"user": "yYgWBZw8o8qsMzM",
},
}),
("https://www.erome.com/a/TdbZ4ogi", {
"pattern": r"https://s\d+\.erome\.com/\d+/TdbZ4ogi/\w+",
"count": 6,
"keyword": {
"album_id": "TdbZ4ogi",
"num": int,
"title": "82e78cfbb461ad87198f927fcb1fda9a1efac9ff.",
"user": "yYgWBZw8o8qsMzM",
},
}),
)
def albums(self):
return (self.item,)
@@ -110,7 +123,7 @@ class EromeAlbumExtractor(EromeExtractor):
class EromeUserExtractor(EromeExtractor):
subcategory = "user"
pattern = BASE_PATTERN + r"/(?!a/|search\?)([^/?#]+)"
test = ("https://www.erome.com/xanub", {
test = ("https://www.erome.com/yYgWBZw8o8qsMzM", {
"range": "1-25",
"count": 25,
})

View File

@@ -117,9 +117,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
r"|/s/([\da-f]{10})/(\d+)-(\d+))")
test = (
("https://exhentai.org/g/1200119/d55c44d3d0/", {
"options": (("original", False),),
"keyword": {
"cost": int,
"date": "dt:2018-03-18 20:15:00",
"date": "dt:2018-03-18 20:14:00",
"eh_category": "Non-H",
"expunged": False,
"favorites": r"re:^[12]\d$",
@@ -150,7 +151,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"uploader": "klorpa",
"width": int,
},
"content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff",
"content": ("2c68cff8a7ca540a78c36fdbf5fbae0260484f87",
"e9891a4c017ed0bb734cd1efba5cd03f594d31ff"),
}),
("https://exhentai.org/g/960461/4f0e369d82/", {
"exception": exception.NotFoundError,
@@ -159,9 +161,11 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"exception": exception.AuthorizationError,
}),
("https://exhentai.org/s/f68367b4c8/1200119-3", {
"options": (("original", False),),
"count": 2,
}),
("https://e-hentai.org/s/f68367b4c8/1200119-3", {
"options": (("original", False),),
"count": 2,
}),
("https://g.e-hentai.org/g/1200119/d55c44d3d0/"),
@@ -516,7 +520,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
data["gallery_token"] = gallery.group(3)
yield Message.Queue, url + "/", data
next_url = text.extr(page, 'nexturl = "', '"', None)
next_url = text.extr(page, 'nexturl="', '"', None)
if next_url is not None:
if not next_url:
return

View File

@@ -39,10 +39,6 @@ class FoolslideExtractor(BaseExtractor):
BASE_PATTERN = FoolslideExtractor.update({
"kireicake": {
"root": "https://reader.kireicake.com",
"pattern": r"reader\.kireicake\.com",
},
"powermanga": {
"root": "https://read.powermanga.org",
"pattern": r"read(?:er)?\.powermanga\.org",
@@ -64,10 +60,6 @@ class FoolslideChapterExtractor(FoolslideExtractor):
archive_fmt = "{id}"
pattern = BASE_PATTERN + r"(/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
test = (
("https://reader.kireicake.com/read/wonderland/en/1/1/", {
"url": "b2d36bc0bc67e4c461c3a4d6444a2fd339f5d07e",
"keyword": "9f80947920a325e33aea7f5cd69ea669171903b6",
}),
(("https://read.powermanga.org"
"/read/one_piece_digital_colour_comics/en/0/75/"), {
"url": "854c5817f8f767e1bccd05fa9d58ffb5a4b09384",
@@ -123,10 +115,6 @@ class FoolslideMangaExtractor(FoolslideExtractor):
categorytransfer = True
pattern = BASE_PATTERN + r"(/series/[^/?#]+)"
test = (
("https://reader.kireicake.com/series/wonderland/", {
"url": "d067b649af1cc88fa8c8b698fde04a10909fd169",
"keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb",
}),
(("https://read.powermanga.org"
"/series/one_piece_digital_colour_comics/"), {
"count": ">= 1",

View File

@@ -174,7 +174,8 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor):
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)"
test = (
("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
"content": "5c6ae9ee13e6d4bc9cb8bdce224c84e67fbfa36c",
"content": ("5c6ae9ee13e6d4bc9cb8bdce224c84e67fbfa36c",
"622e80be3f496672c44aab5c47fbc6941c61bc79"),
"pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
"count": 2,
}),

View File

@@ -200,7 +200,7 @@ class ImagetwistImageExtractor(ImagehostImageExtractor):
return self.request(self.page_url).cookies
def get_info(self, page):
url , pos = text.extract(page, 'center;"><img src="', '"')
url , pos = text.extract(page, '<img src="', '"')
filename, pos = text.extract(page, ' alt="', '"', pos)
return url, filename

View File

@@ -1,60 +1,73 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2019 Mike Fährmann
# Copyright 2015-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://imgth.com/"""
"""Extractors for https://imgth.com/"""
from .common import Extractor, Message
from .common import GalleryExtractor
from .. import text
class ImgthGalleryExtractor(Extractor):
class ImgthGalleryExtractor(GalleryExtractor):
"""Extractor for image galleries from imgth.com"""
category = "imgth"
subcategory = "gallery"
directory_fmt = ("{category}", "{gallery_id} {title}")
filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
archive_fmt = "{gallery_id}_{num}"
pattern = r"(?:https?://)?imgth\.com/gallery/(\d+)"
test = ("http://imgth.com/gallery/37/wallpaper-anime", {
"url": "4ae1d281ca2b48952cf5cca57e9914402ad72748",
"keyword": "6f8c00d6849ea89d1a028764675ec1fe9dbd87e2",
})
root = "https://imgth.com"
pattern = r"(?:https?://)?(?:www\.)?imgth\.com/gallery/(\d+)"
test = (
("https://imgth.com/gallery/37/wallpaper-anime", {
"url": "4ae1d281ca2b48952cf5cca57e9914402ad72748",
"pattern": r"https://imgth\.com/images/2009/11/25"
r"/wallpaper-anime_\w+\.jpg",
"keyword": {
"count": 12,
"date": "dt:2009-11-25 18:21:00",
"extension": "jpg",
"filename": r"re:wallpaper-anime_\w+",
"gallery_id": 37,
"num": int,
"title": "Wallpaper anime",
"user": "celebrities",
},
}),
("https://www.imgth.com/gallery/37/wallpaper-anime"),
)
def __init__(self, match):
Extractor.__init__(self, match)
self.gid = match.group(1)
self.url_base = "https://imgth.com/gallery/" + self.gid + "/g/page/"
self.gallery_id = gid = match.group(1)
url = "{}/gallery/{}/g/".format(self.root, gid)
GalleryExtractor.__init__(self, match, url)
def items(self):
page = self.request(self.url_base + "0").text
data = self.metadata(page)
yield Message.Directory, data
for data["num"], url in enumerate(self.images(page), 1):
yield Message.Url, url, text.nameext_from_url(url, data)
def metadata(self, page):
extr = text.extract_from(page)
return {
"gallery_id": text.parse_int(self.gallery_id),
"title": text.unescape(extr("<h1>", "</h1>")),
"count": text.parse_int(extr(
"total of images in this gallery: ", " ")),
"date" : text.parse_datetime(
extr("created on ", " by <")
.replace("th, ", " ", 1).replace("nd, ", " ", 1)
.replace("st, ", " ", 1), "%B %d %Y at %H:%M"),
"user" : text.unescape(extr(">", "<")),
}
def images(self, page):
"""Yield all image urls for this gallery"""
pnum = 0
while True:
thumbs = text.extr(page, '<ul class="thumbnails">', '</ul>')
for url in text.extract_iter(thumbs, '<img src="', '"'):
yield "https://imgth.com/images" + url[24:]
path = url.partition("/thumbs/")[2]
yield ("{}/images/{}".format(self.root, path), None)
if '<li class="next">' not in page:
return
pnum += 1
page = self.request(self.url_base + str(pnum)).text
def metadata(self, page):
"""Collect metadata for extractor-job"""
return text.extract_all(page, (
("title", '<h1>', '</h1>'),
("count", 'total of images in this gallery: ', ' '),
("date" , 'created on ', ' by <'),
(None , 'href="/users/', ''),
("user" , '>', '<'),
), values={"gallery_id": self.gid})[0]
pnum += 1
url = "{}/gallery/{}/g/page/{}".format(
self.root, self.gallery_id, pnum)
page = self.request(url).text

View File

@@ -65,7 +65,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
"count": text.parse_int(extr("Number of Files: <b>", "<")),
"size" : text.parse_bytes(extr("Total Filesize: <b>", "<")[:-1]),
"date" : extr("Date Added: <b>", "<"),
"type" : extr("Album type: <b>", "<"),
"type" : text.remove_html(extr("Album type: <b>", "</b>")),
}}
def tracks(self, page):

View File

@@ -1,82 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://kissgoddess.com/"""
from .common import GalleryExtractor, Extractor, Message
from .. import text, exception
class KissgoddessGalleryExtractor(GalleryExtractor):
"""Extractor for image galleries on kissgoddess.com"""
category = "kissgoddess"
root = "https://kissgoddess.com"
pattern = r"(?:https?://)?(?:www\.)?kissgoddess\.com/album/(\d+)"
test = ("https://kissgoddess.com/album/18285.html", {
"pattern": r"https://pic\.kissgoddess\.com"
r"/gallery/16473/18285/s/\d+\.jpg",
"count": 19,
"keyword": {
"gallery_id": 18285,
"title": "[Young Champion Extra] 2016.02 No.03 菜乃花 安枝瞳 葉月あや",
},
})
def __init__(self, match):
self.gallery_id = match.group(1)
url = "{}/album/{}.html".format(self.root, self.gallery_id)
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
return {
"gallery_id": text.parse_int(self.gallery_id),
"title" : text.extr(
page, '<title>', "<")[0].rpartition(" | "),
}
def images(self, page):
pnum = 1
while page:
for url in text.extract_iter(page, "<img src='", "'"):
yield url, None
for url in text.extract_iter(page, "<img data-original='", "'"):
yield url, None
pnum += 1
url = "{}/album/{}_{}.html".format(
self.root, self.gallery_id, pnum)
try:
page = self.request(url).text
except exception.HttpError:
return
class KissgoddessModelExtractor(Extractor):
"""Extractor for all galleries of a model on kissgoddess.com"""
category = "kissgoddess"
subcategory = "model"
root = "https://kissgoddess.com"
pattern = r"(?:https?://)?(?:www\.)?kissgoddess\.com/people/([^./?#]+)"
test = ("https://kissgoddess.com/people/aya-hazuki.html", {
"pattern": KissgoddessGalleryExtractor.pattern,
"count": ">= 7",
})
def __init__(self, match):
Extractor.__init__(self, match)
self.model = match.group(1)
def items(self):
url = "{}/people/{}.html".format(self.root, self.model)
page = self.request(url).text
data = {"_extractor": KissgoddessGalleryExtractor}
for path in text.extract_iter(page, 'thumb"><a href="/album/', '"'):
url = self.root + "/album/" + path
yield Message.Queue, url, data

View File

@@ -6,19 +6,19 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://komikcast.me/"""
"""Extractors for https://komikcast.site/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:me|com)"
BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:site|me|com)"
class KomikcastBase():
"""Base class for komikcast extractors"""
category = "komikcast"
root = "https://komikcast.me"
root = "https://komikcast.site"
@staticmethod
def parse_chapter_string(chapter_string, data=None):
@@ -46,23 +46,23 @@ class KomikcastBase():
class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
"""Extractor for manga-chapters from komikcast.me"""
"""Extractor for manga-chapters from komikcast.site"""
pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)"
test = (
(("https://komikcast.me/chapter"
(("https://komikcast.site/chapter"
"/apotheosis-chapter-02-2-bahasa-indonesia/"), {
"url": "74eca5c9b27b896816497f9b2d847f2a1fcfc209",
"url": "f6b43fbc027697749b3ea1c14931c83f878d7936",
"keyword": "f3938e1aff9ad1f302f52447e9781b21f6da26d4",
}),
(("https://komikcast.me/chapter"
"/soul-land-ii-chapter-300-1-bahasa-indonesia/"), {
"url": "243a5250e210b40d17217e83b7547cefea5638bd",
"url": "efd00a9bd95461272d51990d7bc54b79ff3ff2e6",
"keyword": "cb646cfed3d45105bd645ab38b2e9f7d8c436436",
}),
)
def metadata(self, page):
info = text.extr(page, "<title>", " Komikcast<")
info = text.extr(page, "<title>", " - Komikcast<")
return self.parse_chapter_string(info)
@staticmethod
@@ -76,12 +76,12 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
"""Extractor for manga from komikcast.me"""
"""Extractor for manga from komikcast.site"""
chapterclass = KomikcastChapterExtractor
pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$"
test = (
("https://komikcast.me/komik/090-eko-to-issho/", {
"url": "08204f0a703ec5272121abcf0632ecacba1e588f",
("https://komikcast.site/komik/090-eko-to-issho/", {
"url": "19d3d50d532e84be6280a3d61ff0fd0ca04dd6b4",
"keyword": "837a7e96867344ff59d840771c04c20dc46c0ab1",
}),
("https://komikcast.me/tonari-no-kashiwagi-san/"),
@@ -101,7 +101,7 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
@staticmethod
def metadata(page):
"""Return a dict with general metadata"""
manga , pos = text.extract(page, "<title>" , " Komikcast<")
manga , pos = text.extract(page, "<title>" , " - Komikcast<")
genres, pos = text.extract(
page, 'class="komik_info-content-genre">', "</span>", pos)
author, pos = text.extract(page, ">Author:", "</span>", pos)

View File

@@ -109,7 +109,7 @@ class MangadexChapterExtractor(MangadexExtractor):
}),
# 'externalUrl', but still downloadable (#2503)
("https://mangadex.org/chapter/364728a4-6909-4164-9eea-6b56354f7c78", {
"count": 39,
"count": 0, # 404
}),
)

View File

@@ -72,7 +72,7 @@ class RedgifsUserExtractor(RedgifsExtractor):
pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/users/([^/?#]+)"
test = ("https://www.redgifs.com/users/Natalifiction", {
"pattern": r"https://\w+\.redgifs\.com/[A-Za-z]+\.mp4",
"count": ">= 120",
"count": ">= 100",
})
def metadata(self):
@@ -89,7 +89,7 @@ class RedgifsSearchExtractor(RedgifsExtractor):
pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/browse/?\?([^#]+)"
test = (
("https://www.redgifs.com/browse?tags=JAV", {
"pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.mp4",
"pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.(mp4|jpg)",
"range": "1-10",
"count": 10,
}),

View File

@@ -43,7 +43,8 @@ class SlickpicAlbumExtractor(SlickpicExtractor):
}),
("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", {
"range": "34",
"content": ("52b5a310587de1048030ab13a912f6a3a9cc7dab",
"content": ("276eb2c902187bb177ae8013e310e1d6641fba9a",
"52b5a310587de1048030ab13a912f6a3a9cc7dab",
"cec6630e659dc72db1ee1a9a6f3b525189261988",
"6f81e1e74c6cd6db36844e7211eef8e7cd30055d",
"22e83645fc242bc3584eca7ec982c8a53a4d8a44"),

View File

@@ -117,7 +117,7 @@ class SmugmugImageExtractor(SmugmugExtractor):
# video
("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", {
"url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee",
"keyword": "4cef98133ace511adc874c9d9abac5817ba0d856",
"keyword": "2b545184592c282b365fcbb7df6ca7952b8a3173",
}),
)

View File

@@ -83,7 +83,7 @@ class TwibooruPostExtractor(TwibooruExtractor):
"tag_ids": list,
"tags": list,
"thumbnails_generated": True,
"updated_at": "2022-09-21T14:31:50.441Z",
"updated_at": "2022-11-27T00:34:50.483Z",
"upvotes": int,
"view_url": "https://cdn.twibooru.org/img/2020/7/8/1/full.png",
"width": 576,

View File

@@ -633,7 +633,7 @@ class TwitterEventExtractor(TwitterExtractor):
pattern = BASE_PATTERN + r"/i/events/(\d+)"
test = ("https://twitter.com/i/events/1484669206993903616", {
"range": "1-20",
"count": ">5",
"count": ">=1",
})
def metadata(self):
@@ -759,7 +759,7 @@ class TwitterTweetExtractor(TwitterExtractor):
# retweet with missing media entities (#1555)
("https://twitter.com/morino_ya/status/1392763691599237121", {
"options": (("retweets", True),),
"count": 4,
"count": 0, # private
}),
# deleted quote tweet (#2225)
("https://twitter.com/i/web/status/1460044411165888515", {
@@ -782,7 +782,7 @@ class TwitterTweetExtractor(TwitterExtractor):
# '?format=...&name=...'-style URLs
("https://twitter.com/poco_dandy/status/1150646424461176832", {
"options": (("cards", True),),
"pattern": r"https://pbs.twimg.com/card_img/157\d+/\w+"
"pattern": r"https://pbs.twimg.com/card_img/157\d+/[\w-]+"
r"\?format=(jpg|png)&name=orig$",
"range": "1-2",
}),

View File

@@ -78,11 +78,11 @@ class UnsplashImageExtractor(UnsplashExtractor):
pattern = BASE_PATTERN + r"/photos/([^/?#]+)"
test = ("https://unsplash.com/photos/lsoogGC_5dg", {
"pattern": r"https://images\.unsplash\.com/photo-1586348943529-"
r"beaae6c28db9\?ixid=\w+&ixlib=rb-1.2.1",
r"beaae6c28db9\?ixid=\w+&ixlib=rb-4.0.3",
"keyword": {
"alt_description": "re:silhouette of trees near body of water ",
"blur_hash": "LZP4uQS4jboe%#o0WCa}2doJNaaz",
"categories": list,
"? categories": list,
"color": "#f3c08c",
"created_at": "2020-04-08T12:29:42Z",
"date": "dt:2020-04-08 12:29:42",
@@ -108,9 +108,8 @@ class UnsplashImageExtractor(UnsplashExtractor):
"name": "Beaver Dam, WI 53916, USA",
"position": {
"latitude": 43.457769,
"longitude": -88.837329
"longitude": -88.837329,
},
"title": "Beaver Dam, WI 53916, USA"
},
"promoted_at": "2020-04-08T15:12:03Z",
"sponsorship": None,
@@ -149,7 +148,7 @@ class UnsplashUserExtractor(UnsplashExtractor):
pattern = BASE_PATTERN + r"/@(\w+)/?$"
test = ("https://unsplash.com/@davehoefler", {
"pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+"
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$",
"range": "1-30",
"count": 30,
})
@@ -166,7 +165,7 @@ class UnsplashFavoriteExtractor(UnsplashExtractor):
pattern = BASE_PATTERN + r"/@(\w+)/likes"
test = ("https://unsplash.com/@davehoefler/likes", {
"pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+"
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$",
"range": "1-30",
"count": 30,
})
@@ -184,7 +183,7 @@ class UnsplashCollectionExtractor(UnsplashExtractor):
test = (
("https://unsplash.com/collections/3178572/winter", {
"pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+"
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$",
"keyword": {"collection_id": "3178572",
"collection_title": "winter"},
"range": "1-30",
@@ -212,8 +211,9 @@ class UnsplashSearchExtractor(UnsplashExtractor):
subcategory = "search"
pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^#]+))?"
test = ("https://unsplash.com/s/photos/hair-style", {
"pattern": r"https://images\.unsplash\.com/((flagged/)?photo-\d+-\w+"
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
"pattern": r"https://(images|plus)\.unsplash\.com"
r"/((flagged/|premium_)?photo-\d+-\w+"
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$",
"range": "1-30",
"count": 30,
})

View File

@@ -1,21 +1,22 @@
# -*- coding: utf-8 -*-
# Copyright 2017-2019 Mike Fährmann
# Copyright 2017-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://warosu.org/"""
"""Extractors for https://warosu.org/"""
from .common import Extractor, Message
from .. import text
class WarosuThreadExtractor(Extractor):
"""Extractor for images from threads on warosu.org"""
"""Extractor for threads on warosu.org"""
category = "warosu"
subcategory = "thread"
root = "https://warosu.org"
directory_fmt = ("{category}", "{board}", "{thread} - {title}")
filename_fmt = "{tim}-{filename}.{extension}"
archive_fmt = "{board}_{thread}_{tim}"
@@ -31,7 +32,6 @@ class WarosuThreadExtractor(Extractor):
"content": "d48df0a701e6599312bfff8674f4aa5d4fb8db1c",
}),
)
root = "https://warosu.org"
def __init__(self, match):
Extractor.__init__(self, match)
@@ -40,12 +40,12 @@ class WarosuThreadExtractor(Extractor):
def items(self):
url = "{}/{}/thread/{}".format(self.root, self.board, self.thread)
page = self.request(url).text
data = self.get_metadata(page)
data = self.metadata(page)
posts = self.posts(page)
if not data["title"]:
title = text.remove_html(posts[0]["com"])
data["title"] = text.unescape(title)[:50]
data["title"] = text.unescape(text.remove_html(
posts[0]["com"]))[:50]
yield Message.Directory, data
for post in posts:
@@ -55,25 +55,24 @@ class WarosuThreadExtractor(Extractor):
post.update(data)
yield Message.Url, post["image"], post
def get_metadata(self, page):
"""Collect metadata for extractor-job"""
def metadata(self, page):
boardname = text.extr(page, "<title>", "</title>")
title = text.extr(page, 'filetitle" itemprop="name">', '<')
return {
"board": self.board,
"board" : self.board,
"board_name": boardname.rpartition(" - ")[2],
"thread": self.thread,
"title": title,
"thread" : self.thread,
"title" : title,
}
def posts(self, page):
"""Build a list of all post-objects"""
"""Build a list of all post objects"""
page = text.extr(page, '<div class="content">', '<table>')
needle = '<table itemscope itemtype="http://schema.org/Comment">'
return [self.parse(post) for post in page.split(needle)]
def parse(self, post):
"""Build post-object by extracting data from an HTML post"""
"""Build post object by extracting data from an HTML post"""
data = self._extract_post(post)
if "<span>File:" in post:
self._extract_image(post, data)
@@ -84,24 +83,23 @@ class WarosuThreadExtractor(Extractor):
@staticmethod
def _extract_post(post):
data = text.extract_all(post, (
("no" , 'id="p', '"'),
("name", '<span itemprop="name">', '</span>'),
("time", '<span class="posttime" title="', '000">'),
("now" , '', '<'),
("com" , '<blockquote><p itemprop="text">', '</p></blockquote>'),
))[0]
data["com"] = text.unescape(text.remove_html(data["com"].strip()))
return data
extr = text.extract_from(post)
return {
"no" : extr('id="p', '"'),
"name": extr('<span itemprop="name">', "</span>"),
"time": extr('<span class="posttime" title="', '000">'),
"now" : extr("", "<"),
"com" : text.unescape(text.remove_html(extr(
'<blockquote><p itemprop="text">', '</p></blockquote>'
).strip())),
}
@staticmethod
def _extract_image(post, data):
text.extract_all(post, (
("fsize" , '<span>File: ', ', '),
("w" , '', 'x'),
("h" , '', ', '),
("filename", '', '<'),
("image" , '<br />\n<a href="', '"'),
), 0, data)
data["filename"] = text.unquote(data["filename"].rpartition(".")[0])
data["image"] = "https:" + data["image"]
extr = text.extract_from(post)
data["fsize"] = extr("<span>File: ", ", ")
data["w"] = extr("", "x")
data["h"] = extr("", ", ")
data["filename"] = text.unquote(extr("", "<").rpartition(".")[0])
extr("<br />", "")
data["image"] = "https:" + extr('<a href="', '"')

View File

@@ -57,6 +57,7 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
}),
(("https://www.webtoons.com/en/challenge/punderworld"
"/happy-earth-day-/viewer?title_no=312584&episode_no=40"), {
"exception": exception.NotFoundError,
"keyword": {
"comic": "punderworld",
"description": str,