* [arca.live] Add extractor skeleton * [arcalive] update names and formatting * [arcalive] implement initial file extraction code * [arcalive] improve '_extract_media()' performance compile and cache regex on demand * [arcalive] improve image extraction - extract 'data-originalurl' URLs if available - replace URL query strings with 'type=orig' - ignore emoticons by default * [arcalive] update defaults - include 'title' in filenames - use 0.5-1.5s delay between requests * [arcalive] use ext from 'data-orig' if available * [arcalive] update docs/supportedsites * [arcalive] add tests * [arcalive] update 'board' extractor pattern so it doesn't also match 'post' URLs --------- Co-authored-by: Mike Fährmann <mike_faehrmann@web.de>
This commit is contained in:
@@ -384,6 +384,7 @@ Type
|
|||||||
Default
|
Default
|
||||||
* ``"0.5-1.5"``
|
* ``"0.5-1.5"``
|
||||||
``ao3``,
|
``ao3``,
|
||||||
|
``arcalive``,
|
||||||
``civitai``,
|
``civitai``,
|
||||||
``[Danbooru]``,
|
``[Danbooru]``,
|
||||||
``[E621]``,
|
``[E621]``,
|
||||||
@@ -1394,6 +1395,16 @@ Description
|
|||||||
Format(s) to download.
|
Format(s) to download.
|
||||||
|
|
||||||
|
|
||||||
|
extractor.arcalive.emoticons
|
||||||
|
----------------------------
|
||||||
|
Type
|
||||||
|
``bool``
|
||||||
|
Default
|
||||||
|
``false``
|
||||||
|
Description
|
||||||
|
Download emoticon images.
|
||||||
|
|
||||||
|
|
||||||
extractor.artstation.external
|
extractor.artstation.external
|
||||||
-----------------------------
|
-----------------------------
|
||||||
Type
|
Type
|
||||||
|
|||||||
@@ -99,6 +99,12 @@
|
|||||||
|
|
||||||
"formats": ["pdf"]
|
"formats": ["pdf"]
|
||||||
},
|
},
|
||||||
|
"arcalive":
|
||||||
|
{
|
||||||
|
"sleep-request": "0.5-1.5",
|
||||||
|
|
||||||
|
"emoticons": false
|
||||||
|
},
|
||||||
"artstation":
|
"artstation":
|
||||||
{
|
{
|
||||||
"external" : false,
|
"external" : false,
|
||||||
|
|||||||
@@ -97,6 +97,12 @@ Consider all listed sites to potentially be NSFW.
|
|||||||
<td>Posts, Tag Searches</td>
|
<td>Posts, Tag Searches</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Arcalive</td>
|
||||||
|
<td>https://arca.live/</td>
|
||||||
|
<td>Boards, Posts</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Architizer</td>
|
<td>Architizer</td>
|
||||||
<td>https://architizer.com/</td>
|
<td>https://architizer.com/</td>
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ modules = [
|
|||||||
"adultempire",
|
"adultempire",
|
||||||
"agnph",
|
"agnph",
|
||||||
"ao3",
|
"ao3",
|
||||||
|
"arcalive",
|
||||||
"architizer",
|
"architizer",
|
||||||
"artstation",
|
"artstation",
|
||||||
"aryion",
|
"aryion",
|
||||||
|
|||||||
157
gallery_dl/extractor/arcalive.py
Normal file
157
gallery_dl/extractor/arcalive.py
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for https://arca.live/"""
|
||||||
|
|
||||||
|
from .common import Extractor, Message
|
||||||
|
from .. import text, util, exception
|
||||||
|
import re
|
||||||
|
|
||||||
|
BASE_PATTERN = r"(?:https?://)?(?:www\.)?arca\.live"
|
||||||
|
|
||||||
|
|
||||||
|
class ArcaliveExtractor(Extractor):
|
||||||
|
"""Base class for Arca.live extractors"""
|
||||||
|
category = "arcalive"
|
||||||
|
root = "https://arca.live"
|
||||||
|
request_interval = (0.5, 1.5)
|
||||||
|
|
||||||
|
def _init(self):
|
||||||
|
self.api = ArcaliveAPI(self)
|
||||||
|
|
||||||
|
|
||||||
|
class ArcalivePostExtractor(ArcaliveExtractor):
|
||||||
|
"""Extractor for an arca.live post"""
|
||||||
|
subcategory = "post"
|
||||||
|
directory_fmt = ("{category}", "{boardSlug}")
|
||||||
|
filename_fmt = "{id}_{num}{title:? //[b:230]}.{extension}"
|
||||||
|
archive_fmt = "{id}_{num}"
|
||||||
|
pattern = BASE_PATTERN + r"/b/(?:\w+)/(\d+)"
|
||||||
|
example = "https://arca.live/b/breaking/123456789"
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
self.emoticons = self.config("emoticons", False)
|
||||||
|
|
||||||
|
post = self.api.post(self.groups[0])
|
||||||
|
files = self._extract_files(post)
|
||||||
|
|
||||||
|
post["count"] = len(files)
|
||||||
|
post["date"] = text.parse_datetime(
|
||||||
|
post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
|
||||||
|
post["post_url"] = post_url = "{}/b/{}/{}".format(
|
||||||
|
self.root, post["boardSlug"], post["id"])
|
||||||
|
post["_http_headers"] = {"Referer": post_url + "?p=1"}
|
||||||
|
|
||||||
|
yield Message.Directory, post
|
||||||
|
for post["num"], file in enumerate(files, 1):
|
||||||
|
post.update(file)
|
||||||
|
url = file["url"]
|
||||||
|
yield Message.Url, url, text.nameext_from_url(url, post)
|
||||||
|
|
||||||
|
def _extract_files(self, post):
|
||||||
|
files = []
|
||||||
|
|
||||||
|
for media in self._extract_media(post["content"]):
|
||||||
|
|
||||||
|
if not self.emoticons and 'class="arca-emoticon"' in media:
|
||||||
|
continue
|
||||||
|
|
||||||
|
src = (text.extr(media, 'data-originalurl="', '"') or
|
||||||
|
text.extr(media, 'src="', '"'))
|
||||||
|
if not src:
|
||||||
|
continue
|
||||||
|
|
||||||
|
src = text.unescape(src.partition("?")[0])
|
||||||
|
if src[0] == "/":
|
||||||
|
if src[1] == "/":
|
||||||
|
url = "https:" + src
|
||||||
|
else:
|
||||||
|
url = self.root + src
|
||||||
|
else:
|
||||||
|
url = src
|
||||||
|
|
||||||
|
fallback = ()
|
||||||
|
orig = text.extr(media, 'data-orig="', '"')
|
||||||
|
if orig:
|
||||||
|
path, _, ext = url.rpartition(".")
|
||||||
|
if ext != orig:
|
||||||
|
fallback = (url + "?type=orig",)
|
||||||
|
url = path + "." + orig
|
||||||
|
|
||||||
|
files.append({
|
||||||
|
"url" : url + "?type=orig",
|
||||||
|
"width" : text.parse_int(text.extr(media, 'width="', '"')),
|
||||||
|
"height": text.parse_int(text.extr(media, 'height="', '"')),
|
||||||
|
"_fallback": fallback,
|
||||||
|
})
|
||||||
|
|
||||||
|
return files
|
||||||
|
|
||||||
|
def _extract_media(self, content):
|
||||||
|
ArcalivePostExtractor._extract_media = extr = re.compile(
|
||||||
|
r"<(?:img|video) ([^>]+)").findall
|
||||||
|
return extr(content)
|
||||||
|
|
||||||
|
|
||||||
|
class ArcaliveBoardExtractor(ArcaliveExtractor):
|
||||||
|
"""Extractor for an arca.live board's posts"""
|
||||||
|
subcategory = "board"
|
||||||
|
pattern = BASE_PATTERN + r"/b/(\w+)(?:/?\?([^#]+))?$"
|
||||||
|
example = "https://arca.live/b/breaking"
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
board, query = self.groups
|
||||||
|
params = text.parse_query(query)
|
||||||
|
articles = self.api.board(board, params)
|
||||||
|
|
||||||
|
for article in articles:
|
||||||
|
article["_extractor"] = ArcalivePostExtractor
|
||||||
|
url = "{}/b/{}/{}".format(self.root, board, article["id"])
|
||||||
|
yield Message.Queue, url, article
|
||||||
|
|
||||||
|
|
||||||
|
class ArcaliveAPI():
|
||||||
|
|
||||||
|
def __init__(self, extractor):
|
||||||
|
self.extractor = extractor
|
||||||
|
self.log = extractor.log
|
||||||
|
self.root = extractor.root + "/api/app"
|
||||||
|
|
||||||
|
headers = extractor.session.headers
|
||||||
|
headers["User-Agent"] = "net.umanle.arca.android.playstore/0.9.75"
|
||||||
|
headers["X-Device-Token"] = util.generate_token(64)
|
||||||
|
|
||||||
|
def board(self, board_slug, params):
|
||||||
|
endpoint = "/list/channel/" + board_slug
|
||||||
|
return self._pagination(endpoint, params, "articles")
|
||||||
|
|
||||||
|
def post(self, post_id):
|
||||||
|
endpoint = "/view/article/breaking/" + str(post_id)
|
||||||
|
return self._call(endpoint)
|
||||||
|
|
||||||
|
def _call(self, endpoint, params=None):
|
||||||
|
url = self.root + endpoint
|
||||||
|
response = self.extractor.request(url, params=params)
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
if response.status_code == 200:
|
||||||
|
return data
|
||||||
|
|
||||||
|
self.log.debug("Server response: %s", data)
|
||||||
|
msg = data.get("message")
|
||||||
|
raise exception.StopExtraction(
|
||||||
|
"API request failed%s", ": " + msg if msg else "")
|
||||||
|
|
||||||
|
def _pagination(self, endpoint, params, key):
|
||||||
|
while True:
|
||||||
|
data = self._call(endpoint, params)
|
||||||
|
|
||||||
|
posts = data.get(key)
|
||||||
|
if not posts:
|
||||||
|
break
|
||||||
|
yield from posts
|
||||||
|
|
||||||
|
params.update(data["next"])
|
||||||
130
test/results/arcalive.py
Normal file
130
test/results/arcalive.py
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
from gallery_dl.extractor import arcalive
|
||||||
|
|
||||||
|
|
||||||
|
__tests__ = (
|
||||||
|
{
|
||||||
|
"#url" : "https://arca.live/b/arknights/66031722?p=1",
|
||||||
|
"#class" : arcalive.ArcalivePostExtractor,
|
||||||
|
"#urls" : "https://ac.namu.la/20221225sac2/e06dcf8edd29c597240898a6752c74dbdd0680fc932cfd0ecc898795f1db34b5.jpg?type=orig",
|
||||||
|
|
||||||
|
"isEditable": False,
|
||||||
|
"isDeletable": False,
|
||||||
|
"isReportable": False,
|
||||||
|
"id": 66031722,
|
||||||
|
"nickname": "Si리링",
|
||||||
|
"title": "엑샤 스작함",
|
||||||
|
"contentType": "html",
|
||||||
|
"content": r"re:^<p>알게또 뽑으려했는데 못뽑아서 엑샤 스작함<br />엑샤에 보카디 3스나 와파린 2스 붙이는거 맞음.+/></p>$",
|
||||||
|
"viewCount": range(8000, 20000),
|
||||||
|
"ratingUp": 0,
|
||||||
|
"ratingDown": 0,
|
||||||
|
"ratingUpIp": 0,
|
||||||
|
"ratingDownIp": 0,
|
||||||
|
"createdAt": "2022-12-25T05:16:55.000Z",
|
||||||
|
"updatedAt": "2022-12-25T05:16:55.000Z",
|
||||||
|
"lastComment": "2022-12-25T05:22:12.000Z",
|
||||||
|
"commentCount": range(2, 9),
|
||||||
|
"publicId": None,
|
||||||
|
"token": "44bb2dfd0bbc672e",
|
||||||
|
"isUser": True,
|
||||||
|
"gravatar": "//secure.gravatar.com/avatar/6c3fdbdeea149b29eea8d887c37fc119?d=retro&f=y",
|
||||||
|
"preventDelete": False,
|
||||||
|
"channelPermission": dict,
|
||||||
|
"captcha": True,
|
||||||
|
"isSensitive": False,
|
||||||
|
"categoryDisplayName": None,
|
||||||
|
"blockPreview": False,
|
||||||
|
"isSpoilerAlert": False,
|
||||||
|
"boardName": "명일방주 채널",
|
||||||
|
"boardSlug": "arknights",
|
||||||
|
"isBest": False,
|
||||||
|
"vote": [],
|
||||||
|
"date": "dt:2022-12-25 05:16:55",
|
||||||
|
"post_url": "https://arca.live/b/arknights/66031722",
|
||||||
|
"count": 1,
|
||||||
|
"num": 1,
|
||||||
|
"url": "https://ac.namu.la/20221225sac2/e06dcf8edd29c597240898a6752c74dbdd0680fc932cfd0ecc898795f1db34b5.jpg?type=orig",
|
||||||
|
"width": 3200,
|
||||||
|
"height": 1440,
|
||||||
|
"filename": "e06dcf8edd29c597240898a6752c74dbdd0680fc932cfd0ecc898795f1db34b5",
|
||||||
|
"extension": "jpg",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://arca.live/b/breaking/66031722",
|
||||||
|
"#comment": "/b/breaking page URL",
|
||||||
|
"#class" : arcalive.ArcalivePostExtractor,
|
||||||
|
"#urls" : "https://ac.namu.la/20221225sac2/e06dcf8edd29c597240898a6752c74dbdd0680fc932cfd0ecc898795f1db34b5.jpg?type=orig",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://arca.live/b/bluearchive/65031202",
|
||||||
|
"#comment": "animated gif",
|
||||||
|
"#class" : arcalive.ArcalivePostExtractor,
|
||||||
|
"#urls" : (
|
||||||
|
"https://ac.namu.la/20221211sac/5ea7fbca5e49ec16beb099fc6fc991690d37552e599b1de8462533908346241e.png?type=orig",
|
||||||
|
"https://ac.namu.la/20221211sac/7f73beefc4f18a2f986bc4c6821caba706e27f4c94cb828fc16e2af1253402d9.gif?type=orig",
|
||||||
|
"https://ac.namu.la/20221211sac2/3e72f9e05ca97c0c3c0fe5f25632b06eb21ab9f211e9ea22816e16468ee241ca.png?type=orig",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://arca.live/b/arknights/122263340",
|
||||||
|
"#comment": "animated webp",
|
||||||
|
"#class" : arcalive.ArcalivePostExtractor,
|
||||||
|
"#urls" : (
|
||||||
|
"https://ac.namu.la/20241126sac/b2175d9ef4504945d3d989526120dbb6aded501ddedfba8ecc44a64e7aae9059.gif?type=orig",
|
||||||
|
"https://ac.namu.la/20241126sac/bc1f3cb388a3a2d099ab67bc09b28f0a93c2c4755152b3ef9190690a9f0a28fb.webp?type=orig",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://arca.live/b/bluearchive/117240135",
|
||||||
|
"#comment": "video",
|
||||||
|
"#class" : arcalive.ArcalivePostExtractor,
|
||||||
|
"#urls" : "https://ac.namu.la/20240926sac/16f07778a97f91b935c8a3394ead01a223d96b2a619fdb25c4628ddba88b5fad.mp4?type=orig",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://arca.live/b/bluearchive/111191955",
|
||||||
|
"#comment": "fake .mp4 GIF",
|
||||||
|
"#skip" : "not implemented",
|
||||||
|
"#class" : arcalive.ArcalivePostExtractor,
|
||||||
|
# "#urls" : "https://ac.namu.la/20240714sac/c8fcadeb0b578e5121eb7a7e8fb05984cb87c68e7a6e0481a1c8869bf0ecfd2b.gif?type=orig",
|
||||||
|
"#urls" : "https://ac.namu.la/20240714sac/c8fcadeb0b578e5121eb7a7e8fb05984cb87c68e7a6e0481a1c8869bf0ecfd2b.mp4?type=orig",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://arca.live/b/arknights/49406926",
|
||||||
|
"#comment": "static emoticon",
|
||||||
|
"#class" : arcalive.ArcalivePostExtractor,
|
||||||
|
"#urls" : "https://ac.namu.la/20220428sac2/41f472adcea674aff75f15f146e81c27032bc4d6c8073bd7c19325bd1c97d335.png?type=orig",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://arca.live/b/commission/63658702",
|
||||||
|
"#comment": "animated emoticon",
|
||||||
|
"#class" : arcalive.ArcalivePostExtractor,
|
||||||
|
"#options": {"emoticons": True},
|
||||||
|
"#urls" : (
|
||||||
|
"https://ac.namu.la/20221123sac2/14925c5e22ab9f17f2923ae60a39b7af0794c43e478ecaba054ab6131e57e022.png?type=orig",
|
||||||
|
"https://ac.namu.la/20221123sac2/50c385a4004bca44271a2f6133990f086cfefd29a7968514e9c14d6017d61265.png?type=orig",
|
||||||
|
"https://ac.namu.la/20221005sac2/28ebe073fffbb2b88f710c2d380b0fe6dd99a856070c4a836db57634a5371366.gif?type=orig",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://arca.live/b/arknights",
|
||||||
|
"#class" : arcalive.ArcaliveBoardExtractor,
|
||||||
|
"#pattern": arcalive.ArcalivePostExtractor.pattern,
|
||||||
|
"#range" : "1-100",
|
||||||
|
"#count" : 100,
|
||||||
|
},
|
||||||
|
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user