merge branch 'dt': move datetime utils into separate module
- use 'datetime.fromisoformat()' when possible (#7671) - return a datetime-compatible object for invalid datetimes (instead of a 'str' value)
This commit is contained in:
115
gallery_dl/dt.py
Normal file
115
gallery_dl/dt.py
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2025 Mike Fährmann
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Date/Time utilities"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import datetime, date, timedelta, timezone # noqa F401
|
||||||
|
|
||||||
|
|
||||||
|
class NullDatetime(datetime):
|
||||||
|
|
||||||
|
def __bool__(self):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "[Invalid DateTime]"
|
||||||
|
|
||||||
|
def __format__(self, format_spec):
|
||||||
|
return "[Invalid DateTime]"
|
||||||
|
|
||||||
|
|
||||||
|
NONE = NullDatetime(1, 1, 1)
|
||||||
|
EPOCH = datetime(1970, 1, 1)
|
||||||
|
SECOND = timedelta(0, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(dt):
|
||||||
|
# if (o := dt.utcoffset()) is not None:
|
||||||
|
# return dt.replace(tzinfo=None, microsecond=0) - o
|
||||||
|
if dt.tzinfo is not None:
|
||||||
|
return dt.astimezone(timezone.utc).replace(tzinfo=None, microsecond=0)
|
||||||
|
if dt.microsecond:
|
||||||
|
return dt.replace(microsecond=0)
|
||||||
|
return dt
|
||||||
|
|
||||||
|
|
||||||
|
def convert(value):
|
||||||
|
"""Convert 'value' to a naive UTC datetime object"""
|
||||||
|
if not value:
|
||||||
|
return NONE
|
||||||
|
if isinstance(value, datetime):
|
||||||
|
return normalize(value)
|
||||||
|
if isinstance(value, str) and (dt := parse_iso(value)) is not NONE:
|
||||||
|
return dt
|
||||||
|
return parse_ts(value)
|
||||||
|
|
||||||
|
|
||||||
|
def parse(dt_string, format):
|
||||||
|
"""Parse 'dt_string' according to 'format'"""
|
||||||
|
try:
|
||||||
|
return normalize(datetime.strptime(dt_string, format))
|
||||||
|
except Exception:
|
||||||
|
return NONE
|
||||||
|
|
||||||
|
|
||||||
|
if sys.hexversion < 0x30c0000:
|
||||||
|
# Python <= 3.11
|
||||||
|
def parse_iso(dt_string):
|
||||||
|
"""Parse 'dt_string' as ISO 8601 value"""
|
||||||
|
try:
|
||||||
|
if dt_string[-1] == "Z":
|
||||||
|
# compat for Python < 3.11
|
||||||
|
dt_string = dt_string[:-1]
|
||||||
|
elif dt_string[-5] in "+-":
|
||||||
|
# compat for Python < 3.11
|
||||||
|
dt_string = f"{dt_string[:-2]}:{dt_string[-2:]}"
|
||||||
|
return normalize(datetime.fromisoformat(dt_string))
|
||||||
|
except Exception:
|
||||||
|
return NONE
|
||||||
|
|
||||||
|
from_ts = datetime.utcfromtimestamp
|
||||||
|
now = datetime.utcnow
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Python >= 3.12
|
||||||
|
def parse_iso(dt_string):
|
||||||
|
"""Parse 'dt_string' as ISO 8601 value"""
|
||||||
|
try:
|
||||||
|
return normalize(datetime.fromisoformat(dt_string))
|
||||||
|
except Exception:
|
||||||
|
return NONE
|
||||||
|
|
||||||
|
def from_ts(ts=None):
|
||||||
|
"""Convert Unix timestamp to naive UTC datetime"""
|
||||||
|
Y, m, d, H, M, S, _, _, _ = time.gmtime(ts)
|
||||||
|
return datetime(Y, m, d, H, M, S)
|
||||||
|
|
||||||
|
now = from_ts
|
||||||
|
|
||||||
|
|
||||||
|
def parse_ts(ts, default=NONE):
|
||||||
|
"""Create a datetime object from a Unix timestamp"""
|
||||||
|
try:
|
||||||
|
return from_ts(int(ts))
|
||||||
|
except Exception:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def to_ts(dt):
|
||||||
|
"""Convert naive UTC datetime to Unix timestamp"""
|
||||||
|
return (dt - EPOCH) / SECOND
|
||||||
|
|
||||||
|
|
||||||
|
def to_ts_string(dt):
|
||||||
|
"""Convert naive UTC datetime to Unix timestamp string"""
|
||||||
|
try:
|
||||||
|
return str((dt - EPOCH) // SECOND)
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
@@ -46,7 +46,7 @@ class _2chThreadExtractor(Extractor):
|
|||||||
for post in posts:
|
for post in posts:
|
||||||
if files := post.get("files"):
|
if files := post.get("files"):
|
||||||
post["post_name"] = post["name"]
|
post["post_name"] = post["name"]
|
||||||
post["date"] = text.parse_timestamp(post["timestamp"])
|
post["date"] = self.parse_timestamp(post["timestamp"])
|
||||||
del post["files"]
|
del post["files"]
|
||||||
del post["name"]
|
del post["name"]
|
||||||
|
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ class _2chenThreadExtractor(Extractor):
|
|||||||
extr = text.extract_from(post)
|
extr = text.extract_from(post)
|
||||||
return {
|
return {
|
||||||
"name" : text.unescape(extr("<span>", "</span>")),
|
"name" : text.unescape(extr("<span>", "</span>")),
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime(
|
||||||
extr("<time", "<").partition(">")[2],
|
extr("<time", "<").partition(">")[2],
|
||||||
"%d %b %Y (%a) %H:%M:%S"
|
"%d %b %Y (%a) %H:%M:%S"
|
||||||
),
|
),
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
"""Extractors for https://4archive.org/"""
|
"""Extractors for https://4archive.org/"""
|
||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text, util
|
from .. import text, dt
|
||||||
|
|
||||||
|
|
||||||
class _4archiveThreadExtractor(Extractor):
|
class _4archiveThreadExtractor(Extractor):
|
||||||
@@ -37,7 +37,7 @@ class _4archiveThreadExtractor(Extractor):
|
|||||||
|
|
||||||
for post in posts:
|
for post in posts:
|
||||||
post.update(data)
|
post.update(data)
|
||||||
post["time"] = int(util.datetime_to_timestamp(post["date"]))
|
post["time"] = int(dt.to_ts(post["date"]))
|
||||||
yield Message.Directory, post
|
yield Message.Directory, post
|
||||||
if "url" in post:
|
if "url" in post:
|
||||||
yield Message.Url, post["url"], text.nameext_from_url(
|
yield Message.Url, post["url"], text.nameext_from_url(
|
||||||
@@ -61,10 +61,9 @@ class _4archiveThreadExtractor(Extractor):
|
|||||||
extr = text.extract_from(post)
|
extr = text.extract_from(post)
|
||||||
data = {
|
data = {
|
||||||
"name": extr('class="name">', "</span>"),
|
"name": extr('class="name">', "</span>"),
|
||||||
"date": text.parse_datetime(
|
"date": self.parse_datetime_iso(
|
||||||
(extr('class="dateTime">', "<") or
|
(extr('class="dateTime">', "<") or
|
||||||
extr('class="dateTime postNum" >', "<")).strip(),
|
extr('class="dateTime postNum" >', "<")).strip()),
|
||||||
"%Y-%m-%d %H:%M:%S"),
|
|
||||||
"no" : text.parse_int(extr(">Post No.", "<")),
|
"no" : text.parse_int(extr(">Post No.", "<")),
|
||||||
}
|
}
|
||||||
if 'class="file"' in post:
|
if 'class="file"' in post:
|
||||||
|
|||||||
@@ -9,9 +9,8 @@
|
|||||||
"""Extractors for https://8chan.moe/"""
|
"""Extractors for https://8chan.moe/"""
|
||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text, util
|
from .. import text, dt
|
||||||
from ..cache import memcache
|
from ..cache import memcache
|
||||||
from datetime import timedelta
|
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?8chan\.(moe|se|cc)"
|
BASE_PATTERN = r"(?:https?://)?8chan\.(moe|se|cc)"
|
||||||
@@ -44,7 +43,7 @@ class _8chanExtractor(Extractor):
|
|||||||
def cookies_prepare(self):
|
def cookies_prepare(self):
|
||||||
# fetch captcha cookies
|
# fetch captcha cookies
|
||||||
# (necessary to download without getting interrupted)
|
# (necessary to download without getting interrupted)
|
||||||
now = util.datetime_utcnow()
|
now = dt.now()
|
||||||
url = self.root + "/captcha.js"
|
url = self.root + "/captcha.js"
|
||||||
params = {"d": now.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (UTC)")}
|
params = {"d": now.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (UTC)")}
|
||||||
self.request(url, params=params).content
|
self.request(url, params=params).content
|
||||||
@@ -57,7 +56,7 @@ class _8chanExtractor(Extractor):
|
|||||||
if cookie.domain.endswith(domain):
|
if cookie.domain.endswith(domain):
|
||||||
cookie.expires = None
|
cookie.expires = None
|
||||||
if cookie.name == "captchaexpiration":
|
if cookie.name == "captchaexpiration":
|
||||||
cookie.value = (now + timedelta(30, 300)).strftime(
|
cookie.value = (now + dt.timedelta(30, 300)).strftime(
|
||||||
"%a, %d %b %Y %H:%M:%S GMT")
|
"%a, %d %b %Y %H:%M:%S GMT")
|
||||||
|
|
||||||
return self.cookies
|
return self.cookies
|
||||||
|
|||||||
@@ -85,8 +85,7 @@ class _8musesAlbumExtractor(Extractor):
|
|||||||
"parent" : text.parse_int(album["parentId"]),
|
"parent" : text.parse_int(album["parentId"]),
|
||||||
"views" : text.parse_int(album["numberViews"]),
|
"views" : text.parse_int(album["numberViews"]),
|
||||||
"likes" : text.parse_int(album["numberLikes"]),
|
"likes" : text.parse_int(album["numberLikes"]),
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime_iso(album["updatedAt"]),
|
||||||
album["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ"),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def _unobfuscate(self, data):
|
def _unobfuscate(self, data):
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ class AdultempireGalleryExtractor(GalleryExtractor):
|
|||||||
"gallery_id": text.parse_int(self.gallery_id),
|
"gallery_id": text.parse_int(self.gallery_id),
|
||||||
"title" : text.unescape(extr('title="', '"')),
|
"title" : text.unescape(extr('title="', '"')),
|
||||||
"studio" : extr(">studio</small>", "<").strip(),
|
"studio" : extr(">studio</small>", "<").strip(),
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : self.parse_datetime(extr(
|
||||||
">released</small>", "<").strip(), "%m/%d/%Y"),
|
">released</small>", "<").strip(), "%m/%d/%Y"),
|
||||||
"actors" : sorted(text.split_html(extr(
|
"actors" : sorted(text.split_html(extr(
|
||||||
'<ul class="item-details item-cast-list ', '</ul>'))[1:]),
|
'<ul class="item-details item-cast-list ', '</ul>'))[1:]),
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ class AgnphExtractor(booru.BooruExtractor):
|
|||||||
self.cookies.set("confirmed_age", "true", domain="agn.ph")
|
self.cookies.set("confirmed_age", "true", domain="agn.ph")
|
||||||
|
|
||||||
def _prepare(self, post):
|
def _prepare(self, post):
|
||||||
post["date"] = text.parse_timestamp(post["created_at"])
|
post["date"] = self.parse_timestamp(post["created_at"])
|
||||||
post["status"] = post["status"].strip()
|
post["status"] = post["status"].strip()
|
||||||
post["has_children"] = ("true" in post["has_children"])
|
post["has_children"] = ("true" in post["has_children"])
|
||||||
|
|
||||||
|
|||||||
@@ -182,11 +182,11 @@ class Ao3WorkExtractor(Ao3Extractor):
|
|||||||
extr('<dd class="freeform tags">', "</dd>")),
|
extr('<dd class="freeform tags">', "</dd>")),
|
||||||
"lang" : extr('<dd class="language" lang="', '"'),
|
"lang" : extr('<dd class="language" lang="', '"'),
|
||||||
"series" : extr('<dd class="series">', "</dd>"),
|
"series" : extr('<dd class="series">', "</dd>"),
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime_iso(extr(
|
||||||
extr('<dd class="published">', "<"), "%Y-%m-%d"),
|
'<dd class="published">', "<")),
|
||||||
"date_completed": text.parse_datetime(
|
"date_completed": self.parse_datetime_iso(extr(
|
||||||
extr('>Completed:</dt><dd class="status">', "<"), "%Y-%m-%d"),
|
'>Completed:</dt><dd class="status">', "<")),
|
||||||
"date_updated" : text.parse_timestamp(
|
"date_updated" : self.parse_timestamp(
|
||||||
path.rpartition("updated_at=")[2]),
|
path.rpartition("updated_at=")[2]),
|
||||||
"words" : text.parse_int(
|
"words" : text.parse_int(
|
||||||
extr('<dd class="words">', "<").replace(",", "")),
|
extr('<dd class="words">', "<").replace(",", "")),
|
||||||
|
|||||||
@@ -49,8 +49,7 @@ class ArcalivePostExtractor(ArcaliveExtractor):
|
|||||||
files = self._extract_files(post)
|
files = self._extract_files(post)
|
||||||
|
|
||||||
post["count"] = len(files)
|
post["count"] = len(files)
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["createdAt"][:19])
|
||||||
post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
|
|
||||||
post["post_url"] = post_url = \
|
post["post_url"] = post_url = \
|
||||||
f"{self.root}/b/{post['boardSlug']}/{post['id']}"
|
f"{self.root}/b/{post['boardSlug']}/{post['id']}"
|
||||||
post["_http_headers"] = {"Referer": post_url + "?p=1"}
|
post["_http_headers"] = {"Referer": post_url + "?p=1"}
|
||||||
|
|||||||
@@ -126,8 +126,7 @@ class ArtstationExtractor(Extractor):
|
|||||||
data["title"] = text.unescape(data["title"])
|
data["title"] = text.unescape(data["title"])
|
||||||
data["description"] = text.unescape(text.remove_html(
|
data["description"] = text.unescape(text.remove_html(
|
||||||
data["description"]))
|
data["description"]))
|
||||||
data["date"] = text.parse_datetime(
|
data["date"] = self.parse_datetime_iso(data["created_at"])
|
||||||
data["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
|
|
||||||
assets = data["assets"]
|
assets = data["assets"]
|
||||||
del data["assets"]
|
del data["assets"]
|
||||||
|
|||||||
@@ -9,10 +9,9 @@
|
|||||||
"""Extractors for https://aryion.com/"""
|
"""Extractors for https://aryion.com/"""
|
||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text, util, exception
|
from .. import text, util, dt, exception
|
||||||
from ..cache import cache
|
from ..cache import cache
|
||||||
from email.utils import parsedate_tz
|
from email.utils import parsedate_tz
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?(?:www\.)?aryion\.com/g4"
|
BASE_PATTERN = r"(?:https?://)?(?:www\.)?aryion\.com/g4"
|
||||||
|
|
||||||
@@ -156,7 +155,7 @@ class AryionExtractor(Extractor):
|
|||||||
"artist": artist,
|
"artist": artist,
|
||||||
"path" : text.split_html(extr(
|
"path" : text.split_html(extr(
|
||||||
"cookiecrumb'>", '</span'))[4:-1:2],
|
"cookiecrumb'>", '</span'))[4:-1:2],
|
||||||
"date" : datetime(*parsedate_tz(lmod)[:6]),
|
"date" : dt.datetime(*parsedate_tz(lmod)[:6]),
|
||||||
"size" : text.parse_int(clen),
|
"size" : text.parse_int(clen),
|
||||||
"views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")),
|
"views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")),
|
||||||
"width" : text.parse_int(extr("Resolution</b>:", "x")),
|
"width" : text.parse_int(extr("Resolution</b>:", "x")),
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
|
|||||||
"chapter_minor" : minor,
|
"chapter_minor" : minor,
|
||||||
"chapter_string": info,
|
"chapter_string": info,
|
||||||
"chapter_id" : text.parse_int(self.chapter_id),
|
"chapter_id" : text.parse_int(self.chapter_id),
|
||||||
"date" : text.parse_timestamp(extr(' time="', '"')[:-3]),
|
"date" : self.parse_timestamp(extr(' time="', '"')[:-3]),
|
||||||
}
|
}
|
||||||
|
|
||||||
def images(self, page):
|
def images(self, page):
|
||||||
@@ -167,8 +167,7 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor):
|
|||||||
|
|
||||||
data["chapter"] = text.parse_int(chapter)
|
data["chapter"] = text.parse_int(chapter)
|
||||||
data["chapter_minor"] = sep + minor
|
data["chapter_minor"] = sep + minor
|
||||||
data["date"] = text.parse_datetime(
|
data["date"] = self.parse_datetime_iso(extr('time="', '"'))
|
||||||
extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
|
|
||||||
url = f"{self.root}/title/{href}"
|
url = f"{self.root}/title/{href}"
|
||||||
results.append((url, data.copy()))
|
results.append((url, data.copy()))
|
||||||
@@ -188,9 +187,9 @@ def _manga_info(self, manga_id, page=None):
|
|||||||
"manga" : data["name"][1],
|
"manga" : data["name"][1],
|
||||||
"manga_id" : text.parse_int(manga_id),
|
"manga_id" : text.parse_int(manga_id),
|
||||||
"manga_slug" : data["slug"][1],
|
"manga_slug" : data["slug"][1],
|
||||||
"manga_date" : text.parse_timestamp(
|
"manga_date" : self.parse_timestamp(
|
||||||
data["dateCreate"][1] // 1000),
|
data["dateCreate"][1] // 1000),
|
||||||
"manga_date_updated": text.parse_timestamp(
|
"manga_date_updated": self.parse_timestamp(
|
||||||
data["dateUpdate"][1] / 1000),
|
data["dateUpdate"][1] / 1000),
|
||||||
"author" : json_list(data["authors"]),
|
"author" : json_list(data["authors"]),
|
||||||
"artist" : json_list(data["artists"]),
|
"artist" : json_list(data["artists"]),
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ class BehanceExtractor(Extractor):
|
|||||||
tags = [tag["title"] for tag in tags]
|
tags = [tag["title"] for tag in tags]
|
||||||
data["tags"] = tags
|
data["tags"] = tags
|
||||||
|
|
||||||
data["date"] = text.parse_timestamp(
|
data["date"] = self.parse_timestamp(
|
||||||
data.get("publishedOn") or data.get("conceived_on") or 0)
|
data.get("publishedOn") or data.get("conceived_on") or 0)
|
||||||
|
|
||||||
if creator := data.get("creator"):
|
if creator := data.get("creator"):
|
||||||
|
|||||||
@@ -144,8 +144,8 @@ class BellazonExtractor(Extractor):
|
|||||||
"title": schema["headline"],
|
"title": schema["headline"],
|
||||||
"views": stats[0]["userInteractionCount"],
|
"views": stats[0]["userInteractionCount"],
|
||||||
"posts": stats[1]["userInteractionCount"],
|
"posts": stats[1]["userInteractionCount"],
|
||||||
"date" : text.parse_datetime(schema["datePublished"]),
|
"date" : self.parse_datetime_iso(schema["datePublished"]),
|
||||||
"date_updated": text.parse_datetime(schema["dateModified"]),
|
"date_updated": self.parse_datetime_iso(schema["dateModified"]),
|
||||||
"description" : text.unescape(schema["text"]).strip(),
|
"description" : text.unescape(schema["text"]).strip(),
|
||||||
"section" : path[-2],
|
"section" : path[-2],
|
||||||
"author" : author["name"],
|
"author" : author["name"],
|
||||||
@@ -169,7 +169,7 @@ class BellazonExtractor(Extractor):
|
|||||||
post = {
|
post = {
|
||||||
"id": extr('id="elComment_', '"'),
|
"id": extr('id="elComment_', '"'),
|
||||||
"author_url": extr(" href='", "'"),
|
"author_url": extr(" href='", "'"),
|
||||||
"date": text.parse_datetime(extr("datetime='", "'")),
|
"date": self.parse_datetime_iso(extr("datetime='", "'")),
|
||||||
"content": extr("<!-- Post content -->", "\n\t\t</div>"),
|
"content": extr("<!-- Post content -->", "\n\t\t</div>"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ class BloggerExtractor(BaseExtractor):
|
|||||||
blog = self.api.blog_by_url("http://" + self.blog)
|
blog = self.api.blog_by_url("http://" + self.blog)
|
||||||
blog["pages"] = blog["pages"]["totalItems"]
|
blog["pages"] = blog["pages"]["totalItems"]
|
||||||
blog["posts"] = blog["posts"]["totalItems"]
|
blog["posts"] = blog["posts"]["totalItems"]
|
||||||
blog["date"] = text.parse_datetime(blog["published"])
|
blog["date"] = self.parse_datetime_iso(blog["published"])
|
||||||
del blog["selfLink"]
|
del blog["selfLink"]
|
||||||
|
|
||||||
findall_image = util.re(
|
findall_image = util.re(
|
||||||
@@ -65,7 +65,7 @@ class BloggerExtractor(BaseExtractor):
|
|||||||
post["author"] = post["author"]["displayName"]
|
post["author"] = post["author"]["displayName"]
|
||||||
post["replies"] = post["replies"]["totalItems"]
|
post["replies"] = post["replies"]["totalItems"]
|
||||||
post["content"] = text.remove_html(content)
|
post["content"] = text.remove_html(content)
|
||||||
post["date"] = text.parse_datetime(post["published"])
|
post["date"] = self.parse_datetime_iso(post["published"])
|
||||||
del post["selfLink"]
|
del post["selfLink"]
|
||||||
del post["blog"]
|
del post["blog"]
|
||||||
|
|
||||||
|
|||||||
@@ -135,8 +135,7 @@ class BlueskyExtractor(Extractor):
|
|||||||
|
|
||||||
post["instance"] = self.instance
|
post["instance"] = self.instance
|
||||||
post["post_id"] = self._pid(post)
|
post["post_id"] = self._pid(post)
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["createdAt"][:19])
|
||||||
post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
|
|
||||||
|
|
||||||
def _extract_files(self, post):
|
def _extract_files(self, post):
|
||||||
if "embed" not in post:
|
if "embed" not in post:
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ class BoostyExtractor(Extractor):
|
|||||||
post["links"] = links = []
|
post["links"] = links = []
|
||||||
|
|
||||||
if "createdAt" in post:
|
if "createdAt" in post:
|
||||||
post["date"] = text.parse_timestamp(post["createdAt"])
|
post["date"] = self.parse_timestamp(post["createdAt"])
|
||||||
|
|
||||||
for block in post["data"]:
|
for block in post["data"]:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -70,8 +70,7 @@ class BoothItemExtractor(BoothExtractor):
|
|||||||
url + ".json", headers=headers, interval=False)
|
url + ".json", headers=headers, interval=False)
|
||||||
|
|
||||||
item["booth_category"] = item.pop("category", None)
|
item["booth_category"] = item.pop("category", None)
|
||||||
item["date"] = text.parse_datetime(
|
item["date"] = self.parse_datetime_iso(item["published_at"])
|
||||||
item["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
item["tags"] = [t["name"] for t in item["tags"]]
|
item["tags"] = [t["name"] for t in item["tags"]]
|
||||||
|
|
||||||
shop = item["shop"]
|
shop = item["shop"]
|
||||||
|
|||||||
@@ -168,7 +168,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
|
|||||||
item, 'name: "', ".")
|
item, 'name: "', ".")
|
||||||
file["size"] = text.parse_int(text.extr(
|
file["size"] = text.parse_int(text.extr(
|
||||||
item, "size: ", " ,\n"))
|
item, "size: ", " ,\n"))
|
||||||
file["date"] = text.parse_datetime(text.extr(
|
file["date"] = self.parse_datetime(text.extr(
|
||||||
item, 'timestamp: "', '"'), "%H:%M:%S %d/%m/%Y")
|
item, 'timestamp: "', '"'), "%H:%M:%S %d/%m/%Y")
|
||||||
|
|
||||||
yield file
|
yield file
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ class CatboxAlbumExtractor(GalleryExtractor):
|
|||||||
return {
|
return {
|
||||||
"album_id" : self.page_url.rpartition("/")[2],
|
"album_id" : self.page_url.rpartition("/")[2],
|
||||||
"album_name" : text.unescape(extr("<h1>", "<")),
|
"album_name" : text.unescape(extr("<h1>", "<")),
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : self.parse_datetime(extr(
|
||||||
"<p>Created ", "<"), "%B %d %Y"),
|
"<p>Created ", "<"), "%B %d %Y"),
|
||||||
"description": text.unescape(extr("<p>", "<")),
|
"description": text.unescape(extr("<p>", "<")),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -79,8 +79,7 @@ class CheveretoImageExtractor(CheveretoExtractor):
|
|||||||
"url" : url,
|
"url" : url,
|
||||||
"album": text.remove_html(extr(
|
"album": text.remove_html(extr(
|
||||||
"Added to <a", "</a>").rpartition(">")[2]),
|
"Added to <a", "</a>").rpartition(">")[2]),
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : self.parse_datetime_iso(extr('<span title="', '"')),
|
||||||
'<span title="', '"'), "%Y-%m-%d %H:%M:%S"),
|
|
||||||
"user" : extr('username: "', '"'),
|
"user" : extr('username: "', '"'),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -116,8 +115,7 @@ class CheveretoVideoExtractor(CheveretoExtractor):
|
|||||||
'class="far fa-clock"></i>', "—"),
|
'class="far fa-clock"></i>', "—"),
|
||||||
"album": text.remove_html(extr(
|
"album": text.remove_html(extr(
|
||||||
"Added to <a", "</a>").rpartition(">")[2]),
|
"Added to <a", "</a>").rpartition(">")[2]),
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : self.parse_datetime_iso(extr('<span title="', '"')),
|
||||||
'<span title="', '"'), "%Y-%m-%d %H:%M:%S"),
|
|
||||||
"user" : extr('username: "', '"'),
|
"user" : extr('username: "', '"'),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ class CienArticleExtractor(CienExtractor):
|
|||||||
post["post_url"] = url
|
post["post_url"] = url
|
||||||
post["post_id"] = text.parse_int(post_id)
|
post["post_id"] = text.parse_int(post_id)
|
||||||
post["count"] = len(files)
|
post["count"] = len(files)
|
||||||
post["date"] = text.parse_datetime(post["datePublished"])
|
post["date"] = self.parse_datetime_iso(post["datePublished"])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
post["author"]["id"] = text.parse_int(author_id)
|
post["author"]["id"] = text.parse_int(author_id)
|
||||||
|
|||||||
@@ -86,8 +86,7 @@ class CivitaiExtractor(Extractor):
|
|||||||
images = self.api.images_post(post["id"])
|
images = self.api.images_post(post["id"])
|
||||||
|
|
||||||
post = self.api.post(post["id"])
|
post = self.api.post(post["id"])
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["publishedAt"])
|
||||||
post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
data = {
|
data = {
|
||||||
"post": post,
|
"post": post,
|
||||||
"user": post.pop("user"),
|
"user": post.pop("user"),
|
||||||
@@ -122,8 +121,7 @@ class CivitaiExtractor(Extractor):
|
|||||||
data["post"] = post = self._extract_meta_post(file)
|
data["post"] = post = self._extract_meta_post(file)
|
||||||
if post:
|
if post:
|
||||||
post.pop("user", None)
|
post.pop("user", None)
|
||||||
file["date"] = text.parse_datetime(
|
file["date"] = self.parse_datetime_iso(file["createdAt"])
|
||||||
file["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
|
|
||||||
data["url"] = url = self._url(file)
|
data["url"] = url = self._url(file)
|
||||||
text.nameext_from_url(url, data)
|
text.nameext_from_url(url, data)
|
||||||
@@ -180,8 +178,7 @@ class CivitaiExtractor(Extractor):
|
|||||||
if "id" not in file and data["filename"].isdecimal():
|
if "id" not in file and data["filename"].isdecimal():
|
||||||
file["id"] = text.parse_int(data["filename"])
|
file["id"] = text.parse_int(data["filename"])
|
||||||
if "date" not in file:
|
if "date" not in file:
|
||||||
file["date"] = text.parse_datetime(
|
file["date"] = self.parse_datetime_iso(file["createdAt"])
|
||||||
file["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
if self._meta_generation:
|
if self._meta_generation:
|
||||||
file["generation"] = self._extract_meta_generation(file)
|
file["generation"] = self._extract_meta_generation(file)
|
||||||
yield data
|
yield data
|
||||||
@@ -216,8 +213,7 @@ class CivitaiExtractor(Extractor):
|
|||||||
def _extract_meta_post(self, image):
|
def _extract_meta_post(self, image):
|
||||||
try:
|
try:
|
||||||
post = self.api.post(image["postId"])
|
post = self.api.post(image["postId"])
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["publishedAt"])
|
||||||
post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
return post
|
return post
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
return self.log.traceback(exc)
|
return self.log.traceback(exc)
|
||||||
@@ -278,8 +274,7 @@ class CivitaiModelExtractor(CivitaiExtractor):
|
|||||||
versions = (version,)
|
versions = (version,)
|
||||||
|
|
||||||
for version in versions:
|
for version in versions:
|
||||||
version["date"] = text.parse_datetime(
|
version["date"] = self.parse_datetime_iso(version["createdAt"])
|
||||||
version["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"model" : model,
|
"model" : model,
|
||||||
@@ -593,8 +588,7 @@ class CivitaiGeneratedExtractor(CivitaiExtractor):
|
|||||||
self._require_auth()
|
self._require_auth()
|
||||||
|
|
||||||
for gen in self.api.orchestrator_queryGeneratedImages():
|
for gen in self.api.orchestrator_queryGeneratedImages():
|
||||||
gen["date"] = text.parse_datetime(
|
gen["date"] = self.parse_datetime_iso(gen["createdAt"])
|
||||||
gen["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
yield Message.Directory, gen
|
yield Message.Directory, gen
|
||||||
for step in gen.pop("steps", ()):
|
for step in gen.pop("steps", ()):
|
||||||
for image in step.pop("images", ()):
|
for image in step.pop("images", ()):
|
||||||
|
|||||||
@@ -114,10 +114,8 @@ class ComickChapterExtractor(ComickBase, ChapterExtractor):
|
|||||||
"chapter_hid" : ch["hid"],
|
"chapter_hid" : ch["hid"],
|
||||||
"chapter_string": chstr,
|
"chapter_string": chstr,
|
||||||
"group" : ch["group_name"],
|
"group" : ch["group_name"],
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime_iso(ch["created_at"][:19]),
|
||||||
ch["created_at"][:19], "%Y-%m-%dT%H:%M:%S"),
|
"date_updated" : self.parse_datetime_iso(ch["updated_at"][:19]),
|
||||||
"date_updated" : text.parse_datetime(
|
|
||||||
ch["updated_at"][:19], "%Y-%m-%dT%H:%M:%S"),
|
|
||||||
"lang" : ch["lang"],
|
"lang" : ch["lang"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -60,6 +60,6 @@ class ComicvineTagExtractor(BooruExtractor):
|
|||||||
_file_url = operator.itemgetter("original")
|
_file_url = operator.itemgetter("original")
|
||||||
|
|
||||||
def _prepare(self, post):
|
def _prepare(self, post):
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime(
|
||||||
post["dateCreated"], "%a, %b %d %Y")
|
post["dateCreated"], "%a, %b %d %Y")
|
||||||
post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]]
|
post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]]
|
||||||
|
|||||||
@@ -19,11 +19,10 @@ import getpass
|
|||||||
import logging
|
import logging
|
||||||
import requests
|
import requests
|
||||||
import threading
|
import threading
|
||||||
from datetime import datetime
|
|
||||||
from xml.etree import ElementTree
|
from xml.etree import ElementTree
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
from .message import Message
|
from .message import Message
|
||||||
from .. import config, output, text, util, cache, exception
|
from .. import config, output, text, util, dt, cache, exception
|
||||||
urllib3 = requests.packages.urllib3
|
urllib3 = requests.packages.urllib3
|
||||||
|
|
||||||
|
|
||||||
@@ -64,6 +63,10 @@ class Extractor():
|
|||||||
else:
|
else:
|
||||||
self.category = CATEGORY_MAP[self.category]
|
self.category = CATEGORY_MAP[self.category]
|
||||||
|
|
||||||
|
self.parse_datetime = dt.parse
|
||||||
|
self.parse_datetime_iso = dt.parse_iso
|
||||||
|
self.parse_timestamp = dt.parse_ts
|
||||||
|
|
||||||
self._cfgpath = ("extractor", self.category, self.subcategory)
|
self._cfgpath = ("extractor", self.category, self.subcategory)
|
||||||
self._parentdir = ""
|
self._parentdir = ""
|
||||||
|
|
||||||
@@ -313,9 +316,9 @@ class Extractor():
|
|||||||
seconds = float(seconds)
|
seconds = float(seconds)
|
||||||
until = now + seconds
|
until = now + seconds
|
||||||
elif until:
|
elif until:
|
||||||
if isinstance(until, datetime):
|
if isinstance(until, dt.datetime):
|
||||||
# convert to UTC timestamp
|
# convert to UTC timestamp
|
||||||
until = util.datetime_to_timestamp(until)
|
until = dt.to_ts(until)
|
||||||
else:
|
else:
|
||||||
until = float(until)
|
until = float(until)
|
||||||
seconds = until - now
|
seconds = until - now
|
||||||
@@ -327,7 +330,7 @@ class Extractor():
|
|||||||
return
|
return
|
||||||
|
|
||||||
if reason:
|
if reason:
|
||||||
t = datetime.fromtimestamp(until).time()
|
t = dt.datetime.fromtimestamp(until).time()
|
||||||
isotime = f"{t.hour:02}:{t.minute:02}:{t.second:02}"
|
isotime = f"{t.hour:02}:{t.minute:02}:{t.second:02}"
|
||||||
self.log.info("Waiting until %s (%s)", isotime, reason)
|
self.log.info("Waiting until %s (%s)", isotime, reason)
|
||||||
time.sleep(seconds)
|
time.sleep(seconds)
|
||||||
@@ -652,7 +655,7 @@ class Extractor():
|
|||||||
self.log.warning(
|
self.log.warning(
|
||||||
"cookies: %s/%s expired at %s",
|
"cookies: %s/%s expired at %s",
|
||||||
cookie.domain.lstrip("."), cookie.name,
|
cookie.domain.lstrip("."), cookie.name,
|
||||||
datetime.fromtimestamp(cookie.expires))
|
dt.datetime.fromtimestamp(cookie.expires))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
elif diff <= 86400:
|
elif diff <= 86400:
|
||||||
@@ -694,7 +697,7 @@ class Extractor():
|
|||||||
ts = self.config(key, default)
|
ts = self.config(key, default)
|
||||||
if isinstance(ts, str):
|
if isinstance(ts, str):
|
||||||
try:
|
try:
|
||||||
ts = int(datetime.strptime(ts, fmt).timestamp())
|
ts = int(dt.parse(ts, fmt).timestamp())
|
||||||
except ValueError as exc:
|
except ValueError as exc:
|
||||||
self.log.warning("Unable to parse '%s': %s", key, exc)
|
self.log.warning("Unable to parse '%s': %s", key, exc)
|
||||||
ts = default
|
ts = default
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
|
|||||||
"album_name" : text.unescape(extr('title="', '"')),
|
"album_name" : text.unescape(extr('title="', '"')),
|
||||||
"album_size" : text.parse_bytes(extr(
|
"album_size" : text.parse_bytes(extr(
|
||||||
'<p class="title">', "B")),
|
'<p class="title">', "B")),
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : self.parse_datetime(extr(
|
||||||
'<p class="title">', '<'), "%d.%m.%Y"),
|
'<p class="title">', '<'), "%d.%m.%Y"),
|
||||||
"description": text.unescape(text.unescape( # double
|
"description": text.unescape(text.unescape( # double
|
||||||
desc.rpartition(" [R")[0])),
|
desc.rpartition(" [R")[0])),
|
||||||
|
|||||||
@@ -113,7 +113,7 @@ class CyberfileFileExtractor(CyberfileExtractor):
|
|||||||
"Filesize:", "</tr>"))[:-1]),
|
"Filesize:", "</tr>"))[:-1]),
|
||||||
"tags" : text.split_html(extr(
|
"tags" : text.split_html(extr(
|
||||||
"Keywords:", "</tr>")),
|
"Keywords:", "</tr>")),
|
||||||
"date" : text.parse_datetime(text.remove_html(extr(
|
"date" : self.parse_datetime(text.remove_html(extr(
|
||||||
"Uploaded:", "</tr>")), "%d/%m/%Y %H:%M:%S"),
|
"Uploaded:", "</tr>")), "%d/%m/%Y %H:%M:%S"),
|
||||||
"permissions": text.remove_html(extr(
|
"permissions": text.remove_html(extr(
|
||||||
"Permissions:", "</tr>")).split(" & "),
|
"Permissions:", "</tr>")).split(" & "),
|
||||||
|
|||||||
@@ -9,8 +9,7 @@
|
|||||||
"""Extractors for https://danbooru.donmai.us/ and other Danbooru instances"""
|
"""Extractors for https://danbooru.donmai.us/ and other Danbooru instances"""
|
||||||
|
|
||||||
from .common import BaseExtractor, Message
|
from .common import BaseExtractor, Message
|
||||||
from .. import text, util
|
from .. import text, util, dt
|
||||||
import datetime
|
|
||||||
|
|
||||||
|
|
||||||
class DanbooruExtractor(BaseExtractor):
|
class DanbooruExtractor(BaseExtractor):
|
||||||
@@ -69,8 +68,7 @@ class DanbooruExtractor(BaseExtractor):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
text.nameext_from_url(url, post)
|
text.nameext_from_url(url, post)
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = dt.parse_iso(post["created_at"])
|
||||||
post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
|
|
||||||
post["tags"] = (
|
post["tags"] = (
|
||||||
post["tag_string"].split(" ")
|
post["tag_string"].split(" ")
|
||||||
@@ -357,11 +355,11 @@ class DanbooruPopularExtractor(DanbooruExtractor):
|
|||||||
def metadata(self):
|
def metadata(self):
|
||||||
self.params = params = text.parse_query(self.groups[-1])
|
self.params = params = text.parse_query(self.groups[-1])
|
||||||
scale = params.get("scale", "day")
|
scale = params.get("scale", "day")
|
||||||
date = params.get("date") or datetime.date.today().isoformat()
|
date = params.get("date") or dt.date.today().isoformat()
|
||||||
|
|
||||||
if scale == "week":
|
if scale == "week":
|
||||||
date = datetime.date.fromisoformat(date)
|
date = dt.date.fromisoformat(date)
|
||||||
date = (date - datetime.timedelta(days=date.weekday())).isoformat()
|
date = (date - dt.timedelta(days=date.weekday())).isoformat()
|
||||||
elif scale == "month":
|
elif scale == "month":
|
||||||
date = date[:-3]
|
date = date[:-3]
|
||||||
|
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ class DankefuerslesenChapterExtractor(DankefuerslesenBase, ChapterExtractor):
|
|||||||
"chapter_minor": minor,
|
"chapter_minor": minor,
|
||||||
"group" : manga["groups"][group_id].split(" & "),
|
"group" : manga["groups"][group_id].split(" & "),
|
||||||
"group_id" : text.parse_int(group_id),
|
"group_id" : text.parse_int(group_id),
|
||||||
"date" : text.parse_timestamp(data["release_date"][group_id]),
|
"date" : self.parse_timestamp(data["release_date"][group_id]),
|
||||||
"lang" : util.NONE,
|
"lang" : util.NONE,
|
||||||
"language" : util.NONE,
|
"language" : util.NONE,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"""Extractors for https://www.deviantart.com/"""
|
"""Extractors for https://www.deviantart.com/"""
|
||||||
|
|
||||||
from .common import Extractor, Message, Dispatch
|
from .common import Extractor, Message, Dispatch
|
||||||
from .. import text, util, exception
|
from .. import text, util, dt, exception
|
||||||
from ..cache import cache, memcache
|
from ..cache import cache, memcache
|
||||||
import collections
|
import collections
|
||||||
import mimetypes
|
import mimetypes
|
||||||
@@ -259,7 +259,7 @@ class DeviantartExtractor(Extractor):
|
|||||||
|
|
||||||
deviation["published_time"] = text.parse_int(
|
deviation["published_time"] = text.parse_int(
|
||||||
deviation["published_time"])
|
deviation["published_time"])
|
||||||
deviation["date"] = text.parse_timestamp(
|
deviation["date"] = self.parse_timestamp(
|
||||||
deviation["published_time"])
|
deviation["published_time"])
|
||||||
|
|
||||||
if self.comments:
|
if self.comments:
|
||||||
@@ -1187,8 +1187,8 @@ class DeviantartStatusExtractor(DeviantartExtractor):
|
|||||||
deviation["username"] = deviation["author"]["username"]
|
deviation["username"] = deviation["author"]["username"]
|
||||||
deviation["_username"] = deviation["username"].lower()
|
deviation["_username"] = deviation["username"].lower()
|
||||||
|
|
||||||
deviation["date"] = dt = text.parse_datetime(deviation["ts"])
|
deviation["date"] = d = self.parse_datetime_iso(deviation["ts"])
|
||||||
deviation["published_time"] = int(util.datetime_to_timestamp(dt))
|
deviation["published_time"] = int(dt.to_ts(d))
|
||||||
|
|
||||||
deviation["da_category"] = "Status"
|
deviation["da_category"] = "Status"
|
||||||
deviation["category_path"] = "status"
|
deviation["category_path"] = "status"
|
||||||
|
|||||||
@@ -72,9 +72,7 @@ class DiscordExtractor(Extractor):
|
|||||||
"author_files": [],
|
"author_files": [],
|
||||||
"message": self.extract_message_text(message),
|
"message": self.extract_message_text(message),
|
||||||
"message_id": message["id"],
|
"message_id": message["id"],
|
||||||
"date": text.parse_datetime(
|
"date": self.parse_datetime_iso(message["timestamp"]),
|
||||||
message["timestamp"], "%Y-%m-%dT%H:%M:%S.%f%z"
|
|
||||||
),
|
|
||||||
"files": []
|
"files": []
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
|
|||||||
"author" : text.remove_html(author),
|
"author" : text.remove_html(author),
|
||||||
"group" : (text.remove_html(group) or
|
"group" : (text.remove_html(group) or
|
||||||
text.extr(group, ' alt="', '"')),
|
text.extr(group, ' alt="', '"')),
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : self.parse_datetime(extr(
|
||||||
'"icon-calendar"></i> ', '<'), "%b %d, %Y"),
|
'"icon-calendar"></i> ', '<'), "%b %d, %Y"),
|
||||||
"tags" : text.split_html(extr(
|
"tags" : text.split_html(extr(
|
||||||
"class='tags'>", "<div id='chapter-actions'")),
|
"class='tags'>", "<div id='chapter-actions'")),
|
||||||
@@ -166,8 +166,6 @@ class DynastyscansAnthologyExtractor(DynastyscansSearchExtractor):
|
|||||||
data["scanlator"] = content[1].text[11:]
|
data["scanlator"] = content[1].text[11:]
|
||||||
data["tags"] = content[2].text[6:].lower().split(", ")
|
data["tags"] = content[2].text[6:].lower().split(", ")
|
||||||
data["title"] = element[5].text
|
data["title"] = element[5].text
|
||||||
data["date"] = text.parse_datetime(
|
data["date"] = self.parse_datetime_iso(element[1].text)
|
||||||
element[1].text, "%Y-%m-%dT%H:%M:%S%z")
|
data["date_updated"] = self.parse_datetime_iso(element[2].text)
|
||||||
data["date_updated"] = text.parse_datetime(
|
|
||||||
element[2].text, "%Y-%m-%dT%H:%M:%S%z")
|
|
||||||
yield Message.Queue, element[4].text, data
|
yield Message.Queue, element[4].text, data
|
||||||
|
|||||||
@@ -51,8 +51,7 @@ class E621Extractor(danbooru.DanbooruExtractor):
|
|||||||
|
|
||||||
post["filename"] = file["md5"]
|
post["filename"] = file["md5"]
|
||||||
post["extension"] = file["ext"]
|
post["extension"] = file["ext"]
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["created_at"])
|
||||||
post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
|
|
||||||
post.update(data)
|
post.update(data)
|
||||||
yield Message.Directory, post
|
yield Message.Directory, post
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ class EromeAlbumExtractor(EromeExtractor):
|
|||||||
if not date:
|
if not date:
|
||||||
ts = text.extr(group, '?v=', '"')
|
ts = text.extr(group, '?v=', '"')
|
||||||
if len(ts) > 1:
|
if len(ts) > 1:
|
||||||
date = text.parse_timestamp(ts)
|
date = self.parse_timestamp(ts)
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"album_id": album_id,
|
"album_id": album_id,
|
||||||
|
|||||||
@@ -216,7 +216,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
|
|||||||
def _items_hitomi(self):
|
def _items_hitomi(self):
|
||||||
if self.config("metadata", False):
|
if self.config("metadata", False):
|
||||||
data = self.metadata_from_api()
|
data = self.metadata_from_api()
|
||||||
data["date"] = text.parse_timestamp(data["posted"])
|
data["date"] = self.parse_timestamp(data["posted"])
|
||||||
else:
|
else:
|
||||||
data = {}
|
data = {}
|
||||||
|
|
||||||
@@ -233,7 +233,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
|
|||||||
data = self.metadata_from_page(page)
|
data = self.metadata_from_page(page)
|
||||||
if self.config("metadata", False):
|
if self.config("metadata", False):
|
||||||
data.update(self.metadata_from_api())
|
data.update(self.metadata_from_api())
|
||||||
data["date"] = text.parse_timestamp(data["posted"])
|
data["date"] = self.parse_timestamp(data["posted"])
|
||||||
if self.config("tags", False):
|
if self.config("tags", False):
|
||||||
tags = collections.defaultdict(list)
|
tags = collections.defaultdict(list)
|
||||||
for tag in data["tags"]:
|
for tag in data["tags"]:
|
||||||
@@ -258,8 +258,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
|
|||||||
"_" : extr('<div id="gdc"><div class="cs ct', '"'),
|
"_" : extr('<div id="gdc"><div class="cs ct', '"'),
|
||||||
"eh_category" : extr('>', '<'),
|
"eh_category" : extr('>', '<'),
|
||||||
"uploader" : extr('<div id="gdn">', '</div>'),
|
"uploader" : extr('<div id="gdn">', '</div>'),
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : self.parse_datetime_iso(extr(
|
||||||
'>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"),
|
'>Posted:</td><td class="gdt2">', '</td>')),
|
||||||
"parent" : extr(
|
"parent" : extr(
|
||||||
'>Parent:</td><td class="gdt2"><a href="', '"'),
|
'>Parent:</td><td class="gdt2"><a href="', '"'),
|
||||||
"expunged" : "Yes" != extr(
|
"expunged" : "Yes" != extr(
|
||||||
|
|||||||
@@ -108,7 +108,7 @@ class FacebookExtractor(Extractor):
|
|||||||
'"message":{"delight_ranges"',
|
'"message":{"delight_ranges"',
|
||||||
'"},"message_preferred_body"'
|
'"},"message_preferred_body"'
|
||||||
).rsplit('],"text":"', 1)[-1]),
|
).rsplit('],"text":"', 1)[-1]),
|
||||||
"date": text.parse_timestamp(
|
"date": self.parse_timestamp(
|
||||||
text.extr(photo_page, '\\"publish_time\\":', ',') or
|
text.extr(photo_page, '\\"publish_time\\":', ',') or
|
||||||
text.extr(photo_page, '"created_time":', ',')
|
text.extr(photo_page, '"created_time":', ',')
|
||||||
),
|
),
|
||||||
@@ -172,7 +172,7 @@ class FacebookExtractor(Extractor):
|
|||||||
"user_id": text.extr(
|
"user_id": text.extr(
|
||||||
video_page, '"owner":{"__typename":"User","id":"', '"'
|
video_page, '"owner":{"__typename":"User","id":"', '"'
|
||||||
),
|
),
|
||||||
"date": text.parse_timestamp(text.extr(
|
"date": self.parse_timestamp(text.extr(
|
||||||
video_page, '\\"publish_time\\":', ','
|
video_page, '\\"publish_time\\":', ','
|
||||||
)),
|
)),
|
||||||
"type": "video"
|
"type": "video"
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ class FanboxExtractor(Extractor):
|
|||||||
if file.get("extension", "").lower() in exts
|
if file.get("extension", "").lower() in exts
|
||||||
]
|
]
|
||||||
|
|
||||||
post["date"] = text.parse_datetime(post["publishedDatetime"])
|
post["date"] = self.parse_datetime_iso(post["publishedDatetime"])
|
||||||
post["text"] = content_body.get("text") if content_body else None
|
post["text"] = content_body.get("text") if content_body else None
|
||||||
post["isCoverImage"] = False
|
post["isCoverImage"] = False
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class FanslyExtractor(Extractor):
|
|||||||
for post in self.posts():
|
for post in self.posts():
|
||||||
files = self._extract_files(post)
|
files = self._extract_files(post)
|
||||||
post["count"] = len(files)
|
post["count"] = len(files)
|
||||||
post["date"] = text.parse_timestamp(post["createdAt"])
|
post["date"] = self.parse_timestamp(post["createdAt"])
|
||||||
|
|
||||||
yield Message.Directory, post
|
yield Message.Directory, post
|
||||||
for post["num"], file in enumerate(files, 1):
|
for post["num"], file in enumerate(files, 1):
|
||||||
@@ -117,8 +117,8 @@ class FanslyExtractor(Extractor):
|
|||||||
file = {
|
file = {
|
||||||
**variant,
|
**variant,
|
||||||
"format": variant["type"],
|
"format": variant["type"],
|
||||||
"date": text.parse_timestamp(media["createdAt"]),
|
"date": self.parse_timestamp(media["createdAt"]),
|
||||||
"date_updated": text.parse_timestamp(media["updatedAt"]),
|
"date_updated": self.parse_timestamp(media["updatedAt"]),
|
||||||
}
|
}
|
||||||
|
|
||||||
if "metadata" in location:
|
if "metadata" in location:
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ class FantiaExtractor(Extractor):
|
|||||||
"comment": resp["comment"],
|
"comment": resp["comment"],
|
||||||
"rating": resp["rating"],
|
"rating": resp["rating"],
|
||||||
"posted_at": resp["posted_at"],
|
"posted_at": resp["posted_at"],
|
||||||
"date": text.parse_datetime(
|
"date": self.parse_datetime(
|
||||||
resp["posted_at"], "%a, %d %b %Y %H:%M:%S %z"),
|
resp["posted_at"], "%a, %d %b %Y %H:%M:%S %z"),
|
||||||
"fanclub_id": resp["fanclub"]["id"],
|
"fanclub_id": resp["fanclub"]["id"],
|
||||||
"fanclub_user_id": resp["fanclub"]["user"]["id"],
|
"fanclub_user_id": resp["fanclub"]["user"]["id"],
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ class FlickrImageExtractor(FlickrExtractor):
|
|||||||
photo["comments"] = text.parse_int(photo["comments"]["_content"])
|
photo["comments"] = text.parse_int(photo["comments"]["_content"])
|
||||||
photo["description"] = photo["description"]["_content"]
|
photo["description"] = photo["description"]["_content"]
|
||||||
photo["tags"] = [t["raw"] for t in photo["tags"]["tag"]]
|
photo["tags"] = [t["raw"] for t in photo["tags"]["tag"]]
|
||||||
photo["date"] = text.parse_timestamp(photo["dateuploaded"])
|
photo["date"] = self.parse_timestamp(photo["dateuploaded"])
|
||||||
photo["views"] = text.parse_int(photo["views"])
|
photo["views"] = text.parse_int(photo["views"])
|
||||||
photo["id"] = text.parse_int(photo["id"])
|
photo["id"] = text.parse_int(photo["id"])
|
||||||
|
|
||||||
@@ -489,7 +489,7 @@ class FlickrAPI(oauth.OAuth1API):
|
|||||||
def _extract_format(self, photo):
|
def _extract_format(self, photo):
|
||||||
photo["description"] = photo["description"]["_content"].strip()
|
photo["description"] = photo["description"]["_content"].strip()
|
||||||
photo["views"] = text.parse_int(photo["views"])
|
photo["views"] = text.parse_int(photo["views"])
|
||||||
photo["date"] = text.parse_timestamp(photo["dateupload"])
|
photo["date"] = self.parse_timestamp(photo["dateupload"])
|
||||||
photo["tags"] = photo["tags"].split()
|
photo["tags"] = photo["tags"].split()
|
||||||
|
|
||||||
self._extract_metadata(photo)
|
self._extract_metadata(photo)
|
||||||
|
|||||||
@@ -143,7 +143,7 @@ class FuraffinityExtractor(Extractor):
|
|||||||
data["folders"] = () # folders not present in old layout
|
data["folders"] = () # folders not present in old layout
|
||||||
|
|
||||||
data["user"] = self.user or data["artist_url"]
|
data["user"] = self.user or data["artist_url"]
|
||||||
data["date"] = text.parse_timestamp(data["filename"].partition(".")[0])
|
data["date"] = self.parse_timestamp(data["filename"].partition(".")[0])
|
||||||
data["description"] = self._process_description(data["_description"])
|
data["description"] = self._process_description(data["_description"])
|
||||||
data["thumbnail"] = (f"https://t.furaffinity.net/{post_id}@600-"
|
data["thumbnail"] = (f"https://t.furaffinity.net/{post_id}@600-"
|
||||||
f"{path.rsplit('/', 2)[1]}.jpg")
|
f"{path.rsplit('/', 2)[1]}.jpg")
|
||||||
|
|||||||
@@ -55,8 +55,7 @@ class Furry34Extractor(BooruExtractor):
|
|||||||
|
|
||||||
def _prepare(self, post):
|
def _prepare(self, post):
|
||||||
post.pop("files", None)
|
post.pop("files", None)
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["created"])
|
||||||
post["created"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
post["filename"], _, post["format"] = post["filename"].rpartition(".")
|
post["filename"], _, post["format"] = post["filename"].rpartition(".")
|
||||||
if "tags" in post:
|
if "tags" in post:
|
||||||
post["tags"] = [t["value"] for t in post["tags"]]
|
post["tags"] = [t["value"] for t in post["tags"]]
|
||||||
|
|||||||
@@ -246,7 +246,7 @@ class GelbooruFavoriteExtractor(GelbooruBase,
|
|||||||
|
|
||||||
for fav in favs:
|
for fav in favs:
|
||||||
for post in self._api_request({"id": fav["favorite"]}):
|
for post in self._api_request({"id": fav["favorite"]}):
|
||||||
post["date_favorited"] = text.parse_timestamp(fav["added"])
|
post["date_favorited"] = self.parse_timestamp(fav["added"])
|
||||||
yield post
|
yield post
|
||||||
|
|
||||||
params["pid"] += 1
|
params["pid"] += 1
|
||||||
@@ -273,7 +273,7 @@ class GelbooruFavoriteExtractor(GelbooruBase,
|
|||||||
|
|
||||||
for fav in favs:
|
for fav in favs:
|
||||||
for post in self._api_request({"id": fav["favorite"]}):
|
for post in self._api_request({"id": fav["favorite"]}):
|
||||||
post["date_favorited"] = text.parse_timestamp(fav["added"])
|
post["date_favorited"] = self.parse_timestamp(fav["added"])
|
||||||
yield post
|
yield post
|
||||||
|
|
||||||
params["pid"] -= 1
|
params["pid"] -= 1
|
||||||
|
|||||||
@@ -35,8 +35,7 @@ class GelbooruV01Extractor(booru.BooruExtractor):
|
|||||||
}
|
}
|
||||||
|
|
||||||
post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
|
post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["created_at"])
|
||||||
post["created_at"], "%Y-%m-%d %H:%M:%S")
|
|
||||||
|
|
||||||
return post
|
return post
|
||||||
|
|
||||||
|
|||||||
@@ -122,7 +122,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
|
|||||||
|
|
||||||
def _prepare(self, post):
|
def _prepare(self, post):
|
||||||
post["tags"] = post["tags"].strip()
|
post["tags"] = post["tags"].strip()
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime(
|
||||||
post["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
post["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||||
|
|
||||||
def _html(self, post):
|
def _html(self, post):
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ class GirlsreleasedSetExtractor(GirlsreleasedExtractor):
|
|||||||
"id": json["id"],
|
"id": json["id"],
|
||||||
"site": json["site"],
|
"site": json["site"],
|
||||||
"model": [model for _, model in json["models"]],
|
"model": [model for _, model in json["models"]],
|
||||||
"date": text.parse_timestamp(json["date"]),
|
"date": self.parse_timestamp(json["date"]),
|
||||||
"count": len(json["images"]),
|
"count": len(json["images"]),
|
||||||
"url": "https://girlsreleased.com/set/" + json["id"],
|
"url": "https://girlsreleased.com/set/" + json["id"],
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -101,9 +101,8 @@ class GirlswithmusclePostExtractor(GirlswithmuscleExtractor):
|
|||||||
"model": model,
|
"model": model,
|
||||||
"model_list": self._parse_model_list(model),
|
"model_list": self._parse_model_list(model),
|
||||||
"tags": text.split_html(tags)[1::2],
|
"tags": text.split_html(tags)[1::2],
|
||||||
"date": text.parse_datetime(
|
"date": self.parse_datetime_iso(text.extr(
|
||||||
text.extr(page, 'class="hover-time" title="', '"')[:19],
|
page, 'class="hover-time" title="', '"')[:19]),
|
||||||
"%Y-%m-%d %H:%M:%S"),
|
|
||||||
"is_favorite": self._parse_is_favorite(page),
|
"is_favorite": self._parse_is_favorite(page),
|
||||||
"source_filename": source,
|
"source_filename": source,
|
||||||
"uploader": uploader,
|
"uploader": uploader,
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ class HatenablogExtractor(Extractor):
|
|||||||
|
|
||||||
def _handle_article(self, article: str):
|
def _handle_article(self, article: str):
|
||||||
extr = text.extract_from(article)
|
extr = text.extract_from(article)
|
||||||
date = text.parse_datetime(extr('<time datetime="', '"'))
|
date = self.parse_datetime_iso(extr('<time datetime="', '"'))
|
||||||
entry_link = text.unescape(extr('<a href="', '"'))
|
entry_link = text.unescape(extr('<a href="', '"'))
|
||||||
entry = entry_link.partition("/entry/")[2]
|
entry = entry_link.partition("/entry/")[2]
|
||||||
title = text.unescape(extr('>', '<'))
|
title = text.unescape(extr('>', '<'))
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ class HentaifoundryExtractor(Extractor):
|
|||||||
.replace("\r\n", "\n")),
|
.replace("\r\n", "\n")),
|
||||||
"ratings" : [text.unescape(r) for r in text.extract_iter(extr(
|
"ratings" : [text.unescape(r) for r in text.extract_iter(extr(
|
||||||
"class='ratings_box'", "</div>"), "title='", "'")],
|
"class='ratings_box'", "</div>"), "title='", "'")],
|
||||||
"date" : text.parse_datetime(extr("datetime='", "'")),
|
"date" : self.parse_datetime_iso(extr("datetime='", "'")),
|
||||||
"views" : text.parse_int(extr(">Views</span>", "<")),
|
"views" : text.parse_int(extr(">Views</span>", "<")),
|
||||||
"score" : text.parse_int(extr(">Vote Score</span>", "<")),
|
"score" : text.parse_int(extr(">Vote Score</span>", "<")),
|
||||||
"media" : text.unescape(extr(">Media</span>", "<").strip()),
|
"media" : text.unescape(extr(">Media</span>", "<").strip()),
|
||||||
@@ -126,7 +126,7 @@ class HentaifoundryExtractor(Extractor):
|
|||||||
"title" : text.unescape(extr(
|
"title" : text.unescape(extr(
|
||||||
"<div class='titlebar'>", "</a>").rpartition(">")[2]),
|
"<div class='titlebar'>", "</a>").rpartition(">")[2]),
|
||||||
"author" : text.unescape(extr('alt="', '"')),
|
"author" : text.unescape(extr('alt="', '"')),
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : self.parse_datetime(extr(
|
||||||
">Updated<", "</span>").rpartition(">")[2], "%B %d, %Y"),
|
">Updated<", "</span>").rpartition(">")[2], "%B %d, %Y"),
|
||||||
"status" : extr("class='indent'>", "<"),
|
"status" : extr("class='indent'>", "<"),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -35,8 +35,7 @@ class HentaihandGalleryExtractor(GalleryExtractor):
|
|||||||
"language" : info["language"]["name"],
|
"language" : info["language"]["name"],
|
||||||
"lang" : util.language_to_code(info["language"]["name"]),
|
"lang" : util.language_to_code(info["language"]["name"]),
|
||||||
"tags" : [t["slug"] for t in info["tags"]],
|
"tags" : [t["slug"] for t in info["tags"]],
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime_iso(info["uploaded_at"]),
|
||||||
info["uploaded_at"], "%Y-%m-%d"),
|
|
||||||
}
|
}
|
||||||
for key in ("artists", "authors", "groups", "characters",
|
for key in ("artists", "authors", "groups", "characters",
|
||||||
"relationships", "parodies"):
|
"relationships", "parodies"):
|
||||||
|
|||||||
@@ -84,7 +84,7 @@ class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor):
|
|||||||
"type" : info["type"].capitalize(),
|
"type" : info["type"].capitalize(),
|
||||||
"language" : language,
|
"language" : language,
|
||||||
"lang" : util.language_to_code(language),
|
"lang" : util.language_to_code(language),
|
||||||
"date" : text.parse_datetime(date, "%Y-%m-%d %H:%M:%S%z"),
|
"date" : self.parse_datetime_iso(date),
|
||||||
"tags" : tags,
|
"tags" : tags,
|
||||||
"artist" : [o["artist"] for o in iget("artists") or ()],
|
"artist" : [o["artist"] for o in iget("artists") or ()],
|
||||||
"group" : [o["group"] for o in iget("groups") or ()],
|
"group" : [o["group"] for o in iget("groups") or ()],
|
||||||
|
|||||||
@@ -53,11 +53,9 @@ class ImagechestGalleryExtractor(GalleryExtractor):
|
|||||||
def _metadata_api(self, page):
|
def _metadata_api(self, page):
|
||||||
post = self.api.post(self.gallery_id)
|
post = self.api.post(self.gallery_id)
|
||||||
|
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["created"])
|
||||||
post["created"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
for img in post["images"]:
|
for img in post["images"]:
|
||||||
img["date"] = text.parse_datetime(
|
img["date"] = self.parse_datetime_iso(img["created"])
|
||||||
img["created"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
|
|
||||||
post["gallery_id"] = self.gallery_id
|
post["gallery_id"] = self.gallery_id
|
||||||
post.pop("image_count", None)
|
post.pop("image_count", None)
|
||||||
|
|||||||
@@ -159,8 +159,7 @@ class ImgbbImageExtractor(ImgbbExtractor):
|
|||||||
"width" : text.parse_int(extr('"og:image:width" content="', '"')),
|
"width" : text.parse_int(extr('"og:image:width" content="', '"')),
|
||||||
"height": text.parse_int(extr('"og:image:height" content="', '"')),
|
"height": text.parse_int(extr('"og:image:height" content="', '"')),
|
||||||
"album" : extr("Added to <a", "</a>"),
|
"album" : extr("Added to <a", "</a>"),
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : self.parse_datetime_iso(extr('<span title="', '"')),
|
||||||
'<span title="', '"'), "%Y-%m-%d %H:%M:%S"),
|
|
||||||
"user" : util.json_loads(extr(
|
"user" : util.json_loads(extr(
|
||||||
"CHV.obj.resource=", "};") + "}").get("user"),
|
"CHV.obj.resource=", "};") + "}").get("user"),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class ImgthGalleryExtractor(GalleryExtractor):
|
|||||||
"title": text.unescape(extr("<h1>", "</h1>")),
|
"title": text.unescape(extr("<h1>", "</h1>")),
|
||||||
"count": text.parse_int(extr(
|
"count": text.parse_int(extr(
|
||||||
"total of images in this gallery: ", " ")),
|
"total of images in this gallery: ", " ")),
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime(
|
||||||
extr("created on ", " by <")
|
extr("created on ", " by <")
|
||||||
.replace("th, ", " ", 1).replace("nd, ", " ", 1)
|
.replace("th, ", " ", 1).replace("nd, ", " ", 1)
|
||||||
.replace("st, ", " ", 1), "%B %d %Y at %H:%M"),
|
.replace("st, ", " ", 1), "%B %d %Y at %H:%M"),
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ class ImgurExtractor(Extractor):
|
|||||||
|
|
||||||
image["url"] = url = \
|
image["url"] = url = \
|
||||||
f"https://i.imgur.com/{image['id']}.{image['ext']}"
|
f"https://i.imgur.com/{image['id']}.{image['ext']}"
|
||||||
image["date"] = text.parse_datetime(image["created_at"])
|
image["date"] = self.parse_datetime_iso(image["created_at"])
|
||||||
image["_http_validate"] = self._validate
|
image["_http_validate"] = self._validate
|
||||||
text.nameext_from_url(url, image)
|
text.nameext_from_url(url, image)
|
||||||
|
|
||||||
@@ -106,7 +106,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
|
|||||||
|
|
||||||
del album["media"]
|
del album["media"]
|
||||||
count = len(images)
|
count = len(images)
|
||||||
album["date"] = text.parse_datetime(album["created_at"])
|
album["date"] = self.parse_datetime_iso(album["created_at"])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
del album["ad_url"]
|
del album["ad_url"]
|
||||||
|
|||||||
@@ -35,8 +35,8 @@ class InkbunnyExtractor(Extractor):
|
|||||||
|
|
||||||
for post in self.posts():
|
for post in self.posts():
|
||||||
post.update(metadata)
|
post.update(metadata)
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(
|
||||||
post["create_datetime"] + "00", "%Y-%m-%d %H:%M:%S.%f%z")
|
post["create_datetime"][:19])
|
||||||
post["tags"] = [kw["keyword_name"] for kw in post["keywords"]]
|
post["tags"] = [kw["keyword_name"] for kw in post["keywords"]]
|
||||||
post["ratings"] = [r["name"] for r in post["ratings"]]
|
post["ratings"] = [r["name"] for r in post["ratings"]]
|
||||||
files = post["files"]
|
files = post["files"]
|
||||||
@@ -52,8 +52,8 @@ class InkbunnyExtractor(Extractor):
|
|||||||
for post["num"], file in enumerate(files, 1):
|
for post["num"], file in enumerate(files, 1):
|
||||||
post.update(file)
|
post.update(file)
|
||||||
post["deleted"] = (file["deleted"] == "t")
|
post["deleted"] = (file["deleted"] == "t")
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(
|
||||||
file["create_datetime"] + "00", "%Y-%m-%d %H:%M:%S.%f%z")
|
file["create_datetime"][:19])
|
||||||
text.nameext_from_url(file["file_name"], post)
|
text.nameext_from_url(file["file_name"], post)
|
||||||
|
|
||||||
url = file["file_url_full"]
|
url = file["file_url_full"]
|
||||||
|
|||||||
@@ -173,7 +173,7 @@ class InstagramExtractor(Extractor):
|
|||||||
post_url = f"{self.root}/stories/highlights/{reel_id}/"
|
post_url = f"{self.root}/stories/highlights/{reel_id}/"
|
||||||
data = {
|
data = {
|
||||||
"user" : post.get("user"),
|
"user" : post.get("user"),
|
||||||
"expires": text.parse_timestamp(expires),
|
"expires": self.parse_timestamp(expires),
|
||||||
"post_id": reel_id,
|
"post_id": reel_id,
|
||||||
"post_shortcode": shortcode_from_id(reel_id),
|
"post_shortcode": shortcode_from_id(reel_id),
|
||||||
"post_url": post_url,
|
"post_url": post_url,
|
||||||
@@ -224,7 +224,7 @@ class InstagramExtractor(Extractor):
|
|||||||
data["owner_id"] = owner["pk"]
|
data["owner_id"] = owner["pk"]
|
||||||
data["username"] = owner.get("username")
|
data["username"] = owner.get("username")
|
||||||
data["fullname"] = owner.get("full_name")
|
data["fullname"] = owner.get("full_name")
|
||||||
data["post_date"] = data["date"] = text.parse_timestamp(
|
data["post_date"] = data["date"] = self.parse_timestamp(
|
||||||
post.get("taken_at") or post.get("created_at") or post.get("seen"))
|
post.get("taken_at") or post.get("created_at") or post.get("seen"))
|
||||||
data["_files"] = files = []
|
data["_files"] = files = []
|
||||||
for num, item in enumerate(items, 1):
|
for num, item in enumerate(items, 1):
|
||||||
@@ -278,7 +278,7 @@ class InstagramExtractor(Extractor):
|
|||||||
|
|
||||||
media = {
|
media = {
|
||||||
"num" : num,
|
"num" : num,
|
||||||
"date" : text.parse_timestamp(item.get("taken_at") or
|
"date" : self.parse_timestamp(item.get("taken_at") or
|
||||||
media.get("taken_at") or
|
media.get("taken_at") or
|
||||||
post.get("taken_at")),
|
post.get("taken_at")),
|
||||||
"media_id" : item["pk"],
|
"media_id" : item["pk"],
|
||||||
@@ -299,7 +299,7 @@ class InstagramExtractor(Extractor):
|
|||||||
if "reshared_story_media_author" in item:
|
if "reshared_story_media_author" in item:
|
||||||
media["author"] = item["reshared_story_media_author"]
|
media["author"] = item["reshared_story_media_author"]
|
||||||
if "expiring_at" in item:
|
if "expiring_at" in item:
|
||||||
media["expires"] = text.parse_timestamp(post["expiring_at"])
|
media["expires"] = self.parse_timestamp(post["expiring_at"])
|
||||||
|
|
||||||
self._extract_tagged_users(item, media)
|
self._extract_tagged_users(item, media)
|
||||||
files.append(media)
|
files.append(media)
|
||||||
@@ -342,7 +342,7 @@ class InstagramExtractor(Extractor):
|
|||||||
"post_id" : post["id"],
|
"post_id" : post["id"],
|
||||||
"post_shortcode": post["shortcode"],
|
"post_shortcode": post["shortcode"],
|
||||||
"post_url" : f"{self.root}/p/{post['shortcode']}/",
|
"post_url" : f"{self.root}/p/{post['shortcode']}/",
|
||||||
"post_date" : text.parse_timestamp(post["taken_at_timestamp"]),
|
"post_date" : self.parse_timestamp(post["taken_at_timestamp"]),
|
||||||
"description": text.parse_unicode_escapes("\n".join(
|
"description": text.parse_unicode_escapes("\n".join(
|
||||||
edge["node"]["text"]
|
edge["node"]["text"]
|
||||||
for edge in post["edge_media_to_caption"]["edges"]
|
for edge in post["edge_media_to_caption"]["edges"]
|
||||||
@@ -634,7 +634,7 @@ class InstagramStoriesTrayExtractor(InstagramExtractor):
|
|||||||
def items(self):
|
def items(self):
|
||||||
base = f"{self.root}/stories/id:"
|
base = f"{self.root}/stories/id:"
|
||||||
for story in self.api.reels_tray():
|
for story in self.api.reels_tray():
|
||||||
story["date"] = text.parse_timestamp(story["latest_reel_media"])
|
story["date"] = self.parse_timestamp(story["latest_reel_media"])
|
||||||
story["_extractor"] = InstagramStoriesExtractor
|
story["_extractor"] = InstagramStoriesExtractor
|
||||||
yield Message.Queue, f"{base}{story['id']}/", story
|
yield Message.Queue, f"{base}{story['id']}/", story
|
||||||
|
|
||||||
|
|||||||
@@ -36,8 +36,8 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
|
|||||||
'{"":' + data.replace('\\"', '"')))
|
'{"":' + data.replace('\\"', '"')))
|
||||||
|
|
||||||
doc = data["initialDocumentData"]["document"]
|
doc = data["initialDocumentData"]["document"]
|
||||||
doc["date"] = text.parse_datetime(
|
doc["date"] = self.parse_datetime_iso(
|
||||||
doc["originalPublishDateInISOString"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
doc["originalPublishDateInISOString"])
|
||||||
|
|
||||||
self.count = text.parse_int(doc["pageCount"])
|
self.count = text.parse_int(doc["pageCount"])
|
||||||
self.base = (f"https://image.isu.pub/{doc['revisionId']}-"
|
self.base = (f"https://image.isu.pub/{doc['revisionId']}-"
|
||||||
|
|||||||
@@ -32,8 +32,7 @@ class ItakuExtractor(Extractor):
|
|||||||
def items(self):
|
def items(self):
|
||||||
if images := self.images():
|
if images := self.images():
|
||||||
for image in images:
|
for image in images:
|
||||||
image["date"] = text.parse_datetime(
|
image["date"] = self.parse_datetime_iso(image["date_added"])
|
||||||
image["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
for category, tags in image.pop("categorized_tags").items():
|
for category, tags in image.pop("categorized_tags").items():
|
||||||
image[f"tags_{category.lower()}"] = [
|
image[f"tags_{category.lower()}"] = [
|
||||||
t["name"] for t in tags]
|
t["name"] for t in tags]
|
||||||
@@ -60,15 +59,14 @@ class ItakuExtractor(Extractor):
|
|||||||
for post in posts:
|
for post in posts:
|
||||||
images = post.pop("gallery_images") or ()
|
images = post.pop("gallery_images") or ()
|
||||||
post["count"] = len(images)
|
post["count"] = len(images)
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["date_added"])
|
||||||
post["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
post["tags"] = [t["name"] for t in post["tags"]]
|
post["tags"] = [t["name"] for t in post["tags"]]
|
||||||
|
|
||||||
yield Message.Directory, post
|
yield Message.Directory, post
|
||||||
for post["num"], image in enumerate(images, 1):
|
for post["num"], image in enumerate(images, 1):
|
||||||
post["file"] = image
|
post["file"] = image
|
||||||
image["date"] = text.parse_datetime(
|
image["date"] = self.parse_datetime_iso(
|
||||||
image["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
image["date_added"])
|
||||||
|
|
||||||
url = image["image"]
|
url = image["image"]
|
||||||
yield Message.Url, url, text.nameext_from_url(url, post)
|
yield Message.Url, url, text.nameext_from_url(url, post)
|
||||||
|
|||||||
@@ -122,10 +122,10 @@ class IwaraExtractor(Extractor):
|
|||||||
info["file_id"] = file_info.get("id")
|
info["file_id"] = file_info.get("id")
|
||||||
info["filename"] = filename
|
info["filename"] = filename
|
||||||
info["extension"] = extension
|
info["extension"] = extension
|
||||||
info["date"] = text.parse_datetime(
|
info["date"] = self.parse_datetime_iso(
|
||||||
file_info.get("createdAt"), "%Y-%m-%dT%H:%M:%S.%fZ")
|
file_info.get("createdAt"))
|
||||||
info["date_updated"] = text.parse_datetime(
|
info["date_updated"] = self.parse_datetime_iso(
|
||||||
file_info.get("updatedAt"), "%Y-%m-%dT%H:%M:%S.%fZ")
|
file_info.get("updatedAt"))
|
||||||
info["mime"] = file_info.get("mime")
|
info["mime"] = file_info.get("mime")
|
||||||
info["size"] = file_info.get("size")
|
info["size"] = file_info.get("size")
|
||||||
info["width"] = file_info.get("width")
|
info["width"] = file_info.get("width")
|
||||||
@@ -144,8 +144,7 @@ class IwaraExtractor(Extractor):
|
|||||||
"status" : user.get("status"),
|
"status" : user.get("status"),
|
||||||
"role" : user.get("role"),
|
"role" : user.get("role"),
|
||||||
"premium": user.get("premium"),
|
"premium": user.get("premium"),
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime_iso(user.get("createdAt")),
|
||||||
user.get("createdAt"), "%Y-%m-%dT%H:%M:%S.000Z"),
|
|
||||||
"description": profile.get("body"),
|
"description": profile.get("body"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -32,8 +32,7 @@ class KabeuchiUserExtractor(Extractor):
|
|||||||
if post.get("is_ad") or not post["image1"]:
|
if post.get("is_ad") or not post["image1"]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["created_at"])
|
||||||
post["created_at"], "%Y-%m-%d %H:%M:%S")
|
|
||||||
yield Message.Directory, post
|
yield Message.Directory, post
|
||||||
|
|
||||||
for key in keys:
|
for key in keys:
|
||||||
|
|||||||
@@ -244,7 +244,7 @@ class KemonoExtractor(Extractor):
|
|||||||
def _parse_datetime(self, date_string):
|
def _parse_datetime(self, date_string):
|
||||||
if len(date_string) > 19:
|
if len(date_string) > 19:
|
||||||
date_string = date_string[:19]
|
date_string = date_string[:19]
|
||||||
return text.parse_datetime(date_string, "%Y-%m-%dT%H:%M:%S")
|
return self.parse_datetime_iso(date_string)
|
||||||
|
|
||||||
def _revisions(self, posts):
|
def _revisions(self, posts):
|
||||||
return itertools.chain.from_iterable(
|
return itertools.chain.from_iterable(
|
||||||
|
|||||||
@@ -119,8 +119,7 @@ class LensdumpImageExtractor(LensdumpBase, Extractor):
|
|||||||
'property="image:width" content="', '"')),
|
'property="image:width" content="', '"')),
|
||||||
"height": text.parse_int(extr(
|
"height": text.parse_int(extr(
|
||||||
'property="image:height" content="', '"')),
|
'property="image:height" content="', '"')),
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : self.parse_datetime_iso(extr('<span title="', '"')),
|
||||||
'<span title="', '"'), "%Y-%m-%d %H:%M:%S"),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
text.nameext_from_url(data["url"], data)
|
text.nameext_from_url(data["url"], data)
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ class LivedoorExtractor(Extractor):
|
|||||||
"title" : text.unescape(extr('dc:title="', '"')),
|
"title" : text.unescape(extr('dc:title="', '"')),
|
||||||
"categories" : extr('dc:subject="', '"').partition(",")[::2],
|
"categories" : extr('dc:subject="', '"').partition(",")[::2],
|
||||||
"description": extr('dc:description="', '"'),
|
"description": extr('dc:description="', '"'),
|
||||||
"date" : text.parse_datetime(extr('dc:date="', '"')),
|
"date" : self.parse_datetime_iso(extr('dc:date="', '"')),
|
||||||
"tags" : text.split_html(tags)[1:] if tags else [],
|
"tags" : text.split_html(tags)[1:] if tags else [],
|
||||||
"user" : self.user,
|
"user" : self.user,
|
||||||
"body" : body,
|
"body" : body,
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ class LofterExtractor(Extractor):
|
|||||||
post = post["post"]
|
post = post["post"]
|
||||||
|
|
||||||
post["blog_name"] = post["blogInfo"]["blogName"]
|
post["blog_name"] = post["blogInfo"]["blogName"]
|
||||||
post["date"] = text.parse_timestamp(post["publishTime"] // 1000)
|
post["date"] = self.parse_timestamp(post["publishTime"] // 1000)
|
||||||
post_type = post["type"]
|
post_type = post["type"]
|
||||||
|
|
||||||
# Article
|
# Article
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ class LusciousAlbumExtractor(LusciousExtractor):
|
|||||||
image["thumbnail"] = ""
|
image["thumbnail"] = ""
|
||||||
|
|
||||||
image["tags"] = [item["text"] for item in image["tags"]]
|
image["tags"] = [item["text"] for item in image["tags"]]
|
||||||
image["date"] = text.parse_timestamp(image["created"])
|
image["date"] = self.parse_timestamp(image["created"])
|
||||||
image["id"] = text.parse_int(image["id"])
|
image["id"] = text.parse_int(image["id"])
|
||||||
|
|
||||||
url = (image["url_to_original"] or image["url_to_video"]
|
url = (image["url_to_original"] or image["url_to_video"]
|
||||||
@@ -188,7 +188,7 @@ fragment AlbumStandard on Album {
|
|||||||
album["created_by"] = album["created_by"]["display_name"]
|
album["created_by"] = album["created_by"]["display_name"]
|
||||||
|
|
||||||
album["id"] = text.parse_int(album["id"])
|
album["id"] = text.parse_int(album["id"])
|
||||||
album["date"] = text.parse_timestamp(album["created"])
|
album["date"] = self.parse_timestamp(album["created"])
|
||||||
|
|
||||||
return album
|
return album
|
||||||
|
|
||||||
|
|||||||
@@ -47,8 +47,7 @@ class MadokamiMangaExtractor(MadokamiExtractor):
|
|||||||
"path": text.unescape(extr('href="', '"')),
|
"path": text.unescape(extr('href="', '"')),
|
||||||
"chapter_string": text.unescape(extr(">", "<")),
|
"chapter_string": text.unescape(extr(">", "<")),
|
||||||
"size": text.parse_bytes(extr("<td>", "</td>")),
|
"size": text.parse_bytes(extr("<td>", "</td>")),
|
||||||
"date": text.parse_datetime(
|
"date": self.parse_datetime_iso(extr("<td>", "</td>").strip()),
|
||||||
extr("<td>", "</td>").strip(), "%Y-%m-%d %H:%M"),
|
|
||||||
})
|
})
|
||||||
|
|
||||||
if self.config("chapter-reverse"):
|
if self.config("chapter-reverse"):
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ class MangadexExtractor(Extractor):
|
|||||||
"chapter" : text.parse_int(chnum),
|
"chapter" : text.parse_int(chnum),
|
||||||
"chapter_minor": f"{sep}{minor}",
|
"chapter_minor": f"{sep}{minor}",
|
||||||
"chapter_id": chapter["id"],
|
"chapter_id": chapter["id"],
|
||||||
"date" : text.parse_datetime(cattributes["publishAt"]),
|
"date" : self.parse_datetime_iso(cattributes["publishAt"]),
|
||||||
"group" : [group["attributes"]["name"]
|
"group" : [group["attributes"]["name"]
|
||||||
for group in relationships["scanlation_group"]],
|
for group in relationships["scanlation_group"]],
|
||||||
"lang" : lang,
|
"lang" : lang,
|
||||||
@@ -109,8 +109,8 @@ class MangadexCoversExtractor(MangadexExtractor):
|
|||||||
"cover" : cattributes["fileName"],
|
"cover" : cattributes["fileName"],
|
||||||
"lang" : cattributes.get("locale"),
|
"lang" : cattributes.get("locale"),
|
||||||
"volume" : text.parse_int(cattributes["volume"]),
|
"volume" : text.parse_int(cattributes["volume"]),
|
||||||
"date" : text.parse_datetime(cattributes["createdAt"]),
|
"date" : self.parse_datetime_iso(cattributes["createdAt"]),
|
||||||
"date_updated": text.parse_datetime(cattributes["updatedAt"]),
|
"date_updated": self.parse_datetime_iso(cattributes["updatedAt"]),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -454,7 +454,7 @@ def _manga_info(self, uuid):
|
|||||||
"manga_id": manga["id"],
|
"manga_id": manga["id"],
|
||||||
"manga_titles": [t.popitem()[1]
|
"manga_titles": [t.popitem()[1]
|
||||||
for t in mattr.get("altTitles") or ()],
|
for t in mattr.get("altTitles") or ()],
|
||||||
"manga_date" : text.parse_datetime(mattr.get("createdAt")),
|
"manga_date" : self.parse_datetime_iso(mattr.get("createdAt")),
|
||||||
"description" : (mattr["description"].get("en") or
|
"description" : (mattr["description"].get("en") or
|
||||||
next(iter(mattr["description"].values()), "")),
|
next(iter(mattr["description"].values()), "")),
|
||||||
"demographic": mattr.get("publicationDemographic"),
|
"demographic": mattr.get("publicationDemographic"),
|
||||||
|
|||||||
@@ -99,7 +99,7 @@ class MangafoxMangaExtractor(MangaExtractor):
|
|||||||
"chapter" : text.parse_int(chapter),
|
"chapter" : text.parse_int(chapter),
|
||||||
"chapter_minor" : minor or "",
|
"chapter_minor" : minor or "",
|
||||||
"chapter_string": cstr,
|
"chapter_string": cstr,
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime(
|
||||||
extr('right">', '</span>'), "%b %d, %Y"),
|
extr('right">', '</span>'), "%b %d, %Y"),
|
||||||
}
|
}
|
||||||
chapter.update(data)
|
chapter.update(data)
|
||||||
|
|||||||
@@ -50,10 +50,10 @@ class ManganeloChapterExtractor(ManganeloExtractor, ChapterExtractor):
|
|||||||
extr = text.extract_from(page)
|
extr = text.extract_from(page)
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : self.parse_datetime_iso(extr(
|
||||||
'"datePublished": "', '"')[:19], "%Y-%m-%dT%H:%M:%S"),
|
'"datePublished": "', '"')[:19]),
|
||||||
"date_updated": text.parse_datetime(extr(
|
"date_updated": self.parse_datetime_iso(extr(
|
||||||
'"dateModified": "', '"')[:19], "%Y-%m-%dT%H:%M:%S"),
|
'"dateModified": "', '"')[:19]),
|
||||||
"manga_id" : text.parse_int(extr("comic_id =", ";")),
|
"manga_id" : text.parse_int(extr("comic_id =", ";")),
|
||||||
"chapter_id" : text.parse_int(extr("chapter_id =", ";")),
|
"chapter_id" : text.parse_int(extr("chapter_id =", ";")),
|
||||||
"manga" : extr("comic_name =", ";").strip('" '),
|
"manga" : extr("comic_name =", ";").strip('" '),
|
||||||
@@ -99,7 +99,7 @@ class ManganeloMangaExtractor(ManganeloExtractor, MangaExtractor):
|
|||||||
manga = text.unescape(extr("<h1>", "<"))
|
manga = text.unescape(extr("<h1>", "<"))
|
||||||
author = text.remove_html(extr("<li>Author(s) :", "</a>"))
|
author = text.remove_html(extr("<li>Author(s) :", "</a>"))
|
||||||
status = extr("<li>Status :", "<").strip()
|
status = extr("<li>Status :", "<").strip()
|
||||||
update = text.parse_datetime(extr(
|
update = self.parse_datetime(extr(
|
||||||
"<li>Last updated :", "<").strip(), "%b-%d-%Y %I:%M:%S %p")
|
"<li>Last updated :", "<").strip(), "%b-%d-%Y %I:%M:%S %p")
|
||||||
tags = text.split_html(extr(">Genres :", "</li>"))[::2]
|
tags = text.split_html(extr(">Genres :", "</li>"))[::2]
|
||||||
|
|
||||||
@@ -121,7 +121,7 @@ class ManganeloMangaExtractor(ManganeloExtractor, MangaExtractor):
|
|||||||
"chapter" : text.parse_int(chapter),
|
"chapter" : text.parse_int(chapter),
|
||||||
"chapter_minor": (sep and ".") + minor,
|
"chapter_minor": (sep and ".") + minor,
|
||||||
"title" : title.partition(": ")[2],
|
"title" : title.partition(": ")[2],
|
||||||
"date" : text.parse_datetime(date, "%b-%d-%Y %H:%M"),
|
"date" : self.parse_datetime(date, "%b-%d-%Y %H:%M"),
|
||||||
"lang" : "en",
|
"lang" : "en",
|
||||||
"language": "English",
|
"language": "English",
|
||||||
}))
|
}))
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
|
|||||||
"language" : util.code_to_language(lang),
|
"language" : util.code_to_language(lang),
|
||||||
"source" : chapter["srcTitle"],
|
"source" : chapter["srcTitle"],
|
||||||
"source_id" : chapter["sourceId"],
|
"source_id" : chapter["sourceId"],
|
||||||
"date" : text.parse_timestamp(chapter["dateCreate"] // 1000),
|
"date" : self.parse_timestamp(chapter["dateCreate"] // 1000),
|
||||||
}
|
}
|
||||||
|
|
||||||
def images(self, _):
|
def images(self, _):
|
||||||
@@ -138,7 +138,7 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor):
|
|||||||
"language" : util.code_to_language(lang),
|
"language" : util.code_to_language(lang),
|
||||||
"source" : chapter["srcTitle"],
|
"source" : chapter["srcTitle"],
|
||||||
"source_id" : chapter["sourceId"],
|
"source_id" : chapter["sourceId"],
|
||||||
"date" : text.parse_timestamp(
|
"date" : self.parse_timestamp(
|
||||||
chapter["dateCreate"] // 1000),
|
chapter["dateCreate"] // 1000),
|
||||||
"_extractor": MangaparkChapterExtractor,
|
"_extractor": MangaparkChapterExtractor,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,10 +40,8 @@ class MangataroChapterExtractor(MangataroBase, ChapterExtractor):
|
|||||||
"chapter_minor": str(round(minor, 5))[1:] if minor else "",
|
"chapter_minor": str(round(minor, 5))[1:] if minor else "",
|
||||||
"chapter_id" : text.parse_int(chapter_id),
|
"chapter_id" : text.parse_int(chapter_id),
|
||||||
"chapter_url" : comic["url"],
|
"chapter_url" : comic["url"],
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime_iso(comic["datePublished"]),
|
||||||
comic["datePublished"], "%Y-%m-%dT%H:%M:%S%z"),
|
"date_updated" : self.parse_datetime_iso(comic["dateModified"]),
|
||||||
"date_updated" : text.parse_datetime(
|
|
||||||
comic["dateModified"], "%Y-%m-%dT%H:%M:%S%z"),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def images(self, page):
|
def images(self, page):
|
||||||
|
|||||||
@@ -119,7 +119,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor):
|
|||||||
"album": {
|
"album": {
|
||||||
"id": self.album_id,
|
"id": self.album_id,
|
||||||
"name": text.unescape(title),
|
"name": text.unescape(title),
|
||||||
"date": text.parse_datetime(date.strip(), "%Y.%m.%d %H:%M"),
|
"date": self.parse_datetime(date.strip(), "%Y.%m.%d %H:%M"),
|
||||||
"description": text.unescape(descr),
|
"description": text.unescape(descr),
|
||||||
},
|
},
|
||||||
"count": text.parse_int(count),
|
"count": text.parse_int(count),
|
||||||
|
|||||||
@@ -64,8 +64,7 @@ class MastodonExtractor(BaseExtractor):
|
|||||||
|
|
||||||
status["count"] = len(attachments)
|
status["count"] = len(attachments)
|
||||||
status["tags"] = [tag["name"] for tag in status["tags"]]
|
status["tags"] = [tag["name"] for tag in status["tags"]]
|
||||||
status["date"] = text.parse_datetime(
|
status["date"] = self.parse_datetime_iso(status["created_at"][:19])
|
||||||
status["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
|
|
||||||
|
|
||||||
yield Message.Directory, status
|
yield Message.Directory, status
|
||||||
for status["num"], media in enumerate(attachments, 1):
|
for status["num"], media in enumerate(attachments, 1):
|
||||||
@@ -319,10 +318,8 @@ class MastodonAPI():
|
|||||||
if code == 404:
|
if code == 404:
|
||||||
raise exception.NotFoundError()
|
raise exception.NotFoundError()
|
||||||
if code == 429:
|
if code == 429:
|
||||||
self.extractor.wait(until=text.parse_datetime(
|
self.extractor.wait(until=self.parse_datetime_iso(
|
||||||
response.headers["x-ratelimit-reset"],
|
response.headers["x-ratelimit-reset"]))
|
||||||
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
||||||
))
|
|
||||||
continue
|
continue
|
||||||
raise exception.AbortExtraction(response.json().get("error"))
|
raise exception.AbortExtraction(response.json().get("error"))
|
||||||
|
|
||||||
|
|||||||
@@ -48,13 +48,11 @@ class MisskeyExtractor(BaseExtractor):
|
|||||||
note["instance"] = self.instance
|
note["instance"] = self.instance
|
||||||
note["instance_remote"] = note["user"]["host"]
|
note["instance_remote"] = note["user"]["host"]
|
||||||
note["count"] = len(files)
|
note["count"] = len(files)
|
||||||
note["date"] = text.parse_datetime(
|
note["date"] = self.parse_datetime_iso(note["createdAt"])
|
||||||
note["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
|
|
||||||
yield Message.Directory, note
|
yield Message.Directory, note
|
||||||
for note["num"], file in enumerate(files, 1):
|
for note["num"], file in enumerate(files, 1):
|
||||||
file["date"] = text.parse_datetime(
|
file["date"] = self.parse_datetime_iso(file["createdAt"])
|
||||||
file["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
note["file"] = file
|
note["file"] = file
|
||||||
url = file["url"]
|
url = file["url"]
|
||||||
yield Message.Url, url, text.nameext_from_url(url, note)
|
yield Message.Url, url, text.nameext_from_url(url, note)
|
||||||
|
|||||||
@@ -9,9 +9,8 @@
|
|||||||
"""Extractors for Moebooru based sites"""
|
"""Extractors for Moebooru based sites"""
|
||||||
|
|
||||||
from .booru import BooruExtractor
|
from .booru import BooruExtractor
|
||||||
from .. import text, util
|
from .. import text, util, dt
|
||||||
import collections
|
import collections
|
||||||
import datetime
|
|
||||||
|
|
||||||
|
|
||||||
class MoebooruExtractor(BooruExtractor):
|
class MoebooruExtractor(BooruExtractor):
|
||||||
@@ -21,7 +20,7 @@ class MoebooruExtractor(BooruExtractor):
|
|||||||
page_start = 1
|
page_start = 1
|
||||||
|
|
||||||
def _prepare(self, post):
|
def _prepare(self, post):
|
||||||
post["date"] = text.parse_timestamp(post["created_at"])
|
post["date"] = dt.parse_ts(post["created_at"])
|
||||||
|
|
||||||
def _html(self, post):
|
def _html(self, post):
|
||||||
url = f"{self.root}/post/show/{post['id']}"
|
url = f"{self.root}/post/show/{post['id']}"
|
||||||
@@ -164,14 +163,14 @@ class MoebooruPopularExtractor(MoebooruExtractor):
|
|||||||
date = (f"{params['year']:>04}-{params.get('month', '01'):>02}-"
|
date = (f"{params['year']:>04}-{params.get('month', '01'):>02}-"
|
||||||
f"{params.get('day', '01'):>02}")
|
f"{params.get('day', '01'):>02}")
|
||||||
else:
|
else:
|
||||||
date = datetime.date.today().isoformat()
|
date = dt.date.today().isoformat()
|
||||||
|
|
||||||
scale = self.scale
|
scale = self.scale
|
||||||
if scale.startswith("by_"):
|
if scale.startswith("by_"):
|
||||||
scale = scale[3:]
|
scale = scale[3:]
|
||||||
if scale == "week":
|
if scale == "week":
|
||||||
date = datetime.date.fromisoformat(date)
|
date = dt.date.fromisoformat(date)
|
||||||
date = (date - datetime.timedelta(days=date.weekday())).isoformat()
|
date = (date - dt.timedelta(days=date.weekday())).isoformat()
|
||||||
elif scale == "month":
|
elif scale == "month":
|
||||||
date = date[:-3]
|
date = date[:-3]
|
||||||
|
|
||||||
|
|||||||
@@ -9,9 +9,8 @@
|
|||||||
"""Extractors for https://motherless.com/"""
|
"""Extractors for https://motherless.com/"""
|
||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text, util, exception
|
from .. import text, dt, exception
|
||||||
from ..cache import memcache
|
from ..cache import memcache
|
||||||
from datetime import timedelta
|
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?motherless\.com"
|
BASE_PATTERN = r"(?:https?://)?motherless\.com"
|
||||||
|
|
||||||
@@ -115,14 +114,14 @@ class MotherlessExtractor(Extractor):
|
|||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def _parse_datetime(self, dt):
|
def _parse_datetime(self, dt_string):
|
||||||
if " ago" not in dt:
|
if " ago" not in dt_string:
|
||||||
return text.parse_datetime(dt, "%d %b %Y")
|
return dt.parse(dt_string, "%d %b %Y")
|
||||||
|
|
||||||
value = text.parse_int(dt[:-5])
|
value = text.parse_int(dt_string[:-5])
|
||||||
delta = timedelta(0, value*3600) if dt[-5] == "h" else timedelta(value)
|
delta = (dt.timedelta(0, value*3600) if dt_string[-5] == "h" else
|
||||||
return (util.datetime_utcnow() - delta).replace(
|
dt.timedelta(value))
|
||||||
hour=0, minute=0, second=0)
|
return (dt.now() - delta).replace(hour=0, minute=0, second=0)
|
||||||
|
|
||||||
@memcache(keyarg=2)
|
@memcache(keyarg=2)
|
||||||
def _extract_gallery_title(self, page, gallery_id):
|
def _extract_gallery_title(self, page, gallery_id):
|
||||||
|
|||||||
@@ -9,8 +9,7 @@
|
|||||||
"""Extractors for https://blog.naver.com/"""
|
"""Extractors for https://blog.naver.com/"""
|
||||||
|
|
||||||
from .common import GalleryExtractor, Extractor, Message
|
from .common import GalleryExtractor, Extractor, Message
|
||||||
from .. import text, util
|
from .. import text, util, dt
|
||||||
import datetime
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
@@ -67,11 +66,11 @@ class NaverBlogPostExtractor(NaverBlogBase, GalleryExtractor):
|
|||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def _parse_datetime(self, date_string):
|
def _parse_datetime(self, dt_string):
|
||||||
if "전" in date_string:
|
if "전" in dt_string:
|
||||||
ts = time.gmtime()
|
ts = time.gmtime()
|
||||||
return datetime.datetime(ts.tm_year, ts.tm_mon, ts.tm_mday)
|
return dt.datetime(ts.tm_year, ts.tm_mon, ts.tm_mday)
|
||||||
return text.parse_datetime(date_string, "%Y. %m. %d. %H:%M")
|
return dt.parse(dt_string, "%Y. %m. %d. %H:%M")
|
||||||
|
|
||||||
def images(self, page):
|
def images(self, page):
|
||||||
files = []
|
files = []
|
||||||
|
|||||||
@@ -31,17 +31,17 @@ class NaverChzzkExtractor(Extractor):
|
|||||||
data["uid"] = data["objectId"]
|
data["uid"] = data["objectId"]
|
||||||
data["user"] = comment["user"]
|
data["user"] = comment["user"]
|
||||||
data["count"] = len(files)
|
data["count"] = len(files)
|
||||||
data["date"] = text.parse_datetime(
|
data["date"] = self.parse_datetime(
|
||||||
data["createdDate"], "%Y%m%d%H%M%S")
|
data["createdDate"], "%Y%m%d%H%M%S")
|
||||||
|
|
||||||
yield Message.Directory, data
|
yield Message.Directory, data
|
||||||
for data["num"], file in enumerate(files, 1):
|
for data["num"], file in enumerate(files, 1):
|
||||||
if extra := file.get("extraJson"):
|
if extra := file.get("extraJson"):
|
||||||
file.update(util.json_loads(extra))
|
file.update(util.json_loads(extra))
|
||||||
file["date"] = text.parse_datetime(
|
file["date"] = self.parse_datetime_iso(
|
||||||
file["createdDate"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
file["createdDate"])
|
||||||
file["date_updated"] = text.parse_datetime(
|
file["date_updated"] = self.parse_datetime_iso(
|
||||||
file["updatedDate"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
file["updatedDate"])
|
||||||
data["file"] = file
|
data["file"] = file
|
||||||
url = file["attachValue"]
|
url = file["attachValue"]
|
||||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||||
|
|||||||
@@ -59,8 +59,8 @@ class NekohousePostExtractor(NekohouseExtractor):
|
|||||||
'class="scrape__user-name', '</').rpartition(">")[2].strip()),
|
'class="scrape__user-name', '</').rpartition(">")[2].strip()),
|
||||||
"title" : text.unescape(extr(
|
"title" : text.unescape(extr(
|
||||||
'class="scrape__title', '</').rpartition(">")[2]),
|
'class="scrape__title', '</').rpartition(">")[2]),
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : self.parse_datetime_iso(extr(
|
||||||
'datetime="', '"')[:19], "%Y-%m-%d %H:%M:%S"),
|
'datetime="', '"')[:19]),
|
||||||
"content": text.unescape(extr(
|
"content": text.unescape(extr(
|
||||||
'class="scrape__content">', "</div>").strip()),
|
'class="scrape__content">', "</div>").strip()),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"""Extractors for https://www.newgrounds.com/"""
|
"""Extractors for https://www.newgrounds.com/"""
|
||||||
|
|
||||||
from .common import Extractor, Message, Dispatch
|
from .common import Extractor, Message, Dispatch
|
||||||
from .. import text, util, exception
|
from .. import text, util, dt, exception
|
||||||
from ..cache import cache
|
from ..cache import cache
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
@@ -218,7 +218,7 @@ class NewgroundsExtractor(Extractor):
|
|||||||
"description": text.unescape(extr(':description" content="', '"')),
|
"description": text.unescape(extr(':description" content="', '"')),
|
||||||
"type" : "art",
|
"type" : "art",
|
||||||
"_type" : "i",
|
"_type" : "i",
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : dt.parse_iso(extr(
|
||||||
'itemprop="datePublished" content="', '"')),
|
'itemprop="datePublished" content="', '"')),
|
||||||
"rating" : extr('class="rated-', '"'),
|
"rating" : extr('class="rated-', '"'),
|
||||||
"url" : full('src="', '"'),
|
"url" : full('src="', '"'),
|
||||||
@@ -268,7 +268,7 @@ class NewgroundsExtractor(Extractor):
|
|||||||
"description": text.unescape(extr(':description" content="', '"')),
|
"description": text.unescape(extr(':description" content="', '"')),
|
||||||
"type" : "audio",
|
"type" : "audio",
|
||||||
"_type" : "a",
|
"_type" : "a",
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : dt.parse_iso(extr(
|
||||||
'itemprop="datePublished" content="', '"')),
|
'itemprop="datePublished" content="', '"')),
|
||||||
"url" : extr('{"url":"', '"').replace("\\/", "/"),
|
"url" : extr('{"url":"', '"').replace("\\/", "/"),
|
||||||
"index" : text.parse_int(index),
|
"index" : text.parse_int(index),
|
||||||
@@ -287,7 +287,7 @@ class NewgroundsExtractor(Extractor):
|
|||||||
src = src.replace("\\/", "/")
|
src = src.replace("\\/", "/")
|
||||||
formats = ()
|
formats = ()
|
||||||
type = extr(',"description":"', '"')
|
type = extr(',"description":"', '"')
|
||||||
date = text.parse_datetime(extr(
|
date = dt.parse_iso(extr(
|
||||||
'itemprop="datePublished" content="', '"'))
|
'itemprop="datePublished" content="', '"'))
|
||||||
if type:
|
if type:
|
||||||
type = type.rpartition(" ")[2].lower()
|
type = type.rpartition(" ")[2].lower()
|
||||||
@@ -302,7 +302,7 @@ class NewgroundsExtractor(Extractor):
|
|||||||
sources = self.request_json(url, headers=headers)["sources"]
|
sources = self.request_json(url, headers=headers)["sources"]
|
||||||
formats = self._video_formats(sources)
|
formats = self._video_formats(sources)
|
||||||
src = next(formats, "")
|
src = next(formats, "")
|
||||||
date = text.parse_timestamp(src.rpartition("?")[2])
|
date = self.parse_timestamp(src.rpartition("?")[2])
|
||||||
type = "movie"
|
type = "movie"
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"""Extractors for nijie instances"""
|
"""Extractors for nijie instances"""
|
||||||
|
|
||||||
from .common import BaseExtractor, Message, Dispatch, AsynchronousMixin
|
from .common import BaseExtractor, Message, Dispatch, AsynchronousMixin
|
||||||
from .. import text, exception
|
from .. import text, dt, exception
|
||||||
from ..cache import cache
|
from ..cache import cache
|
||||||
|
|
||||||
|
|
||||||
@@ -82,8 +82,9 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
|
|||||||
"title" : keywords[0].strip(),
|
"title" : keywords[0].strip(),
|
||||||
"description": text.unescape(extr(
|
"description": text.unescape(extr(
|
||||||
'"description": "', '"').replace("&", "&")),
|
'"description": "', '"').replace("&", "&")),
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : dt.parse(extr(
|
||||||
'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y", 9),
|
'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"
|
||||||
|
) - dt.timedelta(hours=9),
|
||||||
"artist_id" : text.parse_int(extr('/members.php?id=', '"')),
|
"artist_id" : text.parse_int(extr('/members.php?id=', '"')),
|
||||||
"artist_name": keywords[1],
|
"artist_name": keywords[1],
|
||||||
"tags" : keywords[2:-1],
|
"tags" : keywords[2:-1],
|
||||||
@@ -101,9 +102,9 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
|
|||||||
"artist_id" : text.parse_int(extr('members.php?id=', '"')),
|
"artist_id" : text.parse_int(extr('members.php?id=', '"')),
|
||||||
"artist_name": keywords[1],
|
"artist_name": keywords[1],
|
||||||
"tags" : keywords[2:-1],
|
"tags" : keywords[2:-1],
|
||||||
"date" : text.parse_datetime(extr(
|
"date" : dt.parse_iso(extr(
|
||||||
"itemprop='datePublished' content=", "<").rpartition(">")[2],
|
"itemprop='datePublished' content=", "<").rpartition(">")[2]
|
||||||
"%Y-%m-%d %H:%M:%S", 9),
|
) - dt.timedelta(hours=9),
|
||||||
}
|
}
|
||||||
|
|
||||||
def _extract_images(self, image_id, page):
|
def _extract_images(self, image_id, page):
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class NitterExtractor(BaseExtractor):
|
|||||||
return {
|
return {
|
||||||
"author" : author,
|
"author" : author,
|
||||||
"user" : self.user_obj or author,
|
"user" : self.user_obj or author,
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime(
|
||||||
extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
|
extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
|
||||||
"tweet_id": link.rpartition("/")[2].partition("#")[0],
|
"tweet_id": link.rpartition("/")[2].partition("#")[0],
|
||||||
"content": extr('class="tweet-content', "</div").partition(">")[2],
|
"content": extr('class="tweet-content', "</div").partition(">")[2],
|
||||||
@@ -142,7 +142,7 @@ class NitterExtractor(BaseExtractor):
|
|||||||
return {
|
return {
|
||||||
"author" : author,
|
"author" : author,
|
||||||
"user" : self.user_obj or author,
|
"user" : self.user_obj or author,
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime(
|
||||||
extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
|
extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
|
||||||
"tweet_id": link.rpartition("/")[2].partition("#")[0],
|
"tweet_id": link.rpartition("/")[2].partition("#")[0],
|
||||||
"content" : extr('class="quote-text', "</div").partition(">")[2],
|
"content" : extr('class="quote-text', "</div").partition(">")[2],
|
||||||
@@ -173,7 +173,7 @@ class NitterExtractor(BaseExtractor):
|
|||||||
"nick" : extr('title="', '"'),
|
"nick" : extr('title="', '"'),
|
||||||
"name" : extr('title="@', '"'),
|
"name" : extr('title="@', '"'),
|
||||||
"description" : extr('<p dir="auto">', '<'),
|
"description" : extr('<p dir="auto">', '<'),
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime(
|
||||||
extr('class="profile-joindate"><span title="', '"'),
|
extr('class="profile-joindate"><span title="', '"'),
|
||||||
"%I:%M %p - %d %b %Y"),
|
"%I:%M %p - %d %b %Y"),
|
||||||
"statuses_count" : text.parse_int(extr(
|
"statuses_count" : text.parse_int(extr(
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"""Extractors for https://nozomi.la/"""
|
"""Extractors for https://nozomi.la/"""
|
||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text
|
from .. import text, dt
|
||||||
|
|
||||||
|
|
||||||
def decode_nozomi(n):
|
def decode_nozomi(n):
|
||||||
@@ -49,10 +49,9 @@ class NozomiExtractor(Extractor):
|
|||||||
post["character"] = self._list(post.get("character"))
|
post["character"] = self._list(post.get("character"))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = dt.parse_iso(post["date"] + ":00")
|
||||||
post["date"] + ":00", "%Y-%m-%d %H:%M:%S%z")
|
|
||||||
except Exception:
|
except Exception:
|
||||||
post["date"] = None
|
post["date"] = dt.NONE
|
||||||
|
|
||||||
post.update(data)
|
post.update(data)
|
||||||
|
|
||||||
|
|||||||
@@ -53,8 +53,7 @@ class PahealExtractor(Extractor):
|
|||||||
extr("<source src='", "'")),
|
extr("<source src='", "'")),
|
||||||
"uploader": text.unquote(extr(
|
"uploader": text.unquote(extr(
|
||||||
"class='username' href='/user/", "'")),
|
"class='username' href='/user/", "'")),
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime_iso(extr("datetime='", "'")),
|
||||||
extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"),
|
|
||||||
"source" : text.unescape(text.extr(
|
"source" : text.unescape(text.extr(
|
||||||
extr(">Source Link<", "</td>"), "href='", "'")),
|
extr(">Source Link<", "</td>"), "href='", "'")),
|
||||||
}
|
}
|
||||||
@@ -133,7 +132,7 @@ class PahealTagExtractor(PahealExtractor):
|
|||||||
"duration" : text.parse_float(duration[:-1]),
|
"duration" : text.parse_float(duration[:-1]),
|
||||||
"tags" : text.unescape(tags),
|
"tags" : text.unescape(tags),
|
||||||
"size" : text.parse_bytes(size[:-1]),
|
"size" : text.parse_bytes(size[:-1]),
|
||||||
"date" : text.parse_datetime(date, "%B %d, %Y; %H:%M"),
|
"date" : self.parse_datetime(date, "%B %d, %Y; %H:%M"),
|
||||||
"filename" : f"{pid} - {tags}",
|
"filename" : f"{pid} - {tags}",
|
||||||
"extension": ext,
|
"extension": ext,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"""Extractors for https://www.patreon.com/"""
|
"""Extractors for https://www.patreon.com/"""
|
||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text, util, exception
|
from .. import text, util, dt, exception
|
||||||
from ..cache import memcache
|
from ..cache import memcache
|
||||||
import collections
|
import collections
|
||||||
import itertools
|
import itertools
|
||||||
@@ -177,8 +177,7 @@ class PatreonExtractor(Extractor):
|
|||||||
post, included, "attachments")
|
post, included, "attachments")
|
||||||
attr["attachments_media"] = self._files(
|
attr["attachments_media"] = self._files(
|
||||||
post, included, "attachments_media")
|
post, included, "attachments_media")
|
||||||
attr["date"] = text.parse_datetime(
|
attr["date"] = self.parse_datetime_iso(attr["published_at"])
|
||||||
attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
attr["campaign"] = (included["campaign"][
|
attr["campaign"] = (included["campaign"][
|
||||||
@@ -226,8 +225,7 @@ class PatreonExtractor(Extractor):
|
|||||||
user = response.json()["data"]
|
user = response.json()["data"]
|
||||||
attr = user["attributes"]
|
attr = user["attributes"]
|
||||||
attr["id"] = user["id"]
|
attr["id"] = user["id"]
|
||||||
attr["date"] = text.parse_datetime(
|
attr["date"] = self.parse_datetime_iso(attr["created"])
|
||||||
attr["created"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
return attr
|
return attr
|
||||||
|
|
||||||
def _collection(self, collection_id):
|
def _collection(self, collection_id):
|
||||||
@@ -236,8 +234,7 @@ class PatreonExtractor(Extractor):
|
|||||||
coll = data["data"]
|
coll = data["data"]
|
||||||
attr = coll["attributes"]
|
attr = coll["attributes"]
|
||||||
attr["id"] = coll["id"]
|
attr["id"] = coll["id"]
|
||||||
attr["date"] = text.parse_datetime(
|
attr["date"] = self.parse_datetime_iso(attr["created_at"])
|
||||||
attr["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
return attr
|
return attr
|
||||||
|
|
||||||
def _filename(self, url):
|
def _filename(self, url):
|
||||||
@@ -445,8 +442,7 @@ class PatreonUserExtractor(PatreonExtractor):
|
|||||||
|
|
||||||
def posts(self):
|
def posts(self):
|
||||||
if date_max := self._get_date_min_max(None, None)[1]:
|
if date_max := self._get_date_min_max(None, None)[1]:
|
||||||
self._cursor = cursor = \
|
self._cursor = cursor = dt.from_ts(date_max).isoformat()
|
||||||
util.datetime_from_timestamp(date_max).isoformat()
|
|
||||||
self._init_cursor = lambda: cursor
|
self._init_cursor = lambda: cursor
|
||||||
|
|
||||||
url = self._build_url("stream", (
|
url = self._build_url("stream", (
|
||||||
|
|||||||
@@ -35,8 +35,7 @@ class PexelsExtractor(Extractor):
|
|||||||
post["type"] = attr["type"]
|
post["type"] = attr["type"]
|
||||||
|
|
||||||
post.update(metadata)
|
post.update(metadata)
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["created_at"][:-5])
|
||||||
post["created_at"][:-5], "%Y-%m-%dT%H:%M:%S")
|
|
||||||
|
|
||||||
if "image" in post:
|
if "image" in post:
|
||||||
url, _, query = post["image"]["download_link"].partition("?")
|
url, _, query = post["image"]["download_link"].partition("?")
|
||||||
|
|||||||
@@ -36,8 +36,7 @@ class PhilomenaExtractor(BooruExtractor):
|
|||||||
return url
|
return url
|
||||||
|
|
||||||
def _prepare(self, post):
|
def _prepare(self, post):
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["created_at"][:19])
|
||||||
post["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
|
|
||||||
|
|
||||||
|
|
||||||
BASE_PATTERN = PhilomenaExtractor.update({
|
BASE_PATTERN = PhilomenaExtractor.update({
|
||||||
|
|||||||
@@ -29,8 +29,7 @@ class PhotovogueUserExtractor(Extractor):
|
|||||||
for photo in self.photos():
|
for photo in self.photos():
|
||||||
url = photo["gallery_image"]
|
url = photo["gallery_image"]
|
||||||
photo["title"] = photo["title"].strip()
|
photo["title"] = photo["title"].strip()
|
||||||
photo["date"] = text.parse_datetime(
|
photo["date"] = self.parse_datetime_iso(photo["date"])
|
||||||
photo["date"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
|
|
||||||
yield Message.Directory, photo
|
yield Message.Directory, photo
|
||||||
yield Message.Url, url, text.nameext_from_url(url, photo)
|
yield Message.Url, url, text.nameext_from_url(url, photo)
|
||||||
|
|||||||
@@ -29,8 +29,7 @@ class PicartoGalleryExtractor(Extractor):
|
|||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
for post in self.posts():
|
for post in self.posts():
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["created_at"])
|
||||||
post["created_at"], "%Y-%m-%d %H:%M:%S")
|
|
||||||
variations = post.pop("variations", ())
|
variations = post.pop("variations", ())
|
||||||
yield Message.Directory, post
|
yield Message.Directory, post
|
||||||
|
|
||||||
|
|||||||
@@ -26,8 +26,7 @@ class PiczelExtractor(Extractor):
|
|||||||
def items(self):
|
def items(self):
|
||||||
for post in self.posts():
|
for post in self.posts():
|
||||||
post["tags"] = [t["title"] for t in post["tags"] if t["title"]]
|
post["tags"] = [t["title"] for t in post["tags"] if t["title"]]
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["created_at"])
|
||||||
post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
|
|
||||||
if post["multi"]:
|
if post["multi"]:
|
||||||
images = post["images"]
|
images = post["images"]
|
||||||
|
|||||||
@@ -48,8 +48,7 @@ class PillowfortExtractor(Extractor):
|
|||||||
for url in inline(post["content"]):
|
for url in inline(post["content"]):
|
||||||
files.append({"url": url})
|
files.append({"url": url})
|
||||||
|
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(post["created_at"])
|
||||||
post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
post["post_id"] = post.pop("id")
|
post["post_id"] = post.pop("id")
|
||||||
post["count"] = len(files)
|
post["count"] = len(files)
|
||||||
yield Message.Directory, post
|
yield Message.Directory, post
|
||||||
@@ -76,8 +75,7 @@ class PillowfortExtractor(Extractor):
|
|||||||
if "id" not in file:
|
if "id" not in file:
|
||||||
post["id"] = post["hash"]
|
post["id"] = post["hash"]
|
||||||
if "created_at" in file:
|
if "created_at" in file:
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = self.parse_datetime_iso(file["created_at"])
|
||||||
file["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
|
|
||||||
yield msgtype, url, post
|
yield msgtype, url, post
|
||||||
|
|
||||||
|
|||||||
@@ -24,10 +24,6 @@ class PixeldrainExtractor(Extractor):
|
|||||||
if api_key := self.config("api-key"):
|
if api_key := self.config("api-key"):
|
||||||
self.session.auth = util.HTTPBasicAuth("", api_key)
|
self.session.auth = util.HTTPBasicAuth("", api_key)
|
||||||
|
|
||||||
def parse_datetime(self, date_string):
|
|
||||||
return text.parse_datetime(
|
|
||||||
date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
||||||
|
|
||||||
|
|
||||||
class PixeldrainFileExtractor(PixeldrainExtractor):
|
class PixeldrainFileExtractor(PixeldrainExtractor):
|
||||||
"""Extractor for pixeldrain files"""
|
"""Extractor for pixeldrain files"""
|
||||||
@@ -45,7 +41,7 @@ class PixeldrainFileExtractor(PixeldrainExtractor):
|
|||||||
file = self.request_json(url + "/info")
|
file = self.request_json(url + "/info")
|
||||||
|
|
||||||
file["url"] = url + "?download"
|
file["url"] = url + "?download"
|
||||||
file["date"] = self.parse_datetime(file["date_upload"])
|
file["date"] = self.parse_datetime_iso(file["date_upload"])
|
||||||
|
|
||||||
text.nameext_from_url(file["name"], file)
|
text.nameext_from_url(file["name"], file)
|
||||||
yield Message.Directory, file
|
yield Message.Directory, file
|
||||||
@@ -72,7 +68,7 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor):
|
|||||||
|
|
||||||
files = album["files"]
|
files = album["files"]
|
||||||
album["count"] = album["file_count"]
|
album["count"] = album["file_count"]
|
||||||
album["date"] = self.parse_datetime(album["date_created"])
|
album["date"] = self.parse_datetime_iso(album["date_created"])
|
||||||
|
|
||||||
if self.file_index:
|
if self.file_index:
|
||||||
idx = text.parse_int(self.file_index)
|
idx = text.parse_int(self.file_index)
|
||||||
@@ -91,7 +87,7 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor):
|
|||||||
file["album"] = album
|
file["album"] = album
|
||||||
file["num"] = num
|
file["num"] = num
|
||||||
file["url"] = url = f"{self.root}/api/file/{file['id']}?download"
|
file["url"] = url = f"{self.root}/api/file/{file['id']}?download"
|
||||||
file["date"] = self.parse_datetime(file["date_upload"])
|
file["date"] = self.parse_datetime_iso(file["date_upload"])
|
||||||
text.nameext_from_url(file["name"], file)
|
text.nameext_from_url(file["name"], file)
|
||||||
yield Message.Url, url, file
|
yield Message.Url, url, file
|
||||||
|
|
||||||
@@ -112,7 +108,7 @@ class PixeldrainFolderExtractor(PixeldrainExtractor):
|
|||||||
"mime_type" : data["file_type"],
|
"mime_type" : data["file_type"],
|
||||||
"size" : data["file_size"],
|
"size" : data["file_size"],
|
||||||
"hash_sha256": data["sha256_sum"],
|
"hash_sha256": data["sha256_sum"],
|
||||||
"date" : self.parse_datetime(data["created"]),
|
"date" : self.parse_datetime_iso(data["created"]),
|
||||||
}
|
}
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
|
|||||||
@@ -9,9 +9,8 @@
|
|||||||
"""Extractors for https://www.pixiv.net/"""
|
"""Extractors for https://www.pixiv.net/"""
|
||||||
|
|
||||||
from .common import Extractor, Message, Dispatch
|
from .common import Extractor, Message, Dispatch
|
||||||
from .. import text, util, exception
|
from .. import text, util, dt, exception
|
||||||
from ..cache import cache, memcache
|
from ..cache import cache, memcache
|
||||||
from datetime import datetime, timedelta
|
|
||||||
import itertools
|
import itertools
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
@@ -96,7 +95,7 @@ class PixivExtractor(Extractor):
|
|||||||
if transform_tags:
|
if transform_tags:
|
||||||
transform_tags(work)
|
transform_tags(work)
|
||||||
work["num"] = 0
|
work["num"] = 0
|
||||||
work["date"] = text.parse_datetime(work["create_date"])
|
work["date"] = dt.parse_iso(work["create_date"])
|
||||||
work["rating"] = ratings.get(work["x_restrict"])
|
work["rating"] = ratings.get(work["x_restrict"])
|
||||||
work["suffix"] = ""
|
work["suffix"] = ""
|
||||||
work.update(metadata)
|
work.update(metadata)
|
||||||
@@ -353,10 +352,10 @@ class PixivExtractor(Extractor):
|
|||||||
if fmt in urls:
|
if fmt in urls:
|
||||||
yield urls[fmt]
|
yield urls[fmt]
|
||||||
|
|
||||||
def _date_from_url(self, url, offset=timedelta(hours=9)):
|
def _date_from_url(self, url, offset=dt.timedelta(hours=9)):
|
||||||
try:
|
try:
|
||||||
_, _, _, _, _, y, m, d, H, M, S, _ = url.split("/")
|
_, _, _, _, _, y, m, d, H, M, S, _ = url.split("/")
|
||||||
return datetime(
|
return dt.datetime(
|
||||||
int(y), int(m), int(d), int(H), int(M), int(S)) - offset
|
int(y), int(m), int(d), int(H), int(M), int(S)) - offset
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
@@ -715,8 +714,7 @@ class PixivRankingExtractor(PixivExtractor):
|
|||||||
self.log.warning("invalid date '%s'", date)
|
self.log.warning("invalid date '%s'", date)
|
||||||
date = None
|
date = None
|
||||||
if not date:
|
if not date:
|
||||||
now = util.datetime_utcnow()
|
date = (dt.now() - dt.timedelta(days=1)).strftime("%Y-%m-%d")
|
||||||
date = (now - timedelta(days=1)).strftime("%Y-%m-%d")
|
|
||||||
self.date = date
|
self.date = date
|
||||||
|
|
||||||
self.type = type = query.get("content")
|
self.type = type = query.get("content")
|
||||||
@@ -891,8 +889,7 @@ class PixivSketchExtractor(Extractor):
|
|||||||
for post in self.posts():
|
for post in self.posts():
|
||||||
media = post["media"]
|
media = post["media"]
|
||||||
post["post_id"] = post["id"]
|
post["post_id"] = post["id"]
|
||||||
post["date"] = text.parse_datetime(
|
post["date"] = dt.parse_iso(post["created_at"])
|
||||||
post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
||||||
util.delete_items(post, ("id", "media", "_links"))
|
util.delete_items(post, ("id", "media", "_links"))
|
||||||
|
|
||||||
yield Message.Directory, post
|
yield Message.Directory, post
|
||||||
@@ -972,7 +969,7 @@ class PixivNovelExtractor(PixivExtractor):
|
|||||||
if transform_tags:
|
if transform_tags:
|
||||||
transform_tags(novel)
|
transform_tags(novel)
|
||||||
novel["num"] = 0
|
novel["num"] = 0
|
||||||
novel["date"] = text.parse_datetime(novel["create_date"])
|
novel["date"] = dt.parse_iso(novel["create_date"])
|
||||||
novel["rating"] = ratings.get(novel["x_restrict"])
|
novel["rating"] = ratings.get(novel["x_restrict"])
|
||||||
novel["suffix"] = ""
|
novel["suffix"] = ""
|
||||||
|
|
||||||
@@ -1154,7 +1151,7 @@ class PixivAppAPI():
|
|||||||
"get_secure_url": "1",
|
"get_secure_url": "1",
|
||||||
}
|
}
|
||||||
|
|
||||||
time = util.datetime_utcnow().strftime("%Y-%m-%dT%H:%M:%S+00:00")
|
time = dt.now().strftime("%Y-%m-%dT%H:%M:%S+00:00")
|
||||||
headers = {
|
headers = {
|
||||||
"X-Client-Time": time,
|
"X-Client-Time": time,
|
||||||
"X-Client-Hash": hashlib.md5(
|
"X-Client-Hash": hashlib.md5(
|
||||||
@@ -1329,11 +1326,11 @@ class PixivAppAPI():
|
|||||||
sort = params["sort"]
|
sort = params["sort"]
|
||||||
if sort == "date_desc":
|
if sort == "date_desc":
|
||||||
date_key = "end_date"
|
date_key = "end_date"
|
||||||
date_off = timedelta(days=1)
|
date_off = dt.timedelta(days=1)
|
||||||
date_cmp = lambda lhs, rhs: lhs >= rhs # noqa E731
|
date_cmp = lambda lhs, rhs: lhs >= rhs # noqa E731
|
||||||
elif sort == "date_asc":
|
elif sort == "date_asc":
|
||||||
date_key = "start_date"
|
date_key = "start_date"
|
||||||
date_off = timedelta(days=-1)
|
date_off = dt.timedelta(days=-1)
|
||||||
date_cmp = lambda lhs, rhs: lhs <= rhs # noqa E731
|
date_cmp = lambda lhs, rhs: lhs <= rhs # noqa E731
|
||||||
else:
|
else:
|
||||||
date_key = None
|
date_key = None
|
||||||
@@ -1360,8 +1357,8 @@ class PixivAppAPI():
|
|||||||
|
|
||||||
if date_key and text.parse_int(params.get("offset")) >= 5000:
|
if date_key and text.parse_int(params.get("offset")) >= 5000:
|
||||||
date_last = data["illusts"][-1]["create_date"]
|
date_last = data["illusts"][-1]["create_date"]
|
||||||
date_val = (text.parse_datetime(
|
date_val = (dt.parse_iso(date_last) + date_off).strftime(
|
||||||
date_last) + date_off).strftime("%Y-%m-%d")
|
"%Y-%m-%d")
|
||||||
self.log.info("Reached 'offset' >= 5000; "
|
self.log.info("Reached 'offset' >= 5000; "
|
||||||
"Updating '%s' to '%s'", date_key, date_val)
|
"Updating '%s' to '%s'", date_key, date_val)
|
||||||
params[date_key] = date_val
|
params[date_key] = date_val
|
||||||
|
|||||||
@@ -9,8 +9,7 @@
|
|||||||
"""Extractors for https://www.plurk.com/"""
|
"""Extractors for https://www.plurk.com/"""
|
||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text, util, exception
|
from .. import text, util, dt, exception
|
||||||
import datetime
|
|
||||||
|
|
||||||
|
|
||||||
class PlurkExtractor(Extractor):
|
class PlurkExtractor(Extractor):
|
||||||
@@ -88,12 +87,10 @@ class PlurkTimelineExtractor(PlurkExtractor):
|
|||||||
while plurks:
|
while plurks:
|
||||||
yield from plurks
|
yield from plurks
|
||||||
|
|
||||||
offset = datetime.datetime.strptime(
|
offset = dt.parse(plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z")
|
||||||
plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z")
|
|
||||||
data["offset"] = offset.strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
data["offset"] = offset.strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||||
response = self.request(
|
plurks = self.request_json(
|
||||||
url, method="POST", headers=headers, data=data)
|
url, method="POST", headers=headers, data=data)["plurks"]
|
||||||
plurks = response.json()["plurks"]
|
|
||||||
|
|
||||||
|
|
||||||
class PlurkPostExtractor(PlurkExtractor):
|
class PlurkPostExtractor(PlurkExtractor):
|
||||||
|
|||||||
@@ -150,8 +150,7 @@ class PornhubGifExtractor(PornhubExtractor):
|
|||||||
"tags" : extr("data-context-tag='", "'").split(","),
|
"tags" : extr("data-context-tag='", "'").split(","),
|
||||||
"title": extr('"name": "', '"'),
|
"title": extr('"name": "', '"'),
|
||||||
"url" : extr('"contentUrl": "', '"'),
|
"url" : extr('"contentUrl": "', '"'),
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime_iso(extr('"uploadDate": "', '"')),
|
||||||
extr('"uploadDate": "', '"'), "%Y-%m-%d"),
|
|
||||||
"viewkey" : extr('From this video: '
|
"viewkey" : extr('From this video: '
|
||||||
'<a href="/view_video.php?viewkey=', '"'),
|
'<a href="/view_video.php?viewkey=', '"'),
|
||||||
"timestamp": extr('lass="directLink tstamp" rel="nofollow">', '<'),
|
"timestamp": extr('lass="directLink tstamp" rel="nofollow">', '<'),
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class PostmillExtractor(BaseExtractor):
|
|||||||
|
|
||||||
title = text.unescape(extr(
|
title = text.unescape(extr(
|
||||||
'<meta property="og:title" content="', '">'))
|
'<meta property="og:title" content="', '">'))
|
||||||
date = text.parse_datetime(extr(
|
date = self.parse_datetime_iso(extr(
|
||||||
'<meta property="og:article:published_time" content="', '">'))
|
'<meta property="og:article:published_time" content="', '">'))
|
||||||
username = extr(
|
username = extr(
|
||||||
'<meta property="og:article:author" content="', '">')
|
'<meta property="og:article:author" content="', '">')
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ class RawkumaChapterExtractor(RawkumaBase, ChapterExtractor):
|
|||||||
"chapter_minor": sep + minor,
|
"chapter_minor": sep + minor,
|
||||||
"chapter_id" : text.parse_int(item["cid"]),
|
"chapter_id" : text.parse_int(item["cid"]),
|
||||||
"title" : text.unescape(title),
|
"title" : text.unescape(title),
|
||||||
"date" : text.parse_datetime(
|
"date" : self.parse_datetime(
|
||||||
date, "%Y-%m-%dWIB%H:%M:%S%z"),
|
date, "%Y-%m-%dWIB%H:%M:%S%z"),
|
||||||
"thumbnail" : item.get("t"),
|
"thumbnail" : item.get("t"),
|
||||||
"lang" : "ja",
|
"lang" : "ja",
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user