[dt] update 'parse_datetime' calls with one argument

This commit is contained in:
Mike Fährmann
2025-10-17 22:49:41 +02:00
parent 085616e0a8
commit 6c71b279b6
26 changed files with 49 additions and 56 deletions

View File

@@ -74,7 +74,7 @@ if sys.hexversion < 0x30c0000:
except Exception:
return NONE
def parse_compat(dt_string, format):
def parse_compat(dt_string, format="%Y-%m-%dT%H:%M:%S%z"):
"""Parse 'dt_string' as ISO 8601 value using 'format'"""
return parse(dt_string, format)
@@ -90,7 +90,7 @@ else:
except Exception:
return NONE
def parse_compat(dt_string, format):
def parse_compat(dt_string, format=None):
"""Parse 'dt_string' as ISO 8601 value"""
return parse_iso(dt_string)

View File

@@ -61,10 +61,9 @@ class _4archiveThreadExtractor(Extractor):
extr = text.extract_from(post)
data = {
"name": extr('class="name">', "</span>"),
"date": self.parse_datetime(
"date": self.parse_datetime_iso(
(extr('class="dateTime">', "<") or
extr('class="dateTime postNum" >', "<")).strip(),
"%Y-%m-%d %H:%M:%S"),
extr('class="dateTime postNum" >', "<")).strip()),
"no" : text.parse_int(extr(">Post No.", "<")),
}
if 'class="file"' in post:

View File

@@ -141,8 +141,8 @@ class BellazonExtractor(Extractor):
"title": schema["headline"],
"views": stats[0]["userInteractionCount"],
"posts": stats[1]["userInteractionCount"],
"date" : self.parse_datetime(schema["datePublished"]),
"date_updated": self.parse_datetime(schema["dateModified"]),
"date" : self.parse_datetime_iso(schema["datePublished"]),
"date_updated": self.parse_datetime_iso(schema["dateModified"]),
"description" : text.unescape(schema["text"]).strip(),
"section" : path[-2],
"author" : author["name"],
@@ -162,7 +162,7 @@ class BellazonExtractor(Extractor):
post = {
"id": extr('id="elComment_', '"'),
"author_url": extr(" href='", "'"),
"date": self.parse_datetime(extr("datetime='", "'")),
"date": self.parse_datetime_iso(extr("datetime='", "'")),
"content": extr("<!-- Post content -->", "\n\t\t</div>"),
}

View File

@@ -40,7 +40,7 @@ class BloggerExtractor(BaseExtractor):
blog = self.api.blog_by_url("http://" + self.blog)
blog["pages"] = blog["pages"]["totalItems"]
blog["posts"] = blog["posts"]["totalItems"]
blog["date"] = self.parse_datetime(blog["published"])
blog["date"] = self.parse_datetime_iso(blog["published"])
del blog["selfLink"]
findall_image = util.re(
@@ -65,7 +65,7 @@ class BloggerExtractor(BaseExtractor):
post["author"] = post["author"]["displayName"]
post["replies"] = post["replies"]["totalItems"]
post["content"] = text.remove_html(content)
post["date"] = self.parse_datetime(post["published"])
post["date"] = self.parse_datetime_iso(post["published"])
del post["selfLink"]
del post["blog"]

View File

@@ -61,7 +61,7 @@ class CienArticleExtractor(CienExtractor):
post["post_url"] = url
post["post_id"] = text.parse_int(post_id)
post["count"] = len(files)
post["date"] = self.parse_datetime(post["datePublished"])
post["date"] = self.parse_datetime_iso(post["datePublished"])
try:
post["author"]["id"] = text.parse_int(author_id)

View File

@@ -1187,7 +1187,7 @@ class DeviantartStatusExtractor(DeviantartExtractor):
deviation["username"] = deviation["author"]["username"]
deviation["_username"] = deviation["username"].lower()
deviation["date"] = d = self.parse_datetime(deviation["ts"])
deviation["date"] = d = self.parse_datetime_iso(deviation["ts"])
deviation["published_time"] = int(dt.to_ts(d))
deviation["da_category"] = "Status"

View File

@@ -72,9 +72,7 @@ class DiscordExtractor(Extractor):
"author_files": [],
"message": self.extract_message_text(message),
"message_id": message["id"],
"date": self.parse_datetime(
message["timestamp"], "%Y-%m-%dT%H:%M:%S.%f%z"
),
"date": self.parse_datetime_iso(message["timestamp"]),
"files": []
})

View File

@@ -128,7 +128,7 @@ class FanboxExtractor(Extractor):
if file.get("extension", "").lower() in exts
]
post["date"] = self.parse_datetime(post["publishedDatetime"])
post["date"] = self.parse_datetime_iso(post["publishedDatetime"])
post["text"] = content_body.get("text") if content_body else None
post["isCoverImage"] = False

View File

@@ -34,7 +34,7 @@ class HatenablogExtractor(Extractor):
def _handle_article(self, article: str):
extr = text.extract_from(article)
date = self.parse_datetime(extr('<time datetime="', '"'))
date = self.parse_datetime_iso(extr('<time datetime="', '"'))
entry_link = text.unescape(extr('<a href="', '"'))
entry = entry_link.partition("/entry/")[2]
title = text.unescape(extr('>', '<'))

View File

@@ -86,7 +86,7 @@ class HentaifoundryExtractor(Extractor):
.replace("\r\n", "\n")),
"ratings" : [text.unescape(r) for r in text.extract_iter(extr(
"class='ratings_box'", "</div>"), "title='", "'")],
"date" : self.parse_datetime(extr("datetime='", "'")),
"date" : self.parse_datetime_iso(extr("datetime='", "'")),
"views" : text.parse_int(extr(">Views</span>", "<")),
"score" : text.parse_int(extr(">Vote Score</span>", "<")),
"media" : text.unescape(extr(">Media</span>", "<").strip()),

View File

@@ -38,7 +38,7 @@ class ImgurExtractor(Extractor):
image["url"] = url = \
f"https://i.imgur.com/{image['id']}.{image['ext']}"
image["date"] = self.parse_datetime(image["created_at"])
image["date"] = self.parse_datetime_iso(image["created_at"])
image["_http_validate"] = self._validate
text.nameext_from_url(url, image)
@@ -106,7 +106,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
del album["media"]
count = len(images)
album["date"] = self.parse_datetime(album["created_at"])
album["date"] = self.parse_datetime_iso(album["created_at"])
try:
del album["ad_url"]

View File

@@ -45,7 +45,7 @@ class LivedoorExtractor(Extractor):
"title" : text.unescape(extr('dc:title="', '"')),
"categories" : extr('dc:subject="', '"').partition(",")[::2],
"description": extr('dc:description="', '"'),
"date" : self.parse_datetime(extr('dc:date="', '"')),
"date" : self.parse_datetime_iso(extr('dc:date="', '"')),
"tags" : text.split_html(tags)[1:] if tags else [],
"user" : self.user,
"body" : body,

View File

@@ -68,7 +68,7 @@ class MangadexExtractor(Extractor):
"chapter" : text.parse_int(chnum),
"chapter_minor": f"{sep}{minor}",
"chapter_id": chapter["id"],
"date" : self.parse_datetime(cattributes["publishAt"]),
"date" : self.parse_datetime_iso(cattributes["publishAt"]),
"group" : [group["attributes"]["name"]
for group in relationships["scanlation_group"]],
"lang" : lang,
@@ -109,8 +109,8 @@ class MangadexCoversExtractor(MangadexExtractor):
"cover" : cattributes["fileName"],
"lang" : cattributes.get("locale"),
"volume" : text.parse_int(cattributes["volume"]),
"date" : self.parse_datetime(cattributes["createdAt"]),
"date_updated": self.parse_datetime(cattributes["updatedAt"]),
"date" : self.parse_datetime_iso(cattributes["createdAt"]),
"date_updated": self.parse_datetime_iso(cattributes["updatedAt"]),
}
@@ -454,7 +454,7 @@ def _manga_info(self, uuid):
"manga_id": manga["id"],
"manga_titles": [t.popitem()[1]
for t in mattr.get("altTitles") or ()],
"manga_date" : self.parse_datetime(mattr.get("createdAt")),
"manga_date" : self.parse_datetime_iso(mattr.get("createdAt")),
"description" : (mattr["description"].get("en") or
next(iter(mattr["description"].values()), "")),
"demographic": mattr.get("publicationDemographic"),

View File

@@ -64,8 +64,7 @@ class MastodonExtractor(BaseExtractor):
status["count"] = len(attachments)
status["tags"] = [tag["name"] for tag in status["tags"]]
status["date"] = self.parse_datetime(
status["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
status["date"] = self.parse_datetime_iso(status["created_at"][:19])
yield Message.Directory, status
for status["num"], media in enumerate(attachments, 1):
@@ -319,10 +318,8 @@ class MastodonAPI():
if code == 404:
raise exception.NotFoundError()
if code == 429:
self.extractor.wait(until=self.parse_datetime(
response.headers["x-ratelimit-reset"],
"%Y-%m-%dT%H:%M:%S.%fZ",
))
self.extractor.wait(until=self.parse_datetime_iso(
response.headers["x-ratelimit-reset"]))
continue
raise exception.AbortExtraction(response.json().get("error"))

View File

@@ -9,7 +9,7 @@
"""Extractors for https://www.newgrounds.com/"""
from .common import Extractor, Message, Dispatch
from .. import text, util, exception
from .. import text, util, dt, exception
from ..cache import cache
import itertools
@@ -218,7 +218,7 @@ class NewgroundsExtractor(Extractor):
"description": text.unescape(extr(':description" content="', '"')),
"type" : "art",
"_type" : "i",
"date" : self.parse_datetime(extr(
"date" : dt.parse_compat(extr(
'itemprop="datePublished" content="', '"')),
"rating" : extr('class="rated-', '"'),
"url" : full('src="', '"'),
@@ -268,7 +268,7 @@ class NewgroundsExtractor(Extractor):
"description": text.unescape(extr(':description" content="', '"')),
"type" : "audio",
"_type" : "a",
"date" : self.parse_datetime(extr(
"date" : dt.parse_compat(extr(
'itemprop="datePublished" content="', '"')),
"url" : extr('{"url":"', '"').replace("\\/", "/"),
"index" : text.parse_int(index),
@@ -287,7 +287,7 @@ class NewgroundsExtractor(Extractor):
src = src.replace("\\/", "/")
formats = ()
type = extr(',"description":"', '"')
date = self.parse_datetime(extr(
date = dt.parse_compat(extr(
'itemprop="datePublished" content="', '"'))
if type:
type = type.rpartition(" ")[2].lower()

View File

@@ -9,7 +9,7 @@
"""Extractors for nijie instances"""
from .common import BaseExtractor, Message, Dispatch, AsynchronousMixin
from .. import text, exception
from .. import text, dt, exception
from ..cache import cache
@@ -82,8 +82,9 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"title" : keywords[0].strip(),
"description": text.unescape(extr(
'"description": "', '"').replace("&amp;", "&")),
"date" : self.parse_datetime(extr(
'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y", 9),
"date" : dt.parse(extr(
'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"
) - dt.timedelta(hours=9),
"artist_id" : text.parse_int(extr('/members.php?id=', '"')),
"artist_name": keywords[1],
"tags" : keywords[2:-1],
@@ -101,9 +102,9 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"artist_id" : text.parse_int(extr('members.php?id=', '"')),
"artist_name": keywords[1],
"tags" : keywords[2:-1],
"date" : self.parse_datetime(extr(
"itemprop='datePublished' content=", "<").rpartition(">")[2],
"%Y-%m-%d %H:%M:%S", 9),
"date" : dt.parse_iso(extr(
"itemprop='datePublished' content=", "<").rpartition(">")[2]
) - dt.timedelta(hours=9),
}
def _extract_images(self, image_id, page):

View File

@@ -31,7 +31,7 @@ class PostmillExtractor(BaseExtractor):
title = text.unescape(extr(
'<meta property="og:title" content="', '">'))
date = self.parse_datetime(extr(
date = self.parse_datetime_iso(extr(
'<meta property="og:article:published_time" content="', '">'))
username = extr(
'<meta property="og:article:author" content="', '">')

View File

@@ -97,7 +97,7 @@ class ReactorExtractor(BaseExtractor):
return
num = 0
date = self.parse_datetime(data["datePublished"])
date = self.parse_datetime_iso(data["datePublished"])
user = data["author"]["name"]
description = text.unescape(data["description"])
title, _, tags = text.unescape(data["headline"]).partition(" / ")

View File

@@ -40,7 +40,7 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
extr('property="og:title" content="', '"')),
"description": text.unescape(
extr('property="og:description" content="', '"')),
"date" : self.parse_datetime(
"date" : self.parse_datetime_iso(
extr('property="article:published_time" content="', '"')),
}
content = extr('<div class="entry-content">', '</article>')

View File

@@ -98,7 +98,7 @@ class SimpcityExtractor(Extractor):
"id" : url_t[url_t.rfind(".")+1:-1],
"url" : url_t,
"title": schema["headline"],
"date" : self.parse_datetime(schema["datePublished"]),
"date" : self.parse_datetime_iso(schema["datePublished"]),
"views": stats[0]["userInteractionCount"],
"posts": stats[1]["userInteractionCount"],
"tags" : (schema["keywords"].split(", ")
@@ -119,7 +119,7 @@ class SimpcityExtractor(Extractor):
"author": extr('data-author="', '"'),
"id": extr('data-content="post-', '"'),
"author_url": extr('itemprop="url" content="', '"'),
"date": self.parse_datetime(extr('datetime="', '"')),
"date": self.parse_datetime_iso(extr('datetime="', '"')),
"content": extr('<div itemprop="text">',
'<div class="js-selectToQuote').strip(),
}

View File

@@ -89,7 +89,7 @@ class TapasEpisodeExtractor(TapasExtractor):
html = data["html"]
episode["series"] = self._extract_series(html)
episode["date"] = self.parse_datetime(episode["publish_date"])
episode["date"] = self.parse_datetime_iso(episode["publish_date"])
yield Message.Directory, episode
if episode["book"]:

View File

@@ -27,9 +27,8 @@ class TelegraphGalleryExtractor(GalleryExtractor):
'property="og:title" content="', '"')),
"description": text.unescape(extr(
'property="og:description" content="', '"')),
"date": self.parse_datetime(extr(
'property="article:published_time" content="', '"'),
"%Y-%m-%dT%H:%M:%S%z"),
"date": self.parse_datetime_iso(extr(
'property="article:published_time" content="', '"')),
"author": text.unescape(extr(
'property="article:author" content="', '"')),
"post_url": text.unescape(extr(

View File

@@ -23,7 +23,7 @@ class TungstenExtractor(Extractor):
def items(self):
for post in self.posts():
url = post["original_url"]
post["date"] = self.parse_datetime(post["created_at"])
post["date"] = self.parse_datetime_iso(post["created_at"])
post["filename"] = url[url.rfind("/")+1:]
post["extension"] = "webp"
yield Message.Directory, post

View File

@@ -41,7 +41,7 @@ class UnsplashExtractor(Extractor):
if metadata:
photo.update(metadata)
photo["extension"] = "jpg"
photo["date"] = self.parse_datetime(photo["created_at"])
photo["date"] = self.parse_datetime_iso(photo["created_at"])
if "tags" in photo:
photo["tags"] = [t["title"] for t in photo["tags"]]

View File

@@ -24,8 +24,7 @@ class WeasylExtractor(Extractor):
# Some submissions don't have content and can be skipped
if "submission" in data["media"]:
data["url"] = data["media"]["submission"][0]["url"]
data["date"] = self.parse_datetime(
data["posted_at"][:19], "%Y-%m-%dT%H:%M:%S")
data["date"] = self.parse_datetime_iso(data["posted_at"][:19])
text.nameext_from_url(data["url"], data)
return True
return False
@@ -42,7 +41,7 @@ class WeasylExtractor(Extractor):
f"{self.root}/api/journals/{journalid}/view")
data["extension"] = "html"
data["html"] = "text:" + data["content"]
data["date"] = self.parse_datetime(data["posted_at"])
data["date"] = self.parse_datetime_iso(data["posted_at"])
return data
def submissions(self, owner_login, folderid=None):

View File

@@ -76,7 +76,7 @@ class ZerochanExtractor(BooruExtractor):
data = {
"id" : text.parse_int(entry_id),
"file_url": jsonld["contentUrl"],
"date" : self.parse_datetime(jsonld["datePublished"]),
"date" : self.parse_datetime_iso(jsonld["datePublished"]),
"width" : text.parse_int(jsonld["width"][:-3]),
"height" : text.parse_int(jsonld["height"][:-3]),
"size" : text.parse_bytes(jsonld["contentSize"][:-1]),