[dt] update 'parse_datetime' calls with one argument

This commit is contained in:
Mike Fährmann
2025-10-17 22:49:41 +02:00
parent 085616e0a8
commit 6c71b279b6
26 changed files with 49 additions and 56 deletions

View File

@@ -74,7 +74,7 @@ if sys.hexversion < 0x30c0000:
except Exception: except Exception:
return NONE return NONE
def parse_compat(dt_string, format): def parse_compat(dt_string, format="%Y-%m-%dT%H:%M:%S%z"):
"""Parse 'dt_string' as ISO 8601 value using 'format'""" """Parse 'dt_string' as ISO 8601 value using 'format'"""
return parse(dt_string, format) return parse(dt_string, format)
@@ -90,7 +90,7 @@ else:
except Exception: except Exception:
return NONE return NONE
def parse_compat(dt_string, format): def parse_compat(dt_string, format=None):
"""Parse 'dt_string' as ISO 8601 value""" """Parse 'dt_string' as ISO 8601 value"""
return parse_iso(dt_string) return parse_iso(dt_string)

View File

@@ -61,10 +61,9 @@ class _4archiveThreadExtractor(Extractor):
extr = text.extract_from(post) extr = text.extract_from(post)
data = { data = {
"name": extr('class="name">', "</span>"), "name": extr('class="name">', "</span>"),
"date": self.parse_datetime( "date": self.parse_datetime_iso(
(extr('class="dateTime">', "<") or (extr('class="dateTime">', "<") or
extr('class="dateTime postNum" >', "<")).strip(), extr('class="dateTime postNum" >', "<")).strip()),
"%Y-%m-%d %H:%M:%S"),
"no" : text.parse_int(extr(">Post No.", "<")), "no" : text.parse_int(extr(">Post No.", "<")),
} }
if 'class="file"' in post: if 'class="file"' in post:

View File

@@ -141,8 +141,8 @@ class BellazonExtractor(Extractor):
"title": schema["headline"], "title": schema["headline"],
"views": stats[0]["userInteractionCount"], "views": stats[0]["userInteractionCount"],
"posts": stats[1]["userInteractionCount"], "posts": stats[1]["userInteractionCount"],
"date" : self.parse_datetime(schema["datePublished"]), "date" : self.parse_datetime_iso(schema["datePublished"]),
"date_updated": self.parse_datetime(schema["dateModified"]), "date_updated": self.parse_datetime_iso(schema["dateModified"]),
"description" : text.unescape(schema["text"]).strip(), "description" : text.unescape(schema["text"]).strip(),
"section" : path[-2], "section" : path[-2],
"author" : author["name"], "author" : author["name"],
@@ -162,7 +162,7 @@ class BellazonExtractor(Extractor):
post = { post = {
"id": extr('id="elComment_', '"'), "id": extr('id="elComment_', '"'),
"author_url": extr(" href='", "'"), "author_url": extr(" href='", "'"),
"date": self.parse_datetime(extr("datetime='", "'")), "date": self.parse_datetime_iso(extr("datetime='", "'")),
"content": extr("<!-- Post content -->", "\n\t\t</div>"), "content": extr("<!-- Post content -->", "\n\t\t</div>"),
} }

View File

@@ -40,7 +40,7 @@ class BloggerExtractor(BaseExtractor):
blog = self.api.blog_by_url("http://" + self.blog) blog = self.api.blog_by_url("http://" + self.blog)
blog["pages"] = blog["pages"]["totalItems"] blog["pages"] = blog["pages"]["totalItems"]
blog["posts"] = blog["posts"]["totalItems"] blog["posts"] = blog["posts"]["totalItems"]
blog["date"] = self.parse_datetime(blog["published"]) blog["date"] = self.parse_datetime_iso(blog["published"])
del blog["selfLink"] del blog["selfLink"]
findall_image = util.re( findall_image = util.re(
@@ -65,7 +65,7 @@ class BloggerExtractor(BaseExtractor):
post["author"] = post["author"]["displayName"] post["author"] = post["author"]["displayName"]
post["replies"] = post["replies"]["totalItems"] post["replies"] = post["replies"]["totalItems"]
post["content"] = text.remove_html(content) post["content"] = text.remove_html(content)
post["date"] = self.parse_datetime(post["published"]) post["date"] = self.parse_datetime_iso(post["published"])
del post["selfLink"] del post["selfLink"]
del post["blog"] del post["blog"]

View File

@@ -61,7 +61,7 @@ class CienArticleExtractor(CienExtractor):
post["post_url"] = url post["post_url"] = url
post["post_id"] = text.parse_int(post_id) post["post_id"] = text.parse_int(post_id)
post["count"] = len(files) post["count"] = len(files)
post["date"] = self.parse_datetime(post["datePublished"]) post["date"] = self.parse_datetime_iso(post["datePublished"])
try: try:
post["author"]["id"] = text.parse_int(author_id) post["author"]["id"] = text.parse_int(author_id)

View File

@@ -1187,7 +1187,7 @@ class DeviantartStatusExtractor(DeviantartExtractor):
deviation["username"] = deviation["author"]["username"] deviation["username"] = deviation["author"]["username"]
deviation["_username"] = deviation["username"].lower() deviation["_username"] = deviation["username"].lower()
deviation["date"] = d = self.parse_datetime(deviation["ts"]) deviation["date"] = d = self.parse_datetime_iso(deviation["ts"])
deviation["published_time"] = int(dt.to_ts(d)) deviation["published_time"] = int(dt.to_ts(d))
deviation["da_category"] = "Status" deviation["da_category"] = "Status"

View File

@@ -72,9 +72,7 @@ class DiscordExtractor(Extractor):
"author_files": [], "author_files": [],
"message": self.extract_message_text(message), "message": self.extract_message_text(message),
"message_id": message["id"], "message_id": message["id"],
"date": self.parse_datetime( "date": self.parse_datetime_iso(message["timestamp"]),
message["timestamp"], "%Y-%m-%dT%H:%M:%S.%f%z"
),
"files": [] "files": []
}) })

View File

@@ -128,7 +128,7 @@ class FanboxExtractor(Extractor):
if file.get("extension", "").lower() in exts if file.get("extension", "").lower() in exts
] ]
post["date"] = self.parse_datetime(post["publishedDatetime"]) post["date"] = self.parse_datetime_iso(post["publishedDatetime"])
post["text"] = content_body.get("text") if content_body else None post["text"] = content_body.get("text") if content_body else None
post["isCoverImage"] = False post["isCoverImage"] = False

View File

@@ -34,7 +34,7 @@ class HatenablogExtractor(Extractor):
def _handle_article(self, article: str): def _handle_article(self, article: str):
extr = text.extract_from(article) extr = text.extract_from(article)
date = self.parse_datetime(extr('<time datetime="', '"')) date = self.parse_datetime_iso(extr('<time datetime="', '"'))
entry_link = text.unescape(extr('<a href="', '"')) entry_link = text.unescape(extr('<a href="', '"'))
entry = entry_link.partition("/entry/")[2] entry = entry_link.partition("/entry/")[2]
title = text.unescape(extr('>', '<')) title = text.unescape(extr('>', '<'))

View File

@@ -86,7 +86,7 @@ class HentaifoundryExtractor(Extractor):
.replace("\r\n", "\n")), .replace("\r\n", "\n")),
"ratings" : [text.unescape(r) for r in text.extract_iter(extr( "ratings" : [text.unescape(r) for r in text.extract_iter(extr(
"class='ratings_box'", "</div>"), "title='", "'")], "class='ratings_box'", "</div>"), "title='", "'")],
"date" : self.parse_datetime(extr("datetime='", "'")), "date" : self.parse_datetime_iso(extr("datetime='", "'")),
"views" : text.parse_int(extr(">Views</span>", "<")), "views" : text.parse_int(extr(">Views</span>", "<")),
"score" : text.parse_int(extr(">Vote Score</span>", "<")), "score" : text.parse_int(extr(">Vote Score</span>", "<")),
"media" : text.unescape(extr(">Media</span>", "<").strip()), "media" : text.unescape(extr(">Media</span>", "<").strip()),

View File

@@ -38,7 +38,7 @@ class ImgurExtractor(Extractor):
image["url"] = url = \ image["url"] = url = \
f"https://i.imgur.com/{image['id']}.{image['ext']}" f"https://i.imgur.com/{image['id']}.{image['ext']}"
image["date"] = self.parse_datetime(image["created_at"]) image["date"] = self.parse_datetime_iso(image["created_at"])
image["_http_validate"] = self._validate image["_http_validate"] = self._validate
text.nameext_from_url(url, image) text.nameext_from_url(url, image)
@@ -106,7 +106,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
del album["media"] del album["media"]
count = len(images) count = len(images)
album["date"] = self.parse_datetime(album["created_at"]) album["date"] = self.parse_datetime_iso(album["created_at"])
try: try:
del album["ad_url"] del album["ad_url"]

View File

@@ -45,7 +45,7 @@ class LivedoorExtractor(Extractor):
"title" : text.unescape(extr('dc:title="', '"')), "title" : text.unescape(extr('dc:title="', '"')),
"categories" : extr('dc:subject="', '"').partition(",")[::2], "categories" : extr('dc:subject="', '"').partition(",")[::2],
"description": extr('dc:description="', '"'), "description": extr('dc:description="', '"'),
"date" : self.parse_datetime(extr('dc:date="', '"')), "date" : self.parse_datetime_iso(extr('dc:date="', '"')),
"tags" : text.split_html(tags)[1:] if tags else [], "tags" : text.split_html(tags)[1:] if tags else [],
"user" : self.user, "user" : self.user,
"body" : body, "body" : body,

View File

@@ -68,7 +68,7 @@ class MangadexExtractor(Extractor):
"chapter" : text.parse_int(chnum), "chapter" : text.parse_int(chnum),
"chapter_minor": f"{sep}{minor}", "chapter_minor": f"{sep}{minor}",
"chapter_id": chapter["id"], "chapter_id": chapter["id"],
"date" : self.parse_datetime(cattributes["publishAt"]), "date" : self.parse_datetime_iso(cattributes["publishAt"]),
"group" : [group["attributes"]["name"] "group" : [group["attributes"]["name"]
for group in relationships["scanlation_group"]], for group in relationships["scanlation_group"]],
"lang" : lang, "lang" : lang,
@@ -109,8 +109,8 @@ class MangadexCoversExtractor(MangadexExtractor):
"cover" : cattributes["fileName"], "cover" : cattributes["fileName"],
"lang" : cattributes.get("locale"), "lang" : cattributes.get("locale"),
"volume" : text.parse_int(cattributes["volume"]), "volume" : text.parse_int(cattributes["volume"]),
"date" : self.parse_datetime(cattributes["createdAt"]), "date" : self.parse_datetime_iso(cattributes["createdAt"]),
"date_updated": self.parse_datetime(cattributes["updatedAt"]), "date_updated": self.parse_datetime_iso(cattributes["updatedAt"]),
} }
@@ -454,7 +454,7 @@ def _manga_info(self, uuid):
"manga_id": manga["id"], "manga_id": manga["id"],
"manga_titles": [t.popitem()[1] "manga_titles": [t.popitem()[1]
for t in mattr.get("altTitles") or ()], for t in mattr.get("altTitles") or ()],
"manga_date" : self.parse_datetime(mattr.get("createdAt")), "manga_date" : self.parse_datetime_iso(mattr.get("createdAt")),
"description" : (mattr["description"].get("en") or "description" : (mattr["description"].get("en") or
next(iter(mattr["description"].values()), "")), next(iter(mattr["description"].values()), "")),
"demographic": mattr.get("publicationDemographic"), "demographic": mattr.get("publicationDemographic"),

View File

@@ -64,8 +64,7 @@ class MastodonExtractor(BaseExtractor):
status["count"] = len(attachments) status["count"] = len(attachments)
status["tags"] = [tag["name"] for tag in status["tags"]] status["tags"] = [tag["name"] for tag in status["tags"]]
status["date"] = self.parse_datetime( status["date"] = self.parse_datetime_iso(status["created_at"][:19])
status["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
yield Message.Directory, status yield Message.Directory, status
for status["num"], media in enumerate(attachments, 1): for status["num"], media in enumerate(attachments, 1):
@@ -319,10 +318,8 @@ class MastodonAPI():
if code == 404: if code == 404:
raise exception.NotFoundError() raise exception.NotFoundError()
if code == 429: if code == 429:
self.extractor.wait(until=self.parse_datetime( self.extractor.wait(until=self.parse_datetime_iso(
response.headers["x-ratelimit-reset"], response.headers["x-ratelimit-reset"]))
"%Y-%m-%dT%H:%M:%S.%fZ",
))
continue continue
raise exception.AbortExtraction(response.json().get("error")) raise exception.AbortExtraction(response.json().get("error"))

View File

@@ -9,7 +9,7 @@
"""Extractors for https://www.newgrounds.com/""" """Extractors for https://www.newgrounds.com/"""
from .common import Extractor, Message, Dispatch from .common import Extractor, Message, Dispatch
from .. import text, util, exception from .. import text, util, dt, exception
from ..cache import cache from ..cache import cache
import itertools import itertools
@@ -218,7 +218,7 @@ class NewgroundsExtractor(Extractor):
"description": text.unescape(extr(':description" content="', '"')), "description": text.unescape(extr(':description" content="', '"')),
"type" : "art", "type" : "art",
"_type" : "i", "_type" : "i",
"date" : self.parse_datetime(extr( "date" : dt.parse_compat(extr(
'itemprop="datePublished" content="', '"')), 'itemprop="datePublished" content="', '"')),
"rating" : extr('class="rated-', '"'), "rating" : extr('class="rated-', '"'),
"url" : full('src="', '"'), "url" : full('src="', '"'),
@@ -268,7 +268,7 @@ class NewgroundsExtractor(Extractor):
"description": text.unescape(extr(':description" content="', '"')), "description": text.unescape(extr(':description" content="', '"')),
"type" : "audio", "type" : "audio",
"_type" : "a", "_type" : "a",
"date" : self.parse_datetime(extr( "date" : dt.parse_compat(extr(
'itemprop="datePublished" content="', '"')), 'itemprop="datePublished" content="', '"')),
"url" : extr('{"url":"', '"').replace("\\/", "/"), "url" : extr('{"url":"', '"').replace("\\/", "/"),
"index" : text.parse_int(index), "index" : text.parse_int(index),
@@ -287,7 +287,7 @@ class NewgroundsExtractor(Extractor):
src = src.replace("\\/", "/") src = src.replace("\\/", "/")
formats = () formats = ()
type = extr(',"description":"', '"') type = extr(',"description":"', '"')
date = self.parse_datetime(extr( date = dt.parse_compat(extr(
'itemprop="datePublished" content="', '"')) 'itemprop="datePublished" content="', '"'))
if type: if type:
type = type.rpartition(" ")[2].lower() type = type.rpartition(" ")[2].lower()

View File

@@ -9,7 +9,7 @@
"""Extractors for nijie instances""" """Extractors for nijie instances"""
from .common import BaseExtractor, Message, Dispatch, AsynchronousMixin from .common import BaseExtractor, Message, Dispatch, AsynchronousMixin
from .. import text, exception from .. import text, dt, exception
from ..cache import cache from ..cache import cache
@@ -82,8 +82,9 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"title" : keywords[0].strip(), "title" : keywords[0].strip(),
"description": text.unescape(extr( "description": text.unescape(extr(
'"description": "', '"').replace("&amp;", "&")), '"description": "', '"').replace("&amp;", "&")),
"date" : self.parse_datetime(extr( "date" : dt.parse(extr(
'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y", 9), '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"
) - dt.timedelta(hours=9),
"artist_id" : text.parse_int(extr('/members.php?id=', '"')), "artist_id" : text.parse_int(extr('/members.php?id=', '"')),
"artist_name": keywords[1], "artist_name": keywords[1],
"tags" : keywords[2:-1], "tags" : keywords[2:-1],
@@ -101,9 +102,9 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"artist_id" : text.parse_int(extr('members.php?id=', '"')), "artist_id" : text.parse_int(extr('members.php?id=', '"')),
"artist_name": keywords[1], "artist_name": keywords[1],
"tags" : keywords[2:-1], "tags" : keywords[2:-1],
"date" : self.parse_datetime(extr( "date" : dt.parse_iso(extr(
"itemprop='datePublished' content=", "<").rpartition(">")[2], "itemprop='datePublished' content=", "<").rpartition(">")[2]
"%Y-%m-%d %H:%M:%S", 9), ) - dt.timedelta(hours=9),
} }
def _extract_images(self, image_id, page): def _extract_images(self, image_id, page):

View File

@@ -31,7 +31,7 @@ class PostmillExtractor(BaseExtractor):
title = text.unescape(extr( title = text.unescape(extr(
'<meta property="og:title" content="', '">')) '<meta property="og:title" content="', '">'))
date = self.parse_datetime(extr( date = self.parse_datetime_iso(extr(
'<meta property="og:article:published_time" content="', '">')) '<meta property="og:article:published_time" content="', '">'))
username = extr( username = extr(
'<meta property="og:article:author" content="', '">') '<meta property="og:article:author" content="', '">')

View File

@@ -97,7 +97,7 @@ class ReactorExtractor(BaseExtractor):
return return
num = 0 num = 0
date = self.parse_datetime(data["datePublished"]) date = self.parse_datetime_iso(data["datePublished"])
user = data["author"]["name"] user = data["author"]["name"]
description = text.unescape(data["description"]) description = text.unescape(data["description"])
title, _, tags = text.unescape(data["headline"]).partition(" / ") title, _, tags = text.unescape(data["headline"]).partition(" / ")

View File

@@ -40,7 +40,7 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
extr('property="og:title" content="', '"')), extr('property="og:title" content="', '"')),
"description": text.unescape( "description": text.unescape(
extr('property="og:description" content="', '"')), extr('property="og:description" content="', '"')),
"date" : self.parse_datetime( "date" : self.parse_datetime_iso(
extr('property="article:published_time" content="', '"')), extr('property="article:published_time" content="', '"')),
} }
content = extr('<div class="entry-content">', '</article>') content = extr('<div class="entry-content">', '</article>')

View File

@@ -98,7 +98,7 @@ class SimpcityExtractor(Extractor):
"id" : url_t[url_t.rfind(".")+1:-1], "id" : url_t[url_t.rfind(".")+1:-1],
"url" : url_t, "url" : url_t,
"title": schema["headline"], "title": schema["headline"],
"date" : self.parse_datetime(schema["datePublished"]), "date" : self.parse_datetime_iso(schema["datePublished"]),
"views": stats[0]["userInteractionCount"], "views": stats[0]["userInteractionCount"],
"posts": stats[1]["userInteractionCount"], "posts": stats[1]["userInteractionCount"],
"tags" : (schema["keywords"].split(", ") "tags" : (schema["keywords"].split(", ")
@@ -119,7 +119,7 @@ class SimpcityExtractor(Extractor):
"author": extr('data-author="', '"'), "author": extr('data-author="', '"'),
"id": extr('data-content="post-', '"'), "id": extr('data-content="post-', '"'),
"author_url": extr('itemprop="url" content="', '"'), "author_url": extr('itemprop="url" content="', '"'),
"date": self.parse_datetime(extr('datetime="', '"')), "date": self.parse_datetime_iso(extr('datetime="', '"')),
"content": extr('<div itemprop="text">', "content": extr('<div itemprop="text">',
'<div class="js-selectToQuote').strip(), '<div class="js-selectToQuote').strip(),
} }

View File

@@ -89,7 +89,7 @@ class TapasEpisodeExtractor(TapasExtractor):
html = data["html"] html = data["html"]
episode["series"] = self._extract_series(html) episode["series"] = self._extract_series(html)
episode["date"] = self.parse_datetime(episode["publish_date"]) episode["date"] = self.parse_datetime_iso(episode["publish_date"])
yield Message.Directory, episode yield Message.Directory, episode
if episode["book"]: if episode["book"]:

View File

@@ -27,9 +27,8 @@ class TelegraphGalleryExtractor(GalleryExtractor):
'property="og:title" content="', '"')), 'property="og:title" content="', '"')),
"description": text.unescape(extr( "description": text.unescape(extr(
'property="og:description" content="', '"')), 'property="og:description" content="', '"')),
"date": self.parse_datetime(extr( "date": self.parse_datetime_iso(extr(
'property="article:published_time" content="', '"'), 'property="article:published_time" content="', '"')),
"%Y-%m-%dT%H:%M:%S%z"),
"author": text.unescape(extr( "author": text.unescape(extr(
'property="article:author" content="', '"')), 'property="article:author" content="', '"')),
"post_url": text.unescape(extr( "post_url": text.unescape(extr(

View File

@@ -23,7 +23,7 @@ class TungstenExtractor(Extractor):
def items(self): def items(self):
for post in self.posts(): for post in self.posts():
url = post["original_url"] url = post["original_url"]
post["date"] = self.parse_datetime(post["created_at"]) post["date"] = self.parse_datetime_iso(post["created_at"])
post["filename"] = url[url.rfind("/")+1:] post["filename"] = url[url.rfind("/")+1:]
post["extension"] = "webp" post["extension"] = "webp"
yield Message.Directory, post yield Message.Directory, post

View File

@@ -41,7 +41,7 @@ class UnsplashExtractor(Extractor):
if metadata: if metadata:
photo.update(metadata) photo.update(metadata)
photo["extension"] = "jpg" photo["extension"] = "jpg"
photo["date"] = self.parse_datetime(photo["created_at"]) photo["date"] = self.parse_datetime_iso(photo["created_at"])
if "tags" in photo: if "tags" in photo:
photo["tags"] = [t["title"] for t in photo["tags"]] photo["tags"] = [t["title"] for t in photo["tags"]]

View File

@@ -24,8 +24,7 @@ class WeasylExtractor(Extractor):
# Some submissions don't have content and can be skipped # Some submissions don't have content and can be skipped
if "submission" in data["media"]: if "submission" in data["media"]:
data["url"] = data["media"]["submission"][0]["url"] data["url"] = data["media"]["submission"][0]["url"]
data["date"] = self.parse_datetime( data["date"] = self.parse_datetime_iso(data["posted_at"][:19])
data["posted_at"][:19], "%Y-%m-%dT%H:%M:%S")
text.nameext_from_url(data["url"], data) text.nameext_from_url(data["url"], data)
return True return True
return False return False
@@ -42,7 +41,7 @@ class WeasylExtractor(Extractor):
f"{self.root}/api/journals/{journalid}/view") f"{self.root}/api/journals/{journalid}/view")
data["extension"] = "html" data["extension"] = "html"
data["html"] = "text:" + data["content"] data["html"] = "text:" + data["content"]
data["date"] = self.parse_datetime(data["posted_at"]) data["date"] = self.parse_datetime_iso(data["posted_at"])
return data return data
def submissions(self, owner_login, folderid=None): def submissions(self, owner_login, folderid=None):

View File

@@ -76,7 +76,7 @@ class ZerochanExtractor(BooruExtractor):
data = { data = {
"id" : text.parse_int(entry_id), "id" : text.parse_int(entry_id),
"file_url": jsonld["contentUrl"], "file_url": jsonld["contentUrl"],
"date" : self.parse_datetime(jsonld["datePublished"]), "date" : self.parse_datetime_iso(jsonld["datePublished"]),
"width" : text.parse_int(jsonld["width"][:-3]), "width" : text.parse_int(jsonld["width"][:-3]),
"height" : text.parse_int(jsonld["height"][:-3]), "height" : text.parse_int(jsonld["height"][:-3]),
"size" : text.parse_bytes(jsonld["contentSize"][:-1]), "size" : text.parse_bytes(jsonld["contentSize"][:-1]),