[dt] update 'parse_datetime' calls with one argument
This commit is contained in:
@@ -74,7 +74,7 @@ if sys.hexversion < 0x30c0000:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return NONE
|
return NONE
|
||||||
|
|
||||||
def parse_compat(dt_string, format):
|
def parse_compat(dt_string, format="%Y-%m-%dT%H:%M:%S%z"):
|
||||||
"""Parse 'dt_string' as ISO 8601 value using 'format'"""
|
"""Parse 'dt_string' as ISO 8601 value using 'format'"""
|
||||||
return parse(dt_string, format)
|
return parse(dt_string, format)
|
||||||
|
|
||||||
@@ -90,7 +90,7 @@ else:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return NONE
|
return NONE
|
||||||
|
|
||||||
def parse_compat(dt_string, format):
|
def parse_compat(dt_string, format=None):
|
||||||
"""Parse 'dt_string' as ISO 8601 value"""
|
"""Parse 'dt_string' as ISO 8601 value"""
|
||||||
return parse_iso(dt_string)
|
return parse_iso(dt_string)
|
||||||
|
|
||||||
|
|||||||
@@ -61,10 +61,9 @@ class _4archiveThreadExtractor(Extractor):
|
|||||||
extr = text.extract_from(post)
|
extr = text.extract_from(post)
|
||||||
data = {
|
data = {
|
||||||
"name": extr('class="name">', "</span>"),
|
"name": extr('class="name">', "</span>"),
|
||||||
"date": self.parse_datetime(
|
"date": self.parse_datetime_iso(
|
||||||
(extr('class="dateTime">', "<") or
|
(extr('class="dateTime">', "<") or
|
||||||
extr('class="dateTime postNum" >', "<")).strip(),
|
extr('class="dateTime postNum" >', "<")).strip()),
|
||||||
"%Y-%m-%d %H:%M:%S"),
|
|
||||||
"no" : text.parse_int(extr(">Post No.", "<")),
|
"no" : text.parse_int(extr(">Post No.", "<")),
|
||||||
}
|
}
|
||||||
if 'class="file"' in post:
|
if 'class="file"' in post:
|
||||||
|
|||||||
@@ -141,8 +141,8 @@ class BellazonExtractor(Extractor):
|
|||||||
"title": schema["headline"],
|
"title": schema["headline"],
|
||||||
"views": stats[0]["userInteractionCount"],
|
"views": stats[0]["userInteractionCount"],
|
||||||
"posts": stats[1]["userInteractionCount"],
|
"posts": stats[1]["userInteractionCount"],
|
||||||
"date" : self.parse_datetime(schema["datePublished"]),
|
"date" : self.parse_datetime_iso(schema["datePublished"]),
|
||||||
"date_updated": self.parse_datetime(schema["dateModified"]),
|
"date_updated": self.parse_datetime_iso(schema["dateModified"]),
|
||||||
"description" : text.unescape(schema["text"]).strip(),
|
"description" : text.unescape(schema["text"]).strip(),
|
||||||
"section" : path[-2],
|
"section" : path[-2],
|
||||||
"author" : author["name"],
|
"author" : author["name"],
|
||||||
@@ -162,7 +162,7 @@ class BellazonExtractor(Extractor):
|
|||||||
post = {
|
post = {
|
||||||
"id": extr('id="elComment_', '"'),
|
"id": extr('id="elComment_', '"'),
|
||||||
"author_url": extr(" href='", "'"),
|
"author_url": extr(" href='", "'"),
|
||||||
"date": self.parse_datetime(extr("datetime='", "'")),
|
"date": self.parse_datetime_iso(extr("datetime='", "'")),
|
||||||
"content": extr("<!-- Post content -->", "\n\t\t</div>"),
|
"content": extr("<!-- Post content -->", "\n\t\t</div>"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ class BloggerExtractor(BaseExtractor):
|
|||||||
blog = self.api.blog_by_url("http://" + self.blog)
|
blog = self.api.blog_by_url("http://" + self.blog)
|
||||||
blog["pages"] = blog["pages"]["totalItems"]
|
blog["pages"] = blog["pages"]["totalItems"]
|
||||||
blog["posts"] = blog["posts"]["totalItems"]
|
blog["posts"] = blog["posts"]["totalItems"]
|
||||||
blog["date"] = self.parse_datetime(blog["published"])
|
blog["date"] = self.parse_datetime_iso(blog["published"])
|
||||||
del blog["selfLink"]
|
del blog["selfLink"]
|
||||||
|
|
||||||
findall_image = util.re(
|
findall_image = util.re(
|
||||||
@@ -65,7 +65,7 @@ class BloggerExtractor(BaseExtractor):
|
|||||||
post["author"] = post["author"]["displayName"]
|
post["author"] = post["author"]["displayName"]
|
||||||
post["replies"] = post["replies"]["totalItems"]
|
post["replies"] = post["replies"]["totalItems"]
|
||||||
post["content"] = text.remove_html(content)
|
post["content"] = text.remove_html(content)
|
||||||
post["date"] = self.parse_datetime(post["published"])
|
post["date"] = self.parse_datetime_iso(post["published"])
|
||||||
del post["selfLink"]
|
del post["selfLink"]
|
||||||
del post["blog"]
|
del post["blog"]
|
||||||
|
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ class CienArticleExtractor(CienExtractor):
|
|||||||
post["post_url"] = url
|
post["post_url"] = url
|
||||||
post["post_id"] = text.parse_int(post_id)
|
post["post_id"] = text.parse_int(post_id)
|
||||||
post["count"] = len(files)
|
post["count"] = len(files)
|
||||||
post["date"] = self.parse_datetime(post["datePublished"])
|
post["date"] = self.parse_datetime_iso(post["datePublished"])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
post["author"]["id"] = text.parse_int(author_id)
|
post["author"]["id"] = text.parse_int(author_id)
|
||||||
|
|||||||
@@ -1187,7 +1187,7 @@ class DeviantartStatusExtractor(DeviantartExtractor):
|
|||||||
deviation["username"] = deviation["author"]["username"]
|
deviation["username"] = deviation["author"]["username"]
|
||||||
deviation["_username"] = deviation["username"].lower()
|
deviation["_username"] = deviation["username"].lower()
|
||||||
|
|
||||||
deviation["date"] = d = self.parse_datetime(deviation["ts"])
|
deviation["date"] = d = self.parse_datetime_iso(deviation["ts"])
|
||||||
deviation["published_time"] = int(dt.to_ts(d))
|
deviation["published_time"] = int(dt.to_ts(d))
|
||||||
|
|
||||||
deviation["da_category"] = "Status"
|
deviation["da_category"] = "Status"
|
||||||
|
|||||||
@@ -72,9 +72,7 @@ class DiscordExtractor(Extractor):
|
|||||||
"author_files": [],
|
"author_files": [],
|
||||||
"message": self.extract_message_text(message),
|
"message": self.extract_message_text(message),
|
||||||
"message_id": message["id"],
|
"message_id": message["id"],
|
||||||
"date": self.parse_datetime(
|
"date": self.parse_datetime_iso(message["timestamp"]),
|
||||||
message["timestamp"], "%Y-%m-%dT%H:%M:%S.%f%z"
|
|
||||||
),
|
|
||||||
"files": []
|
"files": []
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ class FanboxExtractor(Extractor):
|
|||||||
if file.get("extension", "").lower() in exts
|
if file.get("extension", "").lower() in exts
|
||||||
]
|
]
|
||||||
|
|
||||||
post["date"] = self.parse_datetime(post["publishedDatetime"])
|
post["date"] = self.parse_datetime_iso(post["publishedDatetime"])
|
||||||
post["text"] = content_body.get("text") if content_body else None
|
post["text"] = content_body.get("text") if content_body else None
|
||||||
post["isCoverImage"] = False
|
post["isCoverImage"] = False
|
||||||
|
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ class HatenablogExtractor(Extractor):
|
|||||||
|
|
||||||
def _handle_article(self, article: str):
|
def _handle_article(self, article: str):
|
||||||
extr = text.extract_from(article)
|
extr = text.extract_from(article)
|
||||||
date = self.parse_datetime(extr('<time datetime="', '"'))
|
date = self.parse_datetime_iso(extr('<time datetime="', '"'))
|
||||||
entry_link = text.unescape(extr('<a href="', '"'))
|
entry_link = text.unescape(extr('<a href="', '"'))
|
||||||
entry = entry_link.partition("/entry/")[2]
|
entry = entry_link.partition("/entry/")[2]
|
||||||
title = text.unescape(extr('>', '<'))
|
title = text.unescape(extr('>', '<'))
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ class HentaifoundryExtractor(Extractor):
|
|||||||
.replace("\r\n", "\n")),
|
.replace("\r\n", "\n")),
|
||||||
"ratings" : [text.unescape(r) for r in text.extract_iter(extr(
|
"ratings" : [text.unescape(r) for r in text.extract_iter(extr(
|
||||||
"class='ratings_box'", "</div>"), "title='", "'")],
|
"class='ratings_box'", "</div>"), "title='", "'")],
|
||||||
"date" : self.parse_datetime(extr("datetime='", "'")),
|
"date" : self.parse_datetime_iso(extr("datetime='", "'")),
|
||||||
"views" : text.parse_int(extr(">Views</span>", "<")),
|
"views" : text.parse_int(extr(">Views</span>", "<")),
|
||||||
"score" : text.parse_int(extr(">Vote Score</span>", "<")),
|
"score" : text.parse_int(extr(">Vote Score</span>", "<")),
|
||||||
"media" : text.unescape(extr(">Media</span>", "<").strip()),
|
"media" : text.unescape(extr(">Media</span>", "<").strip()),
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ class ImgurExtractor(Extractor):
|
|||||||
|
|
||||||
image["url"] = url = \
|
image["url"] = url = \
|
||||||
f"https://i.imgur.com/{image['id']}.{image['ext']}"
|
f"https://i.imgur.com/{image['id']}.{image['ext']}"
|
||||||
image["date"] = self.parse_datetime(image["created_at"])
|
image["date"] = self.parse_datetime_iso(image["created_at"])
|
||||||
image["_http_validate"] = self._validate
|
image["_http_validate"] = self._validate
|
||||||
text.nameext_from_url(url, image)
|
text.nameext_from_url(url, image)
|
||||||
|
|
||||||
@@ -106,7 +106,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
|
|||||||
|
|
||||||
del album["media"]
|
del album["media"]
|
||||||
count = len(images)
|
count = len(images)
|
||||||
album["date"] = self.parse_datetime(album["created_at"])
|
album["date"] = self.parse_datetime_iso(album["created_at"])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
del album["ad_url"]
|
del album["ad_url"]
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ class LivedoorExtractor(Extractor):
|
|||||||
"title" : text.unescape(extr('dc:title="', '"')),
|
"title" : text.unescape(extr('dc:title="', '"')),
|
||||||
"categories" : extr('dc:subject="', '"').partition(",")[::2],
|
"categories" : extr('dc:subject="', '"').partition(",")[::2],
|
||||||
"description": extr('dc:description="', '"'),
|
"description": extr('dc:description="', '"'),
|
||||||
"date" : self.parse_datetime(extr('dc:date="', '"')),
|
"date" : self.parse_datetime_iso(extr('dc:date="', '"')),
|
||||||
"tags" : text.split_html(tags)[1:] if tags else [],
|
"tags" : text.split_html(tags)[1:] if tags else [],
|
||||||
"user" : self.user,
|
"user" : self.user,
|
||||||
"body" : body,
|
"body" : body,
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ class MangadexExtractor(Extractor):
|
|||||||
"chapter" : text.parse_int(chnum),
|
"chapter" : text.parse_int(chnum),
|
||||||
"chapter_minor": f"{sep}{minor}",
|
"chapter_minor": f"{sep}{minor}",
|
||||||
"chapter_id": chapter["id"],
|
"chapter_id": chapter["id"],
|
||||||
"date" : self.parse_datetime(cattributes["publishAt"]),
|
"date" : self.parse_datetime_iso(cattributes["publishAt"]),
|
||||||
"group" : [group["attributes"]["name"]
|
"group" : [group["attributes"]["name"]
|
||||||
for group in relationships["scanlation_group"]],
|
for group in relationships["scanlation_group"]],
|
||||||
"lang" : lang,
|
"lang" : lang,
|
||||||
@@ -109,8 +109,8 @@ class MangadexCoversExtractor(MangadexExtractor):
|
|||||||
"cover" : cattributes["fileName"],
|
"cover" : cattributes["fileName"],
|
||||||
"lang" : cattributes.get("locale"),
|
"lang" : cattributes.get("locale"),
|
||||||
"volume" : text.parse_int(cattributes["volume"]),
|
"volume" : text.parse_int(cattributes["volume"]),
|
||||||
"date" : self.parse_datetime(cattributes["createdAt"]),
|
"date" : self.parse_datetime_iso(cattributes["createdAt"]),
|
||||||
"date_updated": self.parse_datetime(cattributes["updatedAt"]),
|
"date_updated": self.parse_datetime_iso(cattributes["updatedAt"]),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -454,7 +454,7 @@ def _manga_info(self, uuid):
|
|||||||
"manga_id": manga["id"],
|
"manga_id": manga["id"],
|
||||||
"manga_titles": [t.popitem()[1]
|
"manga_titles": [t.popitem()[1]
|
||||||
for t in mattr.get("altTitles") or ()],
|
for t in mattr.get("altTitles") or ()],
|
||||||
"manga_date" : self.parse_datetime(mattr.get("createdAt")),
|
"manga_date" : self.parse_datetime_iso(mattr.get("createdAt")),
|
||||||
"description" : (mattr["description"].get("en") or
|
"description" : (mattr["description"].get("en") or
|
||||||
next(iter(mattr["description"].values()), "")),
|
next(iter(mattr["description"].values()), "")),
|
||||||
"demographic": mattr.get("publicationDemographic"),
|
"demographic": mattr.get("publicationDemographic"),
|
||||||
|
|||||||
@@ -64,8 +64,7 @@ class MastodonExtractor(BaseExtractor):
|
|||||||
|
|
||||||
status["count"] = len(attachments)
|
status["count"] = len(attachments)
|
||||||
status["tags"] = [tag["name"] for tag in status["tags"]]
|
status["tags"] = [tag["name"] for tag in status["tags"]]
|
||||||
status["date"] = self.parse_datetime(
|
status["date"] = self.parse_datetime_iso(status["created_at"][:19])
|
||||||
status["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
|
|
||||||
|
|
||||||
yield Message.Directory, status
|
yield Message.Directory, status
|
||||||
for status["num"], media in enumerate(attachments, 1):
|
for status["num"], media in enumerate(attachments, 1):
|
||||||
@@ -319,10 +318,8 @@ class MastodonAPI():
|
|||||||
if code == 404:
|
if code == 404:
|
||||||
raise exception.NotFoundError()
|
raise exception.NotFoundError()
|
||||||
if code == 429:
|
if code == 429:
|
||||||
self.extractor.wait(until=self.parse_datetime(
|
self.extractor.wait(until=self.parse_datetime_iso(
|
||||||
response.headers["x-ratelimit-reset"],
|
response.headers["x-ratelimit-reset"]))
|
||||||
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
||||||
))
|
|
||||||
continue
|
continue
|
||||||
raise exception.AbortExtraction(response.json().get("error"))
|
raise exception.AbortExtraction(response.json().get("error"))
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"""Extractors for https://www.newgrounds.com/"""
|
"""Extractors for https://www.newgrounds.com/"""
|
||||||
|
|
||||||
from .common import Extractor, Message, Dispatch
|
from .common import Extractor, Message, Dispatch
|
||||||
from .. import text, util, exception
|
from .. import text, util, dt, exception
|
||||||
from ..cache import cache
|
from ..cache import cache
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
@@ -218,7 +218,7 @@ class NewgroundsExtractor(Extractor):
|
|||||||
"description": text.unescape(extr(':description" content="', '"')),
|
"description": text.unescape(extr(':description" content="', '"')),
|
||||||
"type" : "art",
|
"type" : "art",
|
||||||
"_type" : "i",
|
"_type" : "i",
|
||||||
"date" : self.parse_datetime(extr(
|
"date" : dt.parse_compat(extr(
|
||||||
'itemprop="datePublished" content="', '"')),
|
'itemprop="datePublished" content="', '"')),
|
||||||
"rating" : extr('class="rated-', '"'),
|
"rating" : extr('class="rated-', '"'),
|
||||||
"url" : full('src="', '"'),
|
"url" : full('src="', '"'),
|
||||||
@@ -268,7 +268,7 @@ class NewgroundsExtractor(Extractor):
|
|||||||
"description": text.unescape(extr(':description" content="', '"')),
|
"description": text.unescape(extr(':description" content="', '"')),
|
||||||
"type" : "audio",
|
"type" : "audio",
|
||||||
"_type" : "a",
|
"_type" : "a",
|
||||||
"date" : self.parse_datetime(extr(
|
"date" : dt.parse_compat(extr(
|
||||||
'itemprop="datePublished" content="', '"')),
|
'itemprop="datePublished" content="', '"')),
|
||||||
"url" : extr('{"url":"', '"').replace("\\/", "/"),
|
"url" : extr('{"url":"', '"').replace("\\/", "/"),
|
||||||
"index" : text.parse_int(index),
|
"index" : text.parse_int(index),
|
||||||
@@ -287,7 +287,7 @@ class NewgroundsExtractor(Extractor):
|
|||||||
src = src.replace("\\/", "/")
|
src = src.replace("\\/", "/")
|
||||||
formats = ()
|
formats = ()
|
||||||
type = extr(',"description":"', '"')
|
type = extr(',"description":"', '"')
|
||||||
date = self.parse_datetime(extr(
|
date = dt.parse_compat(extr(
|
||||||
'itemprop="datePublished" content="', '"'))
|
'itemprop="datePublished" content="', '"'))
|
||||||
if type:
|
if type:
|
||||||
type = type.rpartition(" ")[2].lower()
|
type = type.rpartition(" ")[2].lower()
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"""Extractors for nijie instances"""
|
"""Extractors for nijie instances"""
|
||||||
|
|
||||||
from .common import BaseExtractor, Message, Dispatch, AsynchronousMixin
|
from .common import BaseExtractor, Message, Dispatch, AsynchronousMixin
|
||||||
from .. import text, exception
|
from .. import text, dt, exception
|
||||||
from ..cache import cache
|
from ..cache import cache
|
||||||
|
|
||||||
|
|
||||||
@@ -82,8 +82,9 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
|
|||||||
"title" : keywords[0].strip(),
|
"title" : keywords[0].strip(),
|
||||||
"description": text.unescape(extr(
|
"description": text.unescape(extr(
|
||||||
'"description": "', '"').replace("&", "&")),
|
'"description": "', '"').replace("&", "&")),
|
||||||
"date" : self.parse_datetime(extr(
|
"date" : dt.parse(extr(
|
||||||
'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y", 9),
|
'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"
|
||||||
|
) - dt.timedelta(hours=9),
|
||||||
"artist_id" : text.parse_int(extr('/members.php?id=', '"')),
|
"artist_id" : text.parse_int(extr('/members.php?id=', '"')),
|
||||||
"artist_name": keywords[1],
|
"artist_name": keywords[1],
|
||||||
"tags" : keywords[2:-1],
|
"tags" : keywords[2:-1],
|
||||||
@@ -101,9 +102,9 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
|
|||||||
"artist_id" : text.parse_int(extr('members.php?id=', '"')),
|
"artist_id" : text.parse_int(extr('members.php?id=', '"')),
|
||||||
"artist_name": keywords[1],
|
"artist_name": keywords[1],
|
||||||
"tags" : keywords[2:-1],
|
"tags" : keywords[2:-1],
|
||||||
"date" : self.parse_datetime(extr(
|
"date" : dt.parse_iso(extr(
|
||||||
"itemprop='datePublished' content=", "<").rpartition(">")[2],
|
"itemprop='datePublished' content=", "<").rpartition(">")[2]
|
||||||
"%Y-%m-%d %H:%M:%S", 9),
|
) - dt.timedelta(hours=9),
|
||||||
}
|
}
|
||||||
|
|
||||||
def _extract_images(self, image_id, page):
|
def _extract_images(self, image_id, page):
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class PostmillExtractor(BaseExtractor):
|
|||||||
|
|
||||||
title = text.unescape(extr(
|
title = text.unescape(extr(
|
||||||
'<meta property="og:title" content="', '">'))
|
'<meta property="og:title" content="', '">'))
|
||||||
date = self.parse_datetime(extr(
|
date = self.parse_datetime_iso(extr(
|
||||||
'<meta property="og:article:published_time" content="', '">'))
|
'<meta property="og:article:published_time" content="', '">'))
|
||||||
username = extr(
|
username = extr(
|
||||||
'<meta property="og:article:author" content="', '">')
|
'<meta property="og:article:author" content="', '">')
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ class ReactorExtractor(BaseExtractor):
|
|||||||
return
|
return
|
||||||
|
|
||||||
num = 0
|
num = 0
|
||||||
date = self.parse_datetime(data["datePublished"])
|
date = self.parse_datetime_iso(data["datePublished"])
|
||||||
user = data["author"]["name"]
|
user = data["author"]["name"]
|
||||||
description = text.unescape(data["description"])
|
description = text.unescape(data["description"])
|
||||||
title, _, tags = text.unescape(data["headline"]).partition(" / ")
|
title, _, tags = text.unescape(data["headline"]).partition(" / ")
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
|
|||||||
extr('property="og:title" content="', '"')),
|
extr('property="og:title" content="', '"')),
|
||||||
"description": text.unescape(
|
"description": text.unescape(
|
||||||
extr('property="og:description" content="', '"')),
|
extr('property="og:description" content="', '"')),
|
||||||
"date" : self.parse_datetime(
|
"date" : self.parse_datetime_iso(
|
||||||
extr('property="article:published_time" content="', '"')),
|
extr('property="article:published_time" content="', '"')),
|
||||||
}
|
}
|
||||||
content = extr('<div class="entry-content">', '</article>')
|
content = extr('<div class="entry-content">', '</article>')
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ class SimpcityExtractor(Extractor):
|
|||||||
"id" : url_t[url_t.rfind(".")+1:-1],
|
"id" : url_t[url_t.rfind(".")+1:-1],
|
||||||
"url" : url_t,
|
"url" : url_t,
|
||||||
"title": schema["headline"],
|
"title": schema["headline"],
|
||||||
"date" : self.parse_datetime(schema["datePublished"]),
|
"date" : self.parse_datetime_iso(schema["datePublished"]),
|
||||||
"views": stats[0]["userInteractionCount"],
|
"views": stats[0]["userInteractionCount"],
|
||||||
"posts": stats[1]["userInteractionCount"],
|
"posts": stats[1]["userInteractionCount"],
|
||||||
"tags" : (schema["keywords"].split(", ")
|
"tags" : (schema["keywords"].split(", ")
|
||||||
@@ -119,7 +119,7 @@ class SimpcityExtractor(Extractor):
|
|||||||
"author": extr('data-author="', '"'),
|
"author": extr('data-author="', '"'),
|
||||||
"id": extr('data-content="post-', '"'),
|
"id": extr('data-content="post-', '"'),
|
||||||
"author_url": extr('itemprop="url" content="', '"'),
|
"author_url": extr('itemprop="url" content="', '"'),
|
||||||
"date": self.parse_datetime(extr('datetime="', '"')),
|
"date": self.parse_datetime_iso(extr('datetime="', '"')),
|
||||||
"content": extr('<div itemprop="text">',
|
"content": extr('<div itemprop="text">',
|
||||||
'<div class="js-selectToQuote').strip(),
|
'<div class="js-selectToQuote').strip(),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -89,7 +89,7 @@ class TapasEpisodeExtractor(TapasExtractor):
|
|||||||
|
|
||||||
html = data["html"]
|
html = data["html"]
|
||||||
episode["series"] = self._extract_series(html)
|
episode["series"] = self._extract_series(html)
|
||||||
episode["date"] = self.parse_datetime(episode["publish_date"])
|
episode["date"] = self.parse_datetime_iso(episode["publish_date"])
|
||||||
yield Message.Directory, episode
|
yield Message.Directory, episode
|
||||||
|
|
||||||
if episode["book"]:
|
if episode["book"]:
|
||||||
|
|||||||
@@ -27,9 +27,8 @@ class TelegraphGalleryExtractor(GalleryExtractor):
|
|||||||
'property="og:title" content="', '"')),
|
'property="og:title" content="', '"')),
|
||||||
"description": text.unescape(extr(
|
"description": text.unescape(extr(
|
||||||
'property="og:description" content="', '"')),
|
'property="og:description" content="', '"')),
|
||||||
"date": self.parse_datetime(extr(
|
"date": self.parse_datetime_iso(extr(
|
||||||
'property="article:published_time" content="', '"'),
|
'property="article:published_time" content="', '"')),
|
||||||
"%Y-%m-%dT%H:%M:%S%z"),
|
|
||||||
"author": text.unescape(extr(
|
"author": text.unescape(extr(
|
||||||
'property="article:author" content="', '"')),
|
'property="article:author" content="', '"')),
|
||||||
"post_url": text.unescape(extr(
|
"post_url": text.unescape(extr(
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ class TungstenExtractor(Extractor):
|
|||||||
def items(self):
|
def items(self):
|
||||||
for post in self.posts():
|
for post in self.posts():
|
||||||
url = post["original_url"]
|
url = post["original_url"]
|
||||||
post["date"] = self.parse_datetime(post["created_at"])
|
post["date"] = self.parse_datetime_iso(post["created_at"])
|
||||||
post["filename"] = url[url.rfind("/")+1:]
|
post["filename"] = url[url.rfind("/")+1:]
|
||||||
post["extension"] = "webp"
|
post["extension"] = "webp"
|
||||||
yield Message.Directory, post
|
yield Message.Directory, post
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ class UnsplashExtractor(Extractor):
|
|||||||
if metadata:
|
if metadata:
|
||||||
photo.update(metadata)
|
photo.update(metadata)
|
||||||
photo["extension"] = "jpg"
|
photo["extension"] = "jpg"
|
||||||
photo["date"] = self.parse_datetime(photo["created_at"])
|
photo["date"] = self.parse_datetime_iso(photo["created_at"])
|
||||||
if "tags" in photo:
|
if "tags" in photo:
|
||||||
photo["tags"] = [t["title"] for t in photo["tags"]]
|
photo["tags"] = [t["title"] for t in photo["tags"]]
|
||||||
|
|
||||||
|
|||||||
@@ -24,8 +24,7 @@ class WeasylExtractor(Extractor):
|
|||||||
# Some submissions don't have content and can be skipped
|
# Some submissions don't have content and can be skipped
|
||||||
if "submission" in data["media"]:
|
if "submission" in data["media"]:
|
||||||
data["url"] = data["media"]["submission"][0]["url"]
|
data["url"] = data["media"]["submission"][0]["url"]
|
||||||
data["date"] = self.parse_datetime(
|
data["date"] = self.parse_datetime_iso(data["posted_at"][:19])
|
||||||
data["posted_at"][:19], "%Y-%m-%dT%H:%M:%S")
|
|
||||||
text.nameext_from_url(data["url"], data)
|
text.nameext_from_url(data["url"], data)
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
@@ -42,7 +41,7 @@ class WeasylExtractor(Extractor):
|
|||||||
f"{self.root}/api/journals/{journalid}/view")
|
f"{self.root}/api/journals/{journalid}/view")
|
||||||
data["extension"] = "html"
|
data["extension"] = "html"
|
||||||
data["html"] = "text:" + data["content"]
|
data["html"] = "text:" + data["content"]
|
||||||
data["date"] = self.parse_datetime(data["posted_at"])
|
data["date"] = self.parse_datetime_iso(data["posted_at"])
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def submissions(self, owner_login, folderid=None):
|
def submissions(self, owner_login, folderid=None):
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ class ZerochanExtractor(BooruExtractor):
|
|||||||
data = {
|
data = {
|
||||||
"id" : text.parse_int(entry_id),
|
"id" : text.parse_int(entry_id),
|
||||||
"file_url": jsonld["contentUrl"],
|
"file_url": jsonld["contentUrl"],
|
||||||
"date" : self.parse_datetime(jsonld["datePublished"]),
|
"date" : self.parse_datetime_iso(jsonld["datePublished"]),
|
||||||
"width" : text.parse_int(jsonld["width"][:-3]),
|
"width" : text.parse_int(jsonld["width"][:-3]),
|
||||||
"height" : text.parse_int(jsonld["height"][:-3]),
|
"height" : text.parse_int(jsonld["height"][:-3]),
|
||||||
"size" : text.parse_bytes(jsonld["contentSize"][:-1]),
|
"size" : text.parse_bytes(jsonld["contentSize"][:-1]),
|
||||||
|
|||||||
Reference in New Issue
Block a user