unescape items in text.split_html()
This commit is contained in:
@@ -126,8 +126,8 @@ class AryionExtractor(Extractor):
|
|||||||
"user" : self.user or artist,
|
"user" : self.user or artist,
|
||||||
"title" : title,
|
"title" : title,
|
||||||
"artist": artist,
|
"artist": artist,
|
||||||
"path" : text.split_html(text.unescape(extr(
|
"path" : text.split_html(extr(
|
||||||
"cookiecrumb'>", '</span')))[4:-1:2],
|
"cookiecrumb'>", '</span'))[4:-1:2],
|
||||||
"date" : extr("class='pretty-date' title='", "'"),
|
"date" : extr("class='pretty-date' title='", "'"),
|
||||||
"size" : text.parse_int(clen),
|
"size" : text.parse_int(clen),
|
||||||
"views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")),
|
"views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")),
|
||||||
|
|||||||
@@ -1,19 +1,18 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Copyright 2015-2019 Mike Fährmann
|
# Copyright 2015-2021 Mike Fährmann
|
||||||
#
|
#
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
# published by the Free Software Foundation.
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
"""Extract manga-chapters from https://dynasty-scans.com/"""
|
"""Extractors for https://dynasty-scans.com/"""
|
||||||
|
|
||||||
from .common import ChapterExtractor, Extractor, Message
|
from .common import ChapterExtractor, Extractor, Message
|
||||||
from .. import text
|
from .. import text
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com"
|
BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com"
|
||||||
|
|
||||||
|
|
||||||
@@ -36,7 +35,7 @@ class DynastyscansBase():
|
|||||||
return {
|
return {
|
||||||
"url" : self.root + url,
|
"url" : self.root + url,
|
||||||
"image_id": text.parse_int(image_id),
|
"image_id": text.parse_int(image_id),
|
||||||
"tags" : text.split_html(text.unescape(tags)),
|
"tags" : text.split_html(tags),
|
||||||
"date" : text.remove_html(date),
|
"date" : text.remove_html(date),
|
||||||
"source" : text.unescape(src),
|
"source" : text.unescape(src),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,11 +27,12 @@ def remove_html(txt, repl=" ", sep=" "):
|
|||||||
return txt.strip()
|
return txt.strip()
|
||||||
|
|
||||||
|
|
||||||
def split_html(txt, sep=None):
|
def split_html(txt):
|
||||||
"""Split input string by html-tags"""
|
"""Split input string by HTML tags"""
|
||||||
try:
|
try:
|
||||||
return [
|
return [
|
||||||
x.strip() for x in HTML_RE.split(txt)
|
unescape(x).strip()
|
||||||
|
for x in HTML_RE.split(txt)
|
||||||
if x and not x.isspace()
|
if x and not x.isspace()
|
||||||
]
|
]
|
||||||
except TypeError:
|
except TypeError:
|
||||||
|
|||||||
@@ -59,6 +59,10 @@ class TestText(unittest.TestCase):
|
|||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
|
f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
|
||||||
|
|
||||||
|
# escaped HTML entities
|
||||||
|
self.assertEqual(
|
||||||
|
f("<i><foo></i> <i><bar> </i>"), ["<foo>", "<bar>"])
|
||||||
|
|
||||||
# empty HTML
|
# empty HTML
|
||||||
self.assertEqual(f("<div></div>"), empty)
|
self.assertEqual(f("<div></div>"), empty)
|
||||||
self.assertEqual(f(" <div> </div> "), empty)
|
self.assertEqual(f(" <div> </div> "), empty)
|
||||||
|
|||||||
Reference in New Issue
Block a user