[seiga] fix file extension and xml parsing
- The file extension of the first image had been used for all further images - API responses can contain invalid characters, which cause the XML parser to fail (http://seiga.nicovideo.jp/user/illust/26377934 contains several \x08 characters)
This commit is contained in:
@@ -15,6 +15,20 @@ import html
|
||||
import urllib.parse
|
||||
|
||||
|
||||
INVALID_XML_CHARS = [
|
||||
chr(c) for c in (1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 14, 15, 16, 17, 18,
|
||||
19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)
|
||||
]
|
||||
|
||||
|
||||
def clean_xml(xmldata, repl=""):
|
||||
"""Replace/Remove invalid control characters in XML data"""
|
||||
for char in INVALID_XML_CHARS:
|
||||
if char in xmldata:
|
||||
xmldata = xmldata.replace(char, repl)
|
||||
return xmldata
|
||||
|
||||
|
||||
def remove_html(text):
|
||||
"""Remove html-tags from a string"""
|
||||
return " ".join(re.sub("<[^>]+?>", " ", text).split())
|
||||
|
||||
Reference in New Issue
Block a user