Force the lxml factory to pass around unicode strings

Not necessarily the most compatible with downstream XML parsers, but at
least should ensure that we manage to write the XML file. The encoding
declared in the header is not necessarily the same we get from the API.

See also:
https://lxml.de/FAQ.html#why-can-t-lxml-parse-my-xml-from-unicode-strings
https://lxml.de/3.7/parsing.html#serialising-to-unicode-strings

Fixes https://github.com/WikiTeam/wikiteam/issues/363
pull/367/head
Federico Leva 4 years ago
parent 6dc86d1964
commit d1619392f4

@ -1033,10 +1033,10 @@ def makeXmlFromPage(page):
E.parentid(to_unicode(rev['parentid'])),
E.timestamp(rev['timestamp']),
E.contributor(
E.id(to_unicode(userid)),
E.username(to_unicode(rev['user'])),
E.id(to_unicode(userid)),
),
E.text(rev['*'], space="preserve", bytes=to_unicode(size)),
E.text(to_unicode(rev['*']), space="preserve", bytes=to_unicode(size)),
)
if 'comment' in rev:
revision.append(E.comment(to_unicode(rev['comment'])))
@ -1049,7 +1049,7 @@ def makeXmlFromPage(page):
except KeyError as e:
print(e)
raise PageMissingError(page['title'], e)
return etree.tostring(p, pretty_print=True)
return etree.tostring(p, pretty_print=True, encoding='unicode')
def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """

Loading…
Cancel
Save