这似乎对我有用。在r = re.compile(ur'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\xFF' \
+ ur'\u0100-\uD7FF\uE000-\uFDCF\uFDE0-\uFFFD]')
def escapeInvalidXML(string):
def replacer(m):
return ""+('%04X' % ord(m.group(0)))+""
return re.sub(r,replacer,string)
示例:
^{pr2}$
更新:哎呀,忘了适应SAX的startElement/characters方法,正确处理多行代码:import re
import xml.sax.xmlreader
import xml.sax.saxutils
r = re.compile(ur'(.*?)(?:([^\x09\x0A\x0D\x20-\x7E\x85\xA0-\xFF' \
+ ur'\u0100-\uD7FF\uE000-\uFDCF\uFDE0-\uFFFD])|([\n])|$)')
attr0 = xml.sax.xmlreader.AttributesImpl({})
def splitInvalidXML(string):
list = []
def replacer(m):
g1 = m.group(1)
if (len(g1) > 0):
list.append(g1)
g2 = m.group(2)
if (not g2 == None):
list.append(ord(g2))
g3 = m.group(3)
if (not g3 == None):
list.append(g3)
return ""
re.sub(r,replacer,string)
return list
def submitCharacters(x, string):
for fragment in splitInvalidXML(string):
if (isinstance(fragment,int)):
x.startElement("u", attr0)
x.characters('%04X' % fragment)
x.endElement("u")
else:
x.characters(fragment)
def test1(fname):
with open(fname,'w') as f:
x = xml.sax.saxutils.XMLGenerator(f)
x.startDocument()
x.startElement('document',attr0)
submitCharacters(x, 'this is a \x01 test\nof the \x02\x0b xml system.')
x.endElement('document')
x.endDocument()
test1('test.xml')
这会产生:<?xml version="1.0" encoding="iso-8859-1"?>
this is a 0001 test
of the 0002000B xml system.