[Yum-devel] [PATCH] clean up misc.to_xml(), make it faster, add tests. BZ 716235.
Zdeněk Pavlas
zpavlas at redhat.com
Fri Nov 16 13:47:12 UTC 2012
Returns valid UTF8, and the input is mostly UTF8, too.
Avoid conversion to unicode, avoid saxutils.
---
test/misc-tests.py | 32 +++++++++++++++++++++
yum/misc.py | 77 ++++++++++++++-------------------------------------
2 files changed, 53 insertions(+), 56 deletions(-)
diff --git a/test/misc-tests.py b/test/misc-tests.py
index d34c161..9c62e24 100644
--- a/test/misc-tests.py
+++ b/test/misc-tests.py
@@ -114,6 +114,38 @@ class MiscTests(DepsolveTests):
res, msg = solver.buildTransaction()
return self.res[res], msg
+ def testXML(self):
+ import yum.misc
+ for i in (
+
+# valid utf8 and unicode
+('\xc4\x9b\xc5\xa1\xc4\x8d', '\xc4\x9b\xc5\xa1\xc4\x8d'),
+(u'\u011b\u0161\u010d', '\xc4\x9b\xc5\xa1\xc4\x8d'),
+
+# invalid utf8
+('\xc3\x28', '\xef\xbf\xbd' + '\x28'),
+('\xa0\xa1', '\xef\xbf\xbd' * 2),
+
+# entity expansion
+('&<>', '&<>'),
+
+# removal of invalid bytes
+('\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f', '\t\n\r'),
+
+# attr flag
+('&"', '&"'),
+('&"', True, '&"'),
+
+# weirdness
+(None, ''),
+('abc ', 'abc'),
+
+ ):
+ i = list(i); ok = i.pop()
+ ret = yum.misc.to_xml(*i)
+ self.assertEqual(type(ret), str)
+ self.assertEqual(ret, ok)
+
def setup_logging():
logging.basicConfig()
plainformatter = logging.Formatter("%(message)s")
diff --git a/yum/misc.py b/yum/misc.py
index a0bac7b..9f403db 100644
--- a/yum/misc.py
+++ b/yum/misc.py
@@ -897,67 +897,32 @@ def seq_max_split(seq, max_entries):
ret.append(seq[beg:])
return ret
-def _ugly_utf8_string_hack(item):
- """hands back a unicoded string"""
- # this is backward compat for handling non-utf8 filenames
- # and content inside packages. :(
- # content that xml can cope with but isn't really kosher
-
- # if we're anything obvious - do them first
- if item is None:
- return ''
- elif isinstance(item, unicode):
- return item
-
- # this handles any bogon formats we see
- du = False
- try:
- x = unicode(item, 'ascii')
- du = True
- except UnicodeError:
- encodings = ['utf-8', 'iso-8859-1', 'iso-8859-15', 'iso-8859-2']
- for enc in encodings:
- try:
- x = unicode(item, enc)
- except UnicodeError:
- pass
-
- else:
- if x.encode(enc) == item:
- if enc != 'utf-8':
- print '\n%s encoding on %s\n' % (enc, item)
- return x.encode('utf-8')
-
-
- # Kill bytes (or libxml will die) not in the small byte portion of:
- # http://www.w3.org/TR/REC-xml/#NT-Char
- # we allow high bytes, if it passed the utf8 check above. Eg.
- # good chars = #x9 | #xA | #xD | [#x20-...]
- newitem = ''
- bad_small_bytes = range(0, 8) + [11, 12] + range(14, 32)
- for char in item:
- if ord(char) in bad_small_bytes:
- pass # Just ignore these bytes...
- elif not du and ord(char) > 127:
- newitem = newitem + '?' # byte by byte equiv of escape
- else:
- newitem = newitem + char
- return newitem
+_deletechars = ''.join(chr(i) for i in range(32) if i not in (9, 10, 13))
-__cached_saxutils = None
def to_xml(item, attrib=False):
- global __cached_saxutils
- if __cached_saxutils is None:
- import xml.sax.saxutils
- __cached_saxutils = xml.sax.saxutils
+ if type(item) is str:
+ # check if valid utf8
+ try: unicode(item, 'utf8')
+ except UnicodeDecodeError:
+ item = unicode(item, 'utf8', 'replace').encode('utf8')
+ elif type(item) is unicode:
+ item = item.encode('utf8')
+ elif item is None:
+ return ''
- item = _ugly_utf8_string_hack(item)
- item = to_utf8(item)
+ # compat cruft...
item = item.rstrip()
+
+ # kill ivalid low bytes
+ item = item.translate(None, _deletechars)
+
+ # quote reserved XML characters
+ item = item.replace("&", "&")
+ item = item.replace("<", "<")
+ item = item.replace(">", ">")
if attrib:
- item = __cached_saxutils.escape(item, entities={'"':"""})
- else:
- item = __cached_saxutils.escape(item)
+ item = item.replace('"', '"')
+
return item
def unlink_f(filename):
--
1.7.4.4
More information about the Yum-devel
mailing list