[Yum-devel] [PATCH] clean up misc.to_xml(), make it faster, add tests. BZ 716235.

Zdeněk Pavlas zpavlas at redhat.com
Fri Nov 16 13:47:12 UTC 2012


Returns valid UTF8, and the input is mostly UTF8, too.
Avoid conversion to unicode, avoid saxutils.
---
 test/misc-tests.py |   32 +++++++++++++++++++++
 yum/misc.py        |   77 ++++++++++++++-------------------------------------
 2 files changed, 53 insertions(+), 56 deletions(-)

diff --git a/test/misc-tests.py b/test/misc-tests.py
index d34c161..9c62e24 100644
--- a/test/misc-tests.py
+++ b/test/misc-tests.py
@@ -114,6 +114,38 @@ class MiscTests(DepsolveTests):
         res, msg = solver.buildTransaction()
         return self.res[res], msg
 
+    def testXML(self):
+        import yum.misc
+        for i in (
+
+# valid utf8 and unicode
+('\xc4\x9b\xc5\xa1\xc4\x8d', '\xc4\x9b\xc5\xa1\xc4\x8d'),
+(u'\u011b\u0161\u010d',      '\xc4\x9b\xc5\xa1\xc4\x8d'),
+
+# invalid utf8
+('\xc3\x28', '\xef\xbf\xbd' + '\x28'),
+('\xa0\xa1', '\xef\xbf\xbd' * 2),
+
+# entity expansion
+('&<>', '&<>'),
+
+# removal of invalid bytes
+('\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f', '\t\n\r'),
+
+# attr flag
+('&"', '&"'),
+('&"', True, '&"'),
+
+# weirdness
+(None, ''),
+('abc ', 'abc'),
+
+        ):
+            i = list(i); ok = i.pop()
+            ret = yum.misc.to_xml(*i)
+            self.assertEqual(type(ret), str)
+            self.assertEqual(ret, ok)
+
 def setup_logging():
     logging.basicConfig()    
     plainformatter = logging.Formatter("%(message)s")    
diff --git a/yum/misc.py b/yum/misc.py
index a0bac7b..9f403db 100644
--- a/yum/misc.py
+++ b/yum/misc.py
@@ -897,67 +897,32 @@ def seq_max_split(seq, max_entries):
     ret.append(seq[beg:])
     return ret
 
-def _ugly_utf8_string_hack(item):
-    """hands back a unicoded string"""
-    # this is backward compat for handling non-utf8 filenames 
-    # and content inside packages. :(
-    # content that xml can cope with but isn't really kosher
-
-    # if we're anything obvious - do them first
-    if item is None:
-        return ''
-    elif isinstance(item, unicode):    
-        return item
-    
-    # this handles any bogon formats we see
-    du = False
-    try:
-        x = unicode(item, 'ascii')
-        du = True
-    except UnicodeError:
-        encodings = ['utf-8', 'iso-8859-1', 'iso-8859-15', 'iso-8859-2']
-        for enc in encodings:
-            try:
-                x = unicode(item, enc)
-            except UnicodeError:
-                pass
-                
-            else:
-                if x.encode(enc) == item:
-                    if enc != 'utf-8':
-                        print '\n%s encoding on %s\n' % (enc, item)
-                    return x.encode('utf-8')
-    
-    
-    # Kill bytes (or libxml will die) not in the small byte portion of:
-    #  http://www.w3.org/TR/REC-xml/#NT-Char
-    # we allow high bytes, if it passed the utf8 check above. Eg.
-    # good chars = #x9 | #xA | #xD | [#x20-...]
-    newitem = ''
-    bad_small_bytes = range(0, 8) + [11, 12] + range(14, 32)
-    for char in item:
-        if ord(char) in bad_small_bytes:
-            pass # Just ignore these bytes...
-        elif not du and ord(char) > 127:
-            newitem = newitem + '?' # byte by byte equiv of escape
-        else:
-            newitem = newitem + char
-    return newitem
+_deletechars = ''.join(chr(i) for i in range(32) if i not in (9, 10, 13))
 
-__cached_saxutils = None
 def to_xml(item, attrib=False):
-    global __cached_saxutils
-    if __cached_saxutils is None:
-        import xml.sax.saxutils
-        __cached_saxutils = xml.sax.saxutils
+    if type(item) is str:
+        # check if valid utf8
+        try: unicode(item, 'utf8')
+        except UnicodeDecodeError:
+            item = unicode(item, 'utf8', 'replace').encode('utf8')
+    elif type(item) is unicode:
+        item = item.encode('utf8')
+    elif item is None:
+        return ''
 
-    item = _ugly_utf8_string_hack(item)
-    item = to_utf8(item)
+    # compat cruft...
     item = item.rstrip()
+
+    # kill ivalid low bytes
+    item = item.translate(None, _deletechars)
+
+    # quote reserved XML characters
+    item = item.replace("&", "&")
+    item = item.replace("<", "<")
+    item = item.replace(">", ">")
     if attrib:
-        item = __cached_saxutils.escape(item, entities={'"':"""})
-    else:
-        item = __cached_saxutils.escape(item)
+        item = item.replace('"', '"')
+
     return item
 
 def unlink_f(filename):
-- 
1.7.4.4



More information about the Yum-devel mailing list