[yum-commits] test/misc-tests.py yum/misc.py

zpavlas at osuosl.org zpavlas at osuosl.org
Mon Nov 19 11:18:48 UTC 2012


 test/misc-tests.py |   33 ++++++++++++++++++++
 yum/misc.py        |   84 +++++++++++++++++------------------------------------
 2 files changed, 61 insertions(+), 56 deletions(-)

New commits:
commit 22586dce865ee6509daf94d3930ee53a20fc2a0e
Author: Zdeněk Pavlas <zpavlas at redhat.com>
Date:   Fri Nov 16 11:21:00 2012 +0100

    clean up misc.to_xml(), make it faster, add tests.  BZ 716235.

diff --git a/test/misc-tests.py b/test/misc-tests.py
index d34c161..4bf0821 100644
--- a/test/misc-tests.py
+++ b/test/misc-tests.py
@@ -114,6 +114,39 @@ class MiscTests(DepsolveTests):
         res, msg = solver.buildTransaction()
         return self.res[res], msg
 
+    def testXML(self):
+        import yum.misc
+        for i in (
+
+# valid utf8 and unicode
+('\xc4\x9b\xc5\xa1\xc4\x8d', '\xc4\x9b\xc5\xa1\xc4\x8d'),
+(u'\u011b\u0161\u010d',      '\xc4\x9b\xc5\xa1\xc4\x8d'),
+
+# invalid utf8
+('\xc3\x28', '\xc3\x83\x28'),
+('\xa0\xa1', '\xc2\xa0\xc2\xa1'),
+('Skytt\xe4', 'Skytt\xc3\xa4'),
+
+# entity expansion
+('&<>', '&<>'),
+
+# removal of invalid bytes
+('\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f', '\t\n\r'),
+
+# attr flag
+('&"\'', '&"\''),
+('&"\'', True, '&"''),
+
+# weirdness
+(None, ''),
+('abc ', 'abc'),
+
+        ):
+            i = list(i); ok = i.pop()
+            ret = yum.misc.to_xml(*i)
+            self.assertEqual(type(ret), str)
+            self.assertEqual(ret, ok)
+
 def setup_logging():
     logging.basicConfig()    
     plainformatter = logging.Formatter("%(message)s")    
diff --git a/yum/misc.py b/yum/misc.py
index a0bac7b..6c3c349 100644
--- a/yum/misc.py
+++ b/yum/misc.py
@@ -897,67 +897,39 @@ def seq_max_split(seq, max_entries):
     ret.append(seq[beg:])
     return ret
 
-def _ugly_utf8_string_hack(item):
-    """hands back a unicoded string"""
-    # this is backward compat for handling non-utf8 filenames 
-    # and content inside packages. :(
-    # content that xml can cope with but isn't really kosher
-
-    # if we're anything obvious - do them first
-    if item is None:
-        return ''
-    elif isinstance(item, unicode):    
-        return item
-    
-    # this handles any bogon formats we see
-    du = False
-    try:
-        x = unicode(item, 'ascii')
-        du = True
-    except UnicodeError:
-        encodings = ['utf-8', 'iso-8859-1', 'iso-8859-15', 'iso-8859-2']
-        for enc in encodings:
-            try:
-                x = unicode(item, enc)
-            except UnicodeError:
-                pass
-                
-            else:
-                if x.encode(enc) == item:
-                    if enc != 'utf-8':
-                        print '\n%s encoding on %s\n' % (enc, item)
-                    return x.encode('utf-8')
-    
-    
-    # Kill bytes (or libxml will die) not in the small byte portion of:
-    #  http://www.w3.org/TR/REC-xml/#NT-Char
-    # we allow high bytes, if it passed the utf8 check above. Eg.
-    # good chars = #x9 | #xA | #xD | [#x20-...]
-    newitem = ''
-    bad_small_bytes = range(0, 8) + [11, 12] + range(14, 32)
-    for char in item:
-        if ord(char) in bad_small_bytes:
-            pass # Just ignore these bytes...
-        elif not du and ord(char) > 127:
-            newitem = newitem + '?' # byte by byte equiv of escape
-        else:
-            newitem = newitem + char
-    return newitem
+_deletechars = ''.join(chr(i) for i in range(32) if i not in (9, 10, 13))
 
-__cached_saxutils = None
 def to_xml(item, attrib=False):
-    global __cached_saxutils
-    if __cached_saxutils is None:
-        import xml.sax.saxutils
-        __cached_saxutils = xml.sax.saxutils
+    """ Returns xml-friendly utf-8 encoded string.
+        Accepts utf-8, iso-8859-1, or unicode.
+    """
+    if type(item) is str:
+        # check if valid utf8
+        try: unicode(item, 'utf-8')
+        except UnicodeDecodeError:
+            # assume iso-8859-1
+            item = unicode(item, 'iso-8859-1').encode('utf-8')
+    elif type(item) is unicode:
+        item = item.encode('utf-8')
+    elif item is None:
+        return ''
+    else:
+        raise ValueError, 'String expected, got %s' % repr(item)
 
-    item = _ugly_utf8_string_hack(item)
-    item = to_utf8(item)
+    # compat cruft...
     item = item.rstrip()
+
+    # kill ivalid low bytes
+    item = item.translate(None, _deletechars)
+
+    # quote reserved XML characters
+    item = item.replace('&', '&')
+    item = item.replace('<', '<')
+    item = item.replace('>', '>')
     if attrib:
-        item = __cached_saxutils.escape(item, entities={'"':"""})
-    else:
-        item = __cached_saxutils.escape(item)
+        item = item.replace('"', '"')
+        item = item.replace("'", ''')
+
     return item
 
 def unlink_f(filename):


More information about the Yum-commits mailing list