[Yum-devel] [PATCH] Fix main speed issue in to_xml(), slows down new createrepo a lot. BZ 716235.

Zdenek Pavlas zpavlas at redhat.com
Thu Nov 15 11:02:52 UTC 2012


> +        return unicode(item, 'utf-8')

ACK, when amended with following..

> bad_small_bytes = range(0, 8) + [11, 12] + range(14, 32)

Byte 0x08 should be removed as well.

diff --git a/yum/misc.py b/yum/misc.py
index 072c99b..9d3be16 100644
--- a/yum/misc.py
+++ b/yum/misc.py
@@ -897,6 +897,12 @@ def seq_max_split(seq, max_entries):
     ret.append(seq[beg:])
     return ret
 
+_bad_small_bytes = {}
+for i in range(0x20):
+    if chr(i) not in '\t\n\r':
+        _bad_small_bytes[i] = None
+del i
+
 def _ugly_utf8_string_hack(item):
     """hands back a unicoded string"""
     # this is backward compat for handling non-utf8 filenames 
@@ -911,7 +917,7 @@ def _ugly_utf8_string_hack(item):
     
     # this handles any bogon formats we see
     try:
-        return unicode(item, 'utf-8')
+        return unicode(item, 'utf-8').translate(_bad_small_bytes)
     except UnicodeError:
         encodings = ['iso-8859-1', 'iso-8859-15', 'iso-8859-2']
         for enc in encodings:
@@ -932,9 +938,8 @@ def _ugly_utf8_string_hack(item):
     # we allow high bytes, if it passed the utf8 check above. Eg.
     # good chars = #x9 | #xA | #xD | [#x20-...]
     newitem = ''
-    bad_small_bytes = range(0, 8) + [11, 12] + range(14, 32)
     for char in item:
-        if ord(char) in bad_small_bytes:
+        if ord(char) in _bad_small_bytes:
             pass # Just ignore these bytes...
         elif ord(char) > 127:
             newitem = newitem + '?' # byte by byte equiv of escape


More information about the Yum-devel mailing list