[yum-git] Branch 'yum-3_2_X' - 2 commits - yum/misc.py

Mon Sep 15 20:48:10 UTC 2008

yum/misc.py |   50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

New commits:
commit 14a0123336784314311f6ce019c40cdbf7b51088
Merge: 876bdf5... 3bde1ea...
Author: Seth Vidal <skvidal at fedoraproject.org>
Date:   Mon Sep 15 16:47:00 2008 -0400

    Merge branch 'yum-3_2_X' of ssh://login.linux.duke.edu/home/groups/yum/git/yum into yum-3_2_X
    
    * 'yum-3_2_X' of ssh://login.linux.duke.edu/home/groups/yum/git/yum:
      Make sure we get two packages with the same nevra, but different pkgKey's
      Fix the testcases to not die due to missing persistdir
      Speedup for includepkgs, use a set for main in test
      Don't use parsePacakges where just returnPackages() dtrt. now
      Make rpmdb.returnPackages dtrt. like sqlitesack, always doing the minimization
      Patterns max constant change, adding PATTERNS_INDEXED_MAX and comments.

commit 876bdf5559ed483ce75446eb6112742b0523cab8
Author: Seth Vidal <skvidal at fedoraproject.org>
Date:   Mon Sep 15 16:45:13 2008 -0400

    - add crazy utf8 conversion routine for outputting xml for metadata
    - if/when non-utf8 filenames/metadata content becomes good and illegal then
      we should be able to clean this all out

diff --git a/yum/misc.py b/yum/misc.py
index 2f2b27b..2a64bea 100644
--- a/yum/misc.py
+++ b/yum/misc.py
@@ -621,10 +621,58 @@ def seq_max_split(seq, max_entries):
         num -= max_entries
     ret.append(seq[beg:])
     return ret
+
+def _ugly_utf8_string_hack(item):
+    """hands back a unicoded string"""
+    # this is backward compat for handling non-utf8 filenames 
+    # and content inside packages. :(
+    # content that xml can cope with but isn't really kosher
+
+    # if we're anything obvious - do them first
+    if item is None:
+        return ''
+    elif isinstance(item, unicode):    
+        return item
+    
+    # this handles any bogon formats we see
+    du = False
+    try:
+        x = unicode(item, 'ascii')
+        du = True
+    except UnicodeError:
+        encodings = ['utf-8', 'iso-8859-1', 'iso-8859-15', 'iso-8859-2']
+        for enc in encodings:
+            try:
+                x = unicode(item, enc)
+            except UnicodeError:
+                pass
+                
+            else:
+                if x.encode(enc) == item:
+                    if enc != 'utf-8':
+                        print '\n%s encoding on %s\n' % (enc, item)
+                    return x.encode('utf-8')
     
+    
+    # Kill bytes (or libxml will die) not in the small byte portion of:
+    #  http://www.w3.org/TR/REC-xml/#NT-Char
+    # we allow high bytes, if it passed the utf8 check above. Eg.
+    # good chars = #x9 | #xA | #xD | [#x20-...]
+    newitem = ''
+    bad_small_bytes = range(0, 8) + [11, 12] + range(14, 32)
+    for char in item:
+        if ord(char) in bad_small_bytes:
+            pass # Just ignore these bytes...
+        elif not du and ord(char) > 127:
+            newitem = newitem + '?' # byte by byte equiv of escape
+        else:
+            newitem = newitem + char
+    return newitem
+
 def to_xml(item, attrib=False):
     import xml.sax.saxutils
-    item = to_utf8(item) # verify this does enough conversion
+    item = _ugly_utf8_string_hack(item)
+    item = to_utf8(item)
     item = item.rstrip()
     if attrib:
         item = xml.sax.saxutils.escape(item, entities={'"':"&quot;"})