[yum-commits] Branch 'yum-3_2_X' - 6 commits - output.py yum/i18n.py yum/plugins.py

James Antill james at osuosl.org
Sat Nov 1 04:51:04 UTC 2008


 output.py      |   57 ++++++-----
 yum/i18n.py    |  294 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 yum/plugins.py |    4 
 3 files changed, 327 insertions(+), 28 deletions(-)

New commits:
commit 7ae8c7a3aafce74b6855a6c86b1459c404b0e0ae
Author: James Antill <james at and.org>
Date:   Sat Nov 1 00:50:55 2008 -0400

    Fix the total download line to use utf8_width_fill too

diff --git a/output.py b/output.py
index 9972d1e..5191b9b 100755
--- a/output.py
+++ b/output.py
@@ -1012,8 +1012,8 @@ Remove   %5.5s Package(s)
         ui_time = tl.add(' %9s' % self.format_time(dl_time))
         ui_end  = tl.add(' ' * 5)
         ui_bs   = tl.add(' %5sB/s' % self.format_number(remote_size / dl_time))
-        msg = "%-*.*s%s%s%s%s" % (tl.rest(), tl.rest(), _("Total"),
-                                  ui_bs, ui_size, ui_time, ui_end)
+        msg = "%s%s%s%s%s" % (utf8_width_fill(_("Total"), tl.rest(), tl.rest()),
+                              ui_bs, ui_size, ui_time, ui_end)
         self.verbose_logger.log(logginglevels.INFO_2, msg)
 
 
commit 713b04ee43f6d718014846e1fbe82556a904c4d1
Author: James Antill <james at and.org>
Date:   Fri Oct 31 13:58:36 2008 -0400

    Fix "Loaded plugins" line to use correct utf8 widths

diff --git a/yum/plugins.py b/yum/plugins.py
index 82efdee..287fba7 100644
--- a/yum/plugins.py
+++ b/yum/plugins.py
@@ -37,6 +37,8 @@ from weakref import proxy as weakref
 
 from yum import _
 
+from yum.i18n import utf8_width, utf8_width_fill
+
 # TODO: expose rpm package sack objects to plugins (once finished)
 # TODO: allow plugins to use the existing config stuff to define options for
 # their own configuration files (would replace confString() etc).
@@ -198,7 +200,7 @@ class YumPlugins:
             # Mostly copied from YumOutput._outKeyValFill()
             key = _("Loaded plugins: ")
             val = ", ".join(sorted(self._plugins))
-            nxt = ' ' * (len(key) - 2) + ': '
+            nxt = ' ' * (utf8_width(key) - 2) + ': '
             width = 80
             if hasattr(self.base, 'term'):
                 width = self.base.term.columns
commit 2082cb6a16e980fac6e6eedd899858fb167f2b8e
Author: James Antill <james at and.org>
Date:   Fri Oct 31 13:57:25 2008 -0400

    Fix fmtKeyValFill, fmtColumns and progress to work with correct utf8 widths

diff --git a/output.py b/output.py
old mode 100644
new mode 100755
index d98e281..9972d1e
--- a/output.py
+++ b/output.py
@@ -39,6 +39,7 @@ from yum.rpmtrans import RPMBaseCallback
 from yum.packageSack import packagesNewestByNameArch
 
 from textwrap import fill
+from yum.i18n import utf8_width, utf8_width_fill
 
 def _term_width():
     """ Simple terminal width, limit to 20 chars. and make 0 == 80. """
@@ -299,7 +300,7 @@ class YumOutput:
         if columns is None:
             columns = [1] * cols
 
-        total_width -= (sum(columns) + (cols - 1) + len(indent))
+        total_width -= (sum(columns) + (cols - 1) + utf8_width(indent))
         while total_width > 0:
             # Find which field all the spaces left will help best
             helps = 0
@@ -351,8 +352,9 @@ class YumOutput:
                 continue
 
             (align, width) = self._fmt_column_align_width(width)
-            if len(val) <= width:
-                msg += u"%%%s%ds " % (align, width)
+            if utf8_width(val) <= width:
+                msg += u"%s "
+                val = utf8_width_fill(val, width, left=(align == u'-'))
             else:
                 msg += u"%s\n" + " " * (total_width + width + 1)
             total_width += width
@@ -360,7 +362,8 @@ class YumOutput:
             data.append(val)
         (val, width) = columns[-1]
         (align, width) = self._fmt_column_align_width(width)
-        msg += u"%%%s%ds%s" % (align, width, end)
+        val = utf8_width_fill(val, width, left=(align == u'-'))
+        msg += u"%%s%s" % end
         data.append(val)
         return msg % tuple(data)
 
@@ -395,7 +398,7 @@ class YumOutput:
     def fmtKeyValFill(self, key, val):
         """ Return a key value pair in the common two column output format. """
         val = to_str(val)
-        keylen = len(key)
+        keylen = utf8_width(key)
         cols = self.term.columns
         nxt = ' ' * (keylen - 2) + ': '
         ret = fill(val, width=cols,
@@ -572,7 +575,7 @@ class YumOutput:
                 continue
             for (apkg, ipkg) in pkg_names2pkgs[item]:
                 pkg = ipkg or apkg
-                envra = len(str(pkg)) + len(indent)
+                envra = utf8_width(str(pkg)) + utf8_width(indent)
                 rid   = len(pkg.repoid)
                 for (d, v) in (('envra', envra), ('rid', rid)):
                     data[d].setdefault(v, 0)
@@ -1133,10 +1136,10 @@ class YumCliRPMCallBack(RPMBaseCallback):
             percent = (te_current*100L)/te_total
         
         if self.output and (sys.stdout.isatty() or te_current == te_total):
-            fmt = self._makefmt(percent, ts_current, ts_total, pkgname=pkgname)
-            # FIXME: Converting to utf8 here is a HACK ... but it's better
-            # to underflow than overflow, see the i18n-rpm-progress example
-            msg = fmt % (to_utf8(process), pkgname)
+            (fmt, wid1, wid2) = self._makefmt(percent, ts_current, ts_total,
+                                              pkgname=pkgname)
+            msg = fmt % (utf8_width_fill(process, wid1, wid1),
+                         utf8_width_fill(pkgname, wid2, wid2))
             if msg != self.lastmsg:
                 sys.stdout.write(to_unicode(msg))
                 sys.stdout.flush()
@@ -1161,7 +1164,7 @@ class YumCliRPMCallBack(RPMBaseCallback):
         if pkgname is None:
             pnl = 22
         else:
-            pnl = len(pkgname)
+            pnl = utf8_width(pkgname)
 
         overhead  = (2 * l) + 2 # Length of done, above
         overhead += 19          # Length of begining
@@ -1179,20 +1182,23 @@ class YumCliRPMCallBack(RPMBaseCallback):
         width = "%s.%s" % (marks, marks)
         fmt_bar = "[%-" + width + "s]"
         # pnl = str(28 + marks + 1)
-        full_pnl = "%%-%d.%ds" % (pnl + marks + 1, pnl + marks + 1)
-        half_pnl = "%%-%d.%ds" % (pnl, pnl)
+        full_pnl = pnl + marks + 1
 
         if progress and percent == 100: # Don't chop pkg name on 100%
-            fmt = "\r  %-15.15s: " + full_pnl + "   " + done
+            fmt = "\r  %s: %s   " + done
+            wid2 = full_pnl
         elif progress:
             bar = fmt_bar % (self.mark * int(marks * (percent / 100.0)), )
-            fmt = "\r  %-15.15s: " + half_pnl + " " + bar + " " + done
+            fmt = "\r  %s: %s " + bar + " " + done
+            wid2 = pnl
         elif percent == 100:
-            fmt = "  %-15.15s: " + full_pnl + "   " + done
+            fmt = "  %s: %s   " + done
+            wid2 = full_pnl
         else:
             bar = fmt_bar % (self.mark * marks, )
-            fmt = "  %-15.15s: " + half_pnl + " " + bar + " " + done
-        return fmt
+            fmt = "  %s: %s " + bar + " " + done
+            wid2 = pnl
+        return fmt, 15, wid2
 
 
 def progressbar(current, total, name=None):
@@ -1214,8 +1220,6 @@ def progressbar(current, total, name=None):
 
     if name is None and current == total:
         name = '-'
-    if name is not None: # FIXME: This is a hack without utf8_width()
-        width -= len(to_utf8(name)) - len(name)
 
     end = ' %d/%d' % (current, total)
     width -= len(end) + 1
@@ -1228,17 +1232,18 @@ def progressbar(current, total, name=None):
         hashbar = mark * int(width * percent)
         output = '\r[%-*s]%s' % (width, hashbar, end)
     elif current == total: # Don't chop name on 100%
-        output = '\r%-*.*s%s' % (width, width, name, end)
+        output = '\r%s%s' % (utf8_width_fill(name, width, width), end)
     else:
         width -= 4
         if width < 0:
             width = 0
         nwid = width / 2
-        if nwid > len(name):
-            nwid = len(name)
+        if nwid > utf8_width(name):
+            nwid = utf8_width(name)
         width -= nwid
         hashbar = mark * int(width * percent)
-        output = '\r%-*.*s: [%-*s]%s' % (nwid, nwid, name, width, hashbar, end)
+        output = '\r%s: [%-*s]%s' % (utf8_width_fill(name, nwid, nwid), width,
+                                     hashbar, end)
      
     if current <= total:
         sys.stdout.write(output)
commit 7cb7bddd642150b5274772b5cb7b13f7441367c4
Author: James Antill <james at and.org>
Date:   Fri Oct 31 13:56:17 2008 -0400

    Add utf8_width chop/fill as helper functions for printing utf8 stuff

diff --git a/yum/i18n.py b/yum/i18n.py
index ba71e4f..27dcc62 100755
--- a/yum/i18n.py
+++ b/yum/i18n.py
@@ -211,6 +211,52 @@ def utf8_width(msg):
             ret += __utf8_ucp_width(ucs)
     return ret
 
+def utf8_width_chop(msg, chop=None):
+    """ Return the textual width of a utf8 string, chopping it to a specified
+        value. """
+
+    if chop is None or utf8_width(msg) <= chop:
+        return utf8_width(msg), msg
+
+    ret = 0
+    passed_unicode = isinstance(msg, unicode)
+    msg_bytes = 0
+    msg = to_utf8(msg)
+    for (ucs, bytes) in __utf8_iter_ucs(msg):
+        if ucs is None:
+            width = bytes # Ugly ... should not feed bad utf8
+        else:
+            width = __utf8_ucp_width(ucs)
+
+        if chop is not None and (ret + width) > chop:
+            msg = msg[:msg_bytes]
+            break
+        ret += width
+        msg_bytes += bytes
+
+    if passed_unicode:
+        msg = to_unicode(msg)
+
+    return ret, msg
+
+def utf8_width_fill(msg, fill, chop=None, left=True):
+    """ Expand a utf8 msg to a specified "width" or chop to same.
+        Expansion can be left or right. """
+    passed_msg = msg
+    width, msg = utf8_width_chop(msg, chop)
+
+    if width < fill:
+        extra = " " * (fill - width)
+        if left:
+            msg = ''.join([msg, extra])
+        else:
+            msg = ''.join([extra, msg])
+
+    if isinstance(passed_msg, unicode):
+        return to_unicode(msg)
+
+    return msg
+
 def utf8_valid(msg):
     """ Return True/False is the text is valid utf8. """
     for (ucs, bytes) in __utf8_iter_ucs(msg):
@@ -238,36 +284,45 @@ except:
 if __name__ == "__main__":
     import sys
 
-    print " ---- Arguments/str ---- "
-    for arg in sys.argv[1:]:
+    def out(arg):
         arg = to_utf8(arg)
         print "UTF8 :", arg
         print "len  :", len(arg)
+        arg = to_unicode(arg)
+        print "USC  :", arg
+        print "len  :", len(arg)
         print "valid:", utf8_valid(arg)
         print "width:", utf8_width(arg)
+        print "4.8  :", "%s%s%s" % ('<', utf8_width_fill(arg,  4,  8), '>')
+        print "4.3  :", "%s%s%s" % ('<', utf8_width_fill(arg,  4,  3), '>')
+        print "4.2  :", "%s%s%s" % ('<', utf8_width_fill(arg,  4,  2), '>')
+        print "4.1  :", "%s%s%s" % ('<', utf8_width_fill(arg,  4,  1), '>')
+        print "3.3  :", "%s%s%s" % ('<', utf8_width_fill(arg,  3,  3), '>')
+        print "3.2  :", "%s%s%s" % ('<', utf8_width_fill(arg,  3,  2), '>')
+        print "3.1  :", "%s%s%s" % ('<', utf8_width_fill(arg,  3,  1), '>')
+        print "40.79:", "%s%s%s" % ('<', utf8_width_fill(arg, 40, 79), '>')
+        print "40.20:", "%s%s%s" % ('<', utf8_width_fill(arg, 40, 20), '>')
         print ''
 
+    print " ---- Arguments/str ---- "
+    for arg in sys.argv[1:]:
+        out(arg)
+
     print " ---- Arguments/gettext ---- "
     for arg in sys.argv[1:]:
-        arg = to_utf8(_(arg))
-        print "UTF8 :", arg
-        print "len  :", len(arg)
-        print "valid:", utf8_valid(arg)
-        print "width:", utf8_width(arg)
-        print ''
+        try:
+            arg = _(arg)
+        except UnicodeDecodeError:
+            continue
+        out(arg)
 
     if len(sys.argv) > 2:
         print " ---- Arguments/str/all ---- "
-        arg = to_utf8(sys.argv[1] % sys.argv[2:])
-        print "UTF8 :", arg
-        print "len  :", len(arg)
-        print "valid:", utf8_valid(arg)
-        print "width:", utf8_width(arg)
-        print ''
+        out(sys.argv[1] % sys.argv[2:])
+
         print " ---- Arguments/gettext/all ---- "
-        arg = to_utf8(_(sys.argv[1]) % map(_, sys.argv[2:]))
-        print "UTF8 :", arg
-        print "len  :", len(arg)
-        print "valid:", utf8_valid(arg)
-        print "width:", utf8_width(arg)
-        print ''
+        try:
+            arg = _(sys.argv[1]) % map(_, sys.argv[2:])
+        except UnicodeDecodeError:
+            sys.exit(0)
+        out(arg)
commit fed9a04e3973d4d827092e66b60f48004a28cd46
Author: James Antill <james at and.org>
Date:   Fri Oct 31 11:45:44 2008 -0400

    Add testing code for utf8_width

diff --git a/yum/i18n.py b/yum/i18n.py
old mode 100644
new mode 100755
index f14dc19..ba71e4f
--- a/yum/i18n.py
+++ b/yum/i18n.py
@@ -234,3 +234,40 @@ except:
     returning the same text
     '''
     _ = dummy_wrapper
+
+if __name__ == "__main__":
+    import sys
+
+    print " ---- Arguments/str ---- "
+    for arg in sys.argv[1:]:
+        arg = to_utf8(arg)
+        print "UTF8 :", arg
+        print "len  :", len(arg)
+        print "valid:", utf8_valid(arg)
+        print "width:", utf8_width(arg)
+        print ''
+
+    print " ---- Arguments/gettext ---- "
+    for arg in sys.argv[1:]:
+        arg = to_utf8(_(arg))
+        print "UTF8 :", arg
+        print "len  :", len(arg)
+        print "valid:", utf8_valid(arg)
+        print "width:", utf8_width(arg)
+        print ''
+
+    if len(sys.argv) > 2:
+        print " ---- Arguments/str/all ---- "
+        arg = to_utf8(sys.argv[1] % sys.argv[2:])
+        print "UTF8 :", arg
+        print "len  :", len(arg)
+        print "valid:", utf8_valid(arg)
+        print "width:", utf8_width(arg)
+        print ''
+        print " ---- Arguments/gettext/all ---- "
+        arg = to_utf8(_(sys.argv[1]) % map(_, sys.argv[2:]))
+        print "UTF8 :", arg
+        print "len  :", len(arg)
+        print "valid:", utf8_valid(arg)
+        print "width:", utf8_width(arg)
+        print ''
commit ea1caa2bdd7e9d8fd22e2eeebef53ad810c94fdb
Author: James Antill <james at and.org>
Date:   Wed Oct 29 17:05:05 2008 -0400

    Add utf8_width to i18n, so we can do progress bars etc. well

diff --git a/yum/i18n.py b/yum/i18n.py
index 86f3ce2..f14dc19 100644
--- a/yum/i18n.py
+++ b/yum/i18n.py
@@ -13,11 +13,211 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 
+from yum.misc import to_unicode, to_utf8
+
 def dummy_wrapper(str):
     '''
     Dummy Translation wrapper, just returning the same string.
     '''
-    return str
+    return to_unicode(str)
+
+# This is ported from ustr_utf8_* which I got from:
+#     http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+#  I've tried to leave it close to the original C (same names etc.) so that
+# it is easy to read/compare both versions...
+
+# ----------------------------- BEG utf8 -----------------------------
+# This is an implementation of wcwidth() and wcswidth() (defined in
+# IEEE Std 1002.1-2001) for Unicode.
+#
+# http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
+# http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
+#
+# In fixed-width output devices, Latin characters all occupy a single
+# "cell" position of equal width, whereas ideographic CJK characters
+# occupy two such cells. Interoperability between terminal-line
+# applications and (teletype-style) character terminals using the
+# UTF-8 encoding requires agreement on which character should advance
+# the cursor by how many cell positions. No established formal
+# standards exist at present on which Unicode character shall occupy
+# how many cell positions on character terminals. These routines are
+# a first attempt of defining such behavior based on simple rules
+# applied to data provided by the Unicode Consortium.
+#
+# [...]
+#
+# Markus Kuhn -- 2007-05-26 (Unicode 5.0)
+#
+# Permission to use, copy, modify, and distribute this software
+# for any purpose and without fee is hereby granted. The author
+# disclaims all warranties with regard to this software.
+#
+# Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+
+def __utf8_bisearch(ucs, table):
+    """ auxiliary function for binary search in interval table. """
+
+    min = 0
+    max = len(table) - 1
+    if ucs < table[min][0] or ucs > table[max][1]:
+        return False
+
+    while max >= min:
+        mid = (min + max) / 2;
+        if ucs > table[mid][1]:
+            min = mid + 1;
+        elif ucs < table[mid][0]:
+            max = mid - 1;
+        else:
+          return True
+
+    return False
+
+def __utf8_ucp_width(ucs):
+    """ Get the textual width of a ucs character. """
+    # sorted list of non-overlapping intervals of non-spacing characters
+    # generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c"
+    combining = [
+    ( 0x0300, 0x036F ), ( 0x0483, 0x0486 ), ( 0x0488, 0x0489 ),
+    ( 0x0591, 0x05BD ), ( 0x05BF, 0x05BF ), ( 0x05C1, 0x05C2 ),
+    ( 0x05C4, 0x05C5 ), ( 0x05C7, 0x05C7 ), ( 0x0600, 0x0603 ),
+    ( 0x0610, 0x0615 ), ( 0x064B, 0x065E ), ( 0x0670, 0x0670 ),
+    ( 0x06D6, 0x06E4 ), ( 0x06E7, 0x06E8 ), ( 0x06EA, 0x06ED ),
+    ( 0x070F, 0x070F ), ( 0x0711, 0x0711 ), ( 0x0730, 0x074A ),
+    ( 0x07A6, 0x07B0 ), ( 0x07EB, 0x07F3 ), ( 0x0901, 0x0902 ),
+    ( 0x093C, 0x093C ), ( 0x0941, 0x0948 ), ( 0x094D, 0x094D ),
+    ( 0x0951, 0x0954 ), ( 0x0962, 0x0963 ), ( 0x0981, 0x0981 ),
+    ( 0x09BC, 0x09BC ), ( 0x09C1, 0x09C4 ), ( 0x09CD, 0x09CD ),
+    ( 0x09E2, 0x09E3 ), ( 0x0A01, 0x0A02 ), ( 0x0A3C, 0x0A3C ),
+    ( 0x0A41, 0x0A42 ), ( 0x0A47, 0x0A48 ), ( 0x0A4B, 0x0A4D ),
+    ( 0x0A70, 0x0A71 ), ( 0x0A81, 0x0A82 ), ( 0x0ABC, 0x0ABC ),
+    ( 0x0AC1, 0x0AC5 ), ( 0x0AC7, 0x0AC8 ), ( 0x0ACD, 0x0ACD ),
+    ( 0x0AE2, 0x0AE3 ), ( 0x0B01, 0x0B01 ), ( 0x0B3C, 0x0B3C ),
+    ( 0x0B3F, 0x0B3F ), ( 0x0B41, 0x0B43 ), ( 0x0B4D, 0x0B4D ),
+    ( 0x0B56, 0x0B56 ), ( 0x0B82, 0x0B82 ), ( 0x0BC0, 0x0BC0 ),
+    ( 0x0BCD, 0x0BCD ), ( 0x0C3E, 0x0C40 ), ( 0x0C46, 0x0C48 ),
+    ( 0x0C4A, 0x0C4D ), ( 0x0C55, 0x0C56 ), ( 0x0CBC, 0x0CBC ),
+    ( 0x0CBF, 0x0CBF ), ( 0x0CC6, 0x0CC6 ), ( 0x0CCC, 0x0CCD ),
+    ( 0x0CE2, 0x0CE3 ), ( 0x0D41, 0x0D43 ), ( 0x0D4D, 0x0D4D ),
+    ( 0x0DCA, 0x0DCA ), ( 0x0DD2, 0x0DD4 ), ( 0x0DD6, 0x0DD6 ),
+    ( 0x0E31, 0x0E31 ), ( 0x0E34, 0x0E3A ), ( 0x0E47, 0x0E4E ),
+    ( 0x0EB1, 0x0EB1 ), ( 0x0EB4, 0x0EB9 ), ( 0x0EBB, 0x0EBC ),
+    ( 0x0EC8, 0x0ECD ), ( 0x0F18, 0x0F19 ), ( 0x0F35, 0x0F35 ),
+    ( 0x0F37, 0x0F37 ), ( 0x0F39, 0x0F39 ), ( 0x0F71, 0x0F7E ),
+    ( 0x0F80, 0x0F84 ), ( 0x0F86, 0x0F87 ), ( 0x0F90, 0x0F97 ),
+    ( 0x0F99, 0x0FBC ), ( 0x0FC6, 0x0FC6 ), ( 0x102D, 0x1030 ),
+    ( 0x1032, 0x1032 ), ( 0x1036, 0x1037 ), ( 0x1039, 0x1039 ),
+    ( 0x1058, 0x1059 ), ( 0x1160, 0x11FF ), ( 0x135F, 0x135F ),
+    ( 0x1712, 0x1714 ), ( 0x1732, 0x1734 ), ( 0x1752, 0x1753 ),
+    ( 0x1772, 0x1773 ), ( 0x17B4, 0x17B5 ), ( 0x17B7, 0x17BD ),
+    ( 0x17C6, 0x17C6 ), ( 0x17C9, 0x17D3 ), ( 0x17DD, 0x17DD ),
+    ( 0x180B, 0x180D ), ( 0x18A9, 0x18A9 ), ( 0x1920, 0x1922 ),
+    ( 0x1927, 0x1928 ), ( 0x1932, 0x1932 ), ( 0x1939, 0x193B ),
+    ( 0x1A17, 0x1A18 ), ( 0x1B00, 0x1B03 ), ( 0x1B34, 0x1B34 ),
+    ( 0x1B36, 0x1B3A ), ( 0x1B3C, 0x1B3C ), ( 0x1B42, 0x1B42 ),
+    ( 0x1B6B, 0x1B73 ), ( 0x1DC0, 0x1DCA ), ( 0x1DFE, 0x1DFF ),
+    ( 0x200B, 0x200F ), ( 0x202A, 0x202E ), ( 0x2060, 0x2063 ),
+    ( 0x206A, 0x206F ), ( 0x20D0, 0x20EF ), ( 0x302A, 0x302F ),
+    ( 0x3099, 0x309A ), ( 0xA806, 0xA806 ), ( 0xA80B, 0xA80B ),
+    ( 0xA825, 0xA826 ), ( 0xFB1E, 0xFB1E ), ( 0xFE00, 0xFE0F ),
+    ( 0xFE20, 0xFE23 ), ( 0xFEFF, 0xFEFF ), ( 0xFFF9, 0xFFFB ),
+    ( 0x10A01, 0x10A03 ), ( 0x10A05, 0x10A06 ), ( 0x10A0C, 0x10A0F ),
+    ( 0x10A38, 0x10A3A ), ( 0x10A3F, 0x10A3F ), ( 0x1D167, 0x1D169 ),
+    ( 0x1D173, 0x1D182 ), ( 0x1D185, 0x1D18B ), ( 0x1D1AA, 0x1D1AD ),
+    ( 0x1D242, 0x1D244 ), ( 0xE0001, 0xE0001 ), ( 0xE0020, 0xE007F ),
+    ( 0xE0100, 0xE01EF )]
+
+    # test for 8-bit control characters
+    if ucs == 0:
+        return 0
+
+    if ucs < 32 or (ucs >= 0x7f and ucs < 0xa0):
+        return (-1)
+
+    if __utf8_bisearch(ucs, combining):
+        return 0
+
+    # if we arrive here, ucs is not a combining or C0/C1 control character
+
+    return (1 + 
+      (ucs >= 0x1100 and
+       (ucs <= 0x115f or                     # Hangul Jamo init. consonants
+        ucs == 0x2329 or ucs == 0x232a or
+        (ucs >= 0x2e80 and ucs <= 0xa4cf and
+         ucs != 0x303f) or                   # CJK ... Yi
+        (ucs >= 0xac00 and ucs <= 0xd7a3) or # Hangul Syllables
+        (ucs >= 0xf900 and ucs <= 0xfaff) or # CJK Compatibility Ideographs
+        (ucs >= 0xfe10 and ucs <= 0xfe19) or # Vertical forms
+        (ucs >= 0xfe30 and ucs <= 0xfe6f) or # CJK Compatibility Forms
+        (ucs >= 0xff00 and ucs <= 0xff60) or # Fullwidth Forms
+        (ucs >= 0xffe0 and ucs <= 0xffe6) or
+        (ucs >= 0x20000 and ucs <= 0x2fffd) or
+        (ucs >= 0x30000 and ucs <= 0x3fffd))))
+
+
+def __utf8_iter_ints(msg):
+    for byte in to_utf8(msg):
+        yield ord(byte)
+def __utf8_iter_ucs(msg):
+    uiter = __utf8_iter_ints(msg)
+    for byte0 in uiter:
+        if byte0 < 0x80:             # 0xxxxxxx
+            yield (byte0, 1)
+        elif (byte0 & 0xe0) == 0xc0: # 110XXXXx 10xxxxxx
+            byte1 = uiter.next()
+            if (((byte0 & 0xc0) != 0x80) or 
+                ((byte1 & 0xfe) == 0xc0)):                          # overlong?
+                yield (None, 2)
+                return
+            yield ((((byte0 & 0x1f) << 6) | (byte1 & 0x3f)), 2)
+        elif (byte0 & 0xf0) == 0xe0: # 1110XXXX 10Xxxxxx 10xxxxxx
+            byte1 = uiter.next()
+            byte2 = uiter.next()
+            if (((byte1 & 0xc0) != 0x80) or ((byte2 & 0xc0) != 0x80) or
+                ((byte0 == 0xe0) and ((byte1 & 0xe0) == 0x80)) or   # overlong?
+                ((byte0 == 0xed) and ((byte1 & 0xe0) == 0xa0)) or   # surrogate?
+                ((byte0 == 0xef) and  (byte1 == 0xbf) and
+                 ((byte2 & 0xfe) == 0xbe))): # U+FFFE or U+FFFF?
+                yield (None, 3)
+                return
+            yield ((((byte0 & 0x0f) << 12) | ((byte1 & 0x3f) << 6) |
+                   (byte2 & 0x3f)), 3)
+        elif (byte0 & 0xf8) == 0xf0: # 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
+            byte1 = uiter.next()
+            byte2 = uiter.next()
+            byte3 = uiter.next()
+            if (((byte1 & 0xc0) != 0x80) or
+                ((byte2 & 0xc0) != 0x80) or
+                ((byte3 & 0xc0) != 0x80) or
+                ((byte0 == 0xf0) and ((byte1 & 0xf0) == 0x80)) or # overlong?
+                ((byte0 == 0xf4) and (byte1 > 0x8f)) or           # > U+10FFFF?
+                (byte0 > 0xf4)):                                  # > U+10FFFF?
+                yield (None, 4)
+                return
+
+            yield ((((byte0 & 0x07) << 18) | ((byte1 & 0x3f) << 12) |
+                    ((byte2 & 0x3f) <<  6) |  (byte3 & 0x3f)), 4)
+        else:
+            yield (None, 1)
+            return
+
+def utf8_width(msg):
+    """ Get the textual width of a utf8 string. """
+    ret = 0
+    for (ucs, bytes) in __utf8_iter_ucs(msg):
+        if ucs is None:
+            ret += bytes # Ugly ... should not feed bad utf8
+        else:
+            ret += __utf8_ucp_width(ucs)
+    return ret
+
+def utf8_valid(msg):
+    """ Return True/False is the text is valid utf8. """
+    for (ucs, bytes) in __utf8_iter_ucs(msg):
+        if ucs is None:
+            return False
+    return True
+# ----------------------------- END utf8 -----------------------------
 
 try: 
     '''


More information about the Yum-commits mailing list