[yum-commits] Branch 'yum-3_2_X' - 6 commits - output.py yum/i18n.py yum/plugins.py
James Antill
james at osuosl.org
Sat Nov 1 04:51:04 UTC 2008
output.py | 57 ++++++-----
yum/i18n.py | 294 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
yum/plugins.py | 4
3 files changed, 327 insertions(+), 28 deletions(-)
New commits:
commit 7ae8c7a3aafce74b6855a6c86b1459c404b0e0ae
Author: James Antill <james at and.org>
Date: Sat Nov 1 00:50:55 2008 -0400
Fix the total download line to use utf8_width_fill too
diff --git a/output.py b/output.py
index 9972d1e..5191b9b 100755
--- a/output.py
+++ b/output.py
@@ -1012,8 +1012,8 @@ Remove %5.5s Package(s)
ui_time = tl.add(' %9s' % self.format_time(dl_time))
ui_end = tl.add(' ' * 5)
ui_bs = tl.add(' %5sB/s' % self.format_number(remote_size / dl_time))
- msg = "%-*.*s%s%s%s%s" % (tl.rest(), tl.rest(), _("Total"),
- ui_bs, ui_size, ui_time, ui_end)
+ msg = "%s%s%s%s%s" % (utf8_width_fill(_("Total"), tl.rest(), tl.rest()),
+ ui_bs, ui_size, ui_time, ui_end)
self.verbose_logger.log(logginglevels.INFO_2, msg)
commit 713b04ee43f6d718014846e1fbe82556a904c4d1
Author: James Antill <james at and.org>
Date: Fri Oct 31 13:58:36 2008 -0400
Fix "Loaded plugins" line to use correct utf8 widths
diff --git a/yum/plugins.py b/yum/plugins.py
index 82efdee..287fba7 100644
--- a/yum/plugins.py
+++ b/yum/plugins.py
@@ -37,6 +37,8 @@ from weakref import proxy as weakref
from yum import _
+from yum.i18n import utf8_width, utf8_width_fill
+
# TODO: expose rpm package sack objects to plugins (once finished)
# TODO: allow plugins to use the existing config stuff to define options for
# their own configuration files (would replace confString() etc).
@@ -198,7 +200,7 @@ class YumPlugins:
# Mostly copied from YumOutput._outKeyValFill()
key = _("Loaded plugins: ")
val = ", ".join(sorted(self._plugins))
- nxt = ' ' * (len(key) - 2) + ': '
+ nxt = ' ' * (utf8_width(key) - 2) + ': '
width = 80
if hasattr(self.base, 'term'):
width = self.base.term.columns
commit 2082cb6a16e980fac6e6eedd899858fb167f2b8e
Author: James Antill <james at and.org>
Date: Fri Oct 31 13:57:25 2008 -0400
Fix fmtKeyValFill, fmtColumns and progress to work with correct utf8 widths
diff --git a/output.py b/output.py
old mode 100644
new mode 100755
index d98e281..9972d1e
--- a/output.py
+++ b/output.py
@@ -39,6 +39,7 @@ from yum.rpmtrans import RPMBaseCallback
from yum.packageSack import packagesNewestByNameArch
from textwrap import fill
+from yum.i18n import utf8_width, utf8_width_fill
def _term_width():
""" Simple terminal width, limit to 20 chars. and make 0 == 80. """
@@ -299,7 +300,7 @@ class YumOutput:
if columns is None:
columns = [1] * cols
- total_width -= (sum(columns) + (cols - 1) + len(indent))
+ total_width -= (sum(columns) + (cols - 1) + utf8_width(indent))
while total_width > 0:
# Find which field all the spaces left will help best
helps = 0
@@ -351,8 +352,9 @@ class YumOutput:
continue
(align, width) = self._fmt_column_align_width(width)
- if len(val) <= width:
- msg += u"%%%s%ds " % (align, width)
+ if utf8_width(val) <= width:
+ msg += u"%s "
+ val = utf8_width_fill(val, width, left=(align == u'-'))
else:
msg += u"%s\n" + " " * (total_width + width + 1)
total_width += width
@@ -360,7 +362,8 @@ class YumOutput:
data.append(val)
(val, width) = columns[-1]
(align, width) = self._fmt_column_align_width(width)
- msg += u"%%%s%ds%s" % (align, width, end)
+ val = utf8_width_fill(val, width, left=(align == u'-'))
+ msg += u"%%s%s" % end
data.append(val)
return msg % tuple(data)
@@ -395,7 +398,7 @@ class YumOutput:
def fmtKeyValFill(self, key, val):
""" Return a key value pair in the common two column output format. """
val = to_str(val)
- keylen = len(key)
+ keylen = utf8_width(key)
cols = self.term.columns
nxt = ' ' * (keylen - 2) + ': '
ret = fill(val, width=cols,
@@ -572,7 +575,7 @@ class YumOutput:
continue
for (apkg, ipkg) in pkg_names2pkgs[item]:
pkg = ipkg or apkg
- envra = len(str(pkg)) + len(indent)
+ envra = utf8_width(str(pkg)) + utf8_width(indent)
rid = len(pkg.repoid)
for (d, v) in (('envra', envra), ('rid', rid)):
data[d].setdefault(v, 0)
@@ -1133,10 +1136,10 @@ class YumCliRPMCallBack(RPMBaseCallback):
percent = (te_current*100L)/te_total
if self.output and (sys.stdout.isatty() or te_current == te_total):
- fmt = self._makefmt(percent, ts_current, ts_total, pkgname=pkgname)
- # FIXME: Converting to utf8 here is a HACK ... but it's better
- # to underflow than overflow, see the i18n-rpm-progress example
- msg = fmt % (to_utf8(process), pkgname)
+ (fmt, wid1, wid2) = self._makefmt(percent, ts_current, ts_total,
+ pkgname=pkgname)
+ msg = fmt % (utf8_width_fill(process, wid1, wid1),
+ utf8_width_fill(pkgname, wid2, wid2))
if msg != self.lastmsg:
sys.stdout.write(to_unicode(msg))
sys.stdout.flush()
@@ -1161,7 +1164,7 @@ class YumCliRPMCallBack(RPMBaseCallback):
if pkgname is None:
pnl = 22
else:
- pnl = len(pkgname)
+ pnl = utf8_width(pkgname)
overhead = (2 * l) + 2 # Length of done, above
overhead += 19 # Length of begining
@@ -1179,20 +1182,23 @@ class YumCliRPMCallBack(RPMBaseCallback):
width = "%s.%s" % (marks, marks)
fmt_bar = "[%-" + width + "s]"
# pnl = str(28 + marks + 1)
- full_pnl = "%%-%d.%ds" % (pnl + marks + 1, pnl + marks + 1)
- half_pnl = "%%-%d.%ds" % (pnl, pnl)
+ full_pnl = pnl + marks + 1
if progress and percent == 100: # Don't chop pkg name on 100%
- fmt = "\r %-15.15s: " + full_pnl + " " + done
+ fmt = "\r %s: %s " + done
+ wid2 = full_pnl
elif progress:
bar = fmt_bar % (self.mark * int(marks * (percent / 100.0)), )
- fmt = "\r %-15.15s: " + half_pnl + " " + bar + " " + done
+ fmt = "\r %s: %s " + bar + " " + done
+ wid2 = pnl
elif percent == 100:
- fmt = " %-15.15s: " + full_pnl + " " + done
+ fmt = " %s: %s " + done
+ wid2 = full_pnl
else:
bar = fmt_bar % (self.mark * marks, )
- fmt = " %-15.15s: " + half_pnl + " " + bar + " " + done
- return fmt
+ fmt = " %s: %s " + bar + " " + done
+ wid2 = pnl
+ return fmt, 15, wid2
def progressbar(current, total, name=None):
@@ -1214,8 +1220,6 @@ def progressbar(current, total, name=None):
if name is None and current == total:
name = '-'
- if name is not None: # FIXME: This is a hack without utf8_width()
- width -= len(to_utf8(name)) - len(name)
end = ' %d/%d' % (current, total)
width -= len(end) + 1
@@ -1228,17 +1232,18 @@ def progressbar(current, total, name=None):
hashbar = mark * int(width * percent)
output = '\r[%-*s]%s' % (width, hashbar, end)
elif current == total: # Don't chop name on 100%
- output = '\r%-*.*s%s' % (width, width, name, end)
+ output = '\r%s%s' % (utf8_width_fill(name, width, width), end)
else:
width -= 4
if width < 0:
width = 0
nwid = width / 2
- if nwid > len(name):
- nwid = len(name)
+ if nwid > utf8_width(name):
+ nwid = utf8_width(name)
width -= nwid
hashbar = mark * int(width * percent)
- output = '\r%-*.*s: [%-*s]%s' % (nwid, nwid, name, width, hashbar, end)
+ output = '\r%s: [%-*s]%s' % (utf8_width_fill(name, nwid, nwid), width,
+ hashbar, end)
if current <= total:
sys.stdout.write(output)
commit 7cb7bddd642150b5274772b5cb7b13f7441367c4
Author: James Antill <james at and.org>
Date: Fri Oct 31 13:56:17 2008 -0400
Add utf8_width chop/fill as helper functions for printing utf8 stuff
diff --git a/yum/i18n.py b/yum/i18n.py
index ba71e4f..27dcc62 100755
--- a/yum/i18n.py
+++ b/yum/i18n.py
@@ -211,6 +211,52 @@ def utf8_width(msg):
ret += __utf8_ucp_width(ucs)
return ret
+def utf8_width_chop(msg, chop=None):
+ """ Return the textual width of a utf8 string, chopping it to a specified
+ value. """
+
+ if chop is None or utf8_width(msg) <= chop:
+ return utf8_width(msg), msg
+
+ ret = 0
+ passed_unicode = isinstance(msg, unicode)
+ msg_bytes = 0
+ msg = to_utf8(msg)
+ for (ucs, bytes) in __utf8_iter_ucs(msg):
+ if ucs is None:
+ width = bytes # Ugly ... should not feed bad utf8
+ else:
+ width = __utf8_ucp_width(ucs)
+
+ if chop is not None and (ret + width) > chop:
+ msg = msg[:msg_bytes]
+ break
+ ret += width
+ msg_bytes += bytes
+
+ if passed_unicode:
+ msg = to_unicode(msg)
+
+ return ret, msg
+
+def utf8_width_fill(msg, fill, chop=None, left=True):
+ """ Expand a utf8 msg to a specified "width" or chop to same.
+ Expansion can be left or right. """
+ passed_msg = msg
+ width, msg = utf8_width_chop(msg, chop)
+
+ if width < fill:
+ extra = " " * (fill - width)
+ if left:
+ msg = ''.join([msg, extra])
+ else:
+ msg = ''.join([extra, msg])
+
+ if isinstance(passed_msg, unicode):
+ return to_unicode(msg)
+
+ return msg
+
def utf8_valid(msg):
""" Return True/False is the text is valid utf8. """
for (ucs, bytes) in __utf8_iter_ucs(msg):
@@ -238,36 +284,45 @@ except:
if __name__ == "__main__":
import sys
- print " ---- Arguments/str ---- "
- for arg in sys.argv[1:]:
+ def out(arg):
arg = to_utf8(arg)
print "UTF8 :", arg
print "len :", len(arg)
+ arg = to_unicode(arg)
+ print "USC :", arg
+ print "len :", len(arg)
print "valid:", utf8_valid(arg)
print "width:", utf8_width(arg)
+ print "4.8 :", "%s%s%s" % ('<', utf8_width_fill(arg, 4, 8), '>')
+ print "4.3 :", "%s%s%s" % ('<', utf8_width_fill(arg, 4, 3), '>')
+ print "4.2 :", "%s%s%s" % ('<', utf8_width_fill(arg, 4, 2), '>')
+ print "4.1 :", "%s%s%s" % ('<', utf8_width_fill(arg, 4, 1), '>')
+ print "3.3 :", "%s%s%s" % ('<', utf8_width_fill(arg, 3, 3), '>')
+ print "3.2 :", "%s%s%s" % ('<', utf8_width_fill(arg, 3, 2), '>')
+ print "3.1 :", "%s%s%s" % ('<', utf8_width_fill(arg, 3, 1), '>')
+ print "40.79:", "%s%s%s" % ('<', utf8_width_fill(arg, 40, 79), '>')
+ print "40.20:", "%s%s%s" % ('<', utf8_width_fill(arg, 40, 20), '>')
print ''
+ print " ---- Arguments/str ---- "
+ for arg in sys.argv[1:]:
+ out(arg)
+
print " ---- Arguments/gettext ---- "
for arg in sys.argv[1:]:
- arg = to_utf8(_(arg))
- print "UTF8 :", arg
- print "len :", len(arg)
- print "valid:", utf8_valid(arg)
- print "width:", utf8_width(arg)
- print ''
+ try:
+ arg = _(arg)
+ except UnicodeDecodeError:
+ continue
+ out(arg)
if len(sys.argv) > 2:
print " ---- Arguments/str/all ---- "
- arg = to_utf8(sys.argv[1] % sys.argv[2:])
- print "UTF8 :", arg
- print "len :", len(arg)
- print "valid:", utf8_valid(arg)
- print "width:", utf8_width(arg)
- print ''
+ out(sys.argv[1] % sys.argv[2:])
+
print " ---- Arguments/gettext/all ---- "
- arg = to_utf8(_(sys.argv[1]) % map(_, sys.argv[2:]))
- print "UTF8 :", arg
- print "len :", len(arg)
- print "valid:", utf8_valid(arg)
- print "width:", utf8_width(arg)
- print ''
+ try:
+ arg = _(sys.argv[1]) % map(_, sys.argv[2:])
+ except UnicodeDecodeError:
+ sys.exit(0)
+ out(arg)
commit fed9a04e3973d4d827092e66b60f48004a28cd46
Author: James Antill <james at and.org>
Date: Fri Oct 31 11:45:44 2008 -0400
Add testing code for utf8_width
diff --git a/yum/i18n.py b/yum/i18n.py
old mode 100644
new mode 100755
index f14dc19..ba71e4f
--- a/yum/i18n.py
+++ b/yum/i18n.py
@@ -234,3 +234,40 @@ except:
returning the same text
'''
_ = dummy_wrapper
+
+if __name__ == "__main__":
+ import sys
+
+ print " ---- Arguments/str ---- "
+ for arg in sys.argv[1:]:
+ arg = to_utf8(arg)
+ print "UTF8 :", arg
+ print "len :", len(arg)
+ print "valid:", utf8_valid(arg)
+ print "width:", utf8_width(arg)
+ print ''
+
+ print " ---- Arguments/gettext ---- "
+ for arg in sys.argv[1:]:
+ arg = to_utf8(_(arg))
+ print "UTF8 :", arg
+ print "len :", len(arg)
+ print "valid:", utf8_valid(arg)
+ print "width:", utf8_width(arg)
+ print ''
+
+ if len(sys.argv) > 2:
+ print " ---- Arguments/str/all ---- "
+ arg = to_utf8(sys.argv[1] % sys.argv[2:])
+ print "UTF8 :", arg
+ print "len :", len(arg)
+ print "valid:", utf8_valid(arg)
+ print "width:", utf8_width(arg)
+ print ''
+ print " ---- Arguments/gettext/all ---- "
+ arg = to_utf8(_(sys.argv[1]) % map(_, sys.argv[2:]))
+ print "UTF8 :", arg
+ print "len :", len(arg)
+ print "valid:", utf8_valid(arg)
+ print "width:", utf8_width(arg)
+ print ''
commit ea1caa2bdd7e9d8fd22e2eeebef53ad810c94fdb
Author: James Antill <james at and.org>
Date: Wed Oct 29 17:05:05 2008 -0400
Add utf8_width to i18n, so we can do progress bars etc. well
diff --git a/yum/i18n.py b/yum/i18n.py
index 86f3ce2..f14dc19 100644
--- a/yum/i18n.py
+++ b/yum/i18n.py
@@ -13,11 +13,211 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+from yum.misc import to_unicode, to_utf8
+
def dummy_wrapper(str):
'''
Dummy Translation wrapper, just returning the same string.
'''
- return str
+ return to_unicode(str)
+
+# This is ported from ustr_utf8_* which I got from:
+# http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+# I've tried to leave it close to the original C (same names etc.) so that
+# it is easy to read/compare both versions...
+
+# ----------------------------- BEG utf8 -----------------------------
+# This is an implementation of wcwidth() and wcswidth() (defined in
+# IEEE Std 1002.1-2001) for Unicode.
+#
+# http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
+# http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
+#
+# In fixed-width output devices, Latin characters all occupy a single
+# "cell" position of equal width, whereas ideographic CJK characters
+# occupy two such cells. Interoperability between terminal-line
+# applications and (teletype-style) character terminals using the
+# UTF-8 encoding requires agreement on which character should advance
+# the cursor by how many cell positions. No established formal
+# standards exist at present on which Unicode character shall occupy
+# how many cell positions on character terminals. These routines are
+# a first attempt of defining such behavior based on simple rules
+# applied to data provided by the Unicode Consortium.
+#
+# [...]
+#
+# Markus Kuhn -- 2007-05-26 (Unicode 5.0)
+#
+# Permission to use, copy, modify, and distribute this software
+# for any purpose and without fee is hereby granted. The author
+# disclaims all warranties with regard to this software.
+#
+# Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+
+def __utf8_bisearch(ucs, table):
+ """ auxiliary function for binary search in interval table. """
+
+ min = 0
+ max = len(table) - 1
+ if ucs < table[min][0] or ucs > table[max][1]:
+ return False
+
+ while max >= min:
+ mid = (min + max) / 2;
+ if ucs > table[mid][1]:
+ min = mid + 1;
+ elif ucs < table[mid][0]:
+ max = mid - 1;
+ else:
+ return True
+
+ return False
+
+def __utf8_ucp_width(ucs):
+ """ Get the textual width of a ucs character. """
+ # sorted list of non-overlapping intervals of non-spacing characters
+ # generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c"
+ combining = [
+ ( 0x0300, 0x036F ), ( 0x0483, 0x0486 ), ( 0x0488, 0x0489 ),
+ ( 0x0591, 0x05BD ), ( 0x05BF, 0x05BF ), ( 0x05C1, 0x05C2 ),
+ ( 0x05C4, 0x05C5 ), ( 0x05C7, 0x05C7 ), ( 0x0600, 0x0603 ),
+ ( 0x0610, 0x0615 ), ( 0x064B, 0x065E ), ( 0x0670, 0x0670 ),
+ ( 0x06D6, 0x06E4 ), ( 0x06E7, 0x06E8 ), ( 0x06EA, 0x06ED ),
+ ( 0x070F, 0x070F ), ( 0x0711, 0x0711 ), ( 0x0730, 0x074A ),
+ ( 0x07A6, 0x07B0 ), ( 0x07EB, 0x07F3 ), ( 0x0901, 0x0902 ),
+ ( 0x093C, 0x093C ), ( 0x0941, 0x0948 ), ( 0x094D, 0x094D ),
+ ( 0x0951, 0x0954 ), ( 0x0962, 0x0963 ), ( 0x0981, 0x0981 ),
+ ( 0x09BC, 0x09BC ), ( 0x09C1, 0x09C4 ), ( 0x09CD, 0x09CD ),
+ ( 0x09E2, 0x09E3 ), ( 0x0A01, 0x0A02 ), ( 0x0A3C, 0x0A3C ),
+ ( 0x0A41, 0x0A42 ), ( 0x0A47, 0x0A48 ), ( 0x0A4B, 0x0A4D ),
+ ( 0x0A70, 0x0A71 ), ( 0x0A81, 0x0A82 ), ( 0x0ABC, 0x0ABC ),
+ ( 0x0AC1, 0x0AC5 ), ( 0x0AC7, 0x0AC8 ), ( 0x0ACD, 0x0ACD ),
+ ( 0x0AE2, 0x0AE3 ), ( 0x0B01, 0x0B01 ), ( 0x0B3C, 0x0B3C ),
+ ( 0x0B3F, 0x0B3F ), ( 0x0B41, 0x0B43 ), ( 0x0B4D, 0x0B4D ),
+ ( 0x0B56, 0x0B56 ), ( 0x0B82, 0x0B82 ), ( 0x0BC0, 0x0BC0 ),
+ ( 0x0BCD, 0x0BCD ), ( 0x0C3E, 0x0C40 ), ( 0x0C46, 0x0C48 ),
+ ( 0x0C4A, 0x0C4D ), ( 0x0C55, 0x0C56 ), ( 0x0CBC, 0x0CBC ),
+ ( 0x0CBF, 0x0CBF ), ( 0x0CC6, 0x0CC6 ), ( 0x0CCC, 0x0CCD ),
+ ( 0x0CE2, 0x0CE3 ), ( 0x0D41, 0x0D43 ), ( 0x0D4D, 0x0D4D ),
+ ( 0x0DCA, 0x0DCA ), ( 0x0DD2, 0x0DD4 ), ( 0x0DD6, 0x0DD6 ),
+ ( 0x0E31, 0x0E31 ), ( 0x0E34, 0x0E3A ), ( 0x0E47, 0x0E4E ),
+ ( 0x0EB1, 0x0EB1 ), ( 0x0EB4, 0x0EB9 ), ( 0x0EBB, 0x0EBC ),
+ ( 0x0EC8, 0x0ECD ), ( 0x0F18, 0x0F19 ), ( 0x0F35, 0x0F35 ),
+ ( 0x0F37, 0x0F37 ), ( 0x0F39, 0x0F39 ), ( 0x0F71, 0x0F7E ),
+ ( 0x0F80, 0x0F84 ), ( 0x0F86, 0x0F87 ), ( 0x0F90, 0x0F97 ),
+ ( 0x0F99, 0x0FBC ), ( 0x0FC6, 0x0FC6 ), ( 0x102D, 0x1030 ),
+ ( 0x1032, 0x1032 ), ( 0x1036, 0x1037 ), ( 0x1039, 0x1039 ),
+ ( 0x1058, 0x1059 ), ( 0x1160, 0x11FF ), ( 0x135F, 0x135F ),
+ ( 0x1712, 0x1714 ), ( 0x1732, 0x1734 ), ( 0x1752, 0x1753 ),
+ ( 0x1772, 0x1773 ), ( 0x17B4, 0x17B5 ), ( 0x17B7, 0x17BD ),
+ ( 0x17C6, 0x17C6 ), ( 0x17C9, 0x17D3 ), ( 0x17DD, 0x17DD ),
+ ( 0x180B, 0x180D ), ( 0x18A9, 0x18A9 ), ( 0x1920, 0x1922 ),
+ ( 0x1927, 0x1928 ), ( 0x1932, 0x1932 ), ( 0x1939, 0x193B ),
+ ( 0x1A17, 0x1A18 ), ( 0x1B00, 0x1B03 ), ( 0x1B34, 0x1B34 ),
+ ( 0x1B36, 0x1B3A ), ( 0x1B3C, 0x1B3C ), ( 0x1B42, 0x1B42 ),
+ ( 0x1B6B, 0x1B73 ), ( 0x1DC0, 0x1DCA ), ( 0x1DFE, 0x1DFF ),
+ ( 0x200B, 0x200F ), ( 0x202A, 0x202E ), ( 0x2060, 0x2063 ),
+ ( 0x206A, 0x206F ), ( 0x20D0, 0x20EF ), ( 0x302A, 0x302F ),
+ ( 0x3099, 0x309A ), ( 0xA806, 0xA806 ), ( 0xA80B, 0xA80B ),
+ ( 0xA825, 0xA826 ), ( 0xFB1E, 0xFB1E ), ( 0xFE00, 0xFE0F ),
+ ( 0xFE20, 0xFE23 ), ( 0xFEFF, 0xFEFF ), ( 0xFFF9, 0xFFFB ),
+ ( 0x10A01, 0x10A03 ), ( 0x10A05, 0x10A06 ), ( 0x10A0C, 0x10A0F ),
+ ( 0x10A38, 0x10A3A ), ( 0x10A3F, 0x10A3F ), ( 0x1D167, 0x1D169 ),
+ ( 0x1D173, 0x1D182 ), ( 0x1D185, 0x1D18B ), ( 0x1D1AA, 0x1D1AD ),
+ ( 0x1D242, 0x1D244 ), ( 0xE0001, 0xE0001 ), ( 0xE0020, 0xE007F ),
+ ( 0xE0100, 0xE01EF )]
+
+ # test for 8-bit control characters
+ if ucs == 0:
+ return 0
+
+ if ucs < 32 or (ucs >= 0x7f and ucs < 0xa0):
+ return (-1)
+
+ if __utf8_bisearch(ucs, combining):
+ return 0
+
+ # if we arrive here, ucs is not a combining or C0/C1 control character
+
+ return (1 +
+ (ucs >= 0x1100 and
+ (ucs <= 0x115f or # Hangul Jamo init. consonants
+ ucs == 0x2329 or ucs == 0x232a or
+ (ucs >= 0x2e80 and ucs <= 0xa4cf and
+ ucs != 0x303f) or # CJK ... Yi
+ (ucs >= 0xac00 and ucs <= 0xd7a3) or # Hangul Syllables
+ (ucs >= 0xf900 and ucs <= 0xfaff) or # CJK Compatibility Ideographs
+ (ucs >= 0xfe10 and ucs <= 0xfe19) or # Vertical forms
+ (ucs >= 0xfe30 and ucs <= 0xfe6f) or # CJK Compatibility Forms
+ (ucs >= 0xff00 and ucs <= 0xff60) or # Fullwidth Forms
+ (ucs >= 0xffe0 and ucs <= 0xffe6) or
+ (ucs >= 0x20000 and ucs <= 0x2fffd) or
+ (ucs >= 0x30000 and ucs <= 0x3fffd))))
+
+
+def __utf8_iter_ints(msg):
+ for byte in to_utf8(msg):
+ yield ord(byte)
+def __utf8_iter_ucs(msg):
+ uiter = __utf8_iter_ints(msg)
+ for byte0 in uiter:
+ if byte0 < 0x80: # 0xxxxxxx
+ yield (byte0, 1)
+ elif (byte0 & 0xe0) == 0xc0: # 110XXXXx 10xxxxxx
+ byte1 = uiter.next()
+ if (((byte0 & 0xc0) != 0x80) or
+ ((byte1 & 0xfe) == 0xc0)): # overlong?
+ yield (None, 2)
+ return
+ yield ((((byte0 & 0x1f) << 6) | (byte1 & 0x3f)), 2)
+ elif (byte0 & 0xf0) == 0xe0: # 1110XXXX 10Xxxxxx 10xxxxxx
+ byte1 = uiter.next()
+ byte2 = uiter.next()
+ if (((byte1 & 0xc0) != 0x80) or ((byte2 & 0xc0) != 0x80) or
+ ((byte0 == 0xe0) and ((byte1 & 0xe0) == 0x80)) or # overlong?
+ ((byte0 == 0xed) and ((byte1 & 0xe0) == 0xa0)) or # surrogate?
+ ((byte0 == 0xef) and (byte1 == 0xbf) and
+ ((byte2 & 0xfe) == 0xbe))): # U+FFFE or U+FFFF?
+ yield (None, 3)
+ return
+ yield ((((byte0 & 0x0f) << 12) | ((byte1 & 0x3f) << 6) |
+ (byte2 & 0x3f)), 3)
+ elif (byte0 & 0xf8) == 0xf0: # 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
+ byte1 = uiter.next()
+ byte2 = uiter.next()
+ byte3 = uiter.next()
+ if (((byte1 & 0xc0) != 0x80) or
+ ((byte2 & 0xc0) != 0x80) or
+ ((byte3 & 0xc0) != 0x80) or
+ ((byte0 == 0xf0) and ((byte1 & 0xf0) == 0x80)) or # overlong?
+ ((byte0 == 0xf4) and (byte1 > 0x8f)) or # > U+10FFFF?
+ (byte0 > 0xf4)): # > U+10FFFF?
+ yield (None, 4)
+ return
+
+ yield ((((byte0 & 0x07) << 18) | ((byte1 & 0x3f) << 12) |
+ ((byte2 & 0x3f) << 6) | (byte3 & 0x3f)), 4)
+ else:
+ yield (None, 1)
+ return
+
+def utf8_width(msg):
+ """ Get the textual width of a utf8 string. """
+ ret = 0
+ for (ucs, bytes) in __utf8_iter_ucs(msg):
+ if ucs is None:
+ ret += bytes # Ugly ... should not feed bad utf8
+ else:
+ ret += __utf8_ucp_width(ucs)
+ return ret
+
+def utf8_valid(msg):
+ """ Return True/False is the text is valid utf8. """
+ for (ucs, bytes) in __utf8_iter_ucs(msg):
+ if ucs is None:
+ return False
+ return True
+# ----------------------------- END utf8 -----------------------------
try:
'''
More information about the Yum-commits
mailing list