[Yum-devel] [PATCH] Avoid converting to unicode and back in dump_xml_*. BZ 716235.

Tue Nov 27 17:42:19 UTC 2012

On Fri, 2012-11-23 at 10:51 +0100, Zdeněk Pavlas wrote:
> Some speedup in createpo (from 2 to 6 percent)
> ---
>  yum/packages.py |   43 +++++++++++++++++++------------------------
>  1 files changed, 19 insertions(+), 24 deletions(-)

 What did you test this on?

> diff --git a/yum/packages.py b/yum/packages.py
> index 1ce45f4..6bbff7a 100644
> --- a/yum/packages.py
> +++ b/yum/packages.py
> @@ -857,22 +857,14 @@ class YumAvailablePackage(PackageObject, RpmBase):
>          if hasattr(self, '_committer_ret'):
>              return self._committer_ret
>  
> -        def _nf2ascii(x):
> -            """ does .encode("ascii", "replace") but it never fails. """
> -            ret = []
> -            for val in x:
> -                if ord(val) >= 128:
> -                    val = '?'
> -                ret.append(val)
> -            return "".join(ret)
> -
>          if not len(self.changelog): # Empty changelog is _possible_ I guess
>              self._committer_ret = self.packager
>              return self._committer_ret
>          val = self.changelog[0][1]
>          # Chagnelog data is in multiple locale's, so we convert to ascii
>          # ignoring "bad" chars.
> -        val = _nf2ascii(val)
> +        val = misc.to_unicode(val, 'replace')
> +        val = val.encode('ascii', 'replace')

 This is old, so maybe python got fixed since then ... but _nf2ascii was
written because we got a lot of randomly encoded stuff in changelog
files and python would just traceback when it hit it.

>          # Hacky way to get rid of version numbers...
>          ix = val.find('> ')
>          if ix != -1:
> @@ -1123,10 +1115,10 @@ class YumAvailablePackage(PackageObject, RpmBase):
>          
>          packager = url = ''
>          if self.packager:
> -            packager = misc.to_unicode(misc.to_xml(self.packager))
> +            packager = misc.to_xml(self.packager)
>          
>          if self.url:
> -            url = misc.to_unicode(misc.to_xml(self.url))
> +            url = misc.to_xml(self.url)
>          (csum_type, csum, csumid) = self.checksums[0]
>          msg = """
>    <name>%s</name>
> @@ -1140,8 +1132,8 @@ class YumAvailablePackage(PackageObject, RpmBase):
>    <time file="%s" build="%s"/>
>    <size package="%s" installed="%s" archive="%s"/>\n""" % (self.name, 
>           self.arch, self.epoch, self.ver, self.rel, csum_type, csum, 
> -         misc.to_unicode(misc.to_xml(self.summary)), 
> -         misc.to_unicode(misc.to_xml(self.description)), 
> +         misc.to_xml(self.summary),
> +         misc.to_xml(self.description),
>           packager, url, self.filetime, 
>           self.buildtime, self.packagesize, self.installedsize, self.archivesize)

 These might be fine, but...

> @@ -1314,25 +1306,28 @@ class YumAvailablePackage(PackageObject, RpmBase):
>  
>      def xml_dump_primary_metadata(self):
>          msg = """\n<package type="rpm">"""
> -        msg += misc.to_unicode(self._dump_base_items())
> -        msg += misc.to_unicode(self._dump_format_items())
> +        msg += self._dump_base_items()
> +        msg += self._dump_format_items()
>          msg += """\n</package>"""
> -        return misc.to_utf8(msg)
> +        assert type(msg) is str
> +        return msg

 This scares me as python has traditionally been _very_ picky about how
+= behaves ... Eg.

# 1. fine, msg is still str at the end.
msg = "x"
msg += "☺"

# 2. fine, msg is now unicode at the end.
msg = "x"
msg += "☺".decode('utf-8') 

# 3. fine, msg is still unicode at the end.
msg = u"x"
msg += "☺".decode('utf-8') 

# 4. fine, msg is still unicode at the end.
msg = u"x"
msg += "y"

# 5. python goes bang.
msg = u"x"
msg += "☺"

# 6. python goes bang, due auto. conversion in #2.
msg = "x"
msg += "☺".decode('utf-8') 
msg += "☺"

# 7. python goes bang, same reason as #5.
msg = "x"
msg += u"x" 
msg += "☺"

# 8. python goes bang, because it hates everyone.
msg = "x"
msg += "☺"
msg += u"x" 

>      def xml_dump_filelists_metadata(self):
>          msg = """\n<package pkgid="%s" name="%s" arch="%s">
>      <version epoch="%s" ver="%s" rel="%s"/>\n""" % (self.checksum, self.name, 
>                                       self.arch, self.epoch, self.ver, self.rel)
> -        msg += misc.to_unicode(self._dump_files())
> +        msg += self._dump_files()
>          msg += "</package>\n"
> -        return misc.to_utf8(msg)
> +        assert type(msg) is str
> +        return msg
>  
>      def xml_dump_other_metadata(self, clog_limit=0):
>          msg = """\n<package pkgid="%s" name="%s" arch="%s">
>      <version epoch="%s" ver="%s" rel="%s"/>\n""" % (self.checksum, self.name, 
>                                       self.arch, self.epoch, self.ver, self.rel)
> -        msg += "%s\n</package>\n" % misc.to_unicode(self._dump_changelog(clog_limit))
> -        return misc.to_utf8(msg)
> +        msg += "%s\n</package>\n" % self._dump_changelog(clog_limit)
> +        assert type(msg) is str
> +        return msg
>  
> 
>  # HACK: This is completely retarded. Don't blame me, someone just fix
> @@ -1519,9 +1514,9 @@ class YumHeaderPackage(YumAvailablePackage):
>          # then create a _loadChangelog() method to put them into the 
>          # self._changelog attr
>          if len(self.hdr['changelogname']) > 0:
> -            return zip(misc.to_unicode(self.hdr['changelogtime'], errors='replace'),
> -                       misc.to_unicode(self.hdr['changelogname'], errors='replace'),
> -                       misc.to_unicode(self.hdr['changelogtext'], errors='replace'))
> +            return zip(self.hdr['changelogtime'],
> +                       self.hdr['changelogname'],
> +                       self.hdr['changelogtext'])

 This is changing the types we are returning too.