[Rpm-metadata] [PATCH] switch --update to use the sqlite dbs instead of the xml files. Should massively impact memory footprint and hopefully only marginally impact performance.
tim.lauridsen at gmail.com
tim.lauridsen at gmail.com
Sun Jul 17 09:20:55 UTC 2011
On Fri, Jul 15, 2011 at 11:53 PM, Seth Vidal <skvidal at fedoraproject.org> wrote:
> ---
> createrepo/__init__.py | 40 ++------
> createrepo/readMetadata.py | 240 +++++++++++++------------------------------
> 2 files changed, 83 insertions(+), 197 deletions(-)
>
> diff --git a/createrepo/__init__.py b/createrepo/__init__.py
> index 44035cc..8549188 100644
> --- a/createrepo/__init__.py
> +++ b/createrepo/__init__.py
> @@ -530,39 +530,19 @@ class MetaDataGenerator:
> old_pkg = pkg
> if pkg.find("://") != -1:
> old_pkg = os.path.basename(pkg)
> - nodes = self.oldData.getNodes(old_pkg)
> - if nodes is not None: # we have a match in the old metadata
> + old_po = self.oldData.getNodes(old_pkg)
> + if old_po: # we have a match in the old metadata
> if self.conf.verbose:
> self.callback.log(_("Using data from old metadata for %s")
> % pkg)
> - (primarynode, filenode, othernode) = nodes
> -
> - for node, outfile in ((primarynode, self.primaryfile),
> - (filenode, self.flfile),
> - (othernode, self.otherfile)):
> - if node is None:
> - break
> -
> - if self.conf.baseurl:
> - anode = node.children
> - while anode is not None:
> - if anode.type != "element":
> - anode = anode.next
> - continue
> - if anode.name == "location":
> - anode.setProp('xml:base', self.conf.baseurl)
> - anode = anode.next
> -
> - output = node.serialize('UTF-8', self.conf.pretty)
> - if output:
> - outfile.write(output)
> - else:
> - if self.conf.verbose:
> - self.callback.log(_("empty serialize on write to" \
> - "%s in %s") % (outfile, pkg))
> - outfile.write('\n')
> -
> - self.oldData.freeNodes(pkg)
> +
> + if self.conf.baseurl: # if we have a baseurl set, reset the one
> + # in the old pkg
> + old_po.basepath = self.conf.baseurl
> + self.primaryfile.write(old_po.xml_dump_primary_metadata())
> + self.flfile.write(old_po.xml_dump_filelists_metadata())
> + self.otherfile.write(old_po.xml_dump_other_metadata())
> +
> #FIXME - if we're in update and we have deltas enabled
> # check the presto data for this pkg and write its info back out
> # to our deltafile
> diff --git a/createrepo/readMetadata.py b/createrepo/readMetadata.py
> index 27d3690..a449e68 100644
> --- a/createrepo/readMetadata.py
> +++ b/createrepo/readMetadata.py
> @@ -16,11 +16,25 @@
> # Copyright 2006 Red Hat
>
> import os
> -import libxml2
> import stat
> from utils import errorprint, _
>
> -from yum import repoMDObject
> +import yum
> +from yum import misc
> +
> +
> +class CreaterepoPkgOld(yum.sqlitesack.YumAvailablePackageSqlite):
> + # special for special people like us.
> + def _return_remote_location(self):
> +
> + if self.basepath:
> + msg = """<location xml:base="%s" href="%s"/>\n""" % (
> + misc.to_xml(self.basepath, attrib=True),
> + misc.to_xml(self.relativepath, attrib=True))
> + else:
> + msg = """<location href="%s"/>\n""" % misc.to_xml(self.relativepath, attrib=True)
> +
> + return msg
>
>
> class MetadataIndex(object):
> @@ -30,178 +44,70 @@ class MetadataIndex(object):
> opts = {}
> self.opts = opts
> self.outputdir = outputdir
> + realpath = os.path.realpath(outputdir)
> repodatadir = self.outputdir + '/repodata'
> - myrepomdxml = repodatadir + '/repomd.xml'
> - if os.path.exists(myrepomdxml):
> - repomd = repoMDObject.RepoMD('garbageid', myrepomdxml)
> - b = repomd.getData('primary').location[1]
> - f = repomd.getData('filelists').location[1]
> - o = repomd.getData('other').location[1]
> - basefile = os.path.join(self.outputdir, b)
> - filelistfile = os.path.join(self.outputdir, f)
> - otherfile = os.path.join(self.outputdir, o)
> - else:
> - basefile = filelistfile = otherfile = ""
> -
> - self.files = {'base' : basefile,
> - 'filelist' : filelistfile,
> - 'other' : otherfile}
> + self._repo = yum.yumRepo.YumRepository('garbageid')
> + self._repo.baseurl = 'file://' + realpath
> + self._repo.basecachedir = misc.getCacheDir()
> + self._repo.metadata_expire = 1
> + self._repo.gpgcheck = 0
> + self._repo.repo_gpgcheck = 0
> + self._repo._sack = yum.sqlitesack.YumSqlitePackageSack(CreaterepoPkgOld)
> + self.pkg_tups_by_path = {}
> self.scan()
> +
>
> def scan(self):
> - """Read in and index old repo data"""
> - self.basenodes = {}
> - self.filesnodes = {}
> - self.othernodes = {}
> - self.pkg_ids = {}
> + """Read in old repodata"""
> if self.opts.get('verbose'):
> print _("Scanning old repo data")
> - for fn in self.files.values():
> - if not os.path.exists(fn):
> - #cannot scan
> - errorprint(_("Warning: Old repodata file missing: %s") % fn)
> - return
> - root = libxml2.parseFile(self.files['base']).getRootElement()
> - self._scanPackageNodes(root, self._handleBase)
> - if self.opts.get('verbose'):
> - print _("Indexed %i base nodes" % len(self.basenodes))
> - root = libxml2.parseFile(self.files['filelist']).getRootElement()
> - self._scanPackageNodes(root, self._handleFiles)
> - if self.opts.get('verbose'):
> - print _("Indexed %i filelist nodes" % len(self.filesnodes))
> - root = libxml2.parseFile(self.files['other']).getRootElement()
> - self._scanPackageNodes(root, self._handleOther)
> - if self.opts.get('verbose'):
> - print _("Indexed %i other nodes" % len(self.othernodes))
> - #reverse index pkg ids to track references
> - self.pkgrefs = {}
> - for relpath, pkgid in self.pkg_ids.iteritems():
> - self.pkgrefs.setdefault(pkgid,[]).append(relpath)
> -
> - def _scanPackageNodes(self, root, handler):
> - node = root.children
> - while node is not None:
> - if node.type != "element":
> - node = node.next
> + self._repo.sack.populate(self._repo, 'all', None, False)
> + for thispo in self._repo.sack:
> + mtime = thispo.filetime
> + size = thispo.size
> + relpath = thispo.relativepath
> + do_stat = self.opts.get('do_stat', True)
> + if mtime is None:
> + print _("mtime missing for %s") % relpath
> continue
> - if node.name == "package":
> - handler(node)
> - node = node.next
> -
> - def _handleBase(self, node):
> - top = node
> - node = node.children
> - pkgid = None
> - mtime = None
> - size = None
> - relpath = None
> - do_stat = self.opts.get('do_stat', True)
> - while node is not None:
> - if node.type != "element":
> - node = node.next
> + if size is None:
> + print _("size missing for %s") % relpath
> continue
> - if node.name == "checksum":
> - pkgid = node.content
> - elif node.name == "time":
> - mtime = int(node.prop('file'))
> - elif node.name == "size":
> - size = int(node.prop('package'))
> - elif node.name == "location":
> - relpath = node.prop('href')
> - node = node.next
> - if relpath is None:
> - print _("Incomplete data for node")
> - return
> - if pkgid is None:
> - print _("pkgid missing for %s") % relpath
> - return
> - if mtime is None:
> - print _("mtime missing for %s") % relpath
> - return
> - if size is None:
> - print _("size missing for %s") % relpath
> - return
> - if do_stat:
> - filepath = os.path.join(self.opts['pkgdir'], relpath)
> - try:
> - st = os.stat(filepath)
> - except OSError:
> - #file missing -- ignore
> - return
> - if not stat.S_ISREG(st.st_mode):
> - #ignore non files
> - return
> - #check size and mtime
> - if st.st_size != size:
> - if self.opts.get('verbose'):
> - print _("Size (%i -> %i) changed for file %s") % (size,st.st_size,filepath)
> - return
> - if int(st.st_mtime) != mtime:
> - if self.opts.get('verbose'):
> - print _("Modification time changed for %s") % filepath
> - return
> - #otherwise we index
> - self.basenodes[relpath] = top
> - self.pkg_ids[relpath] = pkgid
> -
> - def _handleFiles(self, node):
> - pkgid = node.prop('pkgid')
> - if pkgid:
> - self.filesnodes[pkgid] = node
> -
> - def _handleOther(self, node):
> - pkgid = node.prop('pkgid')
> - if pkgid:
> - self.othernodes[pkgid] = node
> + if do_stat:
> + filepath = os.path.join(self.opts['pkgdir'], relpath)
> + try:
> + st = os.stat(filepath)
> + except OSError:
> + #file missing -- ignore
> + continue
> + if not stat.S_ISREG(st.st_mode):
> + #ignore non files
> + continue
> + #check size and mtime
> + if st.st_size != size:
> + if self.opts.get('verbose'):
> + print _("Size (%i -> %i) changed for file %s") % (size,st.st_size,filepath)
> + continue
> + if int(st.st_mtime) != mtime:
> + if self.opts.get('verbose'):
> + print _("Modification time changed for %s") % filepath
> + continue
> +
> + self.pkg_tups_by_path[relpath] = thispo.pkgtup
> +
>
> - def getNodes(self, relpath):
> - """Return base, filelist, and other nodes for file, if they exist
>
> - Returns a tuple of nodes, or None if not found
> + def getNodes(self, relpath):
> + """return a package object based on relative path of pkg
> """
> - bnode = self.basenodes.get(relpath,None)
> - if bnode is None:
> - return None
> - pkgid = self.pkg_ids.get(relpath,None)
> - if pkgid is None:
> - print _("No pkgid found for: %s") % relpath
> - return None
> - fnode = self.filesnodes.get(pkgid,None)
> - if fnode is None:
> - return None
> - onode = self.othernodes.get(pkgid,None)
> - if onode is None:
> - return None
> - return bnode, fnode, onode
> -
> - def freeNodes(self,relpath):
> - #causing problems
> - """Free up nodes corresponding to file, if possible"""
> - bnode = self.basenodes.get(relpath,None)
> - if bnode is None:
> - print "Missing node for %s" % relpath
> - return
> - bnode.unlinkNode()
> - bnode.freeNode()
> - del self.basenodes[relpath]
> - pkgid = self.pkg_ids.get(relpath,None)
> - if pkgid is None:
> - print _("No pkgid found for: %s") % relpath
> + if relpath in self.pkg_tups_by_path:
> + pkgtup = self.pkg_tups_by_path[relpath]
> + return self._repo.sack.searchPkgTuple(pkgtup)[0]
> + else:
> + print _("No pkg found for: %s") % relpath
> return None
> - del self.pkg_ids[relpath]
> - dups = self.pkgrefs.get(pkgid)
> - dups.remove(relpath)
> - if len(dups):
> - #still referenced
> - return
> - del self.pkgrefs[pkgid]
> - for nodes in self.filesnodes, self.othernodes:
> - node = nodes.get(pkgid)
> - if node is not None:
> - node.unlinkNode()
> - node.freeNode()
> - del nodes[pkgid]
>
> +
>
> if __name__ == "__main__":
> cwd = os.getcwd()
> @@ -209,9 +115,9 @@ if __name__ == "__main__":
> 'pkgdir': cwd}
>
> idx = MetadataIndex(cwd, opts)
> - for fn in idx.basenodes.keys():
> - a,b,c, = idx.getNodes(fn)
> - a.serialize()
> - b.serialize()
> - c.serialize()
> - idx.freeNodes(fn)
> + for fn in idx.pkg_tups_by_path:
> + po = idx.getNodes(fn)
> + print po.xml_dump_primary_metadata()
> + print po.xml_dump_filelists_metadata()
> + print po.xml_dump_other_metadata()
> +
> --
> 1.7.4.4
>
> _______________________________________________
> Rpm-metadata mailing list
> Rpm-metadata at lists.baseurl.org
> http://lists.baseurl.org/mailman/listinfo/rpm-metadata
>
ACK, Looks good to me
Tim
More information about the Rpm-metadata
mailing list