[Rpm-metadata] [PATCH] switch --update to use the sqlite dbs instead of the xml files. Should massively impact memory footprint and hopefully only marginally impact performance.
Seth Vidal
skvidal at fedoraproject.org
Fri Jul 15 21:53:43 UTC 2011
---
createrepo/__init__.py | 40 ++------
createrepo/readMetadata.py | 240 +++++++++++++------------------------------
2 files changed, 83 insertions(+), 197 deletions(-)
diff --git a/createrepo/__init__.py b/createrepo/__init__.py
index 44035cc..8549188 100644
--- a/createrepo/__init__.py
+++ b/createrepo/__init__.py
@@ -530,39 +530,19 @@ class MetaDataGenerator:
old_pkg = pkg
if pkg.find("://") != -1:
old_pkg = os.path.basename(pkg)
- nodes = self.oldData.getNodes(old_pkg)
- if nodes is not None: # we have a match in the old metadata
+ old_po = self.oldData.getNodes(old_pkg)
+ if old_po: # we have a match in the old metadata
if self.conf.verbose:
self.callback.log(_("Using data from old metadata for %s")
% pkg)
- (primarynode, filenode, othernode) = nodes
-
- for node, outfile in ((primarynode, self.primaryfile),
- (filenode, self.flfile),
- (othernode, self.otherfile)):
- if node is None:
- break
-
- if self.conf.baseurl:
- anode = node.children
- while anode is not None:
- if anode.type != "element":
- anode = anode.next
- continue
- if anode.name == "location":
- anode.setProp('xml:base', self.conf.baseurl)
- anode = anode.next
-
- output = node.serialize('UTF-8', self.conf.pretty)
- if output:
- outfile.write(output)
- else:
- if self.conf.verbose:
- self.callback.log(_("empty serialize on write to" \
- "%s in %s") % (outfile, pkg))
- outfile.write('\n')
-
- self.oldData.freeNodes(pkg)
+
+ if self.conf.baseurl: # if we have a baseurl set, reset the one
+ # in the old pkg
+ old_po.basepath = self.conf.baseurl
+ self.primaryfile.write(old_po.xml_dump_primary_metadata())
+ self.flfile.write(old_po.xml_dump_filelists_metadata())
+ self.otherfile.write(old_po.xml_dump_other_metadata())
+
#FIXME - if we're in update and we have deltas enabled
# check the presto data for this pkg and write its info back out
# to our deltafile
diff --git a/createrepo/readMetadata.py b/createrepo/readMetadata.py
index 27d3690..a449e68 100644
--- a/createrepo/readMetadata.py
+++ b/createrepo/readMetadata.py
@@ -16,11 +16,25 @@
# Copyright 2006 Red Hat
import os
-import libxml2
import stat
from utils import errorprint, _
-from yum import repoMDObject
+import yum
+from yum import misc
+
+
+class CreaterepoPkgOld(yum.sqlitesack.YumAvailablePackageSqlite):
+ # special for special people like us.
+ def _return_remote_location(self):
+
+ if self.basepath:
+ msg = """<location xml:base="%s" href="%s"/>\n""" % (
+ misc.to_xml(self.basepath, attrib=True),
+ misc.to_xml(self.relativepath, attrib=True))
+ else:
+ msg = """<location href="%s"/>\n""" % misc.to_xml(self.relativepath, attrib=True)
+
+ return msg
class MetadataIndex(object):
@@ -30,178 +44,70 @@ class MetadataIndex(object):
opts = {}
self.opts = opts
self.outputdir = outputdir
+ realpath = os.path.realpath(outputdir)
repodatadir = self.outputdir + '/repodata'
- myrepomdxml = repodatadir + '/repomd.xml'
- if os.path.exists(myrepomdxml):
- repomd = repoMDObject.RepoMD('garbageid', myrepomdxml)
- b = repomd.getData('primary').location[1]
- f = repomd.getData('filelists').location[1]
- o = repomd.getData('other').location[1]
- basefile = os.path.join(self.outputdir, b)
- filelistfile = os.path.join(self.outputdir, f)
- otherfile = os.path.join(self.outputdir, o)
- else:
- basefile = filelistfile = otherfile = ""
-
- self.files = {'base' : basefile,
- 'filelist' : filelistfile,
- 'other' : otherfile}
+ self._repo = yum.yumRepo.YumRepository('garbageid')
+ self._repo.baseurl = 'file://' + realpath
+ self._repo.basecachedir = misc.getCacheDir()
+ self._repo.metadata_expire = 1
+ self._repo.gpgcheck = 0
+ self._repo.repo_gpgcheck = 0
+ self._repo._sack = yum.sqlitesack.YumSqlitePackageSack(CreaterepoPkgOld)
+ self.pkg_tups_by_path = {}
self.scan()
+
def scan(self):
- """Read in and index old repo data"""
- self.basenodes = {}
- self.filesnodes = {}
- self.othernodes = {}
- self.pkg_ids = {}
+ """Read in old repodata"""
if self.opts.get('verbose'):
print _("Scanning old repo data")
- for fn in self.files.values():
- if not os.path.exists(fn):
- #cannot scan
- errorprint(_("Warning: Old repodata file missing: %s") % fn)
- return
- root = libxml2.parseFile(self.files['base']).getRootElement()
- self._scanPackageNodes(root, self._handleBase)
- if self.opts.get('verbose'):
- print _("Indexed %i base nodes" % len(self.basenodes))
- root = libxml2.parseFile(self.files['filelist']).getRootElement()
- self._scanPackageNodes(root, self._handleFiles)
- if self.opts.get('verbose'):
- print _("Indexed %i filelist nodes" % len(self.filesnodes))
- root = libxml2.parseFile(self.files['other']).getRootElement()
- self._scanPackageNodes(root, self._handleOther)
- if self.opts.get('verbose'):
- print _("Indexed %i other nodes" % len(self.othernodes))
- #reverse index pkg ids to track references
- self.pkgrefs = {}
- for relpath, pkgid in self.pkg_ids.iteritems():
- self.pkgrefs.setdefault(pkgid,[]).append(relpath)
-
- def _scanPackageNodes(self, root, handler):
- node = root.children
- while node is not None:
- if node.type != "element":
- node = node.next
+ self._repo.sack.populate(self._repo, 'all', None, False)
+ for thispo in self._repo.sack:
+ mtime = thispo.filetime
+ size = thispo.size
+ relpath = thispo.relativepath
+ do_stat = self.opts.get('do_stat', True)
+ if mtime is None:
+ print _("mtime missing for %s") % relpath
continue
- if node.name == "package":
- handler(node)
- node = node.next
-
- def _handleBase(self, node):
- top = node
- node = node.children
- pkgid = None
- mtime = None
- size = None
- relpath = None
- do_stat = self.opts.get('do_stat', True)
- while node is not None:
- if node.type != "element":
- node = node.next
+ if size is None:
+ print _("size missing for %s") % relpath
continue
- if node.name == "checksum":
- pkgid = node.content
- elif node.name == "time":
- mtime = int(node.prop('file'))
- elif node.name == "size":
- size = int(node.prop('package'))
- elif node.name == "location":
- relpath = node.prop('href')
- node = node.next
- if relpath is None:
- print _("Incomplete data for node")
- return
- if pkgid is None:
- print _("pkgid missing for %s") % relpath
- return
- if mtime is None:
- print _("mtime missing for %s") % relpath
- return
- if size is None:
- print _("size missing for %s") % relpath
- return
- if do_stat:
- filepath = os.path.join(self.opts['pkgdir'], relpath)
- try:
- st = os.stat(filepath)
- except OSError:
- #file missing -- ignore
- return
- if not stat.S_ISREG(st.st_mode):
- #ignore non files
- return
- #check size and mtime
- if st.st_size != size:
- if self.opts.get('verbose'):
- print _("Size (%i -> %i) changed for file %s") % (size,st.st_size,filepath)
- return
- if int(st.st_mtime) != mtime:
- if self.opts.get('verbose'):
- print _("Modification time changed for %s") % filepath
- return
- #otherwise we index
- self.basenodes[relpath] = top
- self.pkg_ids[relpath] = pkgid
-
- def _handleFiles(self, node):
- pkgid = node.prop('pkgid')
- if pkgid:
- self.filesnodes[pkgid] = node
-
- def _handleOther(self, node):
- pkgid = node.prop('pkgid')
- if pkgid:
- self.othernodes[pkgid] = node
+ if do_stat:
+ filepath = os.path.join(self.opts['pkgdir'], relpath)
+ try:
+ st = os.stat(filepath)
+ except OSError:
+ #file missing -- ignore
+ continue
+ if not stat.S_ISREG(st.st_mode):
+ #ignore non files
+ continue
+ #check size and mtime
+ if st.st_size != size:
+ if self.opts.get('verbose'):
+ print _("Size (%i -> %i) changed for file %s") % (size,st.st_size,filepath)
+ continue
+ if int(st.st_mtime) != mtime:
+ if self.opts.get('verbose'):
+ print _("Modification time changed for %s") % filepath
+ continue
+
+ self.pkg_tups_by_path[relpath] = thispo.pkgtup
+
- def getNodes(self, relpath):
- """Return base, filelist, and other nodes for file, if they exist
- Returns a tuple of nodes, or None if not found
+ def getNodes(self, relpath):
+ """return a package object based on relative path of pkg
"""
- bnode = self.basenodes.get(relpath,None)
- if bnode is None:
- return None
- pkgid = self.pkg_ids.get(relpath,None)
- if pkgid is None:
- print _("No pkgid found for: %s") % relpath
- return None
- fnode = self.filesnodes.get(pkgid,None)
- if fnode is None:
- return None
- onode = self.othernodes.get(pkgid,None)
- if onode is None:
- return None
- return bnode, fnode, onode
-
- def freeNodes(self,relpath):
- #causing problems
- """Free up nodes corresponding to file, if possible"""
- bnode = self.basenodes.get(relpath,None)
- if bnode is None:
- print "Missing node for %s" % relpath
- return
- bnode.unlinkNode()
- bnode.freeNode()
- del self.basenodes[relpath]
- pkgid = self.pkg_ids.get(relpath,None)
- if pkgid is None:
- print _("No pkgid found for: %s") % relpath
+ if relpath in self.pkg_tups_by_path:
+ pkgtup = self.pkg_tups_by_path[relpath]
+ return self._repo.sack.searchPkgTuple(pkgtup)[0]
+ else:
+ print _("No pkg found for: %s") % relpath
return None
- del self.pkg_ids[relpath]
- dups = self.pkgrefs.get(pkgid)
- dups.remove(relpath)
- if len(dups):
- #still referenced
- return
- del self.pkgrefs[pkgid]
- for nodes in self.filesnodes, self.othernodes:
- node = nodes.get(pkgid)
- if node is not None:
- node.unlinkNode()
- node.freeNode()
- del nodes[pkgid]
+
if __name__ == "__main__":
cwd = os.getcwd()
@@ -209,9 +115,9 @@ if __name__ == "__main__":
'pkgdir': cwd}
idx = MetadataIndex(cwd, opts)
- for fn in idx.basenodes.keys():
- a,b,c, = idx.getNodes(fn)
- a.serialize()
- b.serialize()
- c.serialize()
- idx.freeNodes(fn)
+ for fn in idx.pkg_tups_by_path:
+ po = idx.getNodes(fn)
+ print po.xml_dump_primary_metadata()
+ print po.xml_dump_filelists_metadata()
+ print po.xml_dump_other_metadata()
+
--
1.7.4.4
More information about the Rpm-metadata
mailing list