[Rpm-metadata] createrepo/__init__.py createrepo/readMetadata.py
skvidal at osuosl.org
skvidal at osuosl.org
Mon Jul 18 20:30:26 UTC 2011
createrepo/__init__.py | 40 +------
createrepo/readMetadata.py | 240 +++++++++++++--------------------------------
2 files changed, 83 insertions(+), 197 deletions(-)
New commits:
commit 0a67bc57a9eda626735513a4015d8087f3f4bb29
Author: Seth Vidal <skvidal at fedoraproject.org>
Date: Fri Jul 15 17:50:48 2011 -0400
switch --update to use the sqlite dbs instead of the xml files. Should massively impact
memory footprint and hopefully only marginally impact performance.
diff --git a/createrepo/__init__.py b/createrepo/__init__.py
index 44035cc..8549188 100644
--- a/createrepo/__init__.py
+++ b/createrepo/__init__.py
@@ -530,39 +530,19 @@ class MetaDataGenerator:
old_pkg = pkg
if pkg.find("://") != -1:
old_pkg = os.path.basename(pkg)
- nodes = self.oldData.getNodes(old_pkg)
- if nodes is not None: # we have a match in the old metadata
+ old_po = self.oldData.getNodes(old_pkg)
+ if old_po: # we have a match in the old metadata
if self.conf.verbose:
self.callback.log(_("Using data from old metadata for %s")
% pkg)
- (primarynode, filenode, othernode) = nodes
-
- for node, outfile in ((primarynode, self.primaryfile),
- (filenode, self.flfile),
- (othernode, self.otherfile)):
- if node is None:
- break
-
- if self.conf.baseurl:
- anode = node.children
- while anode is not None:
- if anode.type != "element":
- anode = anode.next
- continue
- if anode.name == "location":
- anode.setProp('xml:base', self.conf.baseurl)
- anode = anode.next
-
- output = node.serialize('UTF-8', self.conf.pretty)
- if output:
- outfile.write(output)
- else:
- if self.conf.verbose:
- self.callback.log(_("empty serialize on write to" \
- "%s in %s") % (outfile, pkg))
- outfile.write('\n')
-
- self.oldData.freeNodes(pkg)
+
+ if self.conf.baseurl: # if we have a baseurl set, reset the one
+ # in the old pkg
+ old_po.basepath = self.conf.baseurl
+ self.primaryfile.write(old_po.xml_dump_primary_metadata())
+ self.flfile.write(old_po.xml_dump_filelists_metadata())
+ self.otherfile.write(old_po.xml_dump_other_metadata())
+
#FIXME - if we're in update and we have deltas enabled
# check the presto data for this pkg and write its info back out
# to our deltafile
diff --git a/createrepo/readMetadata.py b/createrepo/readMetadata.py
index 27d3690..a449e68 100644
--- a/createrepo/readMetadata.py
+++ b/createrepo/readMetadata.py
@@ -16,11 +16,25 @@
# Copyright 2006 Red Hat
import os
-import libxml2
import stat
from utils import errorprint, _
-from yum import repoMDObject
+import yum
+from yum import misc
+
+
+class CreaterepoPkgOld(yum.sqlitesack.YumAvailablePackageSqlite):
+ # special for special people like us.
+ def _return_remote_location(self):
+
+ if self.basepath:
+ msg = """<location xml:base="%s" href="%s"/>\n""" % (
+ misc.to_xml(self.basepath, attrib=True),
+ misc.to_xml(self.relativepath, attrib=True))
+ else:
+ msg = """<location href="%s"/>\n""" % misc.to_xml(self.relativepath, attrib=True)
+
+ return msg
class MetadataIndex(object):
@@ -30,178 +44,70 @@ class MetadataIndex(object):
opts = {}
self.opts = opts
self.outputdir = outputdir
+ realpath = os.path.realpath(outputdir)
repodatadir = self.outputdir + '/repodata'
- myrepomdxml = repodatadir + '/repomd.xml'
- if os.path.exists(myrepomdxml):
- repomd = repoMDObject.RepoMD('garbageid', myrepomdxml)
- b = repomd.getData('primary').location[1]
- f = repomd.getData('filelists').location[1]
- o = repomd.getData('other').location[1]
- basefile = os.path.join(self.outputdir, b)
- filelistfile = os.path.join(self.outputdir, f)
- otherfile = os.path.join(self.outputdir, o)
- else:
- basefile = filelistfile = otherfile = ""
-
- self.files = {'base' : basefile,
- 'filelist' : filelistfile,
- 'other' : otherfile}
+ self._repo = yum.yumRepo.YumRepository('garbageid')
+ self._repo.baseurl = 'file://' + realpath
+ self._repo.basecachedir = misc.getCacheDir()
+ self._repo.metadata_expire = 1
+ self._repo.gpgcheck = 0
+ self._repo.repo_gpgcheck = 0
+ self._repo._sack = yum.sqlitesack.YumSqlitePackageSack(CreaterepoPkgOld)
+ self.pkg_tups_by_path = {}
self.scan()
+
def scan(self):
- """Read in and index old repo data"""
- self.basenodes = {}
- self.filesnodes = {}
- self.othernodes = {}
- self.pkg_ids = {}
+ """Read in old repodata"""
if self.opts.get('verbose'):
print _("Scanning old repo data")
- for fn in self.files.values():
- if not os.path.exists(fn):
- #cannot scan
- errorprint(_("Warning: Old repodata file missing: %s") % fn)
- return
- root = libxml2.parseFile(self.files['base']).getRootElement()
- self._scanPackageNodes(root, self._handleBase)
- if self.opts.get('verbose'):
- print _("Indexed %i base nodes" % len(self.basenodes))
- root = libxml2.parseFile(self.files['filelist']).getRootElement()
- self._scanPackageNodes(root, self._handleFiles)
- if self.opts.get('verbose'):
- print _("Indexed %i filelist nodes" % len(self.filesnodes))
- root = libxml2.parseFile(self.files['other']).getRootElement()
- self._scanPackageNodes(root, self._handleOther)
- if self.opts.get('verbose'):
- print _("Indexed %i other nodes" % len(self.othernodes))
- #reverse index pkg ids to track references
- self.pkgrefs = {}
- for relpath, pkgid in self.pkg_ids.iteritems():
- self.pkgrefs.setdefault(pkgid,[]).append(relpath)
-
- def _scanPackageNodes(self, root, handler):
- node = root.children
- while node is not None:
- if node.type != "element":
- node = node.next
+ self._repo.sack.populate(self._repo, 'all', None, False)
+ for thispo in self._repo.sack:
+ mtime = thispo.filetime
+ size = thispo.size
+ relpath = thispo.relativepath
+ do_stat = self.opts.get('do_stat', True)
+ if mtime is None:
+ print _("mtime missing for %s") % relpath
continue
- if node.name == "package":
- handler(node)
- node = node.next
-
- def _handleBase(self, node):
- top = node
- node = node.children
- pkgid = None
- mtime = None
- size = None
- relpath = None
- do_stat = self.opts.get('do_stat', True)
- while node is not None:
- if node.type != "element":
- node = node.next
+ if size is None:
+ print _("size missing for %s") % relpath
continue
- if node.name == "checksum":
- pkgid = node.content
- elif node.name == "time":
- mtime = int(node.prop('file'))
- elif node.name == "size":
- size = int(node.prop('package'))
- elif node.name == "location":
- relpath = node.prop('href')
- node = node.next
- if relpath is None:
- print _("Incomplete data for node")
- return
- if pkgid is None:
- print _("pkgid missing for %s") % relpath
- return
- if mtime is None:
- print _("mtime missing for %s") % relpath
- return
- if size is None:
- print _("size missing for %s") % relpath
- return
- if do_stat:
- filepath = os.path.join(self.opts['pkgdir'], relpath)
- try:
- st = os.stat(filepath)
- except OSError:
- #file missing -- ignore
- return
- if not stat.S_ISREG(st.st_mode):
- #ignore non files
- return
- #check size and mtime
- if st.st_size != size:
- if self.opts.get('verbose'):
- print _("Size (%i -> %i) changed for file %s") % (size,st.st_size,filepath)
- return
- if int(st.st_mtime) != mtime:
- if self.opts.get('verbose'):
- print _("Modification time changed for %s") % filepath
- return
- #otherwise we index
- self.basenodes[relpath] = top
- self.pkg_ids[relpath] = pkgid
-
- def _handleFiles(self, node):
- pkgid = node.prop('pkgid')
- if pkgid:
- self.filesnodes[pkgid] = node
-
- def _handleOther(self, node):
- pkgid = node.prop('pkgid')
- if pkgid:
- self.othernodes[pkgid] = node
+ if do_stat:
+ filepath = os.path.join(self.opts['pkgdir'], relpath)
+ try:
+ st = os.stat(filepath)
+ except OSError:
+ #file missing -- ignore
+ continue
+ if not stat.S_ISREG(st.st_mode):
+ #ignore non files
+ continue
+ #check size and mtime
+ if st.st_size != size:
+ if self.opts.get('verbose'):
+ print _("Size (%i -> %i) changed for file %s") % (size,st.st_size,filepath)
+ continue
+ if int(st.st_mtime) != mtime:
+ if self.opts.get('verbose'):
+ print _("Modification time changed for %s") % filepath
+ continue
+
+ self.pkg_tups_by_path[relpath] = thispo.pkgtup
+
- def getNodes(self, relpath):
- """Return base, filelist, and other nodes for file, if they exist
- Returns a tuple of nodes, or None if not found
+ def getNodes(self, relpath):
+ """return a package object based on relative path of pkg
"""
- bnode = self.basenodes.get(relpath,None)
- if bnode is None:
- return None
- pkgid = self.pkg_ids.get(relpath,None)
- if pkgid is None:
- print _("No pkgid found for: %s") % relpath
- return None
- fnode = self.filesnodes.get(pkgid,None)
- if fnode is None:
- return None
- onode = self.othernodes.get(pkgid,None)
- if onode is None:
- return None
- return bnode, fnode, onode
-
- def freeNodes(self,relpath):
- #causing problems
- """Free up nodes corresponding to file, if possible"""
- bnode = self.basenodes.get(relpath,None)
- if bnode is None:
- print "Missing node for %s" % relpath
- return
- bnode.unlinkNode()
- bnode.freeNode()
- del self.basenodes[relpath]
- pkgid = self.pkg_ids.get(relpath,None)
- if pkgid is None:
- print _("No pkgid found for: %s") % relpath
+ if relpath in self.pkg_tups_by_path:
+ pkgtup = self.pkg_tups_by_path[relpath]
+ return self._repo.sack.searchPkgTuple(pkgtup)[0]
+ else:
+ print _("No pkg found for: %s") % relpath
return None
- del self.pkg_ids[relpath]
- dups = self.pkgrefs.get(pkgid)
- dups.remove(relpath)
- if len(dups):
- #still referenced
- return
- del self.pkgrefs[pkgid]
- for nodes in self.filesnodes, self.othernodes:
- node = nodes.get(pkgid)
- if node is not None:
- node.unlinkNode()
- node.freeNode()
- del nodes[pkgid]
+
if __name__ == "__main__":
cwd = os.getcwd()
@@ -209,9 +115,9 @@ if __name__ == "__main__":
'pkgdir': cwd}
idx = MetadataIndex(cwd, opts)
- for fn in idx.basenodes.keys():
- a,b,c, = idx.getNodes(fn)
- a.serialize()
- b.serialize()
- c.serialize()
- idx.freeNodes(fn)
+ for fn in idx.pkg_tups_by_path:
+ po = idx.getNodes(fn)
+ print po.xml_dump_primary_metadata()
+ print po.xml_dump_filelists_metadata()
+ print po.xml_dump_other_metadata()
+
More information about the Rpm-metadata
mailing list