[Rpm-metadata] createrepo/__init__.py createrepo/readMetadata.py

skvidal at osuosl.org skvidal at osuosl.org
Mon Jul 18 20:30:26 UTC 2011


 createrepo/__init__.py     |   40 +------
 createrepo/readMetadata.py |  240 +++++++++++++--------------------------------
 2 files changed, 83 insertions(+), 197 deletions(-)

New commits:
commit 0a67bc57a9eda626735513a4015d8087f3f4bb29
Author: Seth Vidal <skvidal at fedoraproject.org>
Date:   Fri Jul 15 17:50:48 2011 -0400

    switch --update to use the sqlite dbs instead of the xml files. Should massively impact
    memory footprint and hopefully only marginally impact performance.

diff --git a/createrepo/__init__.py b/createrepo/__init__.py
index 44035cc..8549188 100644
--- a/createrepo/__init__.py
+++ b/createrepo/__init__.py
@@ -530,39 +530,19 @@ class MetaDataGenerator:
                 old_pkg = pkg
                 if pkg.find("://") != -1:
                     old_pkg = os.path.basename(pkg)
-                nodes = self.oldData.getNodes(old_pkg)
-                if nodes is not None: # we have a match in the old metadata
+                old_po = self.oldData.getNodes(old_pkg)
+                if old_po: # we have a match in the old metadata
                     if self.conf.verbose:
                         self.callback.log(_("Using data from old metadata for %s")
                                             % pkg)
-                    (primarynode, filenode, othernode) = nodes
-
-                    for node, outfile in ((primarynode, self.primaryfile),
-                                          (filenode, self.flfile),
-                                          (othernode, self.otherfile)):
-                        if node is None:
-                            break
-
-                        if self.conf.baseurl:
-                            anode = node.children
-                            while anode is not None:
-                                if anode.type != "element":
-                                    anode = anode.next
-                                    continue
-                                if anode.name == "location":
-                                    anode.setProp('xml:base', self.conf.baseurl)
-                                anode = anode.next
-
-                        output = node.serialize('UTF-8', self.conf.pretty)
-                        if output:
-                            outfile.write(output)
-                        else:
-                            if self.conf.verbose:
-                                self.callback.log(_("empty serialize on write to" \
-                                                    "%s in %s") % (outfile, pkg))
-                        outfile.write('\n')
-
-                    self.oldData.freeNodes(pkg)
+
+                    if self.conf.baseurl: # if we have a baseurl set, reset the one
+                                          # in the old pkg
+                        old_po.basepath = self.conf.baseurl
+                    self.primaryfile.write(old_po.xml_dump_primary_metadata())
+                    self.flfile.write(old_po.xml_dump_filelists_metadata())
+                    self.otherfile.write(old_po.xml_dump_other_metadata())
+
                     #FIXME - if we're in update and we have deltas enabled
                     # check the presto data for this pkg and write its info back out
                     # to our deltafile
diff --git a/createrepo/readMetadata.py b/createrepo/readMetadata.py
index 27d3690..a449e68 100644
--- a/createrepo/readMetadata.py
+++ b/createrepo/readMetadata.py
@@ -16,11 +16,25 @@
 # Copyright 2006 Red Hat
 
 import os
-import libxml2
 import stat
 from utils import errorprint, _
 
-from yum import repoMDObject
+import yum
+from yum import misc
+
+
+class CreaterepoPkgOld(yum.sqlitesack.YumAvailablePackageSqlite):
+    # special for special people like us.
+    def _return_remote_location(self):
+
+        if self.basepath:
+            msg = """<location xml:base="%s" href="%s"/>\n""" % (
+                                     misc.to_xml(self.basepath, attrib=True),
+                                     misc.to_xml(self.relativepath, attrib=True))
+        else:
+            msg = """<location href="%s"/>\n""" % misc.to_xml(self.relativepath, attrib=True)
+
+        return msg  
 
 
 class MetadataIndex(object):
@@ -30,178 +44,70 @@ class MetadataIndex(object):
             opts = {}
         self.opts = opts
         self.outputdir = outputdir
+        realpath = os.path.realpath(outputdir)
         repodatadir = self.outputdir + '/repodata'
-        myrepomdxml = repodatadir + '/repomd.xml'
-        if os.path.exists(myrepomdxml):
-            repomd = repoMDObject.RepoMD('garbageid', myrepomdxml)
-            b = repomd.getData('primary').location[1]
-            f = repomd.getData('filelists').location[1]
-            o = repomd.getData('other').location[1]
-            basefile = os.path.join(self.outputdir, b)
-            filelistfile = os.path.join(self.outputdir, f)
-            otherfile = os.path.join(self.outputdir, o)
-        else:
-            basefile = filelistfile = otherfile = ""
-
-        self.files = {'base' : basefile,
-                      'filelist' : filelistfile,
-                      'other' : otherfile}
+        self._repo = yum.yumRepo.YumRepository('garbageid')
+        self._repo.baseurl = 'file://' + realpath
+        self._repo.basecachedir = misc.getCacheDir()
+        self._repo.metadata_expire = 1
+        self._repo.gpgcheck = 0
+        self._repo.repo_gpgcheck = 0
+        self._repo._sack = yum.sqlitesack.YumSqlitePackageSack(CreaterepoPkgOld)
+        self.pkg_tups_by_path = {}
         self.scan()
+        
 
     def scan(self):
-        """Read in and index old repo data"""
-        self.basenodes = {}
-        self.filesnodes = {}
-        self.othernodes = {}
-        self.pkg_ids = {}
+        """Read in old repodata"""
         if self.opts.get('verbose'):
             print _("Scanning old repo data")
-        for fn in self.files.values():
-            if not os.path.exists(fn):
-                #cannot scan
-                errorprint(_("Warning: Old repodata file missing: %s") % fn)
-                return
-        root = libxml2.parseFile(self.files['base']).getRootElement()
-        self._scanPackageNodes(root, self._handleBase)
-        if self.opts.get('verbose'):
-            print _("Indexed %i base nodes" % len(self.basenodes))
-        root = libxml2.parseFile(self.files['filelist']).getRootElement()
-        self._scanPackageNodes(root, self._handleFiles)
-        if self.opts.get('verbose'):
-            print _("Indexed %i filelist nodes" % len(self.filesnodes))
-        root = libxml2.parseFile(self.files['other']).getRootElement()
-        self._scanPackageNodes(root, self._handleOther)
-        if self.opts.get('verbose'):
-            print _("Indexed %i other nodes" % len(self.othernodes))
-        #reverse index pkg ids to track references
-        self.pkgrefs = {}
-        for relpath, pkgid in self.pkg_ids.iteritems():
-            self.pkgrefs.setdefault(pkgid,[]).append(relpath)
-
-    def _scanPackageNodes(self, root, handler):
-        node = root.children
-        while node is not None:
-            if node.type != "element":
-                node = node.next
+        self._repo.sack.populate(self._repo, 'all', None, False)
+        for thispo in self._repo.sack:
+            mtime = thispo.filetime
+            size = thispo.size
+            relpath = thispo.relativepath
+            do_stat = self.opts.get('do_stat', True)
+            if mtime is None:
+                print _("mtime missing for %s") % relpath
                 continue
-            if node.name == "package":
-                handler(node)
-            node = node.next
-
-    def _handleBase(self, node):
-        top = node
-        node = node.children
-        pkgid = None
-        mtime = None
-        size = None
-        relpath = None
-        do_stat = self.opts.get('do_stat', True)
-        while node is not None:
-            if node.type != "element":
-                node = node.next
+            if size is None:
+                print _("size missing for %s") % relpath
                 continue
-            if node.name == "checksum":
-                pkgid = node.content
-            elif node.name == "time":
-                mtime = int(node.prop('file'))
-            elif node.name == "size":
-                size = int(node.prop('package'))
-            elif node.name == "location":
-                relpath = node.prop('href')
-            node = node.next
-        if relpath is None:
-            print _("Incomplete data for node")
-            return
-        if pkgid is None:
-            print _("pkgid missing for %s") % relpath
-            return
-        if mtime is None:
-            print _("mtime missing for %s") % relpath
-            return
-        if size is None:
-            print _("size missing for %s") % relpath
-            return
-        if do_stat:
-            filepath = os.path.join(self.opts['pkgdir'], relpath)
-            try:
-                st = os.stat(filepath)
-            except OSError:
-                #file missing -- ignore
-                return
-            if not stat.S_ISREG(st.st_mode):
-                #ignore non files
-                return
-            #check size and mtime
-            if st.st_size != size:
-                if self.opts.get('verbose'):
-                    print _("Size (%i -> %i) changed for file %s") % (size,st.st_size,filepath)
-                return
-            if int(st.st_mtime) != mtime:
-                if self.opts.get('verbose'):
-                    print _("Modification time changed for %s") % filepath
-                return
-        #otherwise we index
-        self.basenodes[relpath] = top
-        self.pkg_ids[relpath] = pkgid
-
-    def _handleFiles(self, node):
-        pkgid = node.prop('pkgid')
-        if pkgid:
-            self.filesnodes[pkgid] = node
-
-    def _handleOther(self, node):
-        pkgid = node.prop('pkgid')
-        if pkgid:
-            self.othernodes[pkgid] = node
+            if do_stat:
+                filepath = os.path.join(self.opts['pkgdir'], relpath)
+                try:
+                    st = os.stat(filepath)
+                except OSError:
+                    #file missing -- ignore
+                    continue
+                if not stat.S_ISREG(st.st_mode):
+                    #ignore non files
+                    continue
+                #check size and mtime
+                if st.st_size != size:
+                    if self.opts.get('verbose'):
+                        print _("Size (%i -> %i) changed for file %s") % (size,st.st_size,filepath)
+                    continue
+                if int(st.st_mtime) != mtime:
+                    if self.opts.get('verbose'):
+                        print _("Modification time changed for %s") % filepath
+                    continue
+
+            self.pkg_tups_by_path[relpath] = thispo.pkgtup
+
 
-    def getNodes(self, relpath):
-        """Return base, filelist, and other nodes for file, if they exist
 
-        Returns a tuple of nodes, or None if not found
+    def getNodes(self, relpath):
+        """return a package object based on relative path of pkg
         """
-        bnode = self.basenodes.get(relpath,None)
-        if bnode is None:
-            return None
-        pkgid = self.pkg_ids.get(relpath,None)
-        if pkgid is None:
-            print _("No pkgid found for: %s") % relpath
-            return None
-        fnode = self.filesnodes.get(pkgid,None)
-        if fnode is None:
-            return None
-        onode = self.othernodes.get(pkgid,None)
-        if onode is None:
-            return None
-        return bnode, fnode, onode
-
-    def freeNodes(self,relpath):
-        #causing problems
-        """Free up nodes corresponding to file, if possible"""
-        bnode = self.basenodes.get(relpath,None)
-        if bnode is None:
-            print "Missing node for %s" % relpath
-            return
-        bnode.unlinkNode()
-        bnode.freeNode()
-        del self.basenodes[relpath]
-        pkgid = self.pkg_ids.get(relpath,None)
-        if pkgid is None:
-            print _("No pkgid found for: %s") % relpath
+        if relpath in self.pkg_tups_by_path:
+            pkgtup = self.pkg_tups_by_path[relpath]
+            return self._repo.sack.searchPkgTuple(pkgtup)[0]
+        else:
+            print _("No pkg found for: %s") % relpath
             return None
-        del self.pkg_ids[relpath]
-        dups = self.pkgrefs.get(pkgid)
-        dups.remove(relpath)
-        if len(dups):
-            #still referenced
-            return
-        del self.pkgrefs[pkgid]
-        for nodes in self.filesnodes, self.othernodes:
-            node = nodes.get(pkgid)
-            if node is not None:
-                node.unlinkNode()
-                node.freeNode()
-                del nodes[pkgid]
 
+    
 
 if __name__ == "__main__":
     cwd = os.getcwd()
@@ -209,9 +115,9 @@ if __name__ == "__main__":
             'pkgdir': cwd}
 
     idx = MetadataIndex(cwd, opts)
-    for fn in idx.basenodes.keys():
-        a,b,c, = idx.getNodes(fn)
-        a.serialize()
-        b.serialize()
-        c.serialize()
-        idx.freeNodes(fn)
+    for fn in idx.pkg_tups_by_path:
+        po = idx.getNodes(fn)
+        print po.xml_dump_primary_metadata()
+        print po.xml_dump_filelists_metadata()
+        print po.xml_dump_other_metadata()
+


More information about the Rpm-metadata mailing list