[Yum-devel] quick & dirty pysqlite2 experiments

Pekka Pietikainen pp at ee.oulu.fi
Wed Feb 8 23:07:08 UTC 2006


After being dared to submit patches on #fedora-devel I decided to quickly
do just that. These patches will eat your dog etc. and need quite a bit of
thought, but just to get some ideas out...

Cache creation time went down from 20 to 15 secs with this, the SELECT stuff
did slow down a bit, but my benchmark was probably pretty bad (just a time
yum list), went 6 to 9 secs for whatever reason.

TODO: (at least)

- Check all the select statements. In general no sql escaping needs to be done
  manually, just use ? in the SQL and give a tuple of parameters to the
  execute.

- For mass inserts, executemany() could be worth a try. Something like:

    def insertHashMany(self,table,keys, values_iterator, cursor):
        """Insert several hashes into a database table"""
        query = "INSERT INTO %s (%s) VALUES (%s)" % (table,','.join(keys),",".join(len(keys)*'?'))
        cursor.executemany(query, values_iterator())
        return cursor.lastrowid

        keys=('pkgKey','name','flags','epoch','version','release')
        for ptype in package.prco:
            def prco_iterator():
                for entry in package.prco[ptype]:
                    yield((pkgKey, entry.get('name'), 
                           entry.get('flags'), entry.get('epoch'),
                           entry.get('ver'), entry.get('rel')))
            self.insertHashMany(ptype,keys, prco_iterator, cur)
        
(didn't seem to make things much faster, maybe 14.5 vs. 15 secs for a 4000
package insert, I didn't convert all of the code though.

- Just make the db2class stuff a row factory, and drop that hacky foo-class :D

- I just noticed the LIKE '%%?%%' changes I did are totally fubar :D
Not sure if you can do 

execute('SELECT ... WHERE x LIKE ?', ('%FOO%'))

if that works it's the most robust way, definately.

All in all, an upgrade should allow for cleaner and faster code, but it
will require some work and probably refactoring too.

Please Cc: on replies, I'm not on the list

diff -urN orig/sqlitecache.py pysqlite2/sqlitecache.py
--- orig/sqlitecache.py	2006-02-09 00:03:00.000000000 +0200
+++ pysqlite2/sqlitecache.py	2006-02-09 00:15:22.000000000 +0200
@@ -19,7 +19,8 @@
 # - Add support for multiple checksums per rpm (not required)
 
 import os
-import sqlite
+#import sqlite
+from pysqlite2 import dbapi2 as sqlite
 import time
 import mdparser
 from sqlitesack import encodefiletypelist,encodefilenamelist
@@ -46,6 +47,7 @@
         matches the required dbversion"""
         db = sqlite.connect(filename)
         cur = db.cursor()
+        cur.row_factory = sqlite.Row
         cur.execute("select * from db_info")
         info = cur.fetchone()
         # If info is not in there this is an incompelete cache file
@@ -195,23 +197,8 @@
 
         keys = hash.keys()
         values = hash.values()
-        query = "INSERT INTO %s (" % (table)
-        query += ",".join(keys)
-        query += ") VALUES ("
-        # Quote all values by replacing None with NULL and ' by ''
-        for x in values:
-            if (x == None):
-              query += "NULL,"
-            else:
-              try:
-                query += "'%s'," % (x.replace("'","''"))
-              except AttributeError:
-                query += "'%s'," % x
-        # Remove the last , from query
-        query = query[:-1]
-        # And replace it with )
-        query += ")"
-        cursor.execute(query.encode('utf8'))
+        query = "INSERT INTO %s (%s) VALUES (%s)" % (table,','.join(keys),",".join(len(keys)*'?'))
+        cursor.execute(query, tuple(values))
         return cursor.lastrowid
              
     def makeSqliteCacheFile(self, filename, cachetype):
@@ -319,6 +306,7 @@
 
         # We start be removing the old db_info, as it is no longer valid
         cur = db.cursor()
+        cur.row_factory = sqlite.Row
         cur.execute("DELETE FROM db_info") 
 
         # First create a list of all pkgIds that are in the database now
@@ -372,7 +360,7 @@
         for table in deltables:
             cur.execute("DELETE FROM "+table+ " where pkgKey in %s" % delpkgs)
 
-        cur.execute("INSERT into db_info (dbversion,checksum) VALUES (%s,%s)",
+        cur.execute("INSERT into db_info (dbversion,checksum) VALUES (?,?)",
                 (dbversion,checksum))
         db.commit()
         self.log(2, "Added %s new packages, deleted %s old in %.2f seconds" % (
diff -urN orig/sqlitesack.py pysqlite2/sqlitesack.py
--- orig/sqlitesack.py	2006-02-09 00:02:53.000000000 +0200
+++ pysqlite2/sqlitesack.py	2006-02-09 00:15:22.000000000 +0200
@@ -25,7 +25,20 @@
 import repos
 from packages import YumAvailablePackage
 from repomd import mdUtils, mdErrors
+from pysqlite2 import dbapi2 as sqlite
 
+class foo:
+    def __init__(self):
+      pass
+    def __getitem__(self, item):
+      return getattr(self, item)    
+      
+def class_factory(cursor, row):
+    d = foo()
+    for idx, col in enumerate(cursor.description):
+       setattr(d,col[0],row[idx])
+    return d
+                    
 # Simple subclass of YumAvailablePackage that can load 'simple headers' from
 # the database when they are requested
 class YumAvailablePackageSqlite(YumAvailablePackage):
@@ -128,7 +141,7 @@
             cur = cache.cursor()
             cur.execute("select changelog.date as date,\
                 changelog.author as author,\
-                changelog.changelog as changelog from packages,changelog where packages.pkgId = %s and packages.pkgKey = changelog.pkgKey",pkgId)
+                changelog.changelog as changelog from packages,changelog where packages.pkgId = ? and packages.pkgKey = changelog.pkgKey",pkgId)
             for ob in cur.fetchall():
                 result.append(( ob['date'],
                                 ob['author'],
@@ -144,10 +157,10 @@
         for (rep, cache) in self.primarydb.items():
             cur = cache.cursor()
             for prco in result.keys():
-                cur.execute("select %s.name as name, %s.version as version,\
-                    %s.release as release, %s.epoch as epoch, %s.flags as flags\
-                    from packages,%s\
-                    where packages.pkgId = %s and packages.pkgKey = %s.pkgKey", prco, prco, prco, prco, prco, prco, pkgId, prco)
+                cur.execute("select ?.name as name, ?.version as version,\
+                    ?.release as release, ?.epoch as epoch, ?.flags as flags\
+                    from packages,?\
+                    where packages.pkgId = ? and packages.pkgKey = ?.pkgKey", prco, prco, prco, prco, prco, prco, pkgId, prco)
                 for ob in cur.fetchall():
                     name = ob['name']
                     version = ob['version']
@@ -166,7 +179,7 @@
             cur.execute("select filelist.dirname as dirname,\
                 filelist.filetypes as filetypes,\
                 filelist.filenames as filenames from packages,filelist\
-                where packages.pkgId = %s and packages.pkgKey = filelist.pkgKey", pkgId)
+                where packages.pkgId = ? and packages.pkgKey = filelist.pkgKey", pkgId)
             for ob in cur.fetchall():
                 found = True
                 dirname = ob['dirname']
@@ -192,7 +205,7 @@
         quotename = name.replace("'","''")
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
-            cur.execute("select DISTINCT packages.pkgId as pkgId from provides,packages where provides.name LIKE '%%%s%%' AND provides.pkgKey = packages.pkgKey" % quotename)
+            cur.execute("select DISTINCT packages.pkgId as pkgId from provides,packages where provides.name LIKE '%%%s%%' AND provides.pkgKey = packages.pkgKey" % quotename)
             for ob in cur.fetchall():
                 if (self.excludes[rep].has_key(ob['pkgId'])):
                     continue
@@ -211,9 +224,9 @@
                 filelist.filetypes as filetypes,\
                 filelist.filenames as filenames \
                 from packages,filelist where \
-                (filelist.dirname LIKE '%%%s%%' \
-                OR (filelist.dirname LIKE '%%%s%%' AND\
-                filelist.filenames LIKE '%%%s%%'))\
+                (filelist.dirname LIKE '%%?%%' \
+                OR (filelist.dirname LIKE '%%?%%' AND\
+                filelist.filenames LIKE '%%?%%'))\
                 AND (filelist.pkgKey = packages.pkgKey)" % (quotename,dir,filename))
             for ob in cur.fetchall():
                 # Check if it is an actual match
@@ -235,6 +248,7 @@
         obsoletes = {}
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
+            cur.row_factory = sqlite.Row
             cur.execute("select packages.name as name,\
                 packages.pkgId as pkgId,\
                 packages.arch as arch, packages.epoch as epoch,\
@@ -262,7 +276,7 @@
     def getPackageDetails(self,pkgId):
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
-            cur.execute("select * from packages where pkgId = %s",pkgId)
+            cur.execute("select * from packages where pkgId = ?",pkgId)
             for ob in cur.fetchall():
                 pkg = self.db2class(ob)
                 return pkg
@@ -272,7 +286,7 @@
         results = []
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
-            cur.execute("select * from %s where name = %s" , (prcotype, name))
+            cur.execute("select * from ? where name = ?" , (prcotype, name))
             prcos = cur.fetchall()
             for res in prcos:
                 cur.execute("select * from packages where pkgKey = %s" , (res['pkgKey']))
@@ -371,7 +385,7 @@
                     'value': db.checksum_value }
       y.time = {'build': db.time_build, 'file': db.time_file }
       y.size = {'package': db.size_package, 'archive': db.size_archive, 'installed': db.size_installed }
-      y.info = {'summary': db.summary, 'description': db['description'],
+      y.info = {'summary': db.summary, 'description': db.description,
                 'packager': db.rpm_packager, 'group': db.rpm_group,
                 'buildhost': db.rpm_buildhost, 'sourcerpm': db.rpm_sourcerpm,
                 'url': db.url, 'vendor': db.rpm_vendor, 'license': db.rpm_license }
@@ -383,11 +397,12 @@
         for (rep,cache) in self.primarydb.items():
             if (repoid == None or repoid == rep):
                 cur = cache.cursor()
+                cur.row_factory=sqlite.Row
                 cur.execute("select pkgId,name,epoch,version,release,arch from packages")
                 for pkg in cur.fetchall():
-                    if (self.excludes[rep].has_key(pkg.pkgId)):
+                    if (self.excludes[rep].has_key(pkg['pkgId'])):
                         continue                        
-                    simplelist.append((pkg.name, pkg.arch, pkg.epoch, pkg.version, pkg.release)) 
+                    simplelist.append((pkg['name'], pkg['arch'], pkg['epoch'], pkg['version'], pkg['release'])) 
                     
         return simplelist
 
@@ -402,6 +417,7 @@
         allpkg = []
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
+            cur.row_factory=class_factory
             cur.execute("select pkgId,name,epoch,version,release,arch from packages where name=%s and arch=%s",naTup)
             for x in cur.fetchall():
                 if (self.excludes[rep].has_key(x.pkgId)):
@@ -423,6 +439,7 @@
         allpkg = []
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
+            cur.row_factory = class_factory
             cur.execute("select pkgId,name,epoch,version,release,arch from packages where name=%s", name)
             for x in cur.fetchall():
                 if (self.excludes[rep].has_key(x.pkgId)):
@@ -440,6 +457,7 @@
         for (rep,cache) in self.primarydb.items():
             if (repoid == None or repoid == rep):
                 cur = cache.cursor()
+                cur.row_factory = class_factory
                 cur.execute("select pkgId,name,epoch,version,release,arch from packages")
                 for x in cur.fetchall():
                     if (self.excludes[rep].has_key(x.pkgId)):
@@ -472,6 +490,7 @@
         # Search all repositories            
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
+            cur.row_factory = class_factory
             #cur.execute("select * from packages WHERE name = %s AND epoch = %s AND version = %s AND release = %s AND arch = %s" , (name,epoch,ver,rel,arch))
             cur.execute(q)
             for x in cur.fetchall():
@@ -490,6 +509,7 @@
         querystring = tmpstring[:last]
         for (rep, cache) in self.primarydb.items():
             cur = cache.cursor()
+            cur.row_factory = class_factory
             cur.execute(querystring)
             for x in cur.fetchall():
                 obj = self.pc(self.db2class(x), rep)




More information about the Yum-devel mailing list