From f7ff2b07e8f74c420fd4b0c658d9a38274e75303 Mon Sep 17 00:00:00 2001
From: Per Cederqvist <ceder@lysator.liu.se>
Date: Sun, 20 Apr 2003 21:26:44 +0000
Subject: [PATCH] (scan): Performance fix: don't do a select per file we
 examine. 	Insert all of the files into a temporary table instead, and do
 	a left join to find out which of them that are new.
 (insert_tmp_files): New function.

---
 scanner.py | 70 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 22 deletions(-)

diff --git a/scanner.py b/scanner.py
index 9d0904b..1e7c8ea 100644
--- a/scanner.py
+++ b/scanner.py
@@ -52,6 +52,7 @@ class xreadchunks:
 
 def scan(DBH, dir_id):
     cursor = DBH.cursor()
+    inserter = DBH.cursor()
 
     cursor.execute("SELECT dir_name FROM base"
                    " WHERE dir_id = %d" % (dir_id, ))
@@ -61,34 +62,59 @@ def scan(DBH, dir_id):
                     "-printf '%TY-%Tm-%Td %TT %s %P\\0'",
                     "r")
 
+    cursor.execute("CREATE TEMPORARY TABLE cur_files ("
+                   " filename varchar(255) not null,"
+                   " mtime datetime not null,"
+                   " size bigint not null,"
+                   " dir_id bigint not null);")
+
+    vals = []
     for line in xreadchunks(find, "\0"):
 	mtime = line[0:19]
 	[filesize, filename] = line[20:].split(" ", 2)
 
-        cursor.execute("LOCK TABLES file WRITE")
-
-        cursor.execute("SELECT count(*) FROM file"
-                       " WHERE filename = %s"
-                       " AND dir_id = %s"
-                       " AND mtime = %s"
-                       " AND size = %s",
-                       (filename, dir_id, mtime, filesize))
-        count = cursor.fetchone()[0]
-        if count == 0:
-            h = file_hash(os.path.join(dir_name, filename))
-            cursor.execute("INSERT INTO file"
-                           " (filename, dir_id, mtime, size,"
-                           "  md5sum, sha1sum, verified, broken)"
-                           " VALUES"
-                           " (%s, %s, %s, %s,"
-                           "  %s, %s, NOW(), %s)",
-                           (filename, dir_id, mtime, filesize,
-                            h.md5, h.sha1, 0))
-
-        cursor.execute("UNLOCK TABLES")
-
+        vals.append((filename, mtime, filesize, dir_id))
+
+        if len(vals) > 100:
+            insert_tmp_files(cursor, vals)
+            vals = []
+    insert_tmp_files(cursor, vals)
+
+    cursor.execute("LOCK TABLES file WRITE, base READ")
+
+    cursor.execute("SELECT cur_files.filename, cur_files.size,"
+                   "    cur_files.mtime"
+                   " FROM cur_files"
+                   " LEFT JOIN file"
+                   " USING (filename, mtime, size, dir_id)"
+                   " WHERE file.filename IS NULL")
+    while 1:
+        res = cursor.fetchmany()
+        if len(res) == 0:
+            break
+        for [filename, filesize, mtime] in res:
+            fn = os.path.join(dir_name, filename)
+            h = file_hash(fn)
+            inserter.execute("INSERT INTO file"
+                             " (filename, dir_id, mtime, size,"
+                             "  md5sum, sha1sum, verified, broken)"
+                             " VALUES"
+                             " (%s, %s, %s, %s,"
+                             "  %s, %s, NOW(), %s)",
+                             (filename, dir_id, mtime, filesize,
+                              h.md5, h.sha1, 0))
+
+    cursor.execute("UNLOCK TABLES")
+    cursor.execute("DROP TABLE cur_files")
+
+    inserter.close()
     cursor.close()
 
+def insert_tmp_files(cursor, vals):
+    cursor.executemany("INSERT INTO cur_files (filename, mtime, size, dir_id)"
+                       " VALUES (%s, %s, %s, %s)",
+                       vals)
+
 def scan_all(DBH):
     outer = DBH.cursor()
     inner = DBH.cursor()
-- 
GitLab