From f7ff2b07e8f74c420fd4b0c658d9a38274e75303 Mon Sep 17 00:00:00 2001 From: Per Cederqvist <ceder@lysator.liu.se> Date: Sun, 20 Apr 2003 21:26:44 +0000 Subject: [PATCH] (scan): Performance fix: don't do a select per file we examine. Insert all of the files into a temporary table instead, and do a left join to find out which of them that are new. (insert_tmp_files): New function. --- scanner.py | 70 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/scanner.py b/scanner.py index 9d0904b..1e7c8ea 100644 --- a/scanner.py +++ b/scanner.py @@ -52,6 +52,7 @@ class xreadchunks: def scan(DBH, dir_id): cursor = DBH.cursor() + inserter = DBH.cursor() cursor.execute("SELECT dir_name FROM base" " WHERE dir_id = %d" % (dir_id, )) @@ -61,34 +62,59 @@ def scan(DBH, dir_id): "-printf '%TY-%Tm-%Td %TT %s %P\\0'", "r") + cursor.execute("CREATE TEMPORARY TABLE cur_files (" + " filename varchar(255) not null," + " mtime datetime not null," + " size bigint not null," + " dir_id bigint not null);") + + vals = [] for line in xreadchunks(find, "\0"): mtime = line[0:19] [filesize, filename] = line[20:].split(" ", 2) - cursor.execute("LOCK TABLES file WRITE") - - cursor.execute("SELECT count(*) FROM file" - " WHERE filename = %s" - " AND dir_id = %s" - " AND mtime = %s" - " AND size = %s", - (filename, dir_id, mtime, filesize)) - count = cursor.fetchone()[0] - if count == 0: - h = file_hash(os.path.join(dir_name, filename)) - cursor.execute("INSERT INTO file" - " (filename, dir_id, mtime, size," - " md5sum, sha1sum, verified, broken)" - " VALUES" - " (%s, %s, %s, %s," - " %s, %s, NOW(), %s)", - (filename, dir_id, mtime, filesize, - h.md5, h.sha1, 0)) - - cursor.execute("UNLOCK TABLES") - + vals.append((filename, mtime, filesize, dir_id)) + + if len(vals) > 100: + insert_tmp_files(cursor, vals) + vals = [] + insert_tmp_files(cursor, vals) + + cursor.execute("LOCK TABLES file WRITE, base READ") + + cursor.execute("SELECT cur_files.filename, cur_files.size," + " cur_files.mtime" + " FROM cur_files" + " LEFT JOIN file" + " USING (filename, mtime, size, dir_id)" + " WHERE file.filename IS NULL") + while 1: + res = cursor.fetchmany() + if len(res) == 0: + break + for [filename, filesize, mtime] in res: + fn = os.path.join(dir_name, filename) + h = file_hash(fn) + inserter.execute("INSERT INTO file" + " (filename, dir_id, mtime, size," + " md5sum, sha1sum, verified, broken)" + " VALUES" + " (%s, %s, %s, %s," + " %s, %s, NOW(), %s)", + (filename, dir_id, mtime, filesize, + h.md5, h.sha1, 0)) + + cursor.execute("UNLOCK TABLES") + cursor.execute("DROP TABLE cur_files") + + inserter.close() cursor.close() +def insert_tmp_files(cursor, vals): + cursor.executemany("INSERT INTO cur_files (filename, mtime, size, dir_id)" + " VALUES (%s, %s, %s, %s)", + vals) + def scan_all(DBH): outer = DBH.cursor() inner = DBH.cursor() -- GitLab