CVS User Account cvsuser
Mon Mar 14 23:25:53 PST 2005
Log Message:
-----------
Changes to address problems found when running multiple Slony-I
clusters against a single backend...

The problem is that if two slons are invoked at almost exactly the same
time, then the cleanup threads will be pretty nearly in phase with one
another. Which gives a (regrettably) excellent chance that both will try
doing an ANALYZE of pg_catalog.pg_listeners concurrently. They'll both
try to commit statistics for it, and one slon will therefore fail.

1.  At startup time, a random "bias" is calculated, and the slon
    sleeps up to an extra 100000ms (e.g. - 100s) between iterations

2.  A further "fuzz" of up to 100s is added

3.  The 7 tables are vacuumed in 7 separate queries

4.  If any of the vacuums fails, this is treated as a SLON_ERROR, not
    as a SLON_FATAL problem requiring that the slon fall over.

Modified Files:
--------------
    slony1-engine/src/slon:
        cleanup_thread.c (r1.21 -> r1.22)

-------------- next part --------------
Index: cleanup_thread.c
===================================================================
RCS file: /usr/local/cvsroot/slony1/slony1-engine/src/slon/cleanup_thread.c,v
retrieving revision 1.21
retrieving revision 1.22
diff -Lsrc/slon/cleanup_thread.c -Lsrc/slon/cleanup_thread.c -u -w -r1.21 -r1.22
--- src/slon/cleanup_thread.c
+++ src/slon/cleanup_thread.c
@@ -32,8 +32,20 @@
  * ---------- Global data ----------
  */
 int			vac_frequency = SLON_VACUUM_FREQUENCY;
+static int vac_bias = 0;
 static unsigned long earliest_xid = 0;
 static unsigned long get_earliest_xid (PGconn *dbconn);
+
+/* The list of tables that need to be vacuumed by Slony-I */
+static char *table_list [] = {"%s.sl_event",
+			      "%s.sl_confirm",
+			      "%s.sl_setsync",
+			      "%s.sl_log_1",
+			      "%s.sl_log_2",
+			      "%s.sl_seqlog",
+			      "pg_catalog.pg_listener"};
+static char tstring[255];
+
 /*
  * ---------- cleanupThread_main
  *
@@ -60,6 +72,13 @@
 
 	slon_log(SLON_DEBUG1, "cleanupThread: thread starts\n");
 
+	/* Want the vacuum time bias to be between 0 and 100 seconds,
+	 * hence between 0 and 100000 */
+	if (vac_bias == 0) {
+		vac_bias = rand() % 100000;
+	}
+	slon_log(SLON_DEBUG4, "cleanupThread: bias = %d\n", vac_bias);
+
 	/*
 	 * Connect to the local database
 	 */
@@ -80,8 +99,13 @@
 
 	/*
 	 * Loop until shutdown time arrived
+	 *
+	 * Note the introduction of vac_bias and an up-to-100s random
+	 * "fuzz"; this reduces the likelihood that having multiple
+	 * slons hitting the same cluster will run into conflicts due
+	 * to trying to vacuum pg_listener concurrently
 	 */
-	while (sched_wait_time(conn, SCHED_WAIT_SOCK_READ, SLON_CLEANUP_SLEEP * 1000) == SCHED_STATUS_OK)
+	while (sched_wait_time(conn, SCHED_WAIT_SOCK_READ, SLON_CLEANUP_SLEEP * 1000 + vac_bias + (rand() % 100000)) == SCHED_STATUS_OK)
 	{
 		/*
 		 * Call the stored procedure cleanupEvent()
@@ -185,39 +209,24 @@
 			 * and event tables
 			 */
 			dstring_init(&query3);
+			gettimeofday(&tv_start, NULL);
+			for (t=0; t < 8; t++) {
+				sprintf(tstring, table_list[t], rtcfg_namespace);
 			slon_mkquery(&query3,
-				     "%s %s.sl_event; "
-				     "%s %s.sl_confirm; "
-				     "%s %s.sl_setsync; "
-				     "%s %s.sl_log_1; "
-				     "%s %s.sl_log_2;"
-				     "%s %s.sl_seqlog;"
-				     "%s pg_catalog.pg_listener;",
-				     vacuum_action,
-				     rtcfg_namespace,
-				     vacuum_action,
-				     rtcfg_namespace,
+					     "%s %s;",
 				     vacuum_action,
-				     rtcfg_namespace,
-				     vacuum_action,
-				     rtcfg_namespace,
-				     vacuum_action,
-				     rtcfg_namespace,
-				     vacuum_action,
-				     rtcfg_namespace,
-				     vacuum_action
-				     );
+					     tstring);
 
-			gettimeofday(&tv_start, NULL);
 			res = PQexec(dbconn, dstring_data(&query3));
 			if (PQresultStatus(res) != PGRES_COMMAND_OK)
 			{
-				slon_log(SLON_FATAL,
+				slon_log(SLON_ERROR,
 						 "cleanupThread: \"%s\" - %s",
 						 dstring_data(&query3), PQresultErrorMessage(res));
 				PQclear(res);
-				slon_abort();
-				break;
+				/* slon_abort();
+				   break; */
+			}
 			}
 			PQclear(res);
 			gettimeofday(&tv_end, NULL);


More information about the Slony1-commit mailing list