CVS User Account cvsuser
Wed Apr 13 22:21:28 PDT 2005
Log Message:
-----------
Address synchronization problem where a MOVE_SET event may
get to cascaded subscribers *AFTER* the new origin has started
generating SYNCs. The subscriber ignored those syncs since it doesn't
yet know the origin has moved; this can lead to data loss.

The ACCEPT_SET is added as a remedy; upon submitting the MOVE_SET,
it also raises, on the new origin, an ACCEPT_SET event.  That
will necessarily precede any SYNCs coming from the new origin.

Nodes that get the ACCEPT_SET event will wait until they also get
the MOVE_SET event before proceeding, protecting from data loss.

Tags:
----
REL_1_0_STABLE

Modified Files:
--------------
    slony1-engine/src/slon:
        remote_worker.c (r1.55.2.7 -> r1.55.2.8)
    slony1-engine/src/backend:
        slony1_funcs.sql (r1.15.2.14 -> r1.15.2.15)

-------------- next part --------------
Index: remote_worker.c
===================================================================
RCS file: /usr/local/cvsroot/slony1/slony1-engine/src/slon/remote_worker.c,v
retrieving revision 1.55.2.7
retrieving revision 1.55.2.8
diff -Lsrc/slon/remote_worker.c -Lsrc/slon/remote_worker.c -u -w -r1.55.2.7 -r1.55.2.8
--- src/slon/remote_worker.c
+++ src/slon/remote_worker.c
@@ -744,6 +744,66 @@
 						rtcfg_namespace,
 						trig_tabid, trig_tgname);
 			}
+			else if (strcmp(event->ev_type, "ACCEPT_SET") == 0)
+			{
+			    /* If we're a remote node, and haven't yet
+			     * received the MOVE_SET event from the
+			     * new origin, then we'll need to sleep a
+			     * bit...  This avoids a race condition
+			     * where new SYNCs take place on the new
+			     * origin, and are ignored on some
+			     * subscribers (and their children)
+			     * because the MOVE_SET wasn't yet
+			     * received and processed  */
+				
+				int		set_id = (int) strtol(event->ev_data1, NULL, 10);
+				int		old_origin = (int) strtol(event->ev_data2, NULL, 10);
+				int		new_origin = (int) strtol(event->ev_data3, NULL, 10);
+			    PGresult   *res;
+
+			    if ((rtcfg_nodeid != old_origin) && (rtcfg_nodeid != new_origin)) {
+					slon_mkquery(&query1, 
+								 "select 1 from %s.sl_event accept "
+								 "where "
+								 "   accept.ev_type = 'ACCEPT_SET' and "
+								 "   accept.ev_origin = %d and "
+								 "   accept.ev_data1 = %d and "
+								 "   accept.ev_data2 = %d and "
+								 "   accept.ev_data3 = %d and "
+								 "   not exists  "
+								 "   (select 1 from %s.sl_event move "
+								 "    where "
+								 "      accept.ev_origin = move.ev_data3 and "
+								 "      move.ev_type = 'MOVE_SET' and "
+								 "      move.ev_data1 = accept.ev_data1 and "
+								 "      move.ev_data2 = accept.ev_data2 and "
+								 "      move.ev_data3 = accept.ev_data3 and "
+								 "      move.ev_seqno = accept.ev_data4); ",
+								 
+								 rtcfg_namespace, 
+								 old_origin, set_id, old_origin, new_origin, 
+								 rtcfg_namespace);
+					res = PQexec(local_dbconn, dstring_data(&query1));
+					while (PQntuples(res) > 0) {
+						int sleeptime = 15;
+				    int sched_rc;
+				    slon_log(SLON_WARN, "remoteWorkerThread_%d: "
+							 "accept set: node has not yet received MOVE_SET event "
+							 "for set %d old origin %d new origin - sleep %d seconds\n",
+							 rtcfg_nodeid, set_id, old_origin, new_origin, sleeptime);
+				    sched_rc = sched_msleep(node, sleeptime * 1000);
+				    if (sched_rc != SCHED_STATUS_OK) {
+						event_ok = false;
+						break;
+				    } else {
+						if (sleeptime < 60)
+							sleeptime *= 2;
+				    }
+				    if (query_execute(node, local_dbconn, &query1) < 0)
+						slon_abort();
+					}
+			    }
+			}
 			else if (strcmp(event->ev_type, "MOVE_SET") == 0)
 			{
 				int		set_id = (int) strtol(event->ev_data1, NULL, 10);
@@ -758,6 +818,7 @@
 				 * chain. To catch up with that, we need to execute
 				 * it now and select the resulting provider for us.
 				 */
+
 				slon_appendquery(&query1,
 						"select %s.moveSet_int(%d, %d, %d); ",
 						rtcfg_namespace,
Index: slony1_funcs.sql
===================================================================
RCS file: /usr/local/cvsroot/slony1/slony1-engine/src/backend/slony1_funcs.sql,v
retrieving revision 1.15.2.14
retrieving revision 1.15.2.15
diff -Lsrc/backend/slony1_funcs.sql -Lsrc/backend/slony1_funcs.sql -u -w -r1.15.2.14 -r1.15.2.15
--- src/backend/slony1_funcs.sql
+++ src/backend/slony1_funcs.sql
@@ -1657,14 +1657,28 @@
 		end loop;
 	end if;
 
+	-- On the new origin, raise an event - ACCEPT_SET
+	if v_local_node_id = p_new_origin then
+		-- Find the event number from the origin
+		select max(ev_seqno) as seqno into v_sub_row 
+			from @NAMESPACE at .sl_event
+			where ev_type = ''MOVE_SET'' and
+			  ev_data1 = p_set_id and
+			  ev_data2 = p_old_origin and
+			  ev_data3 = p_new_origin and
+			  ev_origin = p_old_origin;
+		
+		perform @NAMESPACE at .createEvent(''_ at CLUSTERNAME@'', ''ACCEPT_SET'', 
+			p_set_id, p_old_origin, p_new_origin, v_sub_row.seqno);
+	end if;
+
 	-- ----
 	-- Next we have to reverse the subscription path
 	-- ----
 	v_sub_last = p_new_origin;
 	select sub_provider into v_sub_node
 			from @NAMESPACE at .sl_subscribe
-			where sub_set = p_set_id
-			and sub_receiver = p_new_origin;
+			where sub_receiver = p_new_origin;
 	if not found then
 		raise exception ''Slony-I: subscription path broken in moveSet_int'';
 	end if;
@@ -3451,6 +3465,7 @@
 				''Slony-I: set provider and receiver cannot be identical'';
 	end if;
 
+
 	-- ---
 	-- Check to see if the set contains any tables - gripe if not - bug #1226
 	-- ---


More information about the Slony1-commit mailing list