Glyn Astill glynastill at yahoo.co.uk
Mon Mar 1 08:40:00 PST 2010
--- On Mon, 1/3/10, Lawrence Giam <lawrenceg at globalitcreations.com> wrote:
> Hi 
> All,
>  
> I am trying to setup 
> a monitoring system using nagios to monitor the Slony
> replication. I did some 
> changes to the psql_replication_check.pl script but I
> am not getting the 
> correct response back from the script. The script is
> suppose to check the 
> threshold but after shutting down the slon daemon on the
> slave, the result 
> return is still positive.
>  
> I hereby include the 
> part of the script that I have changed.
>  
> my $query = 'SELECT 
> * FROM _abc.sl_status' ;
>  
> # Get the 
> results
> ## Update to use sl_status
> ## tuple[0] : st_origin
> ## tuple[1] 
> : st_received
> ## tuple[2] : st_last_event
> ## tuple[3] : 
> st_last_event_ts
> ## tuple[4] : st_last_received
> ## tuple[5] : 
> st_last_received_ts
> ## tuple[6] : st_last_received_event_ts
> ## tuple[7] : 
> st_lag_num_events
> ## tuple[8] : st_lag_time
> @tuple = 
> $res->fetchrow;
>  
> # Debugging
> # 
> Uncomment the below to swap the minute for seconds. 
> This is to 
> simulate
> # crazy replication times for when replication is not
> falling 
> behind.
> #$rep_time[1] = $rep_time[2]
>  
> # Check for a 
> warning
> if ($tuple[8] >= $threshold_warning and $tuple[8] < 
> $threshold_critical)
> {
>        
> print("WARNING: ST_Origin $tuple[0], ST_Received
> $tuple[1], Behind $tuple[8] 
> minutes\n");
>        
> exit(1);
> }
> # Or for a 
> critical
> elsif ($tuple[8] >= 
> $threshold_critical)
> {
>        
> print("CRITICAL: ST_Origin $tuple[0], ST_Received
> $tuple[1], Behind $tuple[8] 
> minutes\n");
>        
> exit(2);
> }
> # Otherwise, 
> everything is ok
> else
> {
>         
> printf("OK: ST_Origin $tuple[0], ST_Received
> $tuple[1], Behind $tuple[8] 
> minute%s\n",$tuple[8] == 1 ? "" :
> "s" 
> );
>         
> exit(0);
> }
>  
> I am trying to use 
> the sl_status st_lag_time to check the lag difference but
> somehow the script is 
> not right. Can anyone help me change the
> script?
>

I have a similar script (see below) also you mucht be interested in checking out chack_postgres.pl as they've just implimented a slony lag check too.

Glyn

--------------

#!/usr/bin/perl
# $Id: test.pl,v 1.0 2008-01-30 12:00:30 Glyn Astill Exp $#

use DBI;
use strict;

use Getopt::Long qw/GetOptions/;
Getopt::Long::Configure('no_ignore_case');

my $dbh;
my $sth;
my @node;
my $field;
my $query;
my $result;
my $problems = 0;
my $USAGE = '-h <host> -p <port> -db <database> -u <username> (not recommended -P <password>) -c <cluster> -e <lag events> -t <lag seconds>';

##
## Command line options
##

##http://www.perl.com/doc/manual/html/lib/Getopt/Long.html
use vars qw{%opt};

die $USAGE unless 
	GetOptions(\%opt,
		   'host|H=s',
		   'port=s',
		   'dbname|db=s',
		   'dbuser|u=s',
		   'dbpass|P=s',
		   'cluster|c=s',
		   'events|e:i',
		   'lagtime|t:i',
		   )
	and keys %opt
	and ! @ARGV;

my $dsn = "DBI:Pg:dbname=$opt{dbname};host=$opt{host};port=$opt{port};";

#This should use a pgpass file automatically if password not specified
$dbh = DBI->connect($dsn, $opt{dbuser}, $opt{dbpass});

if ($dbh) {   
   $query = 'SELECT st_origin, st_received, st_lag_num_events, round(extract(epoch from st_lag_time)) from "_'.$opt{cluster}.'".sl_status';

   $sth = $dbh->prepare($query);
   if (!defined($sth)) {
      print "POSTGRES_REPLICATION_LAG CRITICAL: Cannot prepare $DBI::errstr\n";
      exit(2);
   }
   if (!$sth->execute) {
      print "POSTGRES_REPLICATION_LAG CRITICAL: Cannot execute $DBI::errstr\n";
      exit(2);
   }

   while (@node = $sth->fetchrow) {      
      $result = $result . "Subscriber " . $node[1] . " on Origin " . $node[0] . " : Event lag=" . $node[2];
      if (($opt{events} > 0) && ($opt{events} < $node[2])){
      	 $result = $result . " (behind " . ($node[2] - $opt{events}) . ") ";
      	 $problems++;
      }
      $result = $result . " Time lag=" . $node[3] . "s";
      if (($opt{lagtime} > 0) && ($opt{lagtime} < $node[3])) {
	 $result = $result . " (behind " . ($node[3] - $opt{lagtime}) . "s) ";
	 $problems++;
      }
      $result = $result . "\n";
      
   }
   
   if ($problems > 0){
   	$result = "POSTGRES_REPLICATION_LAG CRITICAL: " . $result . "\n";
   	print $result;
   	exit (2);
   } else {
   	$result = "POSTGRES_REPLICATION_LAG OK: " . $result . "\n";
   	print $result;
   	exit (0);
   }
   
   print $problems;
   

   $sth->finish;
   $dbh->disconnect();
} else {
   print "POSTGRES_REPLICATION_LAG UNKNOWN: Cannot connect to Postgres server: $DBI::errstr\n";
   exit(3);
}

exit;

__END__


      


More information about the Slony1-general mailing list