snap_checkpoint #3

  • //
  • guest/
  • michael_mirman/
  • snap_checkpoint/
  • snap_checkpoint
  • View
  • Commits
  • Open Download .zip Download (36 KB)
#!/usr/local/bin/perl
#
# This script supports Veritas snapshotting.  It was derived from
# the Stephen Vance's version of Richard Geiger's script.
# References:
# //guest/stephen_vance/utils/snap_checkpoint/snap_checkpoint#5
# http://public.perforce.com/guest/richard_geiger/utils/snap_checkpoint/snap_checkpoint.html
# --Michael Mirman, The MathWorks, Inc., March 2008.

#  This script is intended both as an illustration, and, potentially,
#  as an actual tool.
#
#  NEITHER THE AUTHOR, THE MATHWORKS, INC., VERITAS, NOR PERFORCE SOFTWARE
#  MAKE ANY WARRANTY, EXPLICIT OR IMPLIED, AS TO THE CORRECTNESS,
#  FITNESS FOR ANY APPLICATION, NOR THE SAFETY OF THE snap_checkpoint SOFTWARE.
#
#  The directory structure that is assumed by this script is highly likely
#  NOT the structure you have.  It is your responsibility to verify what
#  works and what does not work in your environment.
#
#  First, you ought to run "snap_checkpoint -port YOURPORT -n" to see
#  "reasonable" output.
#  Then, you should verify that the p4d_lock() function works
#  correctly at your site. You can do this by executing
#  "snap_checkpoint -port YOURPORT -lockcheck".
#  If things are working right, the script will report that the database
#  has been locked.
#  You can then (from second login shell) attempt to execute a p4 command
#  ("p4 user -o' is fine).  This should block until the lock is released.
#  Here's what you should see:
#
# [batscm@bat240sol64 ws1]$ snap_checkpoint -lockcheck -port 1681
# 2008-03-17 14:30:30 -0400 snap_checkpoint>> requesting lock on all db.* tables...
# 2008-03-17 14:30:30 -0400 snap_checkpoint>> 38 files in /export/db/perforce/1681 were locked.
# 2008-03-17 14:30:30 -0400 snap_checkpoint>> press return to unlock
#
#      (At this point all commands to your Perforce server should block,
#       until you press return)
#
# 2008-03-17 14:30:38 -0400 snap_checkpoint>> /export/db/perforce/1681 were unlocked.
#

use strict;
use warnings;

use Carp;
use Cwd            qw(abs_path);
use Fcntl ':flock'; # import LOCK_* constants
use File::Basename qw(basename dirname);
use File::Copy     qw();
use FileHandle     qw();
use Getopt::Long   qw(GetOptions);
use IO::Handle;
use Mail::Sendmail qw(sendmail);
use Memoize        qw(memoize);
use Pod::Usage     qw(pod2usage);
use POSIX          qw(:sys_wait_h);
use Sys::Hostname  qw(hostname);
use Time::Local;

# no buffering
*STDOUT->autoflush();
*STDERR->autoflush();

my $Mydir  = abs_path(dirname($0));
my $Myname = basename($0);
my $user = $ENV{USER} || getpwuid($>) || getlogin || "uid$>";
my $HOST = hostname;

# Configuration Settings
#
# REVIEW THESE SETTINGS, AND ADJUST THEM AS NECESSARY FOR USE IN YOUR
# ENVIRONMENT:
#

# The presumed directory structure is as follows:
#    /export/checkpoint/perforce/PORT
#    /export/data/perforce/PORT
#    /export/db/perforce/PORT
#    /export/journal/perforce/PORT
#    /export/logs/perforce/PORT
# The p4 root is /export/db/perforce/PORT where we have symlinks for depots.
# For example:
#   /export/db/perforce/1680/depot -> /export/data/perforce/1680/depot/
#
# Checkpoint files are collected in /export/checkpoint/perforce/PORT.
# Volumes
#    /export/data
#    /export/db
#    /export/journal
# are snapshotted separately.
#
# Due to the assumptions above, when Veritas snapshots are taken, they
# contain all Perforce instances (i.e., for all ports).
# THOSE SNAPSHOTS GET REMOVED AT THE END OF THE SCRIPT.
# Therefore, there must be only one instance of this script running at any
# point in time.  However, it can handle creating a backup of more than
# one Perforce instance per run.
#

# Multiple servers.
#
# If you have multiple servers that share a single mounted file system,
# this script will allow you to create a single snapshot and use the
# same snapshot to create checkpoints for every server.
# The example of the directory structure is:
# /export/db/perforce/1680 -- root of the server #1
# /export/db/perforce/1681 -- root of the server #2
# /export/db/perforce/1682 -- root of the server #3
# /export/journal/perforce/1680 -- directory for the journal #1
# ...
#
# Single invocation of the script to create checkpoints of all three servers:
#   snap_checkpoint -port 1680 -port 1681 -port 1682
#

#
# Local path to rsync
my $RSYNC0 = $^O eq 'solaris' ? '/usr/local/bin/rsync' : '/usr/bin/rsync';
# Note. We'll use ssh with the assumption that it does not prompt for
# the password.  In order to achieve this, the public key must be
# generated on the local host (ssh-keygen -t dsa), and the key must be
# appended to the /home/$USER/.ssh/authorized_keys2 .
my $RSH = 'ssh';
my $RSYNC = "$RSYNC0 -av --rsync-path=$RSYNC0 --rsh=$RSH";

#
# Top directory of the mount points
my $TOP = '/export';

#
# Top where snapshots will be mounted
my $SNAPMOUNT = '/backup';

# Several functions returning values, which depend on the port (first arg)
memoize('P4');
memoize('P4CHECKPOINT');
memoize('P4ROOT');
sub P4 {
    my ($port) = @_;
    my $P4 = P4ROOT($port) . '/bin/p4';
    die "Error: $P4 does not exist\n" if ! -x $P4;
    return $P4;
} # P4
sub P4CHECKPOINT { return "$TOP/checkpoint/perforce/$_[0]" }
sub P4D          { return P4ROOT($_[0]) . '/bin/p4d' }
sub P4DATA       { return "$TOP/data/perforce/$_[0]" }
sub P4JOURNAL    { return "$TOP/journal/perforce/$_[0]/journal" }
sub P4LOGDIR     { return "$TOP/logs/perforce/$_[0]" }
sub P4ROOT       { return "$TOP/db/perforce/$_[0]" }
sub SNAPJOURNAL  { return "$SNAPMOUNT/journal/perforce/$_[0]/journal" }

# The name of the journal counter to use
my $P4COUNTER     = 'snap_journal';

# Path to the host's "gzip" command
#
my $GZIP 	  = "/usr/local/bin/gzip";

# Command to create Veritas snapshots
my $CREATE_SNAPSHOT = '/usr/local/bin/mk_vxfs_snapshot';
# Command to remove Veritas snapshots
my $REMOVE_SNAPSHOT = '/usr/local/bin/rm_vxfs_snapshot';

my @orig_args = @ARGV;
my $domain = 'mathworks.com';
my (%Handle, $HoursFromGMT);
my ($logfile, $preview_only, $lockcheck, @ports, %warm_standby);
my $notify = '';
my $verbose = 0;
my $make_checkpoint = 1;
my $max_to_keep = 0;  # keep all snapshots

#
# Parse arguments
#
GetOptions(
	   'help'         => sub { pod2usage( -verbose => 2, -exit => 0 ) },
           'lockcheck'    => \$lockcheck,
	   'log'          => \$logfile,
           'mail=s'       => \$notify,
	   'no_checkpoint' => sub { $make_checkpoint = 0 },
           'notify=s'     => \$notify, # alias
           'max=i'        => \$max_to_keep,
           'n'            => \$preview_only,
           'port=s'       => \@ports,
           'warm=s'       => sub { %warm_standby = map { ($_ => 1) }
                                                       split /,/, $_[1] },
           'v'            => \$verbose,
          )
    or die "$Myname: Error parsing arguments\n";

if ( ! @ports ) {
    print "No ports are specified.\n";
    pod2usage( -verbose => 2,
	       -exitval => 2 );
}

if ( $logfile ) {
    $logfile = P4LOGDIR($ports[0]) . "/backup.log";
    #print "Redirecting output to $logfile\n";
    open STDOUT, '>>', $logfile
	or die "Cannot append to $logfile: $!\n";
    open STDERR, ">&STDOUT"
	or warn "Cannot redirect STDERR to STDOUT: $!\n";
}

print "\n     * * * * *\n";
msg(0, "$0 @orig_args\n");

$SIG{__DIE__} = sub {
    sendmsg("PERFORCE checkpoint $HOST:@ports: FAILURE",
            join('', "We were running:\n   $0 @orig_args\n",
                 "and process $$ eventually crashed:\n\n", @_,
		 "For more information see ", ($logfile || 'stdout'),
                 "\n\n--$user\n"));
} if $notify;

$SIG{HUP} = $SIG{QUIT} = $SIG{INT} = $SIG{TERM} = sub {
    my ($sig, @arg) = @_;
    die "Signal $sig received\n", @arg;
};

my @warm_standby = sort grep { is_warming_ok($_) } keys %warm_standby;

#
# Time to do the job
#

if ( $lockcheck ) {
    msg(0, "requesting lock on all db.* tables...\n");
    p4d_lock($_) for ( @ports );
    msg(0, "press return to unlock ");
    my $ans = <>;
    p4d_unlock($_) for ( @ports );
    exit 0;
}

p4d_snap_checkpoint();

sendmsg("PERFORCE checkpoint of $HOST:@ports: SUCCESS <eom>");
msg(0, "$0: successful exit.\n");
exit 0;

#
# Copy (and compress) Perforce journal from a snapshot
#
sub copy_journal {
    my ($port, $tstamp, $old_counter) = @_;
    # The path to the directory where the checkpoint should be written
    #
    my $P4CHECKPOINT = P4CHECKPOINT($port);

    # Copy the journal (from the snapshot).
    #
    my $journaln = "$P4CHECKPOINT/$tstamp.jnl.$old_counter";

    my $SNAPJOURNAL = SNAPJOURNAL($port);

    msg(0, ($preview_only ? 'NOT ' : ''),
	"Copying journal ($SNAPJOURNAL to $journaln)\n");
    # syscopy is expected to preserve the timestamp (like cp -p)
    File::Copy::syscopy($SNAPJOURNAL, $journaln);

    # Compress the saved journal segment...
    # (Ignore errors - they can be dealt with later)
    #
    run_cmd("$GZIP $journaln");
    return;
} # copy_journal

#
# Create a handle name from a file name
#
sub file2handle {
    my ($file) = @_;
    if ( ! defined $Handle{$file} ) {
        $Handle{$file} = FileHandle->new();
    }
    return $Handle{$file};
} # file2handle

#
# Return the current counter for a given Perforce instance.
# Input:  127.0.0.1:$port
# Return: journal counter for this server
#
sub old_counter {
    my ($port) = @_;
    my $P4PORT = "127.0.0.1:$port";
    my $P4 = P4($port);    # The path to the "p4" client to be used
    my $p4 = "$P4 -p $P4PORT";

    # Next, look up the journal sequence number counter...  Logically,
    # we'd prefer to do this with the database locked, but the danger of
    # a rogue checkpoint -jc happening seems tolerable...
    #
    # Note. We use `` here rather than run_cmd, so the command would run
    # even in case of the -n option.
    my $output = `$p4 counters`;

    #
    # The difference between running 'counters' and "counters $P4COUNTER"
    # is that in the latter case we rely on the integer coming back as
    # the valid number.
    # It's more reliable, however, to verify that we get back the
    # output with the counter name we expect.
    #
    my ($journal_counter)
      = map { /^$P4COUNTER = (\d+)/ ? $1 : () } split /\n/, $output;

    if ( ! defined $journal_counter ) {
        die
          "Can't find set $P4COUNTER counter, nothing done for port=$port.\n",
          "If this is indeed your first run against this server, you need to run\n",
          "    $p4 counter $P4COUNTER 0\n";
    }
    return $journal_counter;
} # old_counter

#
# Lock database files
#
sub p4d_lock
{
    my ($port) = @_;
    # $P4ROOT for the server you wish to checkpoint
    #
    my $P4ROOT = P4ROOT($port);

    my $nlocked = 0;

    # With r01.1 (at least), empty tables don't have files yet. Real
    # servers almost certainly will have all the db.* files, but
    # just in case, we'll grep for existing files only
    foreach my $file ( grep { -f } map { "$P4ROOT/$_" } dbfiles($port) ) {
        my $handle = file2handle($file);

        msg(1, "Locking $file...\n");
        if ( ! $preview_only ) {
            #  Note: Solaris seems to need the "+<" open mode in order to all
            #  LOCK_EX locks to be placed.
            #
            if (! open $handle, '+<', $file) {
                msg(0, "can't open $file: $!\n");
                exit 5;
            }
            if (! flock($handle, LOCK_EX)) {
                msg(0, "can't lock $file: $!\n");
                exit 6;
            }
        }
        $nlocked++;
    }

    if ($nlocked <= 0) {
        die "no tables were locked!";
    }
    msg(0, "$nlocked files in $P4ROOT were ", ($preview_only ? 'NOT ' : ''),
        "locked.\n");
    return;
} # p4d_lock

#
# Unlock database files
#
sub p4d_unlock {
    my ($port) = @_;
    # $P4ROOT for the server you wish to checkpoint
    #
    my $P4ROOT = P4ROOT($port);

    foreach my $file ( grep { -f }
                            map { "$P4ROOT/$_" } reverse(dbfiles($port)) ) {
        my $handle = file2handle($file);
        msg(1, "Unlocking $file...\n");
        if ( ! $preview_only ) {
            close $handle
              or msg(0, "Error closing $file: $!\n");
        }
    }
    msg(0, "$P4ROOT ",
        ($preview_only ? 'did NOT have to be' : 'was'), " unlocked.\n");
    return;
} # p4d_unlock

#
# Print a message in a canonical format assuming the first arg (requested
# verbose level) is less or equal $verbose
#
sub msg {
    my $verbose_level = shift;
    if ( $verbose >= $verbose_level ) {
        print msg_prefix(), " $Myname>> ", @_;
    }
    return;
} # msg

sub msg_prefix {
    my $time = shift || time;
    $HoursFromGMT ||= do {
	my $t = time;
	my $d = $t - Time::Local::timelocal(gmtime($t));
	sprintf($d >= 0 ? "+%02d00" : "%03d00", $d / 3600);
    };
    my ($sec,$min,$hour,$mday,$mon,$year) = localtime($time);
    return sprintf "%4d-%02d-%02d %02d:%02d:%02d $HoursFromGMT",
                   $year+1900, $mon+1, $mday, $hour, $min, $sec;
} # msg_prefix

#  Run a command, returning status and output; terminate on any error.
#
sub run_cmd {
    my ($arg) = @_;
    my ($cmd, $dont_croak, $verbose);
    if ( ref($arg) eq 'HASH' ) {
	$cmd        = $arg->{'cmd'};
	$dont_croak = $arg->{'dont_croak'};
	$verbose    = $arg->{'verbose'};
    }
    $cmd        = $arg if ! defined $cmd;
    $dont_croak = 0    if ! defined $dont_croak;
    $verbose    = 1    if ! defined $verbose;

    msg(0, ($preview_only ? 'NOT ' : ''), "Running: $cmd\n");
    return (0, "Command was not run\n") if $preview_only;

    # Note. We will die here even if dont_croak is set because we expect
    # that the pipe should be openable in any case.
    open my $CMD, '-|', "$cmd 2>&1"
        or die "can't open \"$cmd 2>&1 |\": $!";

    my $output = '';
    while ( <$CMD> ) {
        print ": $_" if $verbose;
        $output .= $_;
    }
    close $CMD;

    if (my $sts = $?) {
	my $sig = $sts & 0x0f;
	$sts = $sts >> 8;
	my $msg = "'$cmd' exited with " . ($sig ? "signal $sig " : '')
	        . "status $sts\n";
	if ( $dont_croak ) {
	    msg(1, $msg);
	}
	else {
	    die $msg;
	}
    }
    return $output;
} # run_cmd

#
# return a time stamp
#
sub ts {
    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
    return sprintf("%04d%02d%02d%02d%02d%02d",
                   1900+$year, $mon+1, $mday, $hour, $min, $sec);
} # ts

#
# Main function that does everything
#
sub p4d_snap_checkpoint {
    my @vols = qw(db journal);
    # Snapshot the data volume if we want to rsync it to the warm standby host
    push @vols, 'data' if @warm_standby;
    # First, delete the previous snapshots (if they exist.)
    for my $vol ( grep { -d "$SNAPMOUNT/$_/perforce" } @vols ) {
        run_cmd("$REMOVE_SNAPSHOT $vol");
    }

    # do one port at a time so the database of one port would not stay
    # locked while we working on another port.
    # this causes more snapshot creating, but it accomplishes
    # the independence between different servers.
    for my $port ( @ports ) {
	# get the counter early. if the counter does not exist, dont even
	# create snapshots - exit early.
        my $old_counter = old_counter( $port );

        # Next, we lock down the entire database(s)
        p4d_lock($port);

        my $tstamp = ts();

	# Now, we snapshot the database filesystem...
	#
        for my $vol ( @vols ) {
            run_cmd("$CREATE_SNAPSHOT $vol");
        }

	if ( $make_checkpoint ) {
	    # OK, now we are confident that we have a good snapshot. We can
	    # proceed on the assumption that that the "off line" operations of
	    # copying the journal and "p4 -jd" will work. All we need to do here
	    # is to truncate the journal, and put the server back on-line by
	    # unlocking it...

	    truncate_journal($port);
	}
	else {
	    print "We are not creating a checkpoint => Journal is not truncated\n";
	}

        # Now we can release the lock...
        #
        p4d_unlock($port);

        # At this point, the system is online for users.

	if ( $make_checkpoint ) {
	    copy_journal($port, $tstamp, $old_counter);
	}
	else {
	    print "We are not creating a checkpoint => Journal is not copied\n";
	}

        # The path to the "p4d" server to be used
        #
        my $P4D = P4D($port);

        # Do this checkpoint from the snapshot we just took...
        #
        my $SNAPROOT = "$SNAPMOUNT/db/perforce/$port";

        #
        # @child is an array of refs to [pid, host]
        # where pid is the pid of the update process, and
        # the host is the host name of the warm standby
        my @child  # copy the data to the slave(s)
            = map {
		print "Calling update_data($port, $_)\n";
		if ( my $pid = update_data($port, $_) ) {
		    print "Adding to child processes to wait: [$pid, $_]\n";
		    [$pid, $_];
		}
		else {
		    print "Update data on $_ failed. No chld process to wait.\n";
		    ();
		}
              } @warm_standby;

	my $P4 = P4($port);
	my $P4CHECKPOINT = P4CHECKPOINT($port);
	my ($checkpt, $journal, $new_counter);
	if ( $make_checkpoint ) {
	    # The path to the directory where the checkpoint should be written
	    #
	    $new_counter = $old_counter + 1;
	    $checkpt = "$P4CHECKPOINT/$tstamp.ckp.$new_counter.gz";
            $journal = "$P4CHECKPOINT/$tstamp.jnl.$old_counter.gz";

	    #
	    # This command creates a checkpoint
	    run_cmd("$P4D -r $SNAPROOT -p 127.0.0.1:$port -z -jd $checkpt");

	    # Now increment the counter - after we created the checkpoint
	    #
	    my $output = run_cmd("$P4 -p 127.0.0.1:$port"
				 . " counter -f $P4COUNTER $new_counter");

	    if ( ! $preview_only
		 &&
		 $output !~ /^Counter $P4COUNTER set\.$/) {
		die "couldn't increment $P4COUNTER counter:\n$output\n";
	    }
	}
	else {
	    $new_counter = $old_counter;
	    # Take the checkpoint for the old_counter
	    ($checkpt) = glob "$P4CHECKPOINT/*.ckp.$old_counter.gz"
		or die
                  "Cannot find a checkpoint in $P4CHECKPOINT for the counter ",
		       $old_counter;
            ($journal) = glob("$P4CHECKPOINT/*.jnl." . ($old_counter-1) . ".gz")
		or die
                  "Cannot find a journal in $P4CHECKPOINT for the counter ",
		       $old_counter-1;
	}

        # Copy this checkpoint to the warm standby(s)
        for my $warm_standby ( @warm_standby ) {
	    if ( my $pid = fork_it("$RSYNC $checkpt $journal "
                                   . "$warm_standby:$P4CHECKPOINT/") ) {
		msg(0, "Started process $pid to sync $checkpt and $journal",
                    " to $warm_standby\n");
		push @child, [$pid, $warm_standby];
	    }
	    my $P4ROOT = P4ROOT($port);
	    # Note. What cannot be copied:
	    # - bin should not be rsync'ed because:
	    #   - platforms may be different;
	    #   - p4d version may be different.
	    # - license cannot be copied because it's a different host.
	    my @what = grep {
		! m{ /(db \. \w+ | bin (\.\S+)? | license \S* | core) $ }x
		} glob("$P4ROOT/*");
	    if ( my $pid = fork_it("$RSYNC @what $warm_standby:$P4ROOT") ) {
		msg(0, "Started process $pid",
		    " to copy admin.support, bin, lib, triggers to $warm_standby\n");
		push @child, [$pid, $warm_standby];
	    }
	}

	my %done;
	# Wait for all the children
	while ( @child ) {
	    msg(0, "Waiting for child process(es) to finish: ",
		join(' ', map { $_->[0] } @child), "\n");
	    my $kid = wait;
	    if ( $kid > 0 ) {
		my ($host) = map { $kid == $_->[0] ? $_->[1] : () } @child;
		if ( $host ) {
		    msg(0, "Child process $kid finished warming up $host",
			$? ? " (FAILED with code=$?)" : '', "\n");
		    @child = grep { $kid != $_->[0] } @child;
		    $done{$host} = $?
			if ( $host && ! defined $done{$host} || $done{$host} < $?);
		}
		else {
		    msg(0, "Child process $kid finished, but it was not on the list of the processes we waited for.\n");
		}
	    }
	    else {
		msg(0, "wait returned $kid ($!)\n");
		last;
	    }
	}

        # Exclude failed hosts from @warm_standby
        # It can be done more efficiently with hashes, but with the number
        # of elements to be usually 1 or 2, it doesn't matter.
        for my $warm_standby ( grep { $done{$_} } keys %done ) {
            msg(0, $warm_standby,
	      " will not be used as a warm standby because of the errors above.\n");
            @warm_standby = grep { $warm_standby ne $_ } @warm_standby;
        }

        #
        # Now we don't need any snapshots anymore
	# don't croak when we run these commands - try to get to the end
        for my $vol ( @vols ) {
            run_cmd({cmd        => "$REMOVE_SNAPSHOT $vol",
		     dont_croak => 1,
		    });
        }

        my $xtra_del = $max_to_keep
            ? "$Mydir/delete_extra_checkpoints -port $port -max $max_to_keep"
              . ( $preview_only ? ' -n' : '' )
            : 'echo no checkpoint cleaning is done';

        for my $warm_standby ( @warm_standby ) {
            msg(0, "Restoring warm standby $warm_standby...\n");
            #
            # At this time, the data have been updated on the warm
            # standby and the checkpoint has been copied there.
            # We now need to restore db files and restart the server.
            my $P4ROOT = P4ROOT($port);
            my $cmd = "$RSH -n $warm_standby 'cd $P4ROOT"
                  . "; mkdir tmp.$new_counter; mv db.* tmp.$new_counter"
                  . "; $P4D -r $P4ROOT -z -jr $checkpt; echo Status=\$?'";
            my $output = run_cmd({cmd        => $cmd,
                                  dont_croak => 1,
                                 });
            if ( $preview_only
                 ||
                 ( $? == 0  # everything was OK in the previous command
                   && $output !~ /error/i
                   && $output =~ /Status=0\b/ ) ) {
                for my $cmd (map { "$RSH -n $warm_standby '$_'" }
                             "/etc/init.d/p4d.$port start >/dev/null 2>&1 &",
               "cd $P4ROOT; rm tmp.$new_counter/db.*; rmdir tmp.$new_counter",
                             $xtra_del,
                            ) {
                    run_cmd($cmd);
                }
            }
            else {
                msg(0, "Not starting p4d on $warm_standby:$port",
                    " because of errors above\n");
            }
        } # restoring warm standby

        # Delete extra checkpoint and journal files
        run_cmd($xtra_del) if $xtra_del;
    } # for each port (Perforce instance)

    return;
} # p4d_snap_checkpoint

#
# Fork a command and return pid
sub fork_it {
    my ($cmd) = @_;
    my $pid = fork;
    if ( $pid ) { # parent
	return $pid;
    }
    if ( defined $pid ) { # child
	sleep 2; # give the parent a couple of seconds to breathe
	run_cmd($cmd);
	msg(0, "PID=$$ finished.\n");
	exit;
    }
    else {
	msg(0, "ERROR: Cant sync checkpt: fork failed: $!\n");
	return;
    }
} # fork_it

#
# Truncate Perforce journal
#
sub truncate_journal {
    my ($port) = @_;
    # The path to the journal file
    #
    my $P4JOURNAL = P4JOURNAL($port);
    if ( ! $preview_only ) {        # Truncate the journal
        open my $J, '>', $P4JOURNAL
            or do {
                msg(0, "couldn't truncate \"$P4JOURNAL\": $!\n");
                exit 7;
            };
        close $J;
    }
    msg(0, "$P4JOURNAL was ", ($preview_only ? 'NOT ' : ''), "truncated.\n");
    return;
} # truncate_journal

# verify several things regarding a warm standby
sub is_warming_ok {
    my ($warm_standby) = @_;
    my ($warm_name, $warm_aliases, $warm_addrtype, $warm_length, @warm_addrs)
         = gethostbyname($warm_standby);
    my ($this_name, $this_aliases, $this_addrtype, $this_length, @this_addrs)
         = gethostbyname($HOST);
    my $err;
    if ( ! -x $RSYNC0 ) {
        $err = "$RSYNC0 was not found => warm standby cannot be maintained";
    } elsif ( $warm_standby eq 'localhost' ) {
        $err = "You cannot use localhost as a warm standby host";
    } elsif ( ! $warm_name ) {
        $err = "Warm standby host $warm_standby is invalid: $!";
    }
    elsif ( ! $this_name ) {
        $err = "Unexpectedly, we cannot resolve address of this host "
             . "$HOST:\n $!";
    }
    else { # make sure $warm_standby is not the local host
        for my $this ( @this_addrs ) {
            if ( grep { $_ eq $this } @warm_addrs ) {
                my ($a,$b,$c,$d) = unpack('C4', $this);
                $err = "Host $warm_standby cannot be used as a warm standby host because its address is the same as of $this_name: $a.$b.$c.$d";
                last;
            }
        }
    }
    if ( $err ) {
        warn "$err\n";
        return;
    }
    return 1; # OK
} # is_warming_ok

#
# Return the list of db files in the order to be locked
#
sub dbfiles {
    my ($port) = @_;
    my $p4d = P4D($port);
    # Perforce server version:
    my @p4dV = `$p4d -V`;
    my ($p4dVers) = map { m{^Rev\. \S+/(\d\d\d\d\.\d)/\d+} ? $1 : () } @p4dV;

    if ( ! $p4dVers ) {
        die "Cannot determine the p4d version from p4d -V output:\n", @p4dV,
          "<<<-- You may need to adjust the regex in parsing the output above";
    }

    #  The locking order of the db.* files, good for
    my @order
      = ($p4dVers =~ /^2000\.[12]$/)    # 2000.1 and 2000.2
        #  per information supplied by Perforce Software:
        #
        #    http://www.perforce.com/perforce/doc.001/schema/index.html
        #    http://www.perforce.com/perforce/doc.002/schema/index.html
        #
        ? qw(
                db.counters
                db.user
                db.group
                db.depot
                db.domain
                db.view
                db.review
                db.have
                db.integ
                db.locks
                db.rev
                db.revcx
                db.working
                db.change
                db.desc
                db.job
                db.jobpend
                db.jobdesc
                db.fix
                db.fixrev
                db.boddate
                db.bodtext
                db.ixdate
                db.ixtext
                db.protect
                db.trigger
           )
        :
        ($p4dVers =~ /^2001\.1$/)    # 2001.1
          #  per information supplied by Perforce Software:
          #
          #    http://www.perforce.com/perforce/doc.011/schema/index.html
          #
        ? qw(
                db.counters
                db.logger
                db.user
                db.group
                db.depot
                db.domain
                db.view
                db.review
                db.have
                db.label
                db.integ
                db.integed
                db.resolve
                db.locks
                db.rev
                db.revcx
                db.working
                db.change
                db.desc
                db.job
                db.jobpend
                db.jobdesc
                db.fix
                db.fixrev
                db.boddate
                db.bodtext
                db.ixdate
                db.ixtext
                db.protect
                db.trigger
           )
        :
        ($p4dVers =~ /^2002\.1$/)  # 2002.1
          #  per information supplied by Perforce Software:
          #
          #    http://www.perforce.com/perforce/doc.021/schema/index.html
        ? qw(
                db.counters
                db.logger
                db.user
                db.group
                db.depot
                db.domain
                db.view
                db.review
                db.have
                db.label
                db.integ
                db.integed
                db.resolve
                db.locks
                db.rev
                db.revcx
                db.working
                db.change
                db.changex
                db.desc
                db.job
                db.jobpend
                db.jobdesc
                db.fix
                db.fixrev
                db.boddate
                db.bodtext
                db.ixdate
                db.ixtext
                db.protect
                db.trigger
           )
        :
        ($p4dVers =~ /^(2002.2|2003.1)$/) # 2002.2, 2003.1
          #  per information supplied by Perforce Software:
          #
          #    http://www.perforce.com/perforce/doc.022/schema/index.html
          #    http://www.perforce.com/perforce/doc.031/schema/index.html
          #
        ? qw(
                db.counters
                db.logger
                db.user
                db.group
                db.depot
                db.domain
                db.view
                db.review
                db.have
                db.label
                db.integ
                db.integed
                db.resolve
                db.locks
                db.rev
                db.revcx
                db.working
                db.change
                db.changex
                db.desc
                db.job
                db.jobpend
                db.jobdesc
                db.fix
                db.fixrev
                db.boddate
                db.bodtext
                db.ixdate
                db.ixtext
                db.protect
                db.trigger
                db.message
           )
        :
        ($p4dVers =~ /^2003\.2|2004\.1$/)    # 2003.2 and 2004.1
          #  per information supplied by Perforce Software:
          #
          #    http://www.perforce.com/perforce/doc.032/schema/index.html
          #    http://www.perforce.com/perforce/doc.041/schema/index.html
          #
        ? qw(
                db.counters
                db.logger
                db.user
                db.group
                db.depot
                db.domain
                db.view
                db.review
                db.have
                db.label
                db.integ
                db.integed
                db.resolve
                db.locks
                db.rev
                db.revcx
                db.working
                db.change
                db.changex
                db.desc
                db.job
                db.jobpend
                db.jobdesc
                db.fix
                db.fixrev
                db.boddate
                db.bodtext
                db.ixdate
                db.ixtext
                db.protect
                db.trigger
                db.message
                db.monitor
           )
        : ($p4dVers =~ /^2004\.2$/)  # 2004.2
          #  per information supplied by Perforce Software:
          #
          #    http://www.perforce.com/perforce/doc.042/schema/index.html
        ? qw(
                db.counters
                db.logger
                db.user
                db.group
                db.depot
                db.domain
                db.view
                db.review
                db.have
                db.label
                db.integ
                db.integed
                db.resolve
                db.locks
                db.rev
                db.revcx
                db.revpx
                db.working
                db.traits
                db.trigger
                db.change
                db.changex
                db.desc
                db.job
                db.jobpend
                db.jobdesc
                db.fix
                db.fixrev
                db.boddate
                db.bodtext
                db.ixdate
                db.ixtext
                db.protect
                db.message
                db.monitor
           )
        : ($p4dVers =~ /^(2007\.[23]|2008\.[12])$/)  # 2007.2 .. 2008.2
        # this information taken from
        # http://www.perforce.com/perforce/doc.072/schema/index.html
        # http://www.perforce.com/perforce/doc.073/schema/index.html
        # http://www.perforce.com/perforce/doc.081/schema/index.html
        # http://www.perforce.com/perforce/doc.082/schema/index.html
        ? qw(
                db.counters
                db.logger
                db.user
                db.group
                db.depot
                db.domain
                db.view
                db.review
                db.integ
                db.integed
                db.resolve
                db.have
                db.label
                db.locks
                db.archive
                db.archmap
                db.rev
                db.revcx
                db.revdx
                db.revhx
                db.revpx
                db.revsx
                db.working
                db.traits
                db.trigger
                db.change
                db.changex
                db.desc
                db.job
                db.jobpend
                db.jobdesc
                db.fix
                db.fixrev
                db.boddate
                db.bodtext
                db.ixdate
                db.ixtext
                db.protect
                db.message
                db.monitor
           )
       :
         ();

    if ( ! @order ) {
        print <<EOM;

Unrecognized Perforce server version "$p4dVers". You will need to confirm
the db.* locking order for this version of the Perforce server, and modify
this script to recognize it before proceeding. See, e.g.,

  http://www.perforce.com/perforce/doc.072/schema/index.html
  http://www.perforce.com/perforce/doc.073/schema/index.html

For older p4d versions see
  //guest/stephen_vance/utils/snap_checkpoint/snap_checkpoint#5 from
  http://public.perforce.com/guest/stephen_vance

EOM
        exit 2;
    }

    return @order;
} # dbfiles

#
# Send mail message
sub sendmsg {
    my ($subject, $message) = @_;
    if ( my $recipients = join ',',
	 map { /\@/ ? $_ : "$_\@$domain"} split /[\s,]+/, $notify ) {
        my %mail = (From    => "$user\@$domain",
                    To      => $recipients,
                    Subject => $subject,
                   );
        $mail{'Message'} = $message if $message;
        sendmail(%mail)
          or print "Error sending mail: $Mail::Sendmail::error\n";
    }
    return;
} # sendmsg

#
# Update versioned files on the warm standby
sub update_data {
    my ($port, $warm_standby) = @_;
    local($_);  # someone inside this sub stumps on $_

    # Stop p4d on the slave - dont fail here
    my $P4 = P4($port);
    if ( p4d_live($warm_standby, $port) ) {
	run_cmd({cmd        => "$P4 -p $warm_standby:$port admin stop",
		 dont_croak => 1});
	if ( p4d_live($warm_standby, $port) ) {
	    msg(0, "Server $warm_standby:$port is alive => we cannot use this host as a warm standby\n");
	    return;
	}
    }

    # Go to the background to update the data
    my $pid = fork;
    if ( $pid ) { # parent
        msg(0,
	    "Started process $pid to update the data on $warm_standby:$port\n");
    }
    elsif ( defined $pid ) { # child
        sleep 2; # give the parent a couple of seconds to breathe
        # TODO: redirect rsync output elsewhere
        my $cmd = "$RSYNC $SNAPMOUNT/data/perforce/$port/ $warm_standby:"
	    . P4DATA($port);
	my $msg = "Child process $$ will" . ($preview_only ? ' NOT' : '')
	    . " update data on $warm_standby:\n    $cmd\n";
        msg(0, $msg);
	if ( ! $preview_only ) {
	    system $cmd
		and warn "ERROR: We could not exec rsync: $!";
	}
	# It's nice to see the time stamp when this is done
	msg(0, "Finished updating data on $warm_standby\n");
        exit;
    }
    else {
        msg(0, "Cant update data on $warm_standby:$port: fork failed: $!\n");
        return;
    }

    return $pid;
} # update_data

#
# Check if the given server is alive
sub p4d_live {
    my ($host, $port) = @_;
    my $P4 = P4($port);
    my $info = "$P4 -p $host:$port info";
    my $out = run_cmd({cmd        => $info,
		       dont_croak => 1,
		       verbose    => 0});
    msg(1, "<$info>:\n$out\n");
    if ( $out =~ /Server uptime:/ ) {
	return 1;
    }
    if ( $out =~ /connect: $host:$port: Connection refused/ ) {
	return 0;
    }
    msg(0, "Cannot determine whether $host:$port",
	" server is running based on this output:\n", $out);
    return;
} # p4d_live

=head1 NAME

snap_checkpoint

=head1 SYNOPSIS

snap_checkpoint -help

snap_checkpoint -port NNNN [-port NNNN] [-lockcheck] [-max N] [-n] [-v]

=head1 DESCRIPTION

This script creates a new Perforce checkpoint using snapshots.

Options:
   -n         preview: shows what would be done
   -lockcheck is to testing whether the server is indeed locked
   -max N     keep maximum N backups and delete the oldest (default: keep all)
   -log       log activity in backup.log in the logs directory
   -mail id   send summary about this run to the given address
   -port NNNN determines which server needs to be checkpointed
   -warm host1[,host2,...]
              Use specified host(s) as a warm stand-by.
              This causes remote copy and other extra actions.
              Specified hosts cannot be this host.
   -v         increases the verbosity level.

=head1 EXAMPLES

=cut
# Change User Description Committed
#3 7246 Michael Mirman multiple improvements, including (but not limited to)
- added option -log to log all output in a file;
- added option -mail to send the summary of the run to the given address(es);
- support multiple standbys;
- sync versioned files
#2 6341 Michael Mirman This version creates snapshot using Veritas file system, then
       unlocks the database and proceeds with checkpointing.
       This is a copy of the TMW production
       version //sandbox/batscm/triggers/admin/snap_checkpoint#6, which
       keeps the database locked for about 8 seconds and produces
       a zipped up checkpoint file ~1G.
#1 6340 Michael Mirman copy of //guest/stephen_vance/utils/snap_checkpoint/snap_checkpoint#5
       - starting point working on Veritas file system snapshots
//guest/stephen_vance/utils/snap_checkpoint/snap_checkpoint
#5 4852 Stephen Vance Parameterize things relative to SNAPMOUNT and simplify script accordingly.
#4 4851 Stephen Vance Finish parameterizing snapshot mount point.
#3 4850 Stephen Vance Fix hardcoded reference to volume.
Parameterize snapshot name. Start to parameterize snapshot mount point.
#2 4849 Stephen Vance Updated locking order for newer versions.
Fixed some typos. Added -f to the counter command so that "journal" can be used again, but this introduces a backward compatibility issue.
#1 4848 Stephen Vance Branch snap_checkpoint to update and customize.
//guest/richard_geiger/utils/snap_checkpoint/snap_checkpoint
#9 1544 Richard Geiger Update to reflect changes in p4d 2002.1:
  a) The change in the locking order, due to db.changex
  b) The fix for job006497
#8 942 Richard Geiger Use $VOLUME, too!
#7 941 Richard Geiger Use the $GZIP variable instead of the literal path.
#6 920 Richard Geiger add 2001.1 locking order; correct open mode ("+<") for Solaris;
some notes; and the "lockcheck" option.
#5 437 Richard Geiger Hack to handle r2000.1's newfound reluctance to do
"p4 counter journal NNNN".
#4 248 Richard Geiger The main change here is to move the copying of the journal file to done
from the checkpoint, outside of the region where the server is locked.
This can make the whole thing go much faster when the journal is sizable
enought that the copy takes a significant amoutn of time to
happen.
#3 246 Richard Geiger Update the script such that we use, verbatim, the p4d_snap_checkpoint
function from "p4d_admin", which the version we're finally really
deploying. This should make it much easier to maintain in the
future. Also update the html doc to match.
#2 239 Richard Geiger - Use LOCK_SH when locking the database
- Use ALL CAPS when shunning all responsibility for the thing
  (Warranty disclaimer)
#1 238 Richard Geiger Sample script illustrating how to use Data ONTAP snapshots for
a "fast checkpoint", plus accompanying notes