#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Helix Management System (hms), and also available online:
# https://swarm.workshop.perforce.com/projects/perforce_software-hms/view/main/LICENSE
#------------------------------------------------------------------------------
set -u
#==============================================================================
# Declarations and Environment
# Allow override of P4U_HOME, which is set only when testing P4U scripts.
export CBIN=${CBIN:-/p4/common/bin}
export P4U_HOME=${P4U_HOME:-$CBIN}
export P4U_LIB=${P4U_LIB:-/p4/common/lib}
export P4U_ENV=$P4U_LIB/p4u_env.sh
export P4U_LOG="/tmp/p4failover.$(date +'%Y%m%d-%H%M%S').log"
# Environment isolation. For stability and security reasons, prepend
# PATH to include dirs where known-good scripts exist.
# known/tested PATH and, by implication, executables on the PATH.
export PATH=$P4U_HOME:$PATH:~/bin:.
export P4CONFIG=${P4CONFIG:-.p4config}
[[ -r "$P4U_ENV" ]] || {
echo -e "\nError: Cannot load environment from: $P4U_ENV\n\n"
exit 1
}
declare BASH_LIBS=$P4U_ENV
BASH_LIBS+=" $P4U_LIB/libcore.sh"
BASH_LIBS+=" $P4U_LIB/libp4u.sh"
for bash_lib in $BASH_LIBS; do
source $bash_lib
done
declare Version=1.0.0
declare -i SilentMode=0
#==============================================================================
# Local Functions
#------------------------------------------------------------------------------
# Function: terminate
function terminate
{
# Disable signal trapping.
trap - EXIT SIGINT SIGTERM
# Don't litter.
cleanTrash
vvmsg "$THISSCRIPT: EXITCODE: $OverallReturnStatus"
# Stop logging.
[[ "${P4U_LOG}" == off ]] || stoplog
# With the trap removed, exit.
exit $OverallReturnStatus
}
declare CentralCfgFile="$CBIN/p4failover.cfg"
declare InstanceCfgFile=""
declare InstanceList=""
declare Instance=""
declare Mode=""
declare Type=""
declare PassThruArgs=""
declare -a ProcessForInstance
declare -a ProcessLogs
declare -a ProcessState
declare -a ProcessExitCode
declare -i ProcessCount=0
declare Log=""
declare Fix="false"
declare ForceFix="false"
declare -i SilentMode=0
#==============================================================================
# Local Functions
#------------------------------------------------------------------------------
# Check that hosts noted in the p4failover.cfg file are reachable. This
# is a basic sanity check for the failover system.
#------------------------------------------------------------------------------
function check_hosts
{
declare host=""
declare i=""
declare master=""
declare masterCheck=""
declare masterP4Status=""
declare masterSDPState=""
declare backup=""
declare backupCheck=""
declare backupP4Status=""
declare backupSDPState=""
declare brokerCfgFile=""
declare brokerTarget=""
declare -i allOK=1
declare -i canAttemptFix=1
[[ -z "$InstanceList" ]] && InstanceList=${P4F_ALL_INSTANCES}
for i in ${InstanceList//,/ }; do
msg "${H}\nChecking status for instance ${i}.\n"
load_instance_cfg ${i}
master=${P4F_MASTER%%:*}
backup=${P4F_BACKUP%%:*}
for host in ${P4F_BROKER} ${P4F_MASTER} ${P4F_BACKUP} ${P4F_DR}; do
# Skip hosts marked as inactive on the config file. Mainly for
# testing with an incomplete set of machines.
[[ ${host##*:} == Active ]] || continue
host=${host%%:*}
if [[ "$(ssh $host echo SystemLoginTest)" == *"SystemLoginTest"* ]]; then
msg "Host $host is accessible."
else
errmsg "Host $host is NOT accessible."
allOK=0
fi
done
# Load SDP environment for this instance.
source $CBIN/p4_vars ${i}
masterCheck=$(ssh $master tail -1 ${P4HOME}/bin/p4.master)
masterSDPState=$(ssh $master $CBIN/get_sdp_state.sh -s -i ${i})
backupCheck=$(ssh $backup tail -1 ${P4HOME}/bin/p4.master)
backupSDPState=$(ssh $backup $CBIN/get_sdp_state.sh -s -i ${i})
# Check to see if Perforce is up where we think it should be.
if [[ "$($P4BIN -s -p $master:${P4PORT##*:} info 2>&1)" == *"Server address:"* ]]; then
masterP4Status="UP"
else
masterP4Status="Down"
fi
if [[ "$($P4BIN -s -p $backup:${P4PORT##*:} info 2>&1)" == *"Server address:"* ]]; then
backupP4Status="UP"
else
backupP4Status="Down"
fi
msg "p4.master on Master $master: [$masterCheck].
SDP state on Master $master: [$masterSDPState].
p4.master on Backup $backup: [$backupCheck].
SDP state on Backup $backup: [$backupSDPState]."
if [[ "$masterCheck" == *"P4_${P4INSTANCE}_MASTER=true"* ]]; then
msg "The p4.master on master host ${master} shows true, as expected."
msg "Perforce instance ${i} is $masterP4Status on ${master}."
else
errmsg "The p4.master on master host ${master} shows [$masterCheck], NOT true as expected!"
allOK=0
fi
# If Perforce is up even if the master file indicates it's down,
# we are in a strange state.
if [[ "$backupP4Status" == UP ]]; then
errmsg "Perforce instance ${i} is unexpectedly up on the backup server $backup!"
allOK=0
canAttemptFix=0
else
msg "Perforce instance ${i} is down on ${backup}, as expected."
fi
if [[ "$backupCheck" == *"P4_${P4INSTANCE}_MASTER=false"* ]]; then
msg "Master file on backup host ${backup} shows false, as expected."
else
errmsg "Master file on backup host ${backup} shows [$backupCheck], NOT false as expected!"
allOK=0
if [[ "$Fix" == true ]]; then
if [[ "$canAttemptFix" -eq 1 || $ForceFix == true ]]; then
msg "Fixing p4failover.cfg for instance ${i}."
update_failover_cfg "${i}"
else
errmsg "Fix cannot be attempted because Perforce is unexpectedly up on the backup server $backup. If it is safe to shutdown Perforce on $backup, do so manually and run with '-fix' again. Or, adjust the P4F_* settings in the p4failover.${i}.cfg file."
allOK=0
fi
else
msg "Specify '-fix' to make p4failover.${i}.cfg match the master files."
fi
fi
for host in $master $backup; do
scp $host:/p4/$i/bin/p4failover.$i.cfg /tmp/p4failover.$i.$host.cfg
GARBAGE+=" /tmp/p4failover.$i.$host.cfg"
runCmd "diff /p4/$i/bin/p4failover.$i.cfg /tmp/p4failover.$i.$master.cfg" \
"Confirming that local p4failover.$i.cfg files agrees with $master."
if [[ $CMDEXITCODE -eq 0 ]]; then
msg "Local p4failover.$i.cfg file matches $host."
elif [[ $Fix == true ]]; then
msg "Local p4failover.$i.cfg does not match $host. Fixing!"
runCmd "ssh $host chmod +w /p4/$i/bin/p4failover.$i.cfg"
runCmd "scp /p4/$i/bin/p4failover.$i.cfg $host:/p4/$i/bin/."
else
errmsg "Local p4failover.$i.cfg does NOT match $master. Use -fix to fix."
allOK=0
fi
done
# Report on broker configuration.
brokerCfgFile=${P4HOME}/bin/p4broker.cfg
if [[ -f $brokerCfgFile ]]; then
if [[ -n "$(grep action $brokerCfgFile 2>/dev/null)" ]]; then
msg "Broker indicates Down For Maintenance for instance ${i} on $HOSTNAME."
elif [[ -n "$(grep target $brokerCfgFile 2>/dev/null)" ]]; then
brokerTarget=$(grep target $brokerCfgFile)
brokerTarget=${brokerTarget##* }
brokerTarget=${brokerTarget%;}
msg "Broker target is [$brokerTarget] for instance ${i} on $HOSTNAME."
else
errmsg "Broker misconfigured for instance ${i} on $HOSTNAME."
allOK=0
fi
else
msg "Broker not configured for instance ${i} on $HOSTNAME."
fi
done
[[ $allOK -eq 0 ]] && return 1
return 0
}
#------------------------------------------------------------------------------
# Load and verify the central config file.
# This routine is do-or-die -- we fail if the central config cannot be loaded.
function load_central_cfg
{
declare cfgDataOK=1
if [[ -r $CentralCfgFile ]]; then
msg "Loading Central Configuration Data from [$CentralCfgFile]."
export P4F_EXPECTED_DBCOUNT=""
export P4F_ALL_INSTANCES=""
export P4F_SIMULATE_SAN_DURING_HA_FAILOVER=""
. $CentralCfgFile
[[ -z "${P4F_SIMULATE_SAN_DURING_HA_FAILOVER}" ]] && { errmsg "Missing def'n for P4F_SIMULATE_SAN_DURING_HA_FAILOVER!"; cfgDataOK=0; }
[[ -z "${P4F_EXPECTED_DBCOUNT}" ]] && { errmsg "Missing def'n for P4F_EXPECTED_DBCOUNT!"; cfgDataOK=0; }
[[ -z "${P4F_ALL_INSTANCES}" ]] && { errmsg "Missing def'n for P4F_ALL_INSTANCES!"; cfgDataOK=0; }
[[ $cfgDataOK == 0 ]] && bail "Faulty central configuration data in $CentralCfgFile."
msg "Central Config data loaded and syntactically verified."
else
bail "Missing central config file [$CentralCfgFile]! Aborting failover."
fi
return 0
}
#------------------------------------------------------------------------------
# manage_ha_failover
# Short: Orchestrate all steps related to HA failover for a given instance.
#
# Input:
# $1 - instance number
#
# Display exit code rather than returning it, for easy capture when running
# multiple processes concurrently.
#------------------------------------------------------------------------------
function manage_ha_failover
{
declare i=$1
load_instance_cfg "${i}"
if [[ $? -ne 0 ]]; then
errmsg "Skipping failover of instance ${i} because load of instance-specific failover cfg file failed."
OverallReturnStatus=1
echo "manage_ha_failover: PEXITCODE: 1"
return 1
fi
msg "Failover: Instance ${i}, $P4F_MASTER_HOST --> $P4F_BACKUP_HOST."
execute_ha_host_failover "${i}"
if [[ $? -eq 0 ]]; then
msg "HA Failover for Instance ${i} succeeded."
else
errmsg "HA Failover for Instance ${i} failed."
OverallReturnStatus=1
echo "manage_ha_failover: PEXITCODE: 1"
return
fi
if [[ $? -ne 0 ]]; then
errmsg "Update of instance-specific failover cfg file failed for Instance ${i}. FIX THIS MANUALLY TO AVOID A POSSIBLE SPLIT BRAIN SCENARIO."
OverallReturnStatus=1
echo "manage_ha_failover: PEXITCODE: 1"
return
fi
echo "manage_ha_failover: PEXITCODE: 0"
return
}
#------------------------------------------------------------------------------
# Load and verify an instance-specific config file.
function load_instance_cfg
{
declare i=$1
declare iCfgFile="/p4/${i}/bin/p4failover.${i}.cfg"
declare -i cfgDataOK=1
if [[ -r $iCfgFile ]]; then
msg "Loading Instance Configuration Data from [$iCfgFile]."
export P4F_MASTER=""
export P4F_MASTER_HOST=""
export P4F_BACKUP=""
export P4F_BACKUP_HOST=""
export P4F_DR=""
export P4F_DR_HOST=""
export P4F_BROKER=""
export P4F_BROKER_HOST=""
. $iCfgFile
[[ -z "${P4F_MASTER}" ]] && { errmsg "Missing def'n for P4F_MASTER!"; cfgDataOK=0; }
[[ -z "${P4F_MASTER_HOST}" ]] && { errmsg "Missing def'n for P4F_MASTER_HOST!"; cfgDataOK=0; }
[[ -z "${P4F_BACKUP}" ]] && { errmsg "Missing def'n for P4F_BACKUP!"; cfgDataOK=0; }
[[ -z "${P4F_BACKUP_HOST}" ]] && { errmsg "Missing def'n for P4F_BACKUP_HOST!"; cfgDataOK=0; }
[[ -z "${P4F_DR}" ]] && { errmsg "Missing def'n for P4F_DR!"; cfgDataOK=0; }
[[ -z "${P4F_DR_HOST}" ]] && { errmsg "Missing def'n for P4F_DR_HOST!"; cfgDataOK=0; }
[[ -z "${P4F_BROKER}" ]] && { errmsg "Missing def'n for P4F_BROKER!"; cfgDataOK=0; }
[[ -z "${P4F_BROKER_HOST}" ]] && { errmsg "Missing def'n for P4F_BROKER_HOST!"; cfgDataOK=0; }
[[ $cfgDataOK -eq 0 ]] && bail "Faulty instance configuration data in $iCfgFile."
msg "Data for instance ${i} syntactically OK. MASTER=${P4F_MASTER_HOST}, BACKUP=${P4F_BACKUP_HOST}.\n"
else
errmsg "Missing instance config file [$iCfgFile]! Aborting failover."
return 1
fi
return 0
}
#------------------------------------------------------------------------------
# Update the instance-specific failover config file that lives on the broker
# machine as /p4/x/bin/p4failover.x.cfg.
#
# Input: $1 - Instance number.
# Return 0 to indicate Success, else 1 to indicate failure.
#------------------------------------------------------------------------------
function update_failover_cfg
{
declare i=$1
declare iCfgFile="/p4/${i}/bin/p4failover.${i}.cfg"
declare -i allOK=1
runCmd "chmod +w ${iCfgFile}"
if [[ $NO_OP -eq 0 ]]; then
msg "Swapping host roles in ${iCfgFile}."
cat ${iCfgFile} | \
sed "s/P4F_MASTER=${P4F_MASTER}/P4F_MASTER=${P4F_BACKUP}/g"| \
sed "s/P4F_BACKUP=${P4F_BACKUP}/P4F_BACKUP=${P4F_MASTER}/g" \
> ${iCfgFile}.new
diff ${iCfgFile} ${iCfgFile}.new
/bin/mv -f "${iCfgFile}.new" "${iCfgFile}"
[[ $? -ne 0 ]] && $allOK=0
else
msg "NO_OP: Would swap host roles in ${iCfgFile}."
fi
# chmod +w first, then scp.
[[ $NO_OP -eq 0 ]] && ssh ${P4F_MASTER_HOST} chmod +w $iCfgFile > /dev/null 2>&1
runCmd "scp -pq ${iCfgFile} ${P4F_MASTER_HOST}:${iCfgFile}" \
"Pushing p4failover.${i}.cfg to old master server ${P4F_MASTER_HOST}."
if [[ $CMDEXITCODE != 0 ]]; then
errmsg "Failed to push p4failover.${i}.cfg to ${P4F_MASTER_HOST}."
warnmsg "BE SURE TO MANUALLY ADJUST ${iCfgFile} on ${P4F_MASTER_HOST} LATER!"
$allOK=0
fi
# chmod +w first, then scp.
[[ $NO_OP -eq 0 ]] && ssh ${P4F_BACKUP_HOST} chmod +w $iCfgFile > /dev/null 2>&1
runCmd "scp -pq ${iCfgFile} ${P4F_BACKUP_HOST}:${iCfgFile}" \
"Pushing p4failover.${i}.cfg to new master server ${P4F_BACKUP_HOST}."
if [[ $CMDEXITCODE != 0 ]]; then
errmsg "Failed to push p4failover.${i}.cfg to ${P4F_MASTER_HOST}."
$allOK=0
fi
[[ $allOK -eq 0 ]] && return 1
return 0
}
#------------------------------------------------------------------------------
# Execute failover on a single target host by calling the p4failover-ha.sh
# script on that host.
#
# Features:
# - Insultates against foreign shell (e.g. /bin/tcsh rather than /bin/bash).
# - Captures shell stdout & stderr.
# - Returns exit status of remote command.
#
# Note that P4_MASTER_HOST referes to the host that was the master at the
# start of the failover process, which is about to become the backup host.
# Likewise, P4F_BACKUP_HOST refers to the backup host prior to failover
# processing, which is the machine about to become the new master server.
#------------------------------------------------------------------------------
function execute_ha_host_failover
{
declare i=$1
declare rCmd=""
declare rLog=""
declare -i exitCode=0
msg "Loading SDP environment for instance ${i}."
set +u
source $CBIN/p4_vars ${i}
source $CBIN/backup_functions.sh
check_vars
set_vars
source $CBIN/state_engine_functions.sh
set -u
if [[ "${CLUSTERING_AND_SAN}" == false ]]; then
rCmd="rsync -avz ${P4F_MASTER_HOST}:${CHECKPOINTS} ${P4HOME}/."
runRemoteCmd "${P4F_BACKUP_HOST}" "$rCmd" \
"SIM: Copying checkpoints to simulate adjusting SAN mounts for HA Failover."
if [[ $RCMDEXITCODE -ne 0 ]]; then
errmsg "Failed to transfer checkpoints."
exitCode=1
fi
fi
# Attempt to disable crontab on the current master server, the one we're failing
# over from, even in an unscheduled failover, to try to avoid a split-brain
# scenario.
runCmd "ssh ${P4F_MASTER_HOST} crontab -r" \
"Safety: Disabling crontab for ${USER}@${P4F_MASTER_HOST}."
# When checking status for the 'crontab -r' command, silently ignore
# 'no crontab for user' messages, but record the error on others.
if [[ $CMDEXITCODE -ne 0 ]]; then
if [[ "$CMDOUTPUT" != *"no crontab for"* ]]; then
errmsg "Failed to disable crontab for $USER on ${P4F_MASTER_HOST}!"
exitCode=1
fi
fi
runCmd "ssh ${P4F_BACKUP_HOST} crontab -r" \
"Safety: Disabling crontab for ${USER}@${P4F_BACKUP_HOST}."
if [[ $CMDEXITCODE -ne 0 ]]; then
if [[ "$CMDOUTPUT" != *"no crontab for"* ]]; then
errmsg "Failed to disable crontab for $USER on ${P4F_BACKUP_HOST}!"
exitCode=1
fi
fi
# In Scheduled HA failover, we assume the master server is up. We go
# there, shut it down nicely, and adjust the p4.master and p4failover.cfg
# files to indicate that it's now the backup machine.
if [[ $Mode == Scheduled ]]; then
rLog="${LOGDIR}/p4failover.HA.BACKUP.${P4F_MASTER_HOST}.$(date +'%Y%m%d-%H%M%S').log"
rCmd="$CBIN/p4failover-ha.sh -role BACKUP -i ${i} -M ${P4F_MASTER_HOST} -B ${P4F_BACKUP_HOST} -si -L ${rLog} ${PassThruArgs} -v5"
runRemoteCmd ${P4F_MASTER_HOST} "$rCmd" \
"Requesting host ${P4F_MASTER_HOST} to be the BACKUP server." 0 0
runCmd "scp -pq ${P4F_MASTER_HOST}:${rLog} ${rLog}"
[[ $NO_OP -eq 0 ]] && cat ${rLog}
if [[ $RCMDEXITCODE -eq 0 ]]; then
msg "Host ${P4F_MASTER_HOST} is now the BACKUP."
else
errmsg "Request to make ${P4F_MASTER_HOST} become the BACKUP returned non-zero exit status (${RCMDEXITCODE})!"
exitCode=1
fi
fi
rLog="${LOGDIR}/p4failover.HA.MASTER.${P4F_BACKUP_HOST}.$(date +'%Y%m%d-%H%M%S').log"
rCmd="$CBIN/p4failover-ha.sh -role MASTER -i ${i} -M ${P4F_MASTER_HOST} -B ${P4F_BACKUP_HOST} -si -L ${rLog} ${PassThruArgs} -v5"
runRemoteCmd ${P4F_BACKUP_HOST} "$rCmd" \
"Requesting host ${P4F_BACKUP_HOST} to be the MASTER server." 0 0
runCmd "scp -pq ${P4F_BACKUP_HOST}:${rLog} ${rLog}"
[[ $NO_OP -eq 0 ]] && cat ${rLog}
if [[ $RCMDEXITCODE -eq 0 ]]; then
msg "Host ${P4F_BACKUP_HOST} is now the MASTER."
else
errmsg "Request to make ${P4F_BACKUP_HOST} become the MASTER returned non-zero exit status (${RCMDEXITCODE})!"
exitCode=1
fi
# Even if failover was not successfull, it is best to update the config file.
update_failover_cfg "${i}"
if [[ $exitCode -eq 0 ]]; then
return 0
else
return 1
fi
}
#------------------------------------------------------------------------------
# Execute failover on a single target host by calling the p4failover-local.sh
# script on that host.
#
# Features:
# - Insulates against foreign shell (e.g. /bin/tcsh rather than /bin/bash).
# - Captures shell stdout & stderr.
# - Display exit code rather than returning it, for easy capture when running
# multiple processes concurrently.
#
# Note that P4_MASTER_HOST referes to the host that was the master at the
# start of the failover process, which is about to become the backup host.
# Likewise, P4F_BACKUP_HOST refers to the backup host prior to failover
# processing, which is the machine about to become the new master server.
#------------------------------------------------------------------------------
function manage_local_failover
{
declare i=$1
declare rCmd=""
declare rLog=""
msg "Loading SDP environment for instance ${i}."
set +u
source $CBIN/p4_vars ${i}
source $CBIN/backup_functions.sh
check_vars
set_vars
source $CBIN/state_engine_functions.sh
set -u
runCmd "ssh ${P4F_MASTER_HOST} crontab -r" \
"Safety: Disabling crontab for ${USER}@${P4F_MASTER_HOST}."
rLog="${LOGDIR}/p4failover.LOCAL.${P4F_MASTER_HOST}.$(date +'%Y%m%d-%H%M%S').log"
rCmd="$CBIN/p4failover-local.sh -i ${i} -M ${P4F_MASTER_HOST} -B ${P4F_BACKUP_HOST} -si -L ${rLog} ${PassThruArgs} -v5"
runRemoteCmd "${P4F_MASTER_HOST}" "$rCmd" \
"Requesting local failover of instance ${i} on ${P4F_MASTER_HOST}." 0 0
runCmd "scp -pq ${P4F_MASTER_HOST}:${rLog} ${rLog}"
[[ $NO_OP -eq 0 ]] && cat ${rLog}
if [[ $RCMDEXITCODE -eq 0 ]]; then
msg "Local Failover for instance ${i} successful on ${P4F_MASTER_HOST}."
echo "manage_local_failover: PEXITCODE: 0"
else
errmsg "Local Failover failed for instance ${i} on ${P4F_MASTER_HOST}."
echo "manage_local_failover: PEXITCODE: 1"
fi
}
#------------------------------------------------------------------------------
# Function: manage_concurrent_processes
#
# Wait for processes to stop logging, as indicated their associated log file
# containing "EXITCODE" on the last line of output.
#
# The array ProcessLogs contains the log file associated with each pid.
# The array ProcessState indicates the state, 1=running, 0=done.
# Array indexes associate a process with its pid and state.
function manage_concurrent_processes
{
declare title=$1
declare -i i=0
declare ec=""
msg "Waiting for $ProcessCount processes to finish."
if [[ $VERBOSITY -gt 3 ]]; then
for ((i=0; i < $ProcessCount; i+=1)); do
vmsg "Log[$i]=${ProcessLogs[$i]}."
done
fi
while [[ -n "1" ]]; do
sleep 2
for ((i=0; i < $ProcessCount; i+=1)); do
[[ ${ProcessState[$i]} -eq 0 ]] && continue
vvmsg "Checking for exit code in log [${ProcessLogs[$i]}]."
ec=$(grep -a "PEXITCODE:" ${ProcessLogs[$i]})
if [[ $? -eq 0 ]]; then
ProcessState[$i]=0
ec=${ec##*PEXITCODE: }
ec=${ec%% *}
ProcessExitCode[$i]=$ec
[[ "${ec}" != 0 ]] && OverallReturnStatus=1
fi
done
# Exit the loop only if there are no running processes.
vvmsg "Process state check: ${ProcessState[@]}"
[[ "${ProcessState[@]}" == *"1"* ]] || break
done
msg "Concurrent processes have completed."
# Display log files to incorporate them into the main script log.
for ((i=0; i < $ProcessCount; i+=1)); do
msg "$title log for instance ${ProcessForInstance[$i]}:\n"
cat ${ProcessLogs[$i]}
msg "Exit Code from ${ProcessLogs[$i]} is ${ec}."
done
}
#------------------------------------------------------------------------------
# Function: usage (required function)
#
# Input:
# $1 - style, either -h (for short form) or -man (for man-page like format).
#------------------------------------------------------------------------------
function usage
{
declare style=${1:--h}
echo "USAGE:
$THISSCRIPT {-s|-u} {-ha|-dr|-local} -i <#> [-L <log>] [-si] [-v<n>] [-D]
or
$THISSCRIPT -check [-fix|-FIX] [-i <#>] [-L <log>] [-v<n>] [-D]
or
$THISSCRIPT [-h|-man]
"
if [[ $style == -man ]]; then
echo -e "
OPTIONS: Check Mode
-check Specifies 'check' mode. In this mode, no failover will be
executed. Instead, ssh checks are done to ensure that
machines that could be involved in failover are accessible.
Check mode also reports on whether the 'p4.master' files
on the machines are in the expected state, i.e. 'true' on
the master server and 'false' on the backup.
-fix If '-fix' is specified and the 'p4.master' values are not
as expected based on the p4failover.cfg file, then the
p4failover.cfg file is updated to match the p4.master files.
If Perforce server instances are up and running on the
\"wrong\" machine according to the instannce's p4failover.cfg
file, the '-fix' option will refuse to operate. To force it,
use '-FIX' instead.
-i Specify a comma-delimited list of Perforce server instances to
check. Or, specify the special value 'ALL' (or 'all') to
check all instances specified by P4F_ALL_INSTANCES in
${CentralCfgFile}. The '-i' argument is optional in Check
Mode; omitting it is equivalent to specifying '-i ALL'.
Regardless of the instance specified, the main log for
p4failover.sh appears in $(dirname ${P4U_LOG}).
Supplemental logs may appear under specific instance log
directories.
OPTIONS: Failover Mode
-s Specifies a Scheduled Failover.
-u Specifies an Unscheduled Failover.
-ha Specifies an HA failover from the primary server to the HA backup
server.
-dr Specifies a DR failover from the primary server to the DR backup
server. Implies '-u'.
THE -dr DISASTER RECOVER IS NOT YET IMPLEMENTED!
-local Specifies a failover on the primary server, but using Offline
databases. Implies '-u'.
-i Specify a comma-delimited list of Perforce server instances to
failover. Or, specify the special value 'ALL' (or 'all') to
failover all instances specified by P4F_ALL_INSTANCES in
${CentralCfgFile}. Only instances mastered on the
current machine (with a \"true\" value in /p4/x/bin/p4.master)
are failed over. The '-i' argument is required in Failover
Mode.
OPTIONS: General (applicable to Check and Failover modes)
-v<n> Set verbosity 1-5 (-v1 = quiet, -v5 = highest).
-L <log>
Specify the path to a log file, or the special value 'off' to disable
logging. By default, all output (stdout and stderr) goes to:
${P4U_LOG}.
NOTE: This script is self-logging. That is, output displayed on the
screen is simultaneously captured in the log file. Do not run this
script with redirection operators like '> log' or '2>&1',
and do not use it with 'tee.'
-si Operate silently. All output (stdout and stderr) is redirected to
the log only; no output appears on the terminal. This cannot be
used with '-L off'.
-D Set extreme debugging verbosity.
HELP OPTIONS:
-h Display short help message
-man Display man-style help message
DESCRIPTION:
Failover is about executing a transition of the Perforce service,
to minimize downtime of Perforce. There are different types and
modes of failover which apply in different failure scenarios.
The goals are generally to minimize both downtime and data loss in
a variety of failure scenarios.
Failover Modes:
There are two failover modes: Scheduled and Unscheduled.
In Scheduled failover, all server mcachines are assumed to be online
and operating nomrally, and all Perforce databases healthy. Perforce
is shut down and the service is transitioned smoothly.
An Unscheduled failover is the result of something going wrong, such as
power failures that might corrupt databases, a hardware failure on the
primary server machine, or a disaster scenario that affects an
entire site.
Failover Types:
There are three types of Failover: Local, HA, and DR.
A Local failover is a keeps the Perofrce service on the same machine,
and simply makes uses of offline databases. For example, say there is
a sudden power failure that shuts down both the Master and Backup
server machines. Power is restored 10 minutes later. There is no
reason to suspect that primary server hardware is damaged, but there
is risk the live databases might be corrupt due to the sudden power
loss. An Local Failover can be executed in this case.i
In Local Failover, live databases are moved aside, and the offline
databases are moved into the live directory and refreshed with the
latest journal file prior to starting p4d.
In a High Availabilty (HA) Failover, Perforce is restarted on the
Backup server machine. This can be Scheduled (e.g. for planned
maintenance, like adding RAM) or Unscheduled (e.g. CPU failure).
So long as all hardware is in good working order, it is possible tox
execute HA Failover bi-directionally between the Primary server
and the Backup server, which reverses roles
after an HA failover.
In a Disaster Recovery (DR) Failover, Perforce service is transitioned
to a Disaster Recovery site. This can be Scheduled or Unscheduled.
A DR failover (NOT IMPLEMENTED) is a one-way transition. Returning
service after a DR failover is a manual process, outside the scope of
this $THISSCRIPT command.
CONFIG FILES:
Instance specific p4failover.n.cfg file define the roles of various hosts
in the deployment architecture for that instance. A global p4failover.cfg
in /p4/common/bin/ file defines configuration common to all instances.
This script uses a central configuration, $CentralCfgFile, and a set of
instance-specific config files in /p4/x/bin/p4failover.cfg.
The central config files defines global settings. One setting is the
definitaion of which Perforce server instances are affected
when '-i ALL' is specified.
UNDER THE HOOD:
This script operates from the broker machine, and uses ssh commands to
communicate with other Perforce-related servers.
USAGE EXAMPLES:
Basic system sanity check.
$THISSCRIPT -check
Planned HA Failover to allow hardware maintenance on the primary server:
$THISSCRIPT -s -ha -i all
After a power out, if the primary server hardware is deemed OK,
a Local Failover should be done due to risk of potential data
corruption:
$THISSCRRIPT -u -local -i all
After a CPU failure on the Primary server machine, an Unscheduled HA
Failover can be initiated:
$THISSCRRIPT -u -ha -i all
After decision to execute a DR failover (assuming the broker
machine survives the calamity):
$THISSCRIPT -u -dr -i all **** NOT IMPLEMENTED ****
SYSTEM DEPENDENCIES:
Following are dependencies of this system:
* The Perforce Server Deployment Package (SDP) must be configured for HA
installed on key machines.
* A system to keep the broker.conf files updated on the primary and backup
broker machines. Note that the backup broker is usually powered down.
* SSH keys for the 'perforce' user must exist between any machines involved in
the failover. This includes:
** Master Perorce Server
** Backup Perorce Server
** Perforce Broker Server
** Perforce DR Server
"
fi
exit 1
}
#------------------------------------------------------------------------------
# Function: terminate
function terminate
{
# Disable signal trapping.
trap - EXIT SIGINT SIGTERM
# Don't litter.
cleanTrash
declare mySSHPids=""
# Do a kill if needed to make sure the 'ssh' processes started by this
# script don't hang after completion. Ignore the return process of the kill.
mySSHPids=$(ps -u $USER -f|grep -v grep|grep "/tmp/$USER.runRemoteCmd.$$"|cut -d ' ' -f 2)
[[ -n "$mySSHPids" ]] && runCmd "kill -TERM ${mySSHPids}"
vvmsg "$THISSCRIPT: EXITCODE: $OverallReturnStatus"
# Stop logging.
[[ "${P4U_LOG}" == off ]] || stoplog
# With the trap removed, exit.
exit $OverallReturnStatus
}
#==============================================================================
# Command Line Processing
declare -i shiftArgs=0
set +u
while [[ $# -gt 0 ]]; do
case $1 in
(-h) usage -h;;
(-man) usage -man;;
(-check) Mode="HostCheck";;
(-fix) Fix="true";;
(-FIX) ForceFix="true"; Fix="true";;
(-c) CfgFile=$2; shiftArgs=1;;
(-s) export Mode="Scheduled"; PassThruArgs="${PassThruArgs}";;
(-u) export Mode="Unscheduled"; PassThruArgs="${PassThruArgs}";;
(-ha) export Type="HA";;
(-dr) export Type="DR"; export Mode="Unscheduled";;
(-local) export Type="LOCAL"; export Mode="Unscheduled";;
(-v1) export VERBOSITY=1; PassThruArgs+=" $1";;
(-v2) export VERBOSITY=2; PassThruArgs+=" $1";;
(-v3) export VERBOSITY=3; PassThruArgs+=" $1";;
(-v4) export VERBOSITY=4; PassThruArgs+=" $1";;
(-v5) export VERBOSITY=5; PassThruArgs+=" $1";;
(-i) InstanceList=$2; shiftArgs=1;;
(-L) export P4U_LOG=$2; shiftArgs=1;;
(-si) SilentMode=1;;
(-n) export NO_OP=1; PassThruArgs+=" $1";;
(-D) set -x; PassThruArgs+=" $1";; # Debug; use 'set -x' mode.
(*) usageError "Unknown arg ($1).";;
esac
# Shift (modify $#) the appropriate number of times.
shift; while [[ $shiftArgs -gt 0 ]]; do
[[ $# -eq 0 ]] && usageError "Bad usage."
shiftArgs=$shiftArgs-1
shift
done
done
[[ $SilentMode -eq 1 && $P4U_LOG == off ]] && \
usageError "Cannot use '-si' with '-L off'."
set -u
#==============================================================================
# Main Program
trap terminate EXIT SIGINT SIGTERM
declare -i OverallReturnStatus=0
if [[ "${P4U_LOG}" != off ]]; then
touch ${P4U_LOG} || bail "Couldn't touch log file [${P4U_LOG}]."
# Redirect stdout and stderr to a log file.
if [[ $SilentMode -eq 0 ]]; then
exec > >(tee ${P4U_LOG})
exec 2>&1
else
exec >${P4U_LOG}
exec 2>&1
fi
initlog
fi
bail "This requires review and customization for operation in the Epic Games environment."
load_central_cfg
#------------------------------------------------------------------------------
# Command line integrtiy/safety checks.
if [[ -n "$Type" ]]; then
case $Type in
(HA)
[[ "$Mode" == Scheduled || "$Mode" == Unscheduled ]] || \
bail "Usage error: With -ha, you must specify '-s' or '-u'."
[[ -z "$InstanceList" ]] && \
bail "Usage error: With -ha|-local|-dr, you must specify '-i <#>'."
;;
(LOCAL|DR)
[[ -z "$InstanceList" ]] && \
bail "Usage error: With -ha|-local|-dr, you must specify '-i <#>'."
;;
(*) bail "Aborting: Unhandled Failover Type [$Type]." ;;
esac
fi
if [[ -n "$Mode" ]]; then
case $Mode in
(HostCheck|Scheduled|Unscheduled) ;;
(*) bail "Aborting: Unhandled Failover Mode [$Mode]." ;;
esac
else
Mode="HostCheck"
fi
[[ $InstanceList == ALL ]] && InstanceList=${P4F_ALL_INSTANCES}
[[ $InstanceList == all ]] && InstanceList=${P4F_ALL_INSTANCES}
for Instance in ${InstanceList//,/ }; do
case $Instance in
(1|2|3|4|5|6|7|8|9) ;;
(*) OverallReturnStatus=1; errmsg "Invalid instance [$Instance] specified." ;;
esac
done
[[ $OverallReturnStatus -ne 0 ]] && bail "Aborting: Invalid instance specified."
# Don't do anything beyond command line checking if not on the broker machine.
if [[ "$HOSTNAME" != "p4broxy"* ]]; then
bail "Error: Run this only from the broker machine."
fi
#------------------------------------------------------------------------------
# This is a sanity check that the failover system is installed ok.
if [[ $Mode == HostCheck ]]; then
check_hosts || OverallReturnStatus=1
# Types: LOCAL, HA, DR.
# LOCAL: Failover to offline databases local to the master server machine.
# HA: High Availability. Failover to the backup machine, using the
# offline databases maintained by the Server Deploymet Package on that
# machine.
# DR: Disaster Recovery. Failover to a remote data center.
# Modes: Scheduled, Unscheduled.
# For Unscheduled, the master Perforce server machine may not be availalbe.
elif [[ $Type == LOCAL || $Type == HA ]]; then
msg "${H}\nExecuting $Mode $Type Failover for instances: ${InstanceList}.\n"
# Start concurrent processes to failover all specified instances together.
ProcessCount=0
for Instance in ${InstanceList//,/ }; do
ProcessForInstance[$ProcessCount]=$Instance
Log="/p4/${Instance}/logs/p4failover.tmp.${Mode}_${Type}.log"
echo "Starting $Mode $Type failover of Instance ${Instance} at $(date)" > ${Log}
if [[ $Type == LOCAL ]]; then
( manage_local_failover ${Instance} > ${Log} 2>&1 < /dev/null & )
else
( manage_ha_failover ${Instance} > ${Log} 2>&1 < /dev/null & )
fi
ProcessLogs[$ProcessCount]=${Log}
ProcessState[$ProcessCount]=1
ProcessCount+=1;
done
manage_concurrent_processes "$Type Failover"
elif [[ $Type == DR ]]; then
warnmsg "${H}\nThe DR Failover mode has not yet been implemented.${H}\n"
bail "DR Failover not attempted."
fi
if [[ $OverallReturnStatus -eq 0 ]]; then
msg "${H}\nAll processing completed successfully."
else
msg "${H}\nProcessing completed, but with errors. Scan above output carefully."
fi
# Illustrate using $SECONDS to display runtime of a script.
msg "That took about $(($SECONDS/3600)) hours $(($SECONDS%3600/60)) minutes $(($SECONDS%60)) seconds."
# See the terminate() function, which is really where this script exits.
exit $OverallReturnStatus