#!/bin/bash
#------------------------------------------------------------------------------
set -u
declare ThisScript=${0##*/}
declare ThisScriptRaw="${0}"
declare Args="$*"
declare ThisService=${ThisScript%.sh}
declare -i DelayPolling=2
declare -i DelayP4DOffline=300
declare -i DelayAfterFix=120
declare CmdLine="$0 $*"
declare LaunchCmdLine=
declare Version=1.1.13
declare -i Debug=0
declare -i NoOp=0
declare OpMode=
declare RunUser=
declare ThisUser=
declare ThisHost=
declare PullErrorsFile=
declare PullFilesFile=
declare VerifyCmd=
declare File=
declare -i PullFixesTried=0
declare Pid=
declare PidFile="${LOGS:-/tmp}/${ThisService}.pid"
declare TransferStatusData=
declare ActiveTransfers=
declare TotalTransfers=
declare Log=
declare UserLog=
declare OldLogTimestamp=
declare OldLog=
declare SDPInstance=${SDP_INSTANCE:-Unset}
declare SDPEnvFile=/p4/common/bin/p4_vars
declare SDPInstanceFile=
declare -i ErrorCount=0
declare -i SilentMode=0
declare H1="\\n=============================================================================="
declare H2="\\n------------------------------------------------------------------------------"
#==============================================================================
# Local Functions
#------------------------------------------------------------------------------
# msg() Message to stdout.
# errmsg () Error message using msg, increment ErrorCount
# bail () Error message then exit
function msg () { echo -e "$*"; }
function dbg () { [[ "$Debug" -eq 0 ]] || msg "DEBUG: $*"; }
function errmsg () { msg "\\nError: ${1:-Unknown Error}\\n"; ErrorCount+=1; }
function bail () { errmsg "${1:-Unknown Error}"; exit "${2:-1}"; }
#------------------------------------------------------------------------------
# Function: terminate()
#
# After the service is launched, the EXIT/SIGINT/SIGTERM signals are trapped,
# and receipt of those signals results in terminate() function being called.
#------------------------------------------------------------------------------
function terminate
{
# Disable signal trapping.
trap - EXIT SIGINT SIGTERM
msg "$ThisScript: EXIT_CODE: $ErrorCount"
# Stop logging.
[[ -n "${Log}" ]] || msg "\\nLog is: $Log"
# Blast the PidFile as we exit.
rm -f "$PidFile"
# With the trap removed, exit.
exit "$ErrorCount"
}
#------------------------------------------------------------------------------
# Function: usage (required function)
#
# Input:
# $1 - style, either -h (for short form) or -man (for man-page like format).
# The default is -h.
#
# $2 - error message (optional). Specify this if usage() is called due to
# user error, in which case the given message displayed first, followed by the
# standard usage message (short or long depending on $1). If displaying an
# error, usually $1 should be -h so that the longer usage message doesn't
# obscure the error message.
#
# Sample Usage:
# usage
# usage -h
# usage -man
# usage -h "Incorrect command line usage."
#------------------------------------------------------------------------------
function usage {
declare style=${1:--h}
declare errorMessage=${2:-Unset}
if [[ "$errorMessage" != Unset ]]; then
msg "\\n\\nUsage Error:\\n\\n$errorMessage\\n\\n"
fi
echo "USAGE for $ThisScript v$Version:
$ThisScript start [-dp #] [-do #] [-df #] [-i <SDPInstance>] [-L <log>] [-si] [-n] [-d|-D]
or
$ThisScript status [-d|-D]
or
$ThisScript stop [-d|-D]
or
$ThisScript [-h|-man|-V]
"
if [[ $style == -man ]]; then
echo -e "
DESCRIPTION:
This script is an interim workaround for a bug in p4d, identified as:
job107572 - 'p4 sync' on commit server of in-flight background transfer
fails with 'open for read' error.
This script runs as a service.
This activates when 'p4 pull -ls' indicates no active archive transfers
are in progress, and then only if there are persistent failures with
archive transfers.
After any actual attempt at fixing anything, it will sleep before waking
up again. Otherwise, between fixes, it polls frequently.
If p4d is unreachable, it silently ignores it and patiently waits, so
this service does not need to be stopped during maintenance for p4d.
OPTIONS:
-dp #
Specify the polling delay in seconds. The default is $DelayPolling.
-do #
Specify the delay in seconds to wait after p4d is found to be offline.
The default is $DelayP4DOffline.
-da #
Specify the delay in seconds after attempting to fix archive pull issues.
The default is $DelayAfterFix
-i <N>
Specify the SDP instance name. This is required unless the SDP_INSTANCE
environment variable is set, e.g. if the SDP environment has already been
loaded.
-L <log>
Specify the path to a log file, or the special value 'off' to disable
logging. By default, all output (stdout and stderr) goes to
${LOGS:-/tmp}/${ThisService}.log
NOTE: This script is self-logging. That is, output displayed on the screen
is simultaneously captured in the log file. Do not run this script with
redirection operators like '> log' or '2>&1', and do not use 'tee.'
-si Operate silently. All output (stdout and stderr) is redirected to the log
only; no output appears on the terminal. This cannot be used with '-L off'.
This is useful when running from cron, as it prevents automatic email from
being sent by cron directly, as cron does when a script called from cron
generates any output.
-n No-Op. Prints data-affecting commands instead of running them.
-d Enable debugging verbosity.
-D Enable extreme extreme debugging verbosity, using bash 'set -x' mode.
HELP OPTIONS:
-h Display short help message
-man Display man-style help message
"
fi
exit 1
}
#==============================================================================
# Command Line Processing
declare -i shiftArgs=0
set +u
while [[ $# -gt 0 ]]; do
case $1 in
(-h|-V) usage -h;;
(-man) usage -man;;
(-i) SDPInstance="${2:-}"; shiftArgs=1;;
(-dp) DelayPolling="${2:-}"; shiftArgs=1;;
(-do) DelayP4DOffline="${2:-}"; shiftArgs=1;;
(-df) DelayAfterFix="${2:-}"; shiftArgs=1;;
(-L) UserLog="${2:-}"; shiftArgs=1;;
(-n) NoOp=1;;
(-d) Debug=1;; # Enable debug mode.
(-D) Debug=1; set -x;; # Extreme debug; use bash 'set -x' mode.
(start|stop|status|_launch) OpMode="$1";;
(-*) usage -h "Unknown option ($1).";;
(*) usage -h "Unknown arg ($1)." ;;
esac
# Shift (modify $#) the appropriate number of times.
shift; while [[ $shiftArgs -gt 0 ]]; do
[[ $# -eq 0 ]] && usage -h "Incorrect number of arguments."
shiftArgs=$shiftArgs-1
shift
done
done
set -u
#==============================================================================
# Command Line Verification
[[ -n "$UserLog" ]] && Log="$UserLog"
[[ "$Log" == "off" && "$SilentMode" -eq 1 ]] && \
bail "The '-si' option cannot be used with '-L off'."
if [[ -n "$OpMode" ]]; then
# The start, stop, and status OpMode values are documented.
# The back-end '_launch' mode is undocumented but allowed.
[[ "$OpMode" =~ ^(start|stop|status|_launch)$ ]] || \
bail "Invalid mode specified [$OpMode]; it should be one of : start, stop, status."
else
usage -h "No mode specified; specifiy one of: start, stop, status."
fi
[[ "$SDPInstance" == Unset ]] && \
bail "The SDP environment is not defined. Add the '-i <SDPInstance>' option to specify."
SDPInstanceFile="/p4/common/config/p4_${SDPInstance}.vars"
[[ -e "$SDPInstanceFile" ]] || \
bail "Missing SDP instance file [$SDPInstanceFile].\\nIs the instance value [$SDPInstance] correct?"
[[ "$DelayPolling" =~ ^[0-9]+$ ]] || bail "The value specified with '-dp' must be numeric, not $DelayPolling."
[[ "$DelayP4DOffline" =~ ^[0-9]+$ ]] || bail "The value specified with '-do' must be numeric, not $DelayP4DOffline."
[[ "$DelayAfterFix" =~ ^[0-9]+$ ]] || bail "The value specified with '-da' must be numeric, not $DelayAfterFix."
#==============================================================================
# Main Program
if [[ -e "$SDPEnvFile" ]]; then
dbg "Loading SDP Environment with: source $SDPEnvFile $SDPInstance"
# shellcheck disable=SC1090
source "$SDPEnvFile" "$SDPInstance"
else
bail "No SDP Environment file [$SDPEnvFile] found. Aborting."
fi
ThisUser=$(whoami)
ThisHost=${HOSTNAME%%.*}
RunUser=${OSUSER:-UnsetOSUSER}
# Safety Preflight Checks
# Prestart Checks -- bail early if we're executed as the wrong user.
if [[ "$ThisUser" == "$RunUser" ]]; then
dbg "Verified: Running as $RunUser."
else
bail "Run this as $RunUser, not $ThisUser."
fi
# This script should only be run on edge servers. We make an assumption that the
# hostname of an edge server will contain the string 'edge'.
if [[ "$ThisHost" == *"edge"* ]]; then
dbg "Verified: Running on a host that appears to be an edge server."
else
bail "This host [$ThisHost] does not have 'edge' in the name. Run this only on an edge server."
fi
# In 'stop' mode, we verify that the Pid in the PidFile is indeed associated
# with this script, and then issue a 'kill' on that pid. When the live
# service gets the kill signal, it will call its terminate() function and
# remove the PidFile as it shuts down.
if [[ "$OpMode" == "stop" ]]; then
if [[ -e "$PidFile" ]]; then
Pid=$(cat "$PidFile")
if ps -f --pid "$Pid" > /dev/null 2>&1; then
if [[ "$(ps --pid "$Pid" -o command=)" == *"$ThisScript _launch"* ]]; then
# We have verified that the Pid in the PidFile is associated with
# this script, so we can safely kill it.
msg "Killing Pid: $Pid"
kill "$Pid"
else
errmsg "Pid $Pid in the PidFile [$PidFile] is running, but it is NOT for $ThisScript! Removing the PidFile."
rm -f "$PidFile" || bail "Could not remove file: $PidFile"
exit 1
fi
else
errmsg "Pid $Pid in the PidFile [$PidFile] is not running. Removing the PidFile."
rm -f "$PidFile" || bail "Could not remove file: $PidFile"
exit 1
fi
# The service can take a few minutes to shutdown. In theory this loop
# could go infinite if the kill signal isn't handled, but that's OK.
# If it has a problem shutting down, better to know about it than not.
echo -ne "Shutting down service $ThisService ..."
while true; do
if [[ ! -e "$PidFile" ]]; then
msg "\\nVerified: Service shutdown complete."
break
fi
echo -n .
sleep 1
done
else
# shellcheck disable=SC2009
Pid=$(ps -fu "$RunUser" | grep "$ThisScript _launch" | grep -v grep)
if [[ -z "$Pid" ]]; then
msg "The $ThisService service was not running. Stop not needed.\\n"
else
errmsg "Even though the PidFile [$PidFile] does not exist, this process appears to be associated with this $ThisService service:\\n$Pid"
fi
fi
exit "$ErrorCount"
fi
# In 'status' mode, display process status and exit.
if [[ "$OpMode" == "status" ]]; then
if [[ -e "$PidFile" ]]; then
Pid=$(cat "$PidFile")
msg "The $ThisService service pid is: $Pid\\nProcess Status:"
# Display 'ps -f' info for the pid. If a process with that pid is running,
# double check that the pid is associated with this script.
if ps -f --pid "$Pid"; then
if [[ "$(ps --pid "$Pid" -o command=)" == *"$ThisScript _launch"* ]]; then
msg "\\nVerified: Pid $Pid is running for $ThisScript."
else
errmsg "Pid $Pid is running, but it is NOT for $ThisScript! Perhaps the PidFile [$PidFile] should be removed?"
fi
fi
else
# shellcheck disable=SC2009
Pid=$(ps -fu "$RunUser" | grep "$ThisScript _launch" | grep -v grep)
if [[ -z "$Pid" ]]; then
msg "The $ThisService service is not running."
else
errmsg "Even though the PidFile [$PidFile] does not exist, this process appears to be associated with this $ThisService service:\\n$Pid"
fi
fi
exit "$ErrorCount"
fi
# In 'start' (or back-end '_launch') mode, check for the PID file.
if [[ -e "$PidFile" ]]; then
Pid=$(cat "$PidFile")
errmsg "PidFile [$PidFile] exists, so refusing to start another service.\\nChecking process status of pid [$Pid] from the PidFile:"
if ps -f --pid "$Pid"; then
if [[ "$(ps --pid "$Pid" -o command=)" == *"$ThisScript _launch"* ]]; then
msg "\\nVerified: Pid $Pid is running for $ThisScript. Start."
else
bail "Pid $Pid is running, but it is NOT for $ThisScript! Perhaps the PidFile [$PidFile] should be removed?"
fi
else
msg "No process is running with the pid in the PidFile, $Pid. Removing the pid file [$PidFile] and moving on."
rm -f "$PidFile" || bail "Failed to remove PidFile."
fi
fi
# At this point, we know there is no PidFile.
# In 'start' mode, launch the service with an 'exec'.
if [[ "$OpMode" == "start" ]]; then
# Change 'start' to '_launch', and ensure the '_launch" is the first
# parameter (to guarantee 'ps -C' checks work as expected).
LaunchCmdLine="$ThisScriptRaw _launch ${Args/ start/}"
# This should never happen ...
[[ "$LaunchCmdLine" =~ _launch ]] || \
bail "Malformed launch command: $LaunchCmdLine"
# Display the command line provided by the user with 'start', not
# LaunchCmdLine with '_launch', so as not to expose the undocumented/back-end
# '_launch' OpMode.
msg "Starting service $ThisService from command line:\\n$CmdLine"
# shellcheck disable=SC2086
nohup $LaunchCmdLine < /dev/null > /dev/null 2>&1 &
sleep 1
if [[ -e "$PidFile" ]]; then
Pid=$(cat "$PidFile")
if ps -f --pid "$Pid"; then
msg "The process started OK."
fi
else
errmsg "Could not confirm service start. Check again in a bit with:\\n\\t$ThisScript status"
fi
exit "$ErrorCount"
fi
[[ "$OpMode" == "_launch" ]] || bail "Internal error: OpMode [$OpMode] unexpected at this point in the $ThisScript."
if [[ "$Log" != "off" ]]; then
# If Log was not set with '-L <log>' on the command line, use the default.
[[ -n "$Log" ]] || Log="${LOGS:-/tmp}/${ThisService}.log"
# If the log specified already exists, move it aside, injecting the modification
# timestmp to the existing log file name.
if [[ -e "$Log" ]]; then
# shellcheck disable=SC2012
OldLogTimestamp=$(ls -l --time-style=+'%Y%m%d-%H%M' "$Log" |cut -d ' ' -f 6)
OldLog="${LOGS:-/tmp}/$ThisService.$OldLogTimestamp.log"
mv "$Log" "$OldLog" ||\
bail "Failed to move old log [$Log] aside to: [$OldLog]."
fi
touch "$Log" || bail "Couldn't touch log file [$Log]."
# Redirect stdout and stderr to a log file.
if [[ "$SilentMode" -eq 0 ]]; then
exec > >(tee "$Log")
exec 2>&1
else
exec >"$Log"
exec 2>&1
fi
msg "${H1}\\nLog is: $Log\\n"
fi
trap terminate EXIT SIGINT SIGTERM
#==============================================================================
# Service Start and Operation.
msg "${H2}\\nService Started with $ThisScript v$Version as $ThisUser@$ThisHost on $(date) as:\\n$CmdLine"
echo $$ > "$PidFile"
msg "PidFile: $PidFile"
Pid=$(cat "$PidFile")
msg "Pid: $Pid"
msg "${H2}\\nService startup complete."
while true; do
sleep "$DelayPolling"
# If we can't talk to p4d, just wait patiently until we can.
if ! p4 -s info -s > /dev/null 2>&1; then
dbg "Cannot talk to p4d. Napping for $DelayP4DOffline seconds."
sleep "$DelayP4DOffline"
continue
fi
TransferStatusData="$(p4 -ztag -F %replicaTransfersActive%:%replicaTransfersTotal% pull -ls)"
ActiveTransfers="${TransferStatusData%%:*}"
TotalTransfers="${TransferStatusData##*:}"
dbg "AT: $ActiveTransfers TT: $TotalTransfers"
# If there are active transfers in flight, wait until it is quiet.
if [[ -z "$ActiveTransfers" || "$ActiveTransfers" != "0" ]]; then
dbg "Active transfers. Waiting."
continue
fi
if [[ -z "$TotalTransfers" || "$TotalTransfers" == "0" ]]; then
dbg "No persistent pull errors. Waiting."
continue
fi
msg "\\nThere are $TotalTransfers persistent pull failures. Attempting to fix them."
PullErrorsFile=$(mktemp "${P4TMP:-/tmp}/pef.XXXXXXXXX.txt")
PullFilesFile=${PullErrorsFile/\/pef/\/pff}
# Note that we intentionally don't capture stderr. If there is any, it will
# go the this script's log, not into the data file.
PullFixesTried=0
if p4 pull -l > "$PullErrorsFile"; then
if [[ -s "$PullErrorsFile" ]]; then
msg "The 'p4 pull -l' output lists $(wc -l "$PullErrorsFile"|cut -d ' ' -f 1) errors."
if perl -e "while(<>) { s/ \d{1}\.\d+ .*$//; print; } " < "$PullErrorsFile" | sort -u > "$PullFilesFile"; then
msg "Running 'p4 verify' on list of $(wc -l "$PullFilesFile" | cut -d ' ' -f 1) unique depot paths."
# Due to the way we use '$File' wrapped in single quotes and later
# do an eval to expand it, ShellCheck things that $File is not
# referenced (SC2034, unused variable). But we're using it, so
# we disable that warning. Similarly, we disable SC2086 and
# SC2016 (quoting) warnings in the block of code below. We also
# ignore SC2116 (useless use of echo) 'cuz the 'echo' is indeed
# useful.
# shellcheck disable=SC2034
while read -r File; do
PullFixesTried+=1
# shellcheck disable=SC2016
VerifyCmd='p4 -s verify -qt "$File"'
if [[ "$NoOp" -eq 0 ]]; then
# shellcheck disable=SC2086
msg "Executing: $(eval echo $VerifyCmd)"
# shellcheck disable=SC2086
eval $VerifyCmd
else
# shellcheck disable=SC2116 disable=SC2086
msg "NO_OP: Would execute: $(eval echo $VerifyCmd)"
fi
done < "$PullFilesFile"
msg "Attempted to fix $PullFixesTried files on this pass."
else
errmsg "Failed to get list off files from perl parsing of 'p4 pull -l' list output."
fi
else
msg "Output from 'p4 pull -l' was empty."
fi
else
errmsg "Failed to get list of files from 'p4 pull -l'."
fi
# If in Debug mode, display contents of generated files.
if [[ "$Debug" -ne 0 ]]; then
dbg "PullErrorsFile (raw): $PullErrorsFile"
dbg "PullFilesFiles (processed): $PullFilesFile"
else
rm -f "$PullErrorsFile" "$PullFilesFile"
fi
dbg "Napping for $DelayAfterFix seconds after attempting archive transfer fixes."
sleep "$DelayAfterFix"
done
exit "$ErrorCount"