sdp_health_check.sh #3

  • //
  • p4-sdp/
  • dev_rebrand/
  • Server/
  • Unix/
  • p4/
  • common/
  • bin/
  • sdp_health_check.sh
  • View
  • Commits
  • Open Download .zip Download (25 KB)
#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://workshop.perforce.com/view/p4-sdp/main/LICENSE
#------------------------------------------------------------------------------

#==============================================================================
# Declarations and Environment

# Version ID Block. Relies on +k filetype modifier.
#------------------------------------------------------------------------------
# shellcheck disable=SC2016
declare VersionID='$Id: //p4-sdp/dev_rebrand/Server/Unix/p4/common/bin/sdp_health_check.sh#3 $ $Change: 31617 $'
declare VersionStream=${VersionID#*//}; VersionStream=${VersionStream#*/}; VersionStream=${VersionStream%%/*};
declare VersionCL=${VersionID##*: }; VersionCL=${VersionCL%% *}
declare Version=${VersionStream}.${VersionCL}
[[ "$VersionStream" == r* ]] || Version="${Version^^}"

# Prefix global vars with HC_ to avoid name collisions.
declare ThisScript="${0##*/}"
declare ThisUser=
declare Log=
declare DirList=

declare HC_SDP_P4CBIN="/p4/common/bin"
declare HC_SDP_P4CCFG="/p4/common/config"
declare HC_SDP_ENV="$HC_SDP_P4CBIN/p4_vars"
declare HC_SDP_MRUN="$HC_SDP_P4CBIN/p4master_run"
declare HC_SDP_VSDP="$HC_SDP_P4CBIN/verify_sdp.sh"
declare HC_SDP_P4LOGIN="${HC_SDP_P4CBIN}/p4login"

declare SDPInstanceList=
declare SDP_341_URL="https://swarm.workshop.perforce.com/jobs/SDP-341"

declare SDPOwner=

declare -i ErrorCount=0
declare -i WarningCount=0

declare -a KeyFiles
declare -i KeyFileCount=0

declare -a SmallLogFiles
declare -i SmallLogCount=0

KeyFiles[KeyFileCount]="$HC_SDP_P4CBIN/p4_vars"
KeyFileCount+=1
KeyFiles[KeyFileCount]="$HC_SDP_P4CBIN/backup_functions.sh"
KeyFileCount+=1

SmallLogFiles[SmallLogCount]="checkpoint.log"
SmallLogCount+=1
SmallLogFiles[SmallLogCount]="sync_replica.log"
SmallLogCount+=1
SmallLogFiles[SmallLogCount]="replica_cleanup.log"
SmallLogCount+=1
SmallLogFiles[SmallLogCount]="replica_status.log"
SmallLogCount+=1
SmallLogFiles[SmallLogCount]="refresh_P4ROOT_from_offline_db.log"
SmallLogCount+=1
SmallLogFiles[SmallLogCount]="recreate_offline_db.log"
SmallLogCount+=1

declare MaxSmallLogLines=2500

declare H1="=============================================================================="
declare H2="------------------------------------------------------------------------------"

if [[ -n "$(command -v date)" ]]; then
   Log=/tmp/sdp_health_check.$(date +'%Y%m%d-%H%M%s').log
else
   Log=/tmp/sdp_health_check.log
fi

#==============================================================================
# Local Functions

# Note: This script does not use SDP library files, as its purpose is to
# verify the integrity of an SDP installation.  Thus, it has its own
# self-contained versions of some functions that would normally be
# sourced in from files like /p4/common/lib/libcore.sh.

# Micro-functions, one-liners used to avoid external dependencies.
# Display text with formatting.
function msg () { echo -e "$*" ; }

# Calls to errmsg() increment the ErrorCount.
function errmsg () { msg "\\nError: ${1:-Unknown Error}\\n"; ErrorCount+=1; }

# Calls to warnmsg() increment the WarningCount.
function warnmsg () { msg "\\nWarning: ${1:-Unknown Warning}\\n"; WarningCount+=1; }

# The bail is only used for critical errors that prevent this script from
# gathering output.  This should only be called early in processing.
function bail () { errmsg "${1:-Unknown Error}"; exit "${2:-1}"; }

function run () {
   local cmd="${1:-echo}"
   local desc="${2:-}"
   local -i showOutput="${3:-1}"
   local tmpLog=
   local -i exitCode=

   tmpLog=$(mktemp)

   [[ -n "$desc" ]] && msg "$desc"
   msg "Executing: $cmd"
   # shellcheck disable=SC2086
   eval $cmd > "$tmpLog" 2>&1
   exitCode="$?"

   if [[ "$showOutput" -eq 1 ]]; then
      echo "EXIT_CODE: $exitCode" >> "$tmpLog"
      cat "$tmpLog"
   fi

   /bin/rm -f "$tmpLog"
   return $exitCode
}

#------------------------------------------------------------------------------
#  copy_jd_table ($TableName, $RootDir)
#
# Copies the specified table to a temp dir for dumping to avoid locks taken by p4d -jd if done against P4ROOT
#
# The caller must ensure the specified table exists in the specified root dir.
#
# Input:
# $1 - TableName (required)
# $2 - RootDir (required, root or offline_db)
#
# Exports JDTmpDir
#
#------------------------------------------------------------------------------
function copy_jd_table () {
   local TableName=${1:-Unset}
   local RootDir=${2:-Unset}

   JDTmpDir=$(mktemp -d 2>/dev/null)
   if [[ ! -d "$JDTmpDir" ]]; then
      JDTmpDir=$(mktemp -d -p "$P4TMP" -t 'tmp_jdtmpdir.XXXXXXXX')
   fi

   if [[ ! -d "$JDTmpDir" ]]; then
      echo -e "\\nError: Could not initialize JDTmpDir [$JDTmpDir]\\n"
      exit 1
   fi

   export JDTmpDir
   cp "$RootDir/$TableName" "$JDTmpDir" || die "Failed to copy $RootDir/$TableName to $JDTmpDir"
}

#------------------------------------------------------------------------------
# Function: remove_jd_tables ()
# 
# Cleanup the JDTmpDir, with extra precautions to avoid doing an 'rm -rf' on the
# wrong path.
#------------------------------------------------------------------------------
function remove_jd_tables () {
   if [[ -n "${JDTmpDir:-}" && -d "$JDTmpDir" ]]; then
      if [[ "$JDTmpDir" =~ ^/tmp/tmp. || "$JDTmpDir" == "$P4TMP/tmp_jdtmpdir."* ]]; then
         rm -rf "${JDTmpDir:-/tmp/does_not_exist}/"
      fi
   fi
}

#------------------------------------------------------------------------------
# Function: usage (required function)
#
# Input:
# $1 - style, either -h (for short form) or -man (for man-page like format).
# The default is -h.
#
# $2 - error message (optional).  Specify this if usage() is called due to
# user error, in which case the given message displayed first, followed by the
# standard usage message (short or long depending on $1).  If displaying an
# error, usually $1 should be -h so that the longer usage message doesn't
# obscure the error message.
#
# Sample Usage:
# usage 
# usage -man
# usage -h "Incorrect command line usage."
#
# This last example generates a usage error message followed by the short
# '-h' usage summary.
#------------------------------------------------------------------------------
function usage
{
   declare style=${1:--h}
   declare errorMessage=${2:-Unset}

   if [[ $errorMessage != Unset ]]; then
      msg "\\n\\nUsage Error:\\n\\n$errorMessage\\n\\n" >&2
   fi

   msg "USAGE for sdp_health_check.sh version $Version:

sdp_health_check.sh

   or

sdp_health_check.sh -h|-man
"
   if [[ $style == -man ]]; then
      echo -e "DESCRIPTION:

	This script does a health check of the SDP. It generates a
	report log, which can be emailed to [email protected].
	It identifies SDP instances and reports on general SDP health.

	It must be run as the OS user who owns the $HC_SDP_P4CBIN
	directory.  This should be the user account which runs the
	p4d process, and which owns the /p4/common/bin directory
	(often 'perforce' or 'p4admin').

	Characteristics of this script:
	* It is always safe to run.  It does only analysis and reporting.
	* It does only fast checks, and has no interactive prompts.
	  Some log files are captured such as checkpoint.log, but not
	  potentially large ones such as the p4d server log.
	* It requires no command line arguments.
	* It works for any and all UNIX/Linux SDP version since 2007.
 
	Assumptions:
	* The SDP has always used $HC_SDP_ENV as the shell
	  environment file.  This is consistent across all SDP versions.

OPTIONS:
 -D     Set extreme debugging verbosity.

HELP OPTIONS:
 -h	Display short help message
 -man	Display man-style help message

EXAMPLES:
	This script is typically called with no arguments.

LOGGING:
	This script generates a log file and also displays it to stdout at the
	end of processing.  By default, the log is:
	
	/tmp/sdp_health_check.<datestamp>.log

	or

	/tmp/sdp_health_check.log

	The exception is usage errors, which result an error being sent to
	stderr followed usage info on stdout, followed by an immediate exit.

EXIT CODES:
	An exit code of 0 indicates no errors or warnings were encountered.
"
   fi

   exit 1
}

#------------------------------------------------------------------------------
# Function: do_341_check ()
function do_341_check () {

   declare -i returnCode=0
   declare atRiskScripts=

   msg "$H2\\nChecking for susceptibility to SDP-341.\\n"
   Lib1="$HC_SDP_P4CBIN/backup_functions.sh"
   Lib2="$HC_SDP_P4CBIN/sdp_functions.sh"
   Script1="$HC_SDP_P4CBIN/recreate_db_checkpoint.sh"
   Script2="$HC_SDP_P4CBIN/recreate_db_sync_replica.sh"

   if [[ -r "$Script1" ]]; then
      if grep -q 'SAVEDIR since we know' $Script1; then
         msg "Verified: Known-safe version exists for: $Script1"
      elif grep -q 'save directory since we know' $Script1; then 
         errmsg "At risk due to existing known-broken version of script $Script1\\nSuggested actions and background information are provided here: $SDP_341_URL"
         returnCode=1
      else
         warnmsg "Unknown version of this script exists: $Script1"
      fi
   else
      msg "Verified: This does not exist: $Script1"
   fi

   if [[ -r "$Script2" ]]; then
      if grep -q 'rm -f rdb.lbr' $Script2; then
         msg "Verified: Known-safe version exists for: $Script2"
      elif grep -q 'RsyncCmd=' $Script2; then 
         errmsg "At risk due to existing known-broken version of script $Script2\\nSuggested actions and background information are provided here: $SDP_341_URL"
         returnCode=1
      else
         warnmsg "Unknown version of this script exists: $Script2"
      fi
   else
      msg "Verified: This does not exist: $Script2"
   fi

   if [[ -r "$Lib1" ]]; then
      if grep -q 'OLDBLNK' $Lib1; then
         atRiskScripts="$(grep -l switch_db_files ./* 2>/dev/null|grep -v backup_functions.sh)"
         if [[ -n "$atRiskScripts" ]]; then
            errmsg "A library file contains a known-broken version of switch_db_files(), that may be called by other scripts.\\nLibrary file is: $Lib1\\nCalling scripts are:\\n$(echo "$atRiskScripts"|tr ' ' '\n')\\n"
            returnCode=1
         else
            warnmsg "A library file contains a known-broken function, switch_db_files(). However, that function is not called by any scripts in $HC_SDP_P4CBIN, and thus is not an issue.  An upgrade of the SDP will replace the library entirely, but it should not be removed now.\\n"
         fi
      else
         msg "Verified: Known-safe version exists for: $Lib1"
      fi
   elif [[ -r "$Lib2" ]]; then
      msg "Verified: Known-safe version exists for: $Lib2"
   else
      errmsg "Missing files.  One of these two files should exist:\\n\\t$Lib1\\nor\\n\\t$Lib2\\n\\nSuggested action: Contact Perforce Support <[email protected]> and request an SDP health check.\\n"
   fi

   if [[ "$returnCode" -eq 0 ]]; then
      msg "Verified: There is no susceptibility to SDP-341."
   else
      errmsg "Known-broken version of one or more key scripts detected."
   fi

   return $returnCode
}

#------------------------------------------------------------------------------
# Function: do_preflight_checks ()
function do_preflight_checks () {

   local exitCode=0
   declare ToolsList="date grep id ls"

   msg "$H2\\nDoing preflight sanity checks."
   msg "Preflight Check 1: Ensuring basic tools are in the PATH."

   for tool in $ToolsList; do
      if [[ -z "$(command -v "$tool")" ]]; then
         errmsg "Required tool '$tool' not found in PATH."
         exitCode=1
      fi
   done

   [[ $exitCode -ne 0 ]] && return 1

   msg "Verified: Essential tools are in the PATH."

   msg "Preflight Check 2: cd $HC_SDP_P4CBIN"

   if ! cd "$HC_SDP_P4CBIN"; then
      errmsg "Could not cd to: $HC_SDP_P4CBIN"
      return 1
   fi

   msg "Verified: cd works to: $HC_SDP_P4CBIN"

   msg "Preflight Check 3: Checking current user owns $HC_SDP_P4CBIN"
   # shellcheck disable=SC2012
   SDPOwner=$(ls -ld . | awk '{print $3}')

   if [[ "$ThisUser" == "$SDPOwner" ]]; then
      msg "Verified: Current user [$ThisUser] owns $HC_SDP_P4CBIN"
   else
      errmsg "Current user [$ThisUser] does not own $HC_SDP_P4CBIN."
      return 1
   fi

   return 0
}

#------------------------------------------------------------------------------
# Function: get_sdp_instances ()
#
# Get the list of SDP instances after doing some preliminary sanity
# checks.
function get_sdp_instances () {
   SDPInstanceList=
   cd /p4 || bail "Could not cd to /p4."
   for e in *; do
      if [[ -r "/p4/$e/root/db.counters" ]]; then
         SDPInstanceList+=" $e"
      fi
   done

   # Trim leading space.
   # shellcheck disable=SC2116
   SDPInstanceList=$(echo "$SDPInstanceList")
}

#------------------------------------------------------------------------------
# Function: check_sdp_instance ()
# This checks various things about a given SDP instance.
function check_sdp_instance () {
   local instance="${1:-UnsetSDPInstance}"
   local -i startErrorCount="$ErrorCount"
   local superUser=

   if [[ "$instance" == "UnsetSDPInstance" ]]; then
      errmsg "Invalid call to check_sdp_instance(), no instance parameter. Skipping further checks for this instance."
      return 1
   fi

   # Support SDP installations that define P4SUPER.

   # Some SDP environments define the P4SUPER variable that, if set, is
   # distinct from the P4USER (the default super user).  If P4SUPER is
   # defined, use it instead of P4USER to run commands that require
   # super access, such as 'p4 configure' or 'p4 journals'.
   superUser=$("$HC_SDP_MRUN" "$instance" p4 set -q P4SUPER)
   superUser=${superUser##*=}

   if [[ -x "${HC_SDP_P4LOGIN}" ]]; then
      if [[ -n "$superUser" ]]; then
         run "$HC_SDP_MRUN $instance p4 -u $superUser login -s" \
            "Checking login for super user [$superUser]." ||\
            errmsg "Super user $superUser does not have valid ticket."
      else
         run "$HC_SDP_MRUN $instance $HC_SDP_P4LOGIN $instance -v" \
            "$H2\\nDoing 'p4login' for instance $instance." ||\
            errmsg "p4 login reported an error."
      fi
   else
      warnmsg "No '/p4/common/bin/p4login' script found."
   fi

   if [[ -n "$(command -v systemctl)" ]]; then
      svcName="p4d_${instance}"
      run "systemctl cat $svcName" "Showing systemd service file." ||\
         warnmsg "Could not cat Systemd unit file for service: $svcName."
   else
      run "ls -lArt /etc/init.d/p4*" "Listing SysV p4 init files." ||\
         errmsg "Could not list SysV init files."
   fi

   run "$HC_SDP_MRUN $instance p4 -ztag info" \
      "$H2\\nChecking p4 -ztag info for instance $instance." ||\
      errmsg "p4 info did not respond."

   if [[ -n "$superUser" ]]; then
      run "$HC_SDP_MRUN $instance p4 -u $superUser configure show allservers" \
         "$H2\\nChecking p4 configure show allservers." ||\
         errmsg "p4 configure show allservers reported an error."
   else
      run "$HC_SDP_MRUN $instance p4 configure show allservers" \
         "$H2\\nChecking p4 configure show allservers." ||\
         errmsg "p4 configure show allservers reported an error."
   fi

   run "$HC_SDP_MRUN $instance p4 triggers -o | grep -v ^#" \
      "$H2\\nChecking triggers for instance $instance." ||\
      errmsg "Could not do 'p4 triggers -o' for instance $instance."

   run "$HC_SDP_MRUN $instance p4 servers -J" \
      "$H2\\nChecking p4 servers -J" ||\
      errmsg "p4 servers -J reported an error."

   if [[ -e "/p4/$instance/root/server.id" ]]; then
      run "cat /p4/$instance/root/server.id" \
         "Contents of /p4/$instance/root/server.id:" ||\
         errmsg "Could not display contents of server.id file."
   else
      errmsg "Expected ServerID file is missing: /p4/$instance/root/server.id"
   fi

   if [[ -e "/p4/$instance/bin/p4d_$instance" ]]; then
      if [[ -r "/p4/$instance/root/db.config" ]]; then
         copy_jd_table "db.config" "/p4/$instance/root/"
         run "/p4/$instance/bin/p4d_$instance -r $JDTmpDir -cshow" \
            "Contents of /p4/$instance/root -cshow:" ||\
            errmsg "Could not display contents of -cshow."
         remove_jd_tables
      else
         errmsg "Cannot do '-cshow'; No db.config found in /p4/$instance/bin/p4d_$instance."
      fi
   else
      errmsg "Expected p4d_$instance or /p4/$instance/root is missing: /p4/$instance/bin/p4d_$instance or /p4/$instance/root/"
   fi

   if [[ -n "$superUser" ]]; then
      run "$HC_SDP_MRUN $instance p4 -u $superUser journals -m 100" \
         "$H2\\nChecking journal data:" ||\
         errmsg "p4 journals -m 100 reported an error."
   else
      run "$HC_SDP_MRUN $instance p4 journals -m 100" \
         "$H2\\nChecking journal data:" ||\
         errmsg "p4 journals -m 100 reported an error."
   fi

   for server in $("$HC_SDP_MRUN" "$instance" p4 -ztag -F %ServerID% servers); do
      run "$HC_SDP_MRUN $instance p4 server -o $server" \
         "$H2\\nChecking p4 server spec for server $server" ||\
         errmsg "p4 server -o $server reported an error."
   done

   run "$HC_SDP_MRUN $instance env" \
      "$H2\\nChecking shell environment for instance $instance." ||\
      errmsg "Shell environment did not load for instance $instance"

   run "$HC_SDP_MRUN $instance p4 counters -e \"SDP*\"" \
      "$H2\\nChecking SDP version counters for instance $instance." ||\
      errmsg "Error checking SDP version counters for instance $instance."

   run "$HC_SDP_MRUN $instance p4 counters -e \"LastSDPCheckpoint*\"" \
      "$H2\\nChecking SDP checkpoint counters for instance $instance." ||\
      errmsg "Error checking SDP checkpoint counters for instance $instance."

   # In the directory values, always include the trailing '/', or the 'ls'
   # may not give desired results.
   msg "${H1}\\nListing key directories."
   DirList="/p4/ /p4/$instance/ /p4/$instance/bin/ /p4/$instance/logs/ /p4/$instance/checkpoints/"

   # For edges and some replicas, a checkpoints.* directory may exist; if so
   # add it to the list of directories checked.
   # shellcheck disable=SC2045
   for d in $(ls -d /p4/"$instance"/checkpoints.* 2>/dev/null); do
      DirList+=" $d/"
   done
   
   for d in $DirList; do
      run "ls -lArt $d" "Listing: $d" ||\
         errmsg "Failed to list dir: $d"
   done

   # Good to list P4ROOT and offline_db in db size order
   DirList="/p4/$instance/root/ /p4/$instance/offline_db/"
   for d in $DirList; do
      run "ls -lAhS $d" "Listing: $d" ||\
         errmsg "Failed to list dir: $d"
   done

   msg "${H1}\\nListing small log files."
   for log in "${SmallLogFiles[@]}"; do
      logPath="/p4/$instance/logs/$log"
      if [[ -e "$logPath" ]]; then
         logLines=$(wc -l "$logPath"|awk '{print $1}')
         if [[ -n "$logLines" && "$logLines" -ge "$MaxSmallLogLines" ]]; then
            errmsg "Log [$log] is $logLines lines, larger than expected max of $MaxSmallLogLines lines. Displaying only first $MaxSmallLogLines lines."
            if run "head -$MaxSmallLogLines $logPath" "$H2\\nCapturing first $MaxSmallLogLines lines of file $logPath:"; then
               msg "\\n=== END first $MaxSmallLogLines lines of log $logPath ===\\n"

               if grep -q -E '(^Error:|: ERROR\!\!\!)' "$logPath"; then
                  errmsg "Found one or more errors in: $logPath"
               fi
            fi
         else
            if run "cat $logPath" "$H2\\nCapturing contents of log file $logPath:"; then
               msg "\\n=== END contents of log $logPath ===\\n"

               # After catting small log files, check to see if they have error
               # messages in known formats.  The die() function in the SDP
               # backup_functions.sh library used in several SDP scripts
               # write critical errors with ': ERROR!!!'. Other scripts report
               # report '^Error:'. This regex avoids false-positives with scripts
               # that have the word error, e.g. "NO ERRORS".
               if grep -q -E '(^Error:|: ERROR\!\!\!)' "$logPath"; then
                  errmsg "Found one or more errors in: $logPath"
               fi
            else
               errmsg "Error showing contents of log: $logPath"
            fi
         fi
      else
         msg "Log $log does not exist here."
      fi
   done

   msg "$H2\\nChecking structure."
   if [[ -L "/p4/$instance" ]]; then
      errmsg "Instance $instance uses old-style symlink structure and should be upgraded."
   fi

   if [[ -x "$HC_SDP_VSDP" ]]; then
      # If the SDP's verify_sdp.sh script is new enough to have the '-csec' option,
      # add that option to the command line.
      if grep -q '(-csec)' "$HC_SDP_VSDP"; then
         run "$HC_SDP_VSDP $instance -L off -csec" "${H1}\\nRunning verify_sdp.sh with -csec." ||\
            errmsg "SDP Verify with '-csec' failed for instance $instance."
      else
         run "$HC_SDP_VSDP $instance -L off" "${H1}\\nRunning verify_sdp.sh" ||\
            errmsg "SDP Verify failed for instance $instance."
      fi
   else
      msg "Note: $HC_SDP_VSDP is not available to execute."
   fi

   if [[ "$ErrorCount" -eq "$startErrorCount" ]]; then
      msg "SDP instance $instance seems OK."
   else
      errmsg "SDP instance $instance has issues - see above."
   fi
}

#------------------------------------------------------------------------------
# Function: terminate
# shellcheck disable=SC2317
function terminate
{
   # Disable signal trapping.
   trap - EXIT SIGINT SIGTERM

   declare -i overallExitCode=0

   msg "$H1\\nErrors detected: $ErrorCount"
   msg "Warnings detected: $WarningCount"

   # Set $overallExitCode:
   # 0 - all clean
   # 1 - errors and maybe warnings
   # 2 - warnings, but no errors.
   if [[ "$WarningCount" -ne 0 && "$ErrorCount" -eq 0 ]]; then
      overallExitCode=2
   elif [[ "$ErrorCount" -ne 0 ]]; then
      overallExitCode=1
   fi

   msg "$ThisScript: EXITCODE: $overallExitCode"
   msg "$H2\\nLog file: $Log"

   exit "$overallExitCode"
}
#==============================================================================
# Command Line Processing

declare -i shiftArgs=0

set +u
while [[ $# -gt 0 ]]; do
   case $1 in
      (-h) usage -h;;
      (-man) usage -man;;
      (-D) set -x;; # Debug; use 'set -x' mode.
      (-*) usage -h "Unknown command line flag ($1).";;
      (*) usage -h "Unknown command line fragment ($1).";;
   esac

   # Shift (modify $#) the appropriate number of times.
   shift; while [[ $shiftArgs -gt 0 ]]; do
      [[ $# -eq 0 ]] && usage -h "Incorrect number of arguments."
      shiftArgs=$shiftArgs-1
      shift
   done
done
set -u

#==============================================================================
# Main Program

trap terminate EXIT SIGINT SIGTERM

# Start Logging.
exec > >(tee "${Log}")
exec 2>&1

if [[ -n "$(command -v id)" ]]; then
   ThisUser="$(id -n -u)"
else
   ThisUser="$USER"
fi

msg "$ThisScript version $Version as $ThisUser@${HOSTNAME%%.*}\\nStarting verification at $(date +'%a %Y-%m-%d %H:%M:%S %Z')."

msg "This log file is: $Log"

do_preflight_checks || bail "Preflight checks failed. Aborting."

do_341_check || errmsg "Failed check for SDP-341."

get_sdp_instances

if [[ -n "$SDPInstanceList" ]]; then
   msg "$H2\\nList of SDP Instances to verify: $SDPInstanceList"
else
   errmsg "No SDP instances detected."
fi

msg "$H1\\nChecking each SDP instance."
for i in $SDPInstanceList; do
   check_sdp_instance "$i"
   KeyFiles[KeyFileCount]="$HC_SDP_P4CCFG/p4_${i}.vars"
   KeyFileCount+=1
done

msg "$H1\\nGeneral Checks."

msg "OS Info:"

run "uname -a" "OS uname info for UNIX/Linux"

if [[ -r /etc/os-release ]]; then
   run "cat /etc/os-release" "$H2\\nList Linux distribution/version:" ||\
      errmsg "Could not read /etc/os-release file."
fi

run "crontab -l" "$H2\\nCrontab for $USER:" ||\
   errmsg "Failed to gather crontab for $USER."

run "ps -ef | grep p4" "$H2\\nChecking currently running p4 processes:" ||\
   errmsg "Error checking processes."

run "ls -lArt $HC_SDP_P4CBIN/" "$H2\\nListing files in $HC_SDP_P4CBIN:" ||\
   "Error listing files in: $HC_SDP_P4CBIN"

run "df -h" "$H2\\nChecking volumes and storage available."

msg "$H1\\nKey File Checks."

for file in "${KeyFiles[@]}"; do
   if [[ -r "$file" ]]; then
      if run "cat $file" "$H2\\nCapturing contents of file $file:"; then
         msg "\\n=== END contents of $file ===\\n"
      else
         errmsg "Error showing contents of file: $file"
      fi
   else
      errmsg "Expected file is missing: $file"
   fi
done

msg "$H1\\nSDP Version Checks."

# The /p4/sdp/Version may exist, depending on how the SDP was
# installed on a given machine. It is usually on the first
# master machine, but always correctly copied to replicas.
if [[ -r /p4/sdp/Version ]]; then
   run "cat /p4/sdp/Version" "Version from /p4/sdp/Version:" ||\
      errmsg "Could not cat /p4/sdp/Version file."
else
   msg "File /p4/sdp/Version did not exist."
fi

# The SDP_VERSION setting in p4_vars should be there; if not that is
# considered an error.
if grep -q 'SDP_VERSION=' $HC_SDP_ENV; then
   run "grep 'SDP_VERSION=' $HC_SDP_ENV" "Version from $HC_SDP_ENV:" ||\
      errmsg "Failed to run: grep 'SDP_VERSION=' $HC_SDP_ENV"
else
   errmsg "No SDP_VERSION defined in $HC_SDP_ENV."
fi

msg "$H1\\nSummary:"

if [[ "$ErrorCount" -eq 0 && "$WarningCount" -eq 0 ]]; then
   msg "No errors or warnings detected."
elif [[ "$ErrorCount" -eq 0 ]]; then
   msg "Encountered no errors and $WarningCount warnings."
else
   msg "Encountered $ErrorCount errors and $WarningCount warnings."
fi

msg "\\nIf you have any questions about the output from this script, contact [email protected]"

# See the 'terminate()' function where this script actually exits.
exit 0
# Change User Description Committed
#3 31617 C. Thomas Tyler Merged work from dev_c2s (development) stream to sibling dev_rebrand (sparsedev) stream.
#2 31615 C. Thomas Tyler First pass at rebranding changes, including:
* Changes to remove 'swarm.' from Workshop URLS, so swarm.workshop -> workshop.
* Changed URL for Copyright.
* Renamed get_helix_binaries.sh -> get_p4_binaries.sh, with associated directory and doc changes.
* Accounted for rename of HAS -> P4AS.
* Changed HMS references to P4MS.
* Replaced "Helix" and "Helix Core" references.
* Renamed variables to reduce tech debt buildup induced by rebranding.
* Changed default mount points:
/hxdepots[-1,N] -> /p4depots[-1,N]
/hxmetadata[1,2] -> /p4db[-1,2]
/hxlogs -> /p4logs

Also made some changes related to rebranding going out with r25.1.
#1 31591 C. Thomas Tyler Populate stream //p4-sdp/dev_rebrand from //p4-sdp/dev.
//p4-sdp/dev/Server/Unix/p4/common/bin/sdp_health_check.sh
#2 31574 C. Thomas Tyler Merged SDP 2024.2 Patch 4 from Classic to Streams.
p4 merge -b SDP_Classic_to_Streams
#1 31397 C. Thomas Tyler Populate -b SDP_Classic_to_Streams -s //guest/perforce_software/sdp/...@31368.
//guest/perforce_software/sdp/dev/Server/Unix/p4/common/bin/sdp_health_check.sh
#10 30979 C. Thomas Tyler Eliminated buildup of temp dirs, e.g.
/tmp/tmp.XXXXXXXXXX.

Added remove_jd_tables() function and calls to it to prevent buildup of new
cruft.

Modified remove_old_logs() to cleanup cruft created previously.

#review-30980 @robert_cowham
#9 30877 C. Thomas Tyler Refined check for SDP checkpoint counters in SDP Health Check.
#8 30826 C. Thomas Tyler Added check for LastSDPCheckpoint* counters.
#7 30305 C. Thomas Tyler Refined JDTmpDir implementation to handle case of empty data sets
consistently.

#review-30306 @robert_cowham
#6 30289 C. Thomas Tyler Added more small logs.
Refined output.
#5 30288 C. Thomas Tyler Added defense against small log files that turn out to be unexpectedly large.
Shellcheck complance tweaks.
#4 30270 Robert Cowham Fix shellcheck warnings and use of copy_jd_table
#3 30267 Robert Cowham Copy files to be dumped via p4d -jd to tmp dir first
to avoid locks on P4ROOT (or offline_db)

SDP-1087
#2 29984 C. Thomas Tyler Added support for SDP installations that define P4SUPER.

Some SDP environments define the P4SUPER variable that, if set, is
distinct from the P4USER (the default super user).  If P4SUPER is
defined, use it instead of P4USER to run commands that require
super access, such as 'p4 configure' or 'p4 journals'.

See: https://swarm.workshop.perforce.com/reviews/29983
#1 29844 C. Thomas Tyler Added sdp_health_check to SDP package.

Updated docs in Guide and Release Notes to reflect this change.

Added more docs for this in the SDP Guide.

#review-29845 @vkanczes