#!/bin/bash
# =============================================================================
#  qa_summary  --  one-glance server hardware health card
# =============================================================================
#  Connects to the local server and runs a series of checks, then renders
#  a formatted summary box with PASS/WARN/FAIL badges for each item.
#
#  Exit codes: 0 = all pass, 1 = one or more warnings, 2 = one or more failures
#
#  Checks performed:
#    - BIOS firmware version (vs known-latest table below)
#    - IPMI firmware version (vs known-latest table below)
#    - RAID controller firmware + BBU/CacheVault health
#    - Fan mode (Heavy IO for PCIe add-in RAID/10G cards; Optimal for onboard)
#    - IPMI network IP configuration
#    - PSU status (count and health via ipmicfg)
#    - NIC link status and speed
#    - Bond interface health (member count, active slaves, mode)
#    - Per-drive SMART error counters and SSD wear
#    - NVMe health (% used, available spare, critical warnings via nvme-cli)
#    - Virtual drive table (RAID only)
#    - Block device map (lsblk tree)
#    - PCI device summary (RAID, NIC, HBA, GPU)
#    - SEL (System Event Log) -- severity-split: Critical=FAIL, Warning=WARN, Info=dim
#    - SDR (Sensor Data Repository) -- any non-OK sensor readings
# =============================================================================


# =============================================================================
#  CONFIGURATION  --  edit these values when firmware or thresholds change
# =============================================================================

# -- BIOS latest firmware versions --------------------------------------------
# Key   = board "Product Name" last field from dmidecode (exact match)
# Value = latest known BIOS version string
# Add new boards here as they are qualified.
declare -A BIOS_LATEST=(
    [H12SSW-AN6]="3.5"
    [X10DDW-i]="3.4"
    [X10DRH-CT]="3.4a"
    [X10DRi]="3.4a"
    [X10DRi-T]="3.4a"
    [X10SLM-F]="3.4"
    [X10SRi-F]="3.4"
    [X11DDW-NT]="4.6"
    [X11DPH-T]="4.7"
    [X11DPU]="4.3"
    [X11SCH-F]="2.8"
    [X11SSH-F]="3.5"
    [X11SSL-CF]="3.6"
    [X12DDW-A6]="2.5"
    [X12STH-SYS]="2.3"
    [X12STW-TF]="2.2"
    [X13DDW-A]="01.03.20"
)

# -- IPMI latest firmware versions --------------------------------------------
# Same key format as BIOS_LATEST above.
declare -A IPMI_LATEST=(
    [H12SSW-AN6]="01.06.28"
    [X10DDW-i]="3.91.00"
    [X10DRH-CT]="3.94.00"
    [X10DRi]="3.91.00"
    [X10DRi-T]="3.91.00"
    [X10SLM-F]="3.88.00"
    [X10SRi-F]="3.93.00"
    [X11DDW-NT]="01.74.17"
    [X11DPH-T]="01.74.19"
    [X11DPU]="01.74.14"
    [X11SCH-F]="01.74.19"
    [X11SSH-F]="1.74.00"
    [X11SSL-CF]="1.78.00"
    [X12DDW-A6]="01.08.02"
    [X12STH-SYS]="01.06.14"
    [X12STW-TF]="01.08.08"
    [X13DDW-A]="01.03.20"
)

# -- RAID firmware manifest URL -----------------------------------------------
# Internal server that hosts a plain-text file of known-latest RAID FW versions.
# Format per line:  ModelNumber=FirmwareVersion   e.g.  9361=24.21.0-0152
# Set to "EOL" as the value to mark a controller as end-of-life.
RAID_FW_URL="http://216.104.40.250/raid/raid_versions"

# -- Timezone for the "Run:" timestamp shown in the header --------------------
REPORT_TZ="America/Chicago"

# -- SMART error threshold ----------------------------------------------------
# Drives with cumulative SMART errors (reallocated + pending + offline +
# reported + UDMA CRC) at or above this number are marked FAIL.
SMART_ERR_THRESHOLD=75

# -- SSD wear threshold -------------------------------------------------------
# Drives with a wear indicator value below this percentage are marked FAIL.
# (Media_Wearout_Indicator and Wear_Leveling_Count attributes)
SSD_WEAR_THRESHOLD=25

# -- NVMe percentage used threshold -------------------------------------------
# NVMe drives reporting "Percentage Used" at or above this value are marked FAIL.
# nvme smart-log reports this directly; 100 = drive has consumed its rated life.
NVME_PCT_USED_THRESHOLD=90

# -- NVMe available spare threshold -------------------------------------------
# NVMe drives with available spare below this percentage are marked WARN.
# Available spare is the % of reserve blocks remaining for bad block replacement.
NVME_SPARE_WARN_THRESHOLD=10

# -- PSU count expectation ----------------------------------------------------
# How many power supplies are expected in this server.
# Set to 1 for single-PSU systems; 2 for redundant pairs.
# A mismatch between expected and detected count is flagged as WARN.
PSU_EXPECTED_COUNT=2

# -- Box width ----------------------------------------------------------------
# Total inner width of the output box in characters (excluding border chars).
# Increase if output lines are wrapping in your terminal.
W=86

# -- Section toggles ----------------------------------------------------------
# Set any of these to 1 to skip that section entirely (useful for testing
# or on systems where the relevant hardware is not present).
SKIP_BIOS_IPMI_CHECK=0   # BIOS / IPMI firmware version check
SKIP_RAID_CHECK=0        # RAID controller + BBU/CacheVault
SKIP_FAN_CHECK=0         # Fan mode check
SKIP_NIC_CHECK=0         # NIC speed / link status + bond check
SKIP_DRIVE_TABLE=0       # Per-drive SMART table
SKIP_NVME_HEALTH=0       # NVMe smart-log health table
SKIP_SEL_SDR=0           # SEL event log + SDR sensor readings
SKIP_VD_TABLE=0          # Virtual drive table (RAID only)
SKIP_PSU_CHECK=0         # Power supply status via ipmicfg
SKIP_LSPCI=0             # PCI device summary table

# =============================================================================
#  END CONFIGURATION
# =============================================================================


# =============================================================================
#  COLORS AND BOX DRAWING
# =============================================================================

R=$'\033[1;31m'     # red   -- failures
G=$'\033[1;32m'     # green -- pass
Y=$'\033[1;33m'     # amber -- warnings
C=$'\033[1;36m'     # cyan  -- box borders and labels
M=$'\033[1;35m'     # magenta -- OS line
DIM=$'\033[0;37m'   # grey  -- secondary info
RST=$'\033[0m'      # reset all attributes
BOLD=$'\033[1m'
BGG=$'\033[42m'     # green background  (PASS badge)
BGR=$'\033[41m'     # red background    (FAIL badge)
BGY=$'\033[43m'     # amber background  (WARN badge)
BLK=$'\033[0;30m'   # black text        (used on colored badge backgrounds)

# Unicode double-line box characters
rep()  { local i o=''; for((i=0;i<$2;i++)); do o+="$1"; done; printf '%s' "$o"; }
_TL()  { printf '\xe2\x95\x94'; }   # +  top-left corner
_EQ()  { printf '\xe2\x95\x90'; }   # -  horizontal double line
_TR()  { printf '\xe2\x95\x97'; }   # +  top-right corner
_ML()  { printf '\xe2\x95\xa0'; }   # ¦  mid-left T-junction
_MR()  { printf '\xe2\x95\xa3'; }   # ¦  mid-right T-junction
_VB()  { printf '\xe2\x95\x91'; }   # ¦  vertical border
_BL()  { printf '\xe2\x95\x9a'; }   # +  bottom-left corner
_BR()  { printf '\xe2\x95\x9d'; }   # +  bottom-right corner
_HD()  { printf '\xe2\x94\x80'; }   # -  thin horizontal line (section dividers)
hline(){ rep "$(_EQ)" $W; }

# box_open TITLE [COLOR]
# Prints the top border + centered title + mid-border to open a box section.
box_open() {
    local title="$1" color="${2:-$C}"
    local tlen=${#title} lpad rpad
    lpad=$(( (W - tlen - 2) / 2 ))
    rpad=$(( W - tlen - 2 - lpad ))
    printf '\n'
    printf "%b%s%s%s%b\n" "$color" "$(_TL)" "$(hline)" "$(_TR)" "$RST"
    printf "%b%s%s%b%b%b %s %b%b%s%s%b\n" \
        "$color" "$(_VB)" "$(rep ' ' $lpad)" "$RST" \
        "$BOLD" "$Y" "$title" "$RST" \
        "$color" "$(rep ' ' $rpad)" "$(_VB)" "$RST"
    printf "%b%s%s%s%b\n" "$color" "$(_ML)" "$(hline)" "$(_MR)" "$RST"
}

# box_close [COLOR]
# Prints the bottom border to close the box.
box_close() {
    printf "%b%s%s%s%b\n" "${1:-$C}" "$(_BL)" "$(hline)" "$(_BR)" "$RST"
}

# bline TEXT [BORDER_COLOR]
# Prints one line inside the box, automatically padded to width W.
# Strips ANSI escape codes before measuring visible length so padding is correct.
bline() {
    local text="$1" bc="${2:-$C}"
    local vis pad
    vis=$(printf '%s' "$text" | sed -E 's/\x1b\[[0-9;:]*[mGKHFJA-Za-z]//g')
    pad=$(( W - ${#vis} ))
    [ $pad -lt 0 ] && pad=0
    printf "%b%s%b%s%s%b%s%b\n" "$bc" "$(_VB)" "$RST" \
        "$text" "$(rep ' ' $pad)" "$bc" "$(_VB)" "$RST"
}

# bdiv
# Prints a thin horizontal divider line inside the box (section separator).
bdiv() {
    bline "$(printf "  %b%s%b" "$DIM" "$(rep "$(_HD)" $((W-2)))" "$RST")"
}


# =============================================================================
#  VERDICT HELPERS
# =============================================================================

# Global counters -- incremented by verdict_row as each check is rendered
FAILS=0; WARNS=0; PASSES=0

# Accumulator arrays for the summary table printed at the bottom of the box
FAIL_ITEMS=()   # entries formatted as "Label|Reason"
WARN_ITEMS=()

# Colored badge functions -- return a pre-formatted badge string
badge_pass(){ printf '%b%b%b PASS %b' "$BGG" "$BLK" "$BOLD" "$RST"; }
badge_fail(){ printf '%b%b FAIL %b'   "$BGR" "$BOLD"        "$RST"; }
badge_warn(){ printf '%b%b%b WARN %b' "$BGY" "$BLK" "$BOLD" "$RST"; }
badge_skip(){ printf '%b SKIP %b'     "$DIM"                "$RST"; }

# verdict_row LABEL DETAIL VERDICT
# Prints a full-width box line: left-aligned label+detail, right-aligned badge.
# Also increments the global PASS/FAIL/WARN counters.
# VERDICT values: PASS | FAIL | WARN | SKIP | INFO
verdict_row() {
    local label="$1" detail="$2" verdict="$3"
    local badge lc left vis_len pad
    case "$verdict" in
        PASS) badge=$(badge_pass); lc="$G";   PASSES=$(( PASSES+1 )) ;;
        FAIL) badge=$(badge_fail); lc="$R";   FAILS=$(( FAILS+1 ))   ;;
        WARN) badge=$(badge_warn); lc="$Y";   WARNS=$(( WARNS+1 ))   ;;
        SKIP) badge=$(badge_skip); lc="$DIM"  ;;
        INFO) badge=""; lc="$DIM" ;;
    esac
    if [ -n "$badge" ]; then
        left=$(printf "  %b%-10s%b %b%s%b" "$BOLD" "$label" "$RST" "$lc" "$detail" "$RST")
        vis_len=$(printf '%s' "$left" | sed -E 's/\x1b\[[0-9;:]*[mGKHFJA-Za-z]//g' | wc -c)
        pad=$(( W - vis_len - 6 ))
        [ $pad -lt 1 ] && pad=1
        bline "$(printf "%s%${pad}s%s" "$left" "" "$badge")"
    else
        bline "$(printf "  %b%-10s%b %b%s%b" "$BOLD" "$label" "$RST" "$lc" "$detail" "$RST")"
    fi
}

# cont_row TEXT
# Continuation line -- indented to align under the detail column, no badge.
# Used for additional DIMM config lines when multiple configurations exist.
cont_row() {
    bline "$(printf "             %b%s%b" "$DIM" "$1" "$RST")"
}

# fw_normalize VERSION
# Pads each dot-separated segment to 2 digits for safe lexicographic comparison.
# e.g. "3.9.1" -> "03.09.01"  so "03.09.01" < "03.91.00" compares correctly.
fw_normalize() {
    echo "$1" | awk -F. '{printf "%02d.%02d.%02d",$1+0,$2+0,$3+0}'
}


# =============================================================================
#  DATA GATHERING
#  All slow commands run here before any output is printed, so the box renders
#  in a single pass without gaps or interleaved output.
# =============================================================================

# -- System identity ----------------------------------------------------------
HOSTNAME=$(uname -n)
RUN_TIME=$(TZ="$REPORT_TZ" date '+%Y-%m-%d %I:%M:%S %p %Z')
OS_NAME=$(grep PRETTY_NAME /etc/*release 2>/dev/null | head -1 | cut -d'"' -f2)

# Board manufacturer and model from DMI type 2 (Baseboard).
# BOARD_RAW is the last field of "Product Name" -- e.g. "X11SSH-F".
# This is used as the key to look up BIOS/IPMI versions in the tables above.
BOARD_MFG=$(dmidecode -t 2 2>/dev/null | grep 'Manufacturer' | head -1 | cut -d: -f2 | xargs)
BOARD_RAW=$(dmidecode -t 2 2>/dev/null | grep 'Product Name' | head -1 | awk '{print $NF}')

# -- Network: IP and MAC ------------------------------------------------------
# Read all IPv4 addresses on eth0, excluding loopback and link-local.
ALL_IPS=$(ip -4 -o addr show eth0 2>/dev/null \
    | awk '$3=="inet" {print $4}' \
    | cut -d/ -f1 \
    | grep -vE '^127\.|^169\.254\.')
PRIMARY_IP=$(echo "$ALL_IPS" | head -1)
IP_COUNT=$(echo "$ALL_IPS" | grep -c .)
# Append count if multiple IPs are bound to eth0
[ "$IP_COUNT" -gt 1 ] && IP_DISPLAY="${PRIMARY_IP}  (${IP_COUNT} IPs detected)" \
                       || IP_DISPLAY="${PRIMARY_IP}"

# Read MAC from eth0 sysfs; fall back to whichever interface owns the primary IP
ETH0_MAC=""
MAC_IFACE=""
if [ -r /sys/class/net/eth0/address ]; then
    ETH0_MAC=$(cat /sys/class/net/eth0/address 2>/dev/null)
    MAC_IFACE="eth0"
else
    primary_iface=$(ip -4 -o addr show 2>/dev/null | awk -v ip="$PRIMARY_IP" '$4~ip{print $2;exit}')
    if [ -n "$primary_iface" ]; then
        ETH0_MAC=$(cat "/sys/class/net/${primary_iface}/address" 2>/dev/null)
        MAC_IFACE="$primary_iface"
    fi
fi

# -- CPU ----------------------------------------------------------------------
# Strip marketing noise: (R), (TM), "CPU", and clock speed suffix.
CPU_NAME=$(grep "model name" /proc/cpuinfo | head -1 | cut -d: -f2 | xargs \
    | sed -E 's/\(R\)//g;s/\(TM\)//g;s/CPU //;s/ @ [0-9.]+GHz//;s/  +/ /g')
# Count distinct physical socket IDs (not logical CPUs)
CPU_COUNT=$(grep "physical id" /proc/cpuinfo | sort -u | wc -l)
CPU_THREADS=$(grep -c "model name" /proc/cpuinfo)
CPU_DETAIL="${CPU_COUNT}x ${CPU_NAME} (${CPU_THREADS}t)"

# -- RAM / DIMM inventory -----------------------------------------------------
# Parse dmidecode type 17 (Memory Device) into pipe-delimited records.
# Slots reporting "No Module Installed" are skipped by the flush guard.
DMI17=$(dmidecode -t 17 2>/dev/null)
DIMM_LINES=$(printf '%s' "$DMI17" | awk '
    function flush() { if(size!=""&&size!~/No Module/) print size"|"type"|"speed"|"mfg"|"part; size="";type="";speed="";mfg="";part="" }
    BEGIN{size="";type="";speed="";mfg="";part=""}
    /^Memory Device$/                        {flush()}
    /^[ \t]*Size:/                           {sub(/^[ \t]*Size:[ \t]*/,"");size=$0}
    /^[ \t]*Type:/&&!/Type Detail/           {sub(/^[ \t]*Type:[ \t]*/,"");type=$0}
    /^[ \t]*Speed:/&&!/Configured/           {sub(/^[ \t]*Speed:[ \t]*/,"");speed=$0}
    /^[ \t]*Configured Memory Speed:/        {if(speed==""||speed~/Unknown/){sub(/^[ \t]*Configured Memory Speed:[ \t]*/,"");speed=$0}}
    /^[ \t]*Configured Clock Speed:/         {if(speed==""||speed~/Unknown/){sub(/^[ \t]*Configured Clock Speed:[ \t]*/,"");speed=$0}}
    /^[ \t]*Manufacturer:/                   {sub(/^[ \t]*Manufacturer:[ \t]*/,"");mfg=$0}
    /^[ \t]*Part Number:/                    {sub(/^[ \t]*Part Number:[ \t]*/,"");sub(/[ \t]+$/,"");part=$0}
    END{flush()}
')
DIMM_COUNT=$(printf '%s\n' "$DIMM_LINES" | grep -cv '^$')

# Group identical DIMM configs and count how many of each exist.
# DIMM_UNIQUE > 1 means mismatched sticks (different sizes or part numbers).
DIMM_GROUPS=$(printf '%s\n' "$DIMM_LINES" | grep -v '^$' | sort | uniq -c \
    | awk '{count=$1;$1="";sub(/^ /,"");print count"|"$0}')
DIMM_UNIQUE=$(printf '%s\n' "$DIMM_GROUPS" | grep -c .)

# Check ECC via type 16 (Physical Memory Array) and Registered via type 17
HAS_ECC="no"; HAS_REG="no"
dmidecode -t 16 2>/dev/null | grep -qiE "Error Correction Type:[[:space:]]*(Single|Multi|ECC)" && HAS_ECC="yes"
printf '%s' "$DMI17" | grep -qiE "Type Detail:.*Registered" && HAS_REG="yes"

# Build a tag like "ECC" or "ECC/REG" for the RAM display line
RAM_TYPE_TAG=""
[ "$HAS_ECC" = "yes" ] && RAM_TYPE_TAG="ECC"
[ "$HAS_REG" = "yes" ] && RAM_TYPE_TAG="${RAM_TYPE_TAG:+$RAM_TYPE_TAG/}REG"

# Build human-readable DIMM display lines, one per unique config
# Format: "2x 32GB DDR4 PART_NUMBER @3200"
RAM_DISPLAY_LINES=()
while IFS='|' read -r count size type speed mfg part; do
    [ -z "$count" ] && continue
    size=$(echo "$size" | sed 's/ //g')
    speed=$(echo "$speed" | sed -E 's/ MT\/s//;s/ MHz//')
    # Fall back to manufacturer name if part number is unknown
    [ -z "$part" ] || [ "$part" = "Unknown" ] && part="$mfg"
    [ "$part" = "Unknown" ] && part=""
    line="${count}x ${size} ${type}"
    [ -n "$part" ] && line="${line} ${part}"
    [ -n "$speed" ] && [ "$speed" != "Unknown" ] && line="${line} @${speed}"
    RAM_DISPLAY_LINES+=("$line")
done <<< "$DIMM_GROUPS"

# Determine RAM verdict:
#   FAIL -- no DIMMs detected, or mismatched part numbers/sizes
#   WARN -- non-ECC memory (unusual for server boards)
#   PASS -- all sticks match and ECC is enabled
if [ "$DIMM_COUNT" -eq 0 ]; then
    RAM_VERDICT="FAIL"; RAM_FAIL_REASON="no DIMMs detected"
elif [ "$DIMM_UNIQUE" -gt 1 ]; then
    RAM_VERDICT="FAIL"; RAM_FAIL_REASON="mismatched sticks"
elif [ "$HAS_ECC" = "no" ] && [ "$HAS_REG" = "no" ]; then
    RAM_VERDICT="WARN"; RAM_FAIL_REASON="non-ECC memory"
else
    RAM_VERDICT="PASS"; RAM_FAIL_REASON=""
fi

# =============================================================================
#  SECTION: BIOS / IPMI firmware check
#  Reads current versions via dmidecode and ipmicfg, then compares against
#  the BIOS_LATEST / IPMI_LATEST tables at the top of this script.
# =============================================================================
[ "${SKIP_BIOS_IPMI_CHECK:-0}" = "1" ] && {
    BIOS_VERDICT="SKIP"; BIOS_DETAIL="skipped"
    IPMI_VERDICT="SKIP"; IPMI_DETAIL="skipped"
} || {

CURRENT_BIOS=$(dmidecode -s bios-version 2>/dev/null | tr -d '[:space:]')
CURRENT_IPMI=$(ipmicfg -ver 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1)
LATEST_BIOS="${BIOS_LATEST[${BOARD_RAW}]:-}"
LATEST_IPMI="${IPMI_LATEST[${BOARD_RAW}]:-}"

# BIOS comparison -- normalize both sides before lexicographic compare
if [ -z "$CURRENT_BIOS" ]; then
    BIOS_VERDICT="FAIL"; BIOS_DETAIL="unreadable"
elif [ -z "$LATEST_BIOS" ]; then
    # Board not in table -- warn rather than fail so unknown boards aren't blocked
    BIOS_VERDICT="WARN"; BIOS_DETAIL="${CURRENT_BIOS} (latest unknown)"
else
    cn=$(fw_normalize "$CURRENT_BIOS"); ln=$(fw_normalize "$LATEST_BIOS")
    if [[ "$cn" < "$ln" ]]; then
        BIOS_VERDICT="FAIL"; BIOS_DETAIL="${CURRENT_BIOS} -> ${LATEST_BIOS}"
    else
        BIOS_VERDICT="PASS"; BIOS_DETAIL="${CURRENT_BIOS}"
    fi
fi
[ "$BIOS_VERDICT" = "FAIL" ] && FAIL_ITEMS+=("BIOS|Outdated: ${BIOS_DETAIL}")
[ "$BIOS_VERDICT" = "WARN" ] && WARN_ITEMS+=("BIOS|${BIOS_DETAIL}")

# IPMI comparison -- same normalization logic as BIOS
if [ -z "$CURRENT_IPMI" ]; then
    IPMI_VERDICT="FAIL"; IPMI_DETAIL="unreadable"
elif [ -z "$LATEST_IPMI" ]; then
    IPMI_VERDICT="WARN"; IPMI_DETAIL="${CURRENT_IPMI} (latest unknown)"
else
    cn=$(fw_normalize "$CURRENT_IPMI"); ln=$(fw_normalize "$LATEST_IPMI")
    if [[ "$cn" < "$ln" ]]; then
        IPMI_VERDICT="FAIL"; IPMI_DETAIL="${CURRENT_IPMI} -> ${LATEST_IPMI}"
    else
        IPMI_VERDICT="PASS"; IPMI_DETAIL="${CURRENT_IPMI}"
    fi
fi
[ "$IPMI_VERDICT" = "FAIL" ] && FAIL_ITEMS+=("IPMI FW|Outdated: ${IPMI_DETAIL}")
[ "$IPMI_VERDICT" = "WARN" ] && WARN_ITEMS+=("IPMI FW|${IPMI_DETAIL}")

} # END: BIOS / IPMI check


# =============================================================================
#  SECTION: RAID controller + BBU/CacheVault
#  Uses storcli to detect the controller, count physical drives, check BBU/CV
#  health, and compare firmware against the manifest at RAID_FW_URL.
# =============================================================================

# Scan lspci once -- reused for both RAID detection and 10GbE NIC detection
LSPCI=$(lspci 2>/dev/null)

# HAS_RAID_10G drives the fan mode expectation:
#   1 = Heavy IO required  -- PCIe add-in RAID card or add-in 10GbE NIC present
#   0 = Optimal sufficient -- onboard controllers only
#
# Detection method: lspci -v shows "Physical Slot: N" for add-in cards.
# Onboard devices (even on high bus numbers on EPYC) have no PhysicalSlot entry.
# This is more reliable than bus number on AMD EPYC where onboard devices
# appear on bus 43+ because everything routes through PCIe fabric.
HAS_RAID_10G=0

# Build a list of BDFs that have a physical slot (i.e. are add-in cards)
_SLOTTED_BDFS=$(lspci -v 2>/dev/null | awk '
    /^[0-9a-f]/{bdf=$1; has_slot=0}
    /Physical Slot:/{has_slot=1}
    /^$/ && has_slot{print bdf}
')

# PCIe add-in RAID card -- must have a physical slot
if [ -n "$_SLOTTED_BDFS" ]; then
    while IFS= read -r _bdf; do
        printf '%s' "$LSPCI" | grep -i "^${_bdf}"             | grep -qiE "RAID bus controller|MegaRAID" && HAS_RAID_10G=1 && break
    done <<< "$_SLOTTED_BDFS"
fi

# PCIe add-in 10GbE NIC -- must have a physical slot
if [ "$HAS_RAID_10G" -eq 0 ] && [ -n "$_SLOTTED_BDFS" ]; then
    while IFS= read -r _bdf; do
        printf '%s' "$LSPCI" | grep -i "^${_bdf}"             | grep -qiE "10-Gigabit|10GbE|10GBASE|X520|X540|X550|X710|X722|XL710|XXV710|E810|82599|Connect[Xx]-[3-7]|57711|57810|57840|NetXtreme-E|BCM578"             && HAS_RAID_10G=1 && break
    done <<< "$_SLOTTED_BDFS"
fi
unset _SLOTTED_BDFS _bdf

# Read controller identity and physical drive count from storcli
CTRL_NAME=$(storcli /c0 show 2>/dev/null | grep 'Product Name' | cut -d'=' -f2 | xargs)
CTRL_FW=$(storcli /c0 show 2>/dev/null | grep 'FW Package Build' | cut -d'=' -f2 | xargs)
RAID_PD_COUNT=$(storcli /c0 show 2>/dev/null | grep -i "Physical Drives" | head -1 \
    | awk -F'=' '{print $2}' | xargs)
[ -z "$RAID_PD_COUNT" ] && RAID_PD_COUNT=0

# IS_REAL_RAID = controller present AND drives behind it.
# A controller with 0 drives is an HBA (pass-through mode) -- handled separately.
IS_REAL_RAID=false
[ -n "$CTRL_NAME" ] && [ "$RAID_PD_COUNT" -gt 0 ] 2>/dev/null && IS_REAL_RAID=true

# -- BBU / CacheVault check ---------------------------------------------------
# Only relevant when a real RAID controller with drives is present.
BBU_CV_DETAIL=""; BBU_CV_VERDICT=""
if $IS_REAL_RAID; then
    CV_OUT=$(storcli /c0/cv show 2>/dev/null)
    BBU_OUT=$(storcli /c0/bbu show 2>/dev/null)

    if echo "$CV_OUT" | grep -qiE "Cachevault|CacheVault Info"; then
        # CacheVault capacitor backup -- parse model and state from storcli table
        CV_MODEL=$(echo "$CV_OUT" \
            | awk '/Model[[:space:]]+State[[:space:]]/{found=1;next} found && /[A-Za-z0-9]/{print $1;exit}')
        CV_STATE=$(echo "$CV_OUT" \
            | awk '/Model[[:space:]]+State[[:space:]]/{found=1;next} found && /[A-Za-z0-9]/{print $2;exit}')
        [ -z "$CV_MODEL" ] && CV_MODEL="(unknown)"
        [ -z "$CV_STATE" ] && CV_STATE="unknown"
        BBU_CV_DETAIL="CacheVault ${CV_MODEL}  ${CV_STATE}"
        BBU_CV_VERDICT="INFO"
        if ! echo "$CV_STATE" | grep -qi "Optimal"; then
            BBU_CV_VERDICT="WARN"
            WARN_ITEMS+=("BBU/CV|CacheVault state: ${CV_STATE}")
        fi

    elif echo "$BBU_OUT" | grep -qiE "Battery State|Battery Pack"; then
        # Traditional BBU battery backup
        BBU_MODEL=$(echo "$BBU_OUT" | grep -iE "^Model" | head -1 | cut -d'=' -f2 | xargs)
        BBU_STATE=$(echo "$BBU_OUT" | grep -iE "Battery State" | head -1 | cut -d'=' -f2 | xargs)
        [ -z "$BBU_MODEL" ] && BBU_MODEL="(unknown)"
        [ -z "$BBU_STATE" ] && BBU_STATE="unknown"
        BBU_CV_DETAIL="BBU ${BBU_MODEL}  ${BBU_STATE}"
        BBU_CV_VERDICT="INFO"
        if ! echo "$BBU_STATE" | grep -qi "Optimal\|Fully Charged\|Charging"; then
            BBU_CV_VERDICT="WARN"
            WARN_ITEMS+=("BBU/CV|BBU state: ${BBU_STATE}")
        fi

    else
        # RAID controller present but no battery backup found
        BBU_CV_DETAIL="none detected"
        BBU_CV_VERDICT="WARN"
        WARN_ITEMS+=("BBU/CV|No battery backup on RAID controller")
    fi
fi

# -- RAID controller firmware check -------------------------------------------
if [ -z "$CTRL_NAME" ]; then
    # storcli returned nothing -- no controller present
    RAID_VERDICT="SKIP"; RAID_DETAIL="no controller"

elif ! $IS_REAL_RAID; then
    # Controller found but no drives -- HBA / pass-through mode, no FW check needed
    RAID_VERDICT="PASS"; RAID_DETAIL="${CTRL_NAME} HBA  FW ${CTRL_FW:-?}"

else
    # Fetch the known-latest FW manifest from the internal server
    RAID_FW_LIST=$(wget -q -O- "$RAID_FW_URL" 2>/dev/null)
    LATEST_RAID_FW=""

    if [ -n "$RAID_FW_LIST" ]; then
        # Try matching on each word of the controller name
        # e.g. "MegaRAID SAS 9361-8i" -> tries "MegaRAID", "SAS", "9361-8i"
        for tok in $CTRL_NAME; do
            echo "$RAID_FW_LIST" | grep -qi "^${tok}=" && \
                LATEST_RAID_FW=$(echo "$RAID_FW_LIST" | grep -i "^${tok}=" \
                    | cut -d'=' -f2 | tr -d '[:space:]') && break
        done
        # Fall back to the 4-digit model number if full-name match failed
        if [ -z "$LATEST_RAID_FW" ]; then
            mod=$(echo "$CTRL_NAME" | grep -oE '[0-9]{4}' | head -1)
            [ -n "$mod" ] && LATEST_RAID_FW=$(echo "$RAID_FW_LIST" \
                | grep -i "^${mod}=" | cut -d'=' -f2 | tr -d '[:space:]')
        fi
    fi

    if [ "$LATEST_RAID_FW" = "EOL" ]; then
        RAID_VERDICT="WARN"; RAID_DETAIL="${CTRL_NAME} EOL  FW ${CTRL_FW}"
        WARN_ITEMS+=("RAID|Controller EOL: ${CTRL_NAME}")
    elif [ -z "$LATEST_RAID_FW" ]; then
        RAID_VERDICT="WARN"; RAID_DETAIL="${CTRL_NAME}  FW ${CTRL_FW} (latest unknown)"
        WARN_ITEMS+=("RAID|FW version unknown for ${CTRL_NAME}")
    else
        # Zero-pad each dot-separated segment to 5 digits for comparison.
        # RAID FW versions use 4-part strings like "24.21.0-0152" -- convert
        # dashes to dots first so all four parts are handled uniformly.
        cr=$(echo "$CTRL_FW"        | tr '-' '.' | awk -F. '{printf "%05d%05d%05d%05d",$1+0,$2+0,$3+0,$4+0}')
        lr=$(echo "$LATEST_RAID_FW" | tr '-' '.' | awk -F. '{printf "%05d%05d%05d%05d",$1+0,$2+0,$3+0,$4+0}')
        if [[ "$cr" < "$lr" ]]; then
            RAID_VERDICT="FAIL"; RAID_DETAIL="${CTRL_NAME}  ${CTRL_FW} -> ${LATEST_RAID_FW}"
            FAIL_ITEMS+=("RAID|Outdated FW: ${CTRL_FW} -> ${LATEST_RAID_FW}")
        else
            RAID_VERDICT="PASS"; RAID_DETAIL="${CTRL_NAME}  FW ${CTRL_FW}"
        fi
    fi
fi


# =============================================================================
#  SECTION: Fan mode + IPMI network
#  Reads the current fan speed mode from ipmicfg and compares it to what's
#  expected based on whether a RAID card or 10GbE NIC was detected above.
# =============================================================================
[ "${SKIP_FAN_CHECK:-0}" = "1" ] && { FAN_VERDICT="SKIP"; FAN_DETAIL="skipped"; } || {

# Read raw fan mode string and normalize to a short canonical label
FAN_MODE_RAW=$(ipmicfg -fan 2>/dev/null \
    | grep -i "Current Fan Speed Mode" | sed 's/.*:[[:space:]]*//' | tr -d '[]' | xargs)
if   echo "$FAN_MODE_RAW" | grep -qi "Heavy IO";  then FAN_MODE="Heavy IO"
elif echo "$FAN_MODE_RAW" | grep -qi "Optimal";   then FAN_MODE="Optimal"
elif echo "$FAN_MODE_RAW" | grep -qi "Standard";  then FAN_MODE="Standard"
elif echo "$FAN_MODE_RAW" | grep -qi "Full";      then FAN_MODE="Full Speed"
else FAN_MODE="$FAN_MODE_RAW"; fi

# Expected mode: Heavy IO when RAID/10G is present, Optimal otherwise.
# Standard is also acceptable when Optimal is expected (both are low-power modes).
if [ "$HAS_RAID_10G" -eq 1 ]; then EXPECTED_FAN="Heavy IO"
else                                EXPECTED_FAN="Optimal"; fi

if [ -z "$FAN_MODE" ]; then
    FAN_VERDICT="FAIL"; FAN_DETAIL="cannot read"
elif [ "$EXPECTED_FAN" = "Heavy IO" ] && [ "$FAN_MODE" = "Heavy IO" ]; then
    FAN_VERDICT="PASS"; FAN_DETAIL="${FAN_MODE}"
elif [ "$EXPECTED_FAN" = "Optimal" ] && { [ "$FAN_MODE" = "Standard" ] || [ "$FAN_MODE" = "Optimal" ]; }; then
    FAN_VERDICT="PASS"; FAN_DETAIL="${FAN_MODE}"
else
    FAN_VERDICT="WARN"; FAN_DETAIL="${FAN_MODE} -- want ${EXPECTED_FAN}"
    WARN_ITEMS+=("Fan|Mode is ${FAN_MODE}, expected ${EXPECTED_FAN}")
fi

# -- IPMI network check -------------------------------------------------------
# Verify IPMI has a real IP configured (not 0.0.0.0 / missing).
IPMI_IP=$(ipmicfg -summary 2>/dev/null | grep "IPv4 Address" | cut -d: -f2 | xargs)
if [ -z "$IPMI_IP" ] || [ "$IPMI_IP" = "0.0.0.0" ]; then
    IPMINET_VERDICT="FAIL"; IPMINET_DETAIL="no IPMI IP"
    FAIL_ITEMS+=("IPMI Net|No IP configured")
else
    IPMINET_VERDICT="PASS"; IPMINET_DETAIL="${IPMI_IP}"
fi

} # END: Fan + IPMI network


# =============================================================================
#  SECTION: NIC speed and link status
#  Enumerates eth*, eno*, and bond* interfaces (excluding VLAN sub-interfaces)
#  and checks speed + link state via ethtool.
# =============================================================================
declare -a NIC_LINES=()
if [ "${SKIP_NIC_CHECK:-0}" != "1" ]; then

while IFS= read -r iface; do
    [ -z "$iface" ] && continue
    speed=$(ethtool "$iface" 2>/dev/null | awk -F: '/Speed:/{gsub(/^[ \t]+/,"",$2); print $2}' | xargs)
    link=$(ethtool  "$iface" 2>/dev/null | awk -F: '/Link detected:/{gsub(/^[ \t]+/,"",$2); print $2}' | xargs)
    [ -z "$speed" ] && speed="unknown"
    [ -z "$link"  ] && link="unknown"
    if echo "$link" | grep -qi "yes"; then link_label="up";   link_color="G"
    else                                    link_label="down"; link_color="R"; fi
    NIC_LINES+=("${iface}|${speed}|${link_label}|${link_color}")
done < <(ip -o link show 2>/dev/null \
    | awk -F': ' '{print $2}' \
    | grep -E '^(eth|eno|bond)' \
    | grep -v '\.'  \
    | sort)

fi # END: NIC check


# =============================================================================
#  SECTION: SEL and SDR (gathered early -- ipmicfg -sdr full is slow)
#  SEL = System Event Log  (historical events, e.g. reboots, intrusions)
#  SDR = Sensor Data Repository  (live sensor readings, e.g. voltage, temp)
# =============================================================================
SEL_LINES=""; SDR_LINES=""
if [ "${SKIP_SEL_SDR:-0}" != "1" ]; then

# Filter out known-benign SEL events: AC power-on records and empty lines.
# Separator dashes from ipmicfg formatting are also stripped.
SEL_LINES=$(ipmicfg -sel list 2>/dev/null \
    | grep -vE "^$|^SEL|No.*entries|List is empty|ACPowerOn|AC Power on|First AC" \
    | grep -vE "^[[:space:]]*-+[[:space:]]*$" \
    | grep -v "^[[:space:]]*$")

# Filter out SDR noise: empty lines, column headers, separator rows,
# OK-status rows, and N/A readings. Only actual alerts survive.
SDR_LINES=$(ipmicfg -sdr full 2>/dev/null \
    | grep -vE "^\s*$|^\s*Status\s*\||^\s*-+\s*\|" \
    | grep -vE "^\s*OK\s*\|" \
    | grep -vE "^\s*\|.*N/A.*N/A")

fi # END: SEL/SDR gather


# =============================================================================
#  SECTION: Drive SMART data collection
#  Iterates over all physical drives (via storcli in RAID mode, or lsblk in
#  direct-attached mode) and collects model, serial, size, SMART error counts,
#  and SSD wear indicators via smartctl.
# =============================================================================
declare -a DRV_DEV DRV_SLOT DRV_DID DRV_DG DRV_SIZE DRV_STATE \
           DRV_ERR DRV_WEAR DRV_MODEL DRV_SERIAL DRV_STATUS
DRIVE_FAIL_COUNT=0

# collect_drive DEV SLOT DG MEGARAID_DID STATE
# Runs smartctl on one drive and appends results to the DRV_* arrays.
# For RAID drives, MEGARAID_DID is the device ID for passthrough; DEV is empty.
# For direct drives, DEV is the block device path; MEGARAID_DID is empty.
collect_drive() {
    local dev="$1" slot="$2" dg="$3" megaraid_did="$4" state_in="$5"
    local smart_out model serial size

    if [ -n "$megaraid_did" ]; then
        smart_out=$(smartctl -a -d "megaraid,${megaraid_did}" /dev/bus/0 2>/dev/null)
    else
        smart_out=$(smartctl -a "$dev" 2>/dev/null)
    fi

    # Extract identity fields
    model=$(printf '%s' "$smart_out"  | grep -E "Device Model|Model Number" | head -1 | cut -d: -f2 | xargs)
    serial=$(printf '%s' "$smart_out" | grep "Serial Number"                | head -1 | cut -d: -f2 | xargs)
    size=$(printf '%s' "$smart_out"   | grep "User Capacity"                | head -1 | cut -d'[' -f2 | cut -d']' -f1 | xargs)
    [ -z "$model" ]  && model="(unknown)"
    [ -z "$serial" ] && serial="(unknown)"

    # For NVMe, smartctl User Capacity is often empty -- fall back to lsblk
    if [ -z "$size" ] || echo "$size" | grep -q "^?"; then
        if [ -n "$dev" ]; then
            size=$(lsblk -dno SIZE "$dev" 2>/dev/null)
        elif [ -n "$megaraid_did" ]; then
            : # megaraid passthrough -- size parsed below from bytes
        fi
    fi
    [ -z "$size" ] && size="?"

    # Convert "X bytes [Y GB]" to short form if smartctl returned the bytes format
    if echo "$size" | grep -q "bytes"; then
        bytes=$(echo "$size" | tr -d ',' | grep -oE '[0-9]+' | head -1)
        if [ -n "$bytes" ]; then
            gb=$(( bytes / 1000000000 ))
            [ "$gb" -ge 1000 ] && size="$(awk "BEGIN{printf \"%.1fTB\",$gb/1000}")" \
                                || size="${gb}GB"
        fi
    fi

    # Strip common manufacturer prefixes for brevity in the drive table
    model_short=$(echo "$model" | sed -E \
        's/^(INTEL|SAMSUNG|SEAGATE|TOSHIBA|HITACHI|HGST|WDC|WD|MICRON|KINGSTON|CRUCIAL|SANDISK)[[:space:]]+//I')

    # Read key SMART attributes -- accumulate error counts and wear indicator
    local reallocated=0 pending=0 offline=0 reported=0 udma=0 wear=""
    while read -r id attr f v w t tp up wf raw; do
        case "$attr" in
            Reallocated_Sector_Ct)                  reallocated=$raw ;;  # remapped bad sectors
            Current_Pending_Sector)                 pending=$raw     ;;  # sectors waiting to be remapped
            Offline_Uncorrectable)                  offline=$raw     ;;  # errors found in offline scan
            Reported_Uncorrect)                     reported=$raw    ;;  # reported uncorrectable errors
            UDMA_CRC_Error_Count)                   udma=$raw        ;;  # cable/interface errors
            Media_Wearout_Indicator|Wear_Leveling_Count) wear="$v"   ;;  # SSD % life remaining
        esac
    done < <(printf '%s' "$smart_out" | grep -E \
        "Reallocated_Sector|Current_Pending|Offline_Unco|Reported_Uncorrect|UDMA_CRC_Error|Media_Wearout|Wear_Leveling")

    local total_err=$(( reallocated + pending + offline + reported + udma ))
    local drv_status="PASS"
    [ "$total_err" -ge "$SMART_ERR_THRESHOLD" ]              && drv_status="FAIL"
    [ -n "$wear" ] && [ "$wear" -lt "$SSD_WEAR_THRESHOLD" ] 2>/dev/null && drv_status="FAIL"

    DRV_DEV+=("$dev");           DRV_SLOT+=("$slot")
    DRV_DID+=("$megaraid_did");  DRV_DG+=("$dg")
    DRV_SIZE+=("$size");         DRV_STATE+=("$state_in")
    DRV_ERR+=("$total_err");     DRV_WEAR+=("${wear:--}")
    DRV_MODEL+=("$model_short"); DRV_SERIAL+=("$serial")
    DRV_STATUS+=("$drv_status")
    [ "$drv_status" = "FAIL" ] && DRIVE_FAIL_COUNT=$(( DRIVE_FAIL_COUNT + 1 ))
}

if $IS_REAL_RAID; then
    # RAID mode: get drive list from storcli, use megaraid passthrough for smartctl
    while read -r eid_slt did state dg; do
        collect_drive "" "$eid_slt" "$dg" "$did" "$state"
    done < <(storcli /call /eall /sall show 2>/dev/null | awk '
        /[0-9]+:[0-9]+/ && /UGood|Onln|Offln|UBad|GHS|DHS|JBOD/ { print $1, $2, $3, $4 }')
else
    # Direct-attached mode: iterate over all block devices from lsblk
    while IFS= read -r dev; do
        [ -b "$dev" ] || continue
        collect_drive "$dev" "$(basename "$dev")" "-" "" "Direct"
    done < <(lsblk -dno NAME,TYPE 2>/dev/null | awk '$2=="disk"{print "/dev/"$1}' | sort)
fi

[ "$DRIVE_FAIL_COUNT" -gt 0 ] && \
    FAIL_ITEMS+=("Drives|${DRIVE_FAIL_COUNT} drive(s) failing -- see drive table above")

# -- Unformatted drive check --------------------------------------------------
# Warn if any direct-attached disk has no filesystem anywhere on it.
# Skipped in RAID mode since storcli manages those drives at a lower level.
if ! $IS_REAL_RAID; then
    while IFS= read -r _uf_dev; do
        [ -b "$_uf_dev" ] || continue
        _uf_fstype=$(lsblk -no FSTYPE "$_uf_dev" 2>/dev/null | grep -v '^$' | head -1)
        if [ -z "$_uf_fstype" ]; then
            _uf_parts=$(lsblk -no TYPE "$_uf_dev" 2>/dev/null | grep -c "part" 2>/dev/null || echo 0)
            _uf_name=$(basename "$_uf_dev")
            if [ "${_uf_parts}" -eq 0 ] 2>/dev/null; then
                WARN_ITEMS+=("${_uf_name}|Drive has no partitions and no filesystem")
            else
                WARN_ITEMS+=("${_uf_name}|Drive has partitions but no filesystem (unformatted)")
            fi
        fi
    done < <(lsblk -dno NAME,TYPE 2>/dev/null | awk '$2=="disk"{print "/dev/"$1}' | sort)
fi

# =============================================================================
#  SECTION: PSU (Power Supply) status
#  Reads PSU count and status from ipmicfg -summary.
#  A missing or failed PSU on a redundant system is silent without this check.
# =============================================================================
PSU_LINES=()
PSU_VERDICT="SKIP"; PSU_DETAIL="skipped"
if [ "${SKIP_PSU_CHECK:-0}" != "1" ]; then
    PSU_DETECTED=0
    PSU_FAIL_COUNT=0

    if [ -n "$SDR_PSU_LINES" ]; then
        # Parse SDR pipe-delimited format:
        #   OK | (2148) PS1 Status | Power Supply | Presence detected |
        # Field 1 = status, Field 2 = sensor name (strip numeric ID in parens)
        while IFS= read -r psu_line; do
            [ -z "$psu_line" ] && continue
            psu_stat=$(echo "$psu_line" | awk -F'|' '{gsub(/^[ 	]+|[ 	]+$/,"",$1); print $1}')
            psu_name=$(echo "$psu_line" | awk -F'|' '{gsub(/^[ 	]+|[ 	]+$/,"",$2); gsub(/\([0-9]+\)[[:space:]]*/,"",$2); print $2}')
            psu_reading=$(echo "$psu_line" | awk -F'|' '{gsub(/^[ 	]+|[ 	]+$/,"",$4); print $4}')
            [ -z "$psu_name" ] && continue
            PSU_DETECTED=$(( PSU_DETECTED + 1 ))
            _psu_display="${psu_name}  ${psu_reading}"
            if echo "$psu_stat" | grep -qi "^OK"; then
                PSU_LINES+=("${psu_name}|${psu_reading:-Presence detected}|PASS")
            else
                PSU_LINES+=("${psu_name}|${psu_reading:-${psu_stat}}|FAIL")
                PSU_FAIL_COUNT=$(( PSU_FAIL_COUNT + 1 ))
                FAIL_ITEMS+=("PSU|${psu_name}: ${psu_stat}")
            fi
        done <<< "$SDR_PSU_LINES"
    fi

    # Fall back to ipmicfg -summary if SDR had no PSU rows
    if [ "$PSU_DETECTED" -eq 0 ]; then
        while IFS= read -r psu_line; do
            [ -z "$psu_line" ] && continue
            psu_num=$(echo "$psu_line" | grep -oE '(Power Supply|PSU|PS)[[:space:]]*[0-9]+' | grep -oE '[0-9]+' | head -1)
            psu_status=$(echo "$psu_line" | cut -d: -f2 | xargs)
            [ -z "$psu_num" ] && continue
            PSU_DETECTED=$(( PSU_DETECTED + 1 ))
            if echo "$psu_status" | grep -qi "Power OK\|Present.*OK"; then
                PSU_LINES+=("PSU${psu_num}|${psu_status}|PASS")
            else
                PSU_LINES+=("PSU${psu_num}|${psu_status}|FAIL")
                PSU_FAIL_COUNT=$(( PSU_FAIL_COUNT + 1 ))
            fi
        done < <(ipmicfg -summary 2>/dev/null | grep -iE "Power Supply|PSU|^PS[[:space:]]+[0-9]")
    fi

    if [ "$PSU_DETECTED" -eq 0 ]; then
        PSU_VERDICT="SKIP"; PSU_DETAIL="not reported by this board"
    elif [ "$PSU_FAIL_COUNT" -gt 0 ]; then
        PSU_VERDICT="FAIL"
        PSU_DETAIL="${PSU_DETECTED} PSU(s), ${PSU_FAIL_COUNT} not OK"
    elif [ "$PSU_DETECTED" -lt "$PSU_EXPECTED_COUNT" ] 2>/dev/null; then
        PSU_VERDICT="WARN"
        PSU_DETAIL="${PSU_DETECTED} of ${PSU_EXPECTED_COUNT} expected PSUs"
        WARN_ITEMS+=("PSU|Only ${PSU_DETECTED} of ${PSU_EXPECTED_COUNT} PSUs detected")
    else
        PSU_VERDICT="PASS"; PSU_DETAIL="${PSU_DETECTED} PSU(s) OK"
    fi
fi


# =============================================================================
#  SECTION: NVMe health via nvme-cli smart-log
#  smartctl misses NVMe-specific health fields. nvme smart-log provides:
#    - Percentage Used      (0-100+, manufacturer rated write endurance consumed)
#    - Available Spare      (% of reserve blocks remaining)
#    - Critical Warning     (bitmask: 0x00 = all clear)
#  Only runs if nvme-cli is installed and NVMe controllers are present.
# =============================================================================
declare -a NVME_SLOT NVME_MODEL NVME_PCT_USED NVME_SPARE NVME_WARN_BITS NVME_STATUS
NVME_FAIL_COUNT=0

if [ "${SKIP_NVME_HEALTH:-0}" != "1" ] && command -v nvme >/dev/null 2>&1; then
    for _nctrl in /dev/nvme[0-9]*; do
        # Only controller nodes (nvme0), not namespace nodes (nvme0n1)
        [[ "$(basename "$_nctrl")" =~ ^nvme[0-9]+$ ]] || continue
        [ -e "$_nctrl" ] || continue

        # Read model name from controller identify
        _nmodel=$(nvme id-ctrl "$_nctrl" 2>/dev/null             | awk -F: '/^mn /{gsub(/^[ 	]+/,"",$2); print $2; exit}' | xargs)
        [ -z "$_nmodel" ] && _nmodel="(unknown)"

        # Parse smart-log output for health fields
        _smart=$(nvme smart-log "$_nctrl" 2>/dev/null)
        _pct_used=$(printf '%s' "$_smart"  | awk -F: '/percentage_used/{gsub(/[^0-9]/,"",$2); print $2+0}')
        _spare=$(printf '%s' "$_smart"     | awk -F: '/available_spare[^_]/{gsub(/[^0-9]/,"",$2); print $2+0}')
        _warn=$(printf '%s' "$_smart"      | awk -F: '/critical_warning/{gsub(/[^0-9a-fx]/,"",$2); print $2}')

        [ -z "$_pct_used" ] && _pct_used="-"
        [ -z "$_spare"    ] && _spare="-"
        [ -z "$_warn"     ] && _warn="-"

        # Determine per-drive status
        _nstatus="PASS"
        if [ "$_pct_used" != "-" ] && [ "$_pct_used" -ge "$NVME_PCT_USED_THRESHOLD" ] 2>/dev/null; then
            _nstatus="FAIL"
        fi
        if [ "$_spare" != "-" ] && [ "$_spare" -lt "$NVME_SPARE_WARN_THRESHOLD" ] 2>/dev/null; then
            [ "$_nstatus" = "PASS" ] && _nstatus="WARN"
        fi
        # Critical warning bits: 0x00 = healthy. Any non-zero value = alert.
        if [ "$_warn" != "-" ] && [ "$_warn" != "0x00" ] && [ "$_warn" != "0" ]; then
            _nstatus="FAIL"
        fi

        NVME_SLOT+=("$_nctrl")
        NVME_MODEL+=("$_nmodel")
        NVME_PCT_USED+=("$_pct_used")
        NVME_SPARE+=("$_spare")
        NVME_WARN_BITS+=("$_warn")
        NVME_STATUS+=("$_nstatus")
        [ "$_nstatus" = "FAIL" ] && NVME_FAIL_COUNT=$(( NVME_FAIL_COUNT + 1 ))
    done
    [ "$NVME_FAIL_COUNT" -gt 0 ] &&         FAIL_ITEMS+=("NVMe|${NVME_FAIL_COUNT} NVMe drive(s) with health issues")
fi


# =============================================================================
#  SECTION: Bond interface check
#  For each bond* interface, verify both member NICs are active and report
#  the bonding mode, active slave count, and any failed members.
#  Bond health lives in /proc/net/bonding/<bond_name>.
# =============================================================================
declare -a BOND_LINES=()
if [ "${SKIP_NIC_CHECK:-0}" != "1" ]; then
    for _bond_proc in /proc/net/bonding/bond*; do
        [ -r "$_bond_proc" ] || continue
        _bname=$(basename "$_bond_proc")

        # Parse bonding mode from the proc file
        _bmode=$(awk -F: '/^Bonding Mode:/{gsub(/^[ 	]+/,"",$2); print $2}' "$_bond_proc" | xargs)

        # Count total slave interfaces and active (up) slaves
        _total_slaves=$(grep -c "^Slave Interface:" "$_bond_proc" 2>/dev/null || echo 0)
        _active_slaves=$(grep -c "MII Status: up" "$_bond_proc" 2>/dev/null || echo 0)

        # Collect names of any slaves that are down
        _down_slaves=$(awk '
            /^Slave Interface:/ { iface=$NF }
            /MII Status: down/  { print iface }
        ' "$_bond_proc" | paste -sd ',' -)

        # Determine bond verdict
        if [ "$_active_slaves" -eq 0 ]; then
            _bverd="FAIL"
            FAIL_ITEMS+=("${_bname}|all slaves down")
        elif [ -n "$_down_slaves" ]; then
            _bverd="WARN"
            WARN_ITEMS+=("${_bname}|slave(s) down: ${_down_slaves}")
        else
            _bverd="PASS"
        fi

        _bdetail="${_active_slaves}/${_total_slaves} slaves up  mode: ${_bmode:-unknown}"
        [ -n "$_down_slaves" ] && _bdetail="${_bdetail}  down: ${_down_slaves}"
        BOND_LINES+=("${_bname}|${_bdetail}|${_bverd}")
    done
fi


# =============================================================================
#  SECTION: lspci device summary
#  Extracts key PCI devices: RAID controllers, NICs, HBAs, and GPUs.
#  Provides a quick way to verify the installed card inventory matches
#  the build spec without logging into the IPMI or opening the chassis.
# =============================================================================
LSPCI_SUMMARY_LINES=()
if [ "${SKIP_LSPCI:-0}" != "1" ]; then
    # Filter lspci to device classes we care about: RAID, NICs, HBAs, NVMe.
    # Excluded:
    #   - VGA / Display / ASPEED: BMC video chip, always present, not useful
    #   - Onboard 1G NICs (bus 00): not relevant for inventory purposes
    # Deduplicated: identical model strings (e.g. dual-port cards showing twice)
    #   are collapsed to a single line with a count prefix.
    declare -A _pci_seen=()
    while IFS= read -r _pci_line; do
        [ -z "$_pci_line" ] && continue
        # Strip BDF prefix (e.g. "03:00.0 ") -- just want the device description
        _pci_short=$(echo "$_pci_line" | sed -E 's/^[0-9a-f:.]+ //; s/^[^:]+:[[:space:]]*//') 
        # Count occurrences -- increment seen counter
        _pci_seen["$_pci_short"]=$(( ${_pci_seen["$_pci_short"]:-0} + 1 ))
    done < <(printf '%s
' "$LSPCI" | grep -iE         "RAID|MegaRAID|Ethernet|Fibre Channel|Non-Volatile"         | grep -viE "Audio|USB|SMBus|ISA|PCI bridge|Host bridge|VGA|Display|ASPEED|AST[0-9]")

    # Emit deduplicated lines, prefixing count when > 1
    for _pci_short in "${!_pci_seen[@]}"; do
        _cnt=${_pci_seen["$_pci_short"]}
        if [ "$_cnt" -gt 1 ]; then
            LSPCI_SUMMARY_LINES+=("${_cnt}x  ${_pci_short}")
        else
            LSPCI_SUMMARY_LINES+=("${_pci_short}")
        fi
    done
    unset _pci_seen _pci_short _cnt
fi


# Add RAM result to the global lists now that all data is collected
[ "$RAM_VERDICT" = "FAIL" ] && FAIL_ITEMS+=("RAM|${RAM_FAIL_REASON}")
[ "$RAM_VERDICT" = "WARN" ] && WARN_ITEMS+=("RAM|${RAM_FAIL_REASON}")


# =============================================================================
#  RENDER
#  Nothing above this line prints to stdout. Everything below is output.
# =============================================================================

box_open "SERVER SUMMARY -- ${HOSTNAME}" "$C"

# -- Header: identity / run info ----------------------------------------------
bline "$(printf "  %b%-10s%b  %b%s%b" "$BOLD" "IP:"  "$RST" "$C"   "$IP_DISPLAY" "$RST")"
[ -n "$ETH0_MAC" ] && \
bline "$(printf "  %b%-10s%b  %b%s%b" "$BOLD" "${MAC_IFACE^^} MAC:" "$RST" "$DIM" "$ETH0_MAC" "$RST")"
bline "$(printf "  %b%-10s%b  %b%s%b" "$BOLD" "OS:"  "$RST" "$M"   "${OS_NAME:-unknown}" "$RST")"
bline "$(printf "  %b%-10s%b  %b%s%b" "$BOLD" "Run:" "$RST" "$DIM" "$RUN_TIME" "$RST")"
bdiv

# -- Hardware summary rows ----------------------------------------------------
bline "$(printf "  %b%-10s%b %b%s%b" "$BOLD" "Board" "$RST" "$C"   "${BOARD_MFG} ${BOARD_RAW}" "$RST")"
bline "$(printf "  %b%-10s%b %b%s%b" "$BOLD" "CPU"   "$RST" "$DIM" "$CPU_DETAIL" "$RST")"

# RAM: first DIMM config line gets the verdict badge + (MATCHING) tag.
# If multiple distinct configs exist, show MISMATCH. Additional configs
# print as plain continuation lines below the first.
first_ram=true
for line in "${RAM_DISPLAY_LINES[@]}"; do
    if $first_ram; then
        if [ "$DIMM_UNIQUE" -gt 1 ]; then
            verdict_row "RAM" "MISMATCH  ${line}" "$RAM_VERDICT"
        else
            _ram_match_tag="$(printf "%b%b(MATCHING)%b" "$G" "$BOLD" "$RST")"
            verdict_row "RAM" "$line${RAM_TYPE_TAG:+  $RAM_TYPE_TAG}  ${_ram_match_tag}" "$RAM_VERDICT"
        fi
        first_ram=false
    else
        cont_row "$line"
    fi
done

# -- Firmware rows: append (LATEST) when on the newest known version ----------
if [ "$BIOS_VERDICT" = "PASS" ] && [ -n "$LATEST_BIOS" ]; then
    verdict_row "BIOS" "${BIOS_DETAIL}  (LATEST)" "$BIOS_VERDICT"
else
    verdict_row "BIOS" "$BIOS_DETAIL" "$BIOS_VERDICT"
fi

if [ "$IPMI_VERDICT" = "PASS" ] && [ -n "$LATEST_IPMI" ]; then
    verdict_row "IPMI FW" "${IPMI_DETAIL}  (LATEST)" "$IPMI_VERDICT"
else
    verdict_row "IPMI FW" "$IPMI_DETAIL" "$IPMI_VERDICT"
fi

if [ "$RAID_VERDICT" = "PASS" ] && [ -n "$LATEST_RAID_FW" ] && [ "$LATEST_RAID_FW" != "EOL" ]; then
    verdict_row "RAID" "${RAID_DETAIL}  (LATEST)" "$RAID_VERDICT"
else
    verdict_row "RAID" "$RAID_DETAIL" "$RAID_VERDICT"
fi

# -- BBU/CacheVault row (RAID only) -------------------------------------------
if $IS_REAL_RAID; then
    if [ "$BBU_CV_VERDICT" = "WARN" ]; then
        verdict_row "BBU/CV" "$BBU_CV_DETAIL" "WARN"; WARNS=$(( WARNS + 1 ))
    elif [ "$BBU_CV_VERDICT" = "INFO" ]; then
        verdict_row "BBU/CV" "$BBU_CV_DETAIL" "PASS"  # INFO maps to PASS badge
    else
        verdict_row "BBU/CV" "$BBU_CV_DETAIL" "WARN"; WARNS=$(( WARNS + 1 ))
    fi
fi

# -- Fan mode row: append detection reason in parentheses ---------------------
if [ "$HAS_RAID_10G" -eq 1 ]; then
    verdict_row "Fan" "${FAN_DETAIL}  (10G NIC / RAID DETECTED)" "$FAN_VERDICT"
else
    verdict_row "Fan" "${FAN_DETAIL}  (NO RAID / 10G NIC)" "$FAN_VERDICT"
fi

verdict_row "IPMI Net" "$IPMINET_DETAIL" "$IPMINET_VERDICT"

# -- PSU rows: one per detected power supply ----------------------------------
if [ "${SKIP_PSU_CHECK:-0}" != "1" ]; then
    if [ "${#PSU_LINES[@]}" -eq 0 ]; then
        verdict_row "PSU" "$PSU_DETAIL" "$PSU_VERDICT"
    else
        for _psu_entry in "${PSU_LINES[@]}"; do
            IFS='|' read -r _psu_label _psu_status _psu_v <<< "$_psu_entry"
            verdict_row "$_psu_label" "$_psu_status" "$_psu_v"
        done
    fi
fi
unset _psu_entry _psu_label _psu_status _psu_v

# -- NIC rows: one per interface, down links flagged as FAIL ------------------
for nic_entry in "${NIC_LINES[@]}"; do
    IFS='|' read -r _nic _speed _link _lc <<< "$nic_entry"
    if [ "$_lc" = "G" ]; then
        verdict_row "$_nic" "$_speed  up" "PASS"
    else
        verdict_row "$_nic" "$_speed  down" "FAIL"
        FAIL_ITEMS+=("${_nic}|link is down"); FAILS=$(( FAILS + 1 ))
    fi
done
unset _nic _speed _link _lc

# -- Bond rows: one per bond interface ----------------------------------------
for _bond_entry in "${BOND_LINES[@]}"; do
    IFS='|' read -r _bname _bdetail _bverd <<< "$_bond_entry"
    verdict_row "$_bname" "$_bdetail" "$_bverd"
done
unset _bond_entry _bname _bdetail _bverd

# =============================================================================
#  RENDER: Virtual drive table (RAID only)
#  Pulls the VD list from storcli and renders it as a simple table.
# =============================================================================
if $IS_REAL_RAID && [ "${SKIP_VD_TABLE:-0}" != "1" ]; then
    _VD_OUT=$(storcli /call /vall show 2>/dev/null)
    _VD_HDR=$(printf '%s' "$_VD_OUT" | grep -E "^DG/VD" | head -1)
    _VD_ROWS=$(printf '%s' "$_VD_OUT" | grep -E "^[0-9]+/[0-9]+")
    if [ -n "$_VD_ROWS" ]; then
        _VD_DIV=$(printf '%s' "$_VD_HDR" | sed 's/./-/g')
        bdiv
        bline "$(printf "  %b%s%b" "$DIM"  "$_VD_DIV" "$RST")"
        bline "$(printf "  %b%s%b" "$BOLD" "$_VD_HDR" "$RST")"
        bline "$(printf "  %b%s%b" "$DIM"  "$_VD_DIV" "$RST")"
        while IFS= read -r _vd_line; do
            [ -n "$_vd_line" ] && bline "$(printf "  %b%s%b" "$C" "$_vd_line" "$RST")"
        done < <(printf '%s' "$_VD_ROWS")
        bline "$(printf "  %b%s%b" "$DIM"  "$_VD_DIV" "$RST")"
    fi
    unset _VD_OUT _VD_HDR _VD_ROWS _VD_DIV _vd_line
fi

# =============================================================================
#  RENDER: Physical drive table
#  PASS drives listed first; FAIL drives separated by a divider at the bottom.
#  Column layout differs between RAID mode (includes Slot/DG/State) and direct.
# =============================================================================
if [ "${SKIP_DRIVE_TABLE:-0}" != "1" ]; then
bdiv
if $IS_REAL_RAID; then
    bline "$(printf "  %b%-7s  %-3s  %-6s  %-5s  %-4s  %-4s  %-14s  %-19s  %s%b" \
        "$BOLD" "Slot" "DG" "Size" "State" "Err" "Wear" "Model" "Serial" "Status" "$RST")"
else
    bline "$(printf "  %b%-8s  %-7s  %-5s  %-16s  %-20s  %s%b" \
        "$BOLD" "Drive" "Size" "Err" "Model" "Serial" "Status" "$RST")"
fi
bdiv

# Sort drive indices: PASS first, then FAIL
pass_idx=(); fail_idx=()
for i in "${!DRV_STATUS[@]}"; do
    [ "${DRV_STATUS[$i]}" = "PASS" ] && pass_idx+=($i) || fail_idx+=($i)
done

# print_drive_row INDEX
# Renders one drive row with a right-aligned colored PASS/FAIL badge.
print_drive_row() {
    local i=$1
    local status="${DRV_STATUS[$i]}"
    local sc badge left vis_len pad
    [ "$status" = "PASS" ] && sc="$G" || sc="$R"
    badge=$(printf "%b%s%b" "$sc" "$status" "$RST")
    if $IS_REAL_RAID; then
        left=$(printf "  %b%-7s%b  %-3s  %-6s  %-5s  %-4s  %-4s  %-14s  %-19s" \
            "$DIM" "${DRV_SLOT[$i]}" "$RST" \
            "${DRV_DG[$i]}" "${DRV_SIZE[$i]}" "${DRV_STATE[$i]}" \
            "${DRV_ERR[$i]}" "${DRV_WEAR[$i]}" \
            "${DRV_MODEL[$i]}" "${DRV_SERIAL[$i]}")
    else
        left=$(printf "  %b%-8s%b  %-7s  %-5s  %-16s  %-20s" \
            "$DIM" "${DRV_SLOT[$i]}" "$RST" \
            "${DRV_SIZE[$i]}" "${DRV_ERR[$i]}" \
            "${DRV_MODEL[$i]}" "${DRV_SERIAL[$i]}")
    fi
    vis_len=$(printf '%s' "$left" | sed -E 's/\x1b\[[0-9;:]*[mGKHFJA-Za-z]//g' | wc -c)
    pad=$(( W - vis_len - ${#status} ))
    [ $pad -lt 1 ] && pad=1
    bline "$(printf "%s%${pad}s%s" "$left" "" "$badge")"
}

for i in "${pass_idx[@]}"; do print_drive_row $i; done
if [ "${#fail_idx[@]}" -gt 0 ]; then
    bdiv
    for i in "${fail_idx[@]}"; do print_drive_row $i; done
fi
fi # END: drive table

# =============================================================================
#  RENDER: NVMe health table
#  Shows nvme smart-log health fields per controller: % used, spare, warnings.
#  Only rendered when NVMe controllers are present and nvme-cli is installed.
# =============================================================================
if [ "${SKIP_NVME_HEALTH:-0}" != "1" ] && [ "${#NVME_SLOT[@]}" -gt 0 ]; then
    bdiv
    bline "$(printf "  %b%bNVMe Health (nvme smart-log):%b" "$BOLD" "$C" "$RST")"
    bdiv
    bline "$(printf "  %b%-12s  %-28s  %-8s  %-8s  %-10s  %s%b"         "$BOLD" "Device" "Model" "Used%" "Spare%" "CritWarn" "Status" "$RST")"
    bdiv
    for _ni in "${!NVME_SLOT[@]}"; do
        _ns="${NVME_STATUS[$_ni]}"
        [ "$_ns" = "PASS" ] && _nsc="$G" || { [ "$_ns" = "WARN" ] && _nsc="$Y" || _nsc="$R"; }
        _nbadge=$(printf "%b%s%b" "$_nsc" "$_ns" "$RST")
        _nleft=$(printf "  %b%-12s%b  %-28s  %-8s  %-8s  %-10s"             "$DIM" "$(basename "${NVME_SLOT[$_ni]}")" "$RST"             "${NVME_MODEL[$_ni]}"             "${NVME_PCT_USED[$_ni]}"             "${NVME_SPARE[$_ni]}"             "${NVME_WARN_BITS[$_ni]}")
        _nvis=$(printf '%s' "$_nleft" | sed -E 's/\[[0-9;:]*[mGKHFJA-Za-z]//g' | wc -c)
        _npad=$(( W - _nvis - ${#_ns} ))
        [ $_npad -lt 1 ] && _npad=1
        bline "$(printf "%s%${_npad}s%s" "$_nleft" "" "$_nbadge")"
    done
    unset _ni _ns _nsc _nbadge _nleft _nvis _npad
fi

# =============================================================================
#  RENDER: Block device map
#  lsblk tree showing partitions, filesystems, and mount points.
#  Model names are read from sysfs (faster than smartctl per device).
# =============================================================================
bdiv
bline "$(printf "  %b%bBlock Devices:%b" "$BOLD" "$C" "$RST")"
bdiv

# Build a NAME->MODEL lookup from lsblk in one pass (top-level disks only)
declare -A _LB_MODELS
while IFS= read -r _lbm_line; do
    _lbm_name=$(echo "$_lbm_line" | awk '{print $1}')
    _lbm_model=$(echo "$_lbm_line" | cut -d' ' -f2- | xargs)
    [ -n "$_lbm_name" ] && _LB_MODELS["$_lbm_name"]="$_lbm_model"
done < <(lsblk -dno NAME,MODEL 2>/dev/null)

# lsblk MODEL is often empty for NVMe -- fill in via nvme id-ctrl where missing
if command -v nvme >/dev/null 2>&1; then
    for _nctrl in /dev/nvme[0-9]*; do
        [[ "$(basename "$_nctrl")" =~ ^nvme[0-9]+$ ]] || continue
        [ -e "$_nctrl" ] || continue
        _ctrl_idx=$(basename "$_nctrl" | grep -oE '[0-9]+$')
        _ns_name="nvme${_ctrl_idx}n1"
        # Only fill in if lsblk left it empty
        if [ -z "${_LB_MODELS[$_ns_name]}" ]; then
            _nmodel=$(nvme id-ctrl "$_nctrl" 2>/dev/null                 | awk -F: '/^mn /{gsub(/^[ 	]+/,"",$2); print $2; exit}' | xargs)
            [ -n "$_nmodel" ] && _LB_MODELS["$_ns_name"]="$_nmodel"
        fi
    done
fi

# Exclude loop devices (-e 7), print tree with filesystem and mount info.
# Top-level disk lines start with a letter; partition/child lines start with
# tree-drawing unicode characters -- colour them differently.
lsblk -e 7 -o NAME,SIZE,TYPE,FSTYPE,MOUNTPOINT --noheadings 2>/dev/null | while IFS= read -r _lb_line; do
    [ -z "$_lb_line" ] && continue
    _lb_dev=$(printf '%s' "$_lb_line" | awk '{print $1}' \
        | tr -d '\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x94\xe2\x94\x82 ')
    _lb_model="${_LB_MODELS[$_lb_dev]:-}"
    if printf '%s' "$_lb_line" | grep -qE "^[a-zA-Z]"; then
        bline "$(printf "  %b%-28s%b %b%s%b" "$C"   "$_lb_line" "$RST" "$DIM" "$_lb_model" "$RST")"
    else
        bline "$(printf "  %b%-28s%b %b%s%b" "$DIM" "$_lb_line" "$RST" "$DIM" "$_lb_model" "$RST")"
    fi
done
unset _LB_MODELS _lb_line _lb_dev _lb_model _lbm_line _lbm_name _lbm_model

# =============================================================================
#  RENDER: SEL / SDR sensor alerts
#  SEL events are informational -- any non-benign event = WARN.
#  SDR alerts mean a sensor is outside normal range -- any alert = FAIL.
#  Long lines are truncated to fit inside the box (max 71 visible chars).
# =============================================================================
# =============================================================================
#  RENDER: PCI device summary
#  Lists RAID controllers, NICs, HBAs, and GPUs detected by lspci.
#  Useful for verifying installed cards match the build spec.
# =============================================================================
if [ "${SKIP_LSPCI:-0}" != "1" ] && [ "${#LSPCI_SUMMARY_LINES[@]}" -gt 0 ]; then
    bdiv
    bline "$(printf "  %b%bPCI Devices:%b" "$BOLD" "$C" "$RST")"
    bdiv
    for _pci_entry in "${LSPCI_SUMMARY_LINES[@]}"; do
        # Truncate long lines to fit in box
        _pci_vis=$(printf '%s' "$_pci_entry" | sed -E 's/\[[0-9;:]*[mGKHFJA-Za-z]//g')
        [ ${#_pci_vis} -gt $(( W - 4 )) ] && _pci_entry="${_pci_vis:0:$(( W - 7 ))}..."
        bline "$(printf "  %b%s%b" "$DIM" "$_pci_entry" "$RST")"
    done
    unset _pci_entry _pci_vis
fi

if [ "${SKIP_SEL_SDR:-0}" != "1" ]; then
bdiv

if [ -z "$SEL_LINES" ]; then
    verdict_row "SEL" "no events" "PASS"
else
    # Overall SEL badge is FAIL if any Critical events exist, WARN otherwise.
    SEL_TOTAL=$(( SEL_CRITICAL_COUNT + SEL_WARN_COUNT + SEL_INFO_COUNT ))
    SEL_SUMMARY="${SEL_TOTAL} event(s)"
    [ "$SEL_CRITICAL_COUNT" -gt 0 ] && SEL_SUMMARY="${SEL_SUMMARY}  ${SEL_CRITICAL_COUNT} critical"
    [ "$SEL_WARN_COUNT"     -gt 0 ] && SEL_SUMMARY="${SEL_SUMMARY}  ${SEL_WARN_COUNT} warning"
    [ "$SEL_INFO_COUNT"     -gt 0 ] && SEL_SUMMARY="${SEL_SUMMARY}  ${SEL_INFO_COUNT} info"

    if [ "$SEL_CRITICAL_COUNT" -gt 0 ]; then
        verdict_row "SEL" "$SEL_SUMMARY" "FAIL"
        FAIL_ITEMS+=("SEL|${SEL_CRITICAL_COUNT} critical event(s) in system event log")
    else
        verdict_row "SEL" "$SEL_SUMMARY" "WARN"
        WARN_ITEMS+=("SEL|${SEL_TOTAL} event(s) in system event log")
    fi

    # Print Critical events in red
    if [ -n "$SEL_CRITICAL" ]; then
        bline "$(printf "             %b%bCRITICAL:%b" "$BOLD" "$R" "$RST")"
        printf '%s
' "$SEL_CRITICAL" | while IFS= read -r sel_line; do
            [ -z "$sel_line" ] && continue
            vis=$(printf '%s' "$sel_line" | sed -E 's/\[[0-9;:]*[mGKHFJA-Za-z]//g')
            [ ${#vis} -gt 71 ] && sel_line="${vis:0:68}..."
            bline "$(printf "             %b%s%b" "$R" "$sel_line" "$RST")"
        done
    fi

    # Print Warning events in amber
    if [ -n "$SEL_WARN" ]; then
        bline "$(printf "             %b%bWARNING:%b" "$BOLD" "$Y" "$RST")"
        printf '%s
' "$SEL_WARN" | while IFS= read -r sel_line; do
            [ -z "$sel_line" ] && continue
            vis=$(printf '%s' "$sel_line" | sed -E 's/\[[0-9;:]*[mGKHFJA-Za-z]//g')
            [ ${#vis} -gt 71 ] && sel_line="${vis:0:68}..."
            bline "$(printf "             %b%s%b" "$Y" "$sel_line" "$RST")"
        done
    fi

    # Print Info/OK events in dim -- least urgent, shown last
    if [ -n "$SEL_INFO" ]; then
        bline "$(printf "             %b%bINFO:%b" "$BOLD" "$DIM" "$RST")"
        printf '%s
' "$SEL_INFO" | while IFS= read -r sel_line; do
            [ -z "$sel_line" ] && continue
            vis=$(printf '%s' "$sel_line" | sed -E 's/\[[0-9;:]*[mGKHFJA-Za-z]//g')
            [ ${#vis} -gt 71 ] && sel_line="${vis:0:68}..."
            bline "$(printf "             %b%s%b" "$DIM" "$sel_line" "$RST")"
        done
    fi
fi

if [ -z "$SDR_LINES" ]; then
    verdict_row "SDR" "all sensors OK" "PASS"
else
    verdict_row "SDR" "sensor alerts detected" "FAIL"
    FAIL_ITEMS+=("SDR|sensor alert(s) detected")
    printf '%s\n' "$SDR_LINES" | while IFS= read -r sdr_line; do
        vis=$(printf '%s' "$sdr_line" | sed -E 's/\x1b\[[0-9;:]*[mGKHFJA-Za-z]//g')
        [ ${#vis} -gt 71 ] && sdr_line="${vis:0:68}..."
        bline "$(printf "             %b%s%b" "$R" "$sdr_line" "$RST")"
    done
fi
fi # END: SEL/SDR render

# =============================================================================
#  RENDER: Overall verdict + failure/warning summary table
# =============================================================================
bdiv

# Roll up FAIL/WARN/PASS counts into a single overall badge
if [ "$FAILS" -gt 0 ]; then
    ob=$(badge_fail); od="${FAILS} fail, ${WARNS} warn, ${PASSES} ok"; EXIT_CODE=2
elif [ "$WARNS" -gt 0 ]; then
    ob=$(badge_warn); od="${WARNS} warn, ${PASSES} ok"; EXIT_CODE=1
else
    ob="${BGG}${BLK}${BOLD} PASS ${RST}"; od="${PASSES} checks passed"; EXIT_CODE=0
fi
bline "$(printf "  %b%-10s%b %b%s%b  %b" "$BOLD" "OVERALL" "$RST" "$DIM" "$od" "$RST" "$ob")"

# Print a detail table listing every FAIL then every WARN with its reason.
# Only rendered when there is something to report.
if [ "${#FAIL_ITEMS[@]}" -gt 0 ] || [ "${#WARN_ITEMS[@]}" -gt 0 ]; then
    bdiv
    bline "$(printf "  %b%-6s  %-10s  %s%b" "$BOLD" "Status" "Item" "Reason" "$RST")"
    bdiv
    for entry in "${FAIL_ITEMS[@]}"; do
        item="${entry%%|*}"; reason="${entry##*|}"
        bline "$(printf "  %b%-6s%b  %b%-10s%b  %b%s%b" \
            "$R" "FAIL" "$RST" "$BOLD" "$item" "$RST" "$DIM" "$reason" "$RST")"
    done
    for entry in "${WARN_ITEMS[@]}"; do
        item="${entry%%|*}"; reason="${entry##*|}"
        bline "$(printf "  %b%-6s%b  %b%-10s%b  %b%s%b" \
            "$Y" "WARN" "$RST" "$BOLD" "$item" "$RST" "$DIM" "$reason" "$RST")"
    done
fi

box_close "$C"
exit "${EXIT_CODE:-0}"