Release: monitoring, wiki, and infrastructure consolidation #9

Merged
jmiller merged 46 commits from dev into main 2026-05-16 13:49:51 +00:00
17 changed files with 939 additions and 238 deletions
+1 -1
View File
@@ -62,7 +62,7 @@ jobs:
API="${GITEA_URL}/api/v1"
# Platform/standards/infra repos to exclude
EXCLUDE="gitea-org-config org-profile gitea-private gitea-server-setup MokoStandards MokoStandards-API MokoTesting"
EXCLUDE="gitea-org-config org-profile gitea-private .mokogitea-private MokoStandards MokoStandards-API MokoTesting"
EXCLUDE="$EXCLUDE MokoStandards-Template-Client MokoStandards-Template-Dolibarr MokoStandards-Template-Generic MokoStandards-Template-Joomla MokoDoliProjTemplate"
if [ -n "${{ inputs.repos }}" ]; then
+1 -1
View File
@@ -61,7 +61,7 @@ jobs:
run: |
API="${GITEA_URL}/api/v1"
EXCLUDE="gitea-org-config org-profile gitea-private gitea-server-setup MokoStandards MokoStandards-API MokoTesting"
EXCLUDE="gitea-org-config org-profile gitea-private .mokogitea-private MokoStandards MokoStandards-API MokoTesting"
EXCLUDE="$EXCLUDE MokoStandards-Template-Client MokoStandards-Template-Dolibarr MokoStandards-Template-Generic MokoStandards-Template-Joomla MokoDoliProjTemplate"
if [ -n "${{ inputs.repos }}" ]; then
+678
View File
@@ -0,0 +1,678 @@
#!/usr/bin/env bash
# server-autoheal.sh - Auto-heal on restart + split backup management
#
# Copyright (C) 2026 Moko Consulting <hello@mokoconsulting.tech>
# SPDX-License-Identifier: GPL-3.0-or-later
#
# DEFGROUP: MokoStandards.Automation.ServerAutoheal
# INGROUP: MokoStandards.Automation
# REPO: https://git.mokoconsulting.tech/MokoConsulting/moko-platform
# PATH: /automation/server-autoheal.sh
# BRIEF: Server auto-heal on unclean restart + split system/content backups
#
# Usage:
# server-autoheal.sh <command> [options]
#
# Commands:
# boot-check Run at boot — auto-heals if no safe point exists
# set-safepoint Mark current state as safe (call before planned shutdown)
# backup-system Run a system backup (configs, packages, services)
# backup-content Run a content backup (site files, databases, uploads)
# cleanup Prune expired backups per retention policy
# status Show safe point and backup status
#
# Scheduling (cron):
# @reboot server-autoheal.sh boot-check
# 0 3 * * * server-autoheal.sh backup-system (daily at 3am)
# 0 */2 * * * server-autoheal.sh backup-content (every 2 hours)
# 30 */2 * * * server-autoheal.sh cleanup (30 min after content backup)
set -euo pipefail
# ──────────────────────────────────────────────
# Configuration — override via /etc/moko/autoheal.conf
# ──────────────────────────────────────────────
CONF_FILE="/etc/moko/autoheal.conf"
[[ -f "$CONF_FILE" ]] && source "$CONF_FILE"
BACKUP_ROOT="${BACKUP_ROOT:-/var/backups/moko}"
SAFEPOINT_FILE="${SAFEPOINT_FILE:-/var/run/moko/safepoint}"
LOG_FILE="${LOG_FILE:-/var/log/moko/autoheal.log}"
LOCK_DIR="${LOCK_DIR:-/var/run/moko}"
# System backup: configs, package lists, service state, cron
SYSTEM_BACKUP_DIR="${BACKUP_ROOT}/system"
SYSTEM_BACKUP_RETAIN="${SYSTEM_BACKUP_RETAIN:-7}" # keep 7 daily system backups
# Content backup: web roots, databases, uploads
CONTENT_BACKUP_DIR="${BACKUP_ROOT}/content"
CONTENT_BACKUP_RETAIN_HOURS="${CONTENT_BACKUP_RETAIN_HOURS:-24}" # 1 day of content backups
# Paths to back up — override these in /etc/moko/autoheal.conf
SYSTEM_PATHS="${SYSTEM_PATHS:-/etc/nginx /etc/php /etc/mysql /etc/cron.d /etc/systemd/system}"
CONTENT_PATHS="${CONTENT_PATHS:-/var/www}"
DB_NAMES="${DB_NAMES:-}" # space-separated list, empty = auto-detect all
# ──────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────
log() {
local level="$1"; shift
local ts
ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
local msg="[$ts] [$level] $*"
echo "$msg" | tee -a "$LOG_FILE" >&2
}
ensure_dirs() {
mkdir -p "$SYSTEM_BACKUP_DIR" "$CONTENT_BACKUP_DIR" \
"$LOCK_DIR" "$(dirname "$LOG_FILE")"
}
acquire_lock() {
local lockfile="${LOCK_DIR}/autoheal-${1}.lock"
if [[ -f "$lockfile" ]]; then
local pid
pid=$(<"$lockfile")
if kill -0 "$pid" 2>/dev/null; then
log WARN "Another $1 operation is running (PID $pid), skipping"
exit 0
fi
rm -f "$lockfile"
fi
echo $$ > "$lockfile"
trap "rm -f '$lockfile'" EXIT
}
timestamp() {
date -u '+%Y%m%d_%H%M%S'
}
# ──────────────────────────────────────────────
# Safe-point management
# ──────────────────────────────────────────────
cmd_set_safepoint() {
ensure_dirs
local ts
ts=$(timestamp)
cat > "$SAFEPOINT_FILE" <<EOF
timestamp=$ts
hostname=$(hostname)
kernel=$(uname -r)
uptime=$(uptime -s 2>/dev/null || echo "unknown")
set_by=${SUDO_USER:-$(whoami)}
EOF
log INFO "Safe point set at $ts by ${SUDO_USER:-$(whoami)}"
}
cmd_clear_safepoint() {
rm -f "$SAFEPOINT_FILE"
log INFO "Safe point cleared"
}
has_safepoint() {
[[ -f "$SAFEPOINT_FILE" ]]
}
# ──────────────────────────────────────────────
# System backup (daily)
# ──────────────────────────────────────────────
cmd_backup_system() {
ensure_dirs
acquire_lock "system-backup"
local ts
ts=$(timestamp)
local archive="${SYSTEM_BACKUP_DIR}/system_${ts}.tar.gz"
local manifest="${SYSTEM_BACKUP_DIR}/system_${ts}.manifest"
log INFO "Starting system backup → $archive"
# Collect existing paths only
local existing_paths=()
for p in $SYSTEM_PATHS; do
[[ -e "$p" ]] && existing_paths+=("$p")
done
if [[ ${#existing_paths[@]} -eq 0 ]]; then
log WARN "No system paths found to back up"
return 1
fi
# Archive configs and system files
tar -czf "$archive" "${existing_paths[@]}" 2>/dev/null || true
# Capture package list and service state as manifest
{
echo "=== PACKAGES ==="
if command -v dpkg &>/dev/null; then
dpkg --get-selections
elif command -v rpm &>/dev/null; then
rpm -qa --qf '%{NAME}\t%{VERSION}\n'
fi
echo ""
echo "=== ENABLED SERVICES ==="
if command -v systemctl &>/dev/null; then
systemctl list-unit-files --state=enabled --no-pager 2>/dev/null || true
fi
echo ""
echo "=== CRONTABS ==="
for user_home in /var/spool/cron/crontabs/*; do
[[ -f "$user_home" ]] && echo "--- $(basename "$user_home") ---" && cat "$user_home"
done 2>/dev/null || true
} > "$manifest"
local size
size=$(du -sh "$archive" 2>/dev/null | cut -f1)
log INFO "System backup complete: $archive ($size)"
# Prune old system backups (keep $SYSTEM_BACKUP_RETAIN)
local count
count=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' | wc -l)
if [[ "$count" -gt "$SYSTEM_BACKUP_RETAIN" ]]; then
local to_remove=$((count - SYSTEM_BACKUP_RETAIN))
find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' -printf '%T+ %p\n' \
| sort | head -n "$to_remove" | awk '{print $2}' \
| while read -r f; do
rm -f "$f" "${f%.tar.gz}.manifest"
log INFO "Pruned old system backup: $f"
done
fi
}
# ──────────────────────────────────────────────
# Content backup (every 2 hours)
# ──────────────────────────────────────────────
cmd_backup_content() {
ensure_dirs
acquire_lock "content-backup"
local ts
ts=$(timestamp)
local archive="${CONTENT_BACKUP_DIR}/content_${ts}.tar.gz"
local db_dump="${CONTENT_BACKUP_DIR}/content_${ts}.sql.gz"
log INFO "Starting content backup → $archive"
# Back up web content / uploads
local existing_paths=()
for p in $CONTENT_PATHS; do
[[ -e "$p" ]] && existing_paths+=("$p")
done
if [[ ${#existing_paths[@]} -gt 0 ]]; then
tar -czf "$archive" "${existing_paths[@]}" 2>/dev/null || true
local size
size=$(du -sh "$archive" 2>/dev/null | cut -f1)
log INFO "Content files archived: $archive ($size)"
else
log WARN "No content paths found to back up"
fi
# Database dump
if command -v mysqldump &>/dev/null || command -v mariadb-dump &>/dev/null; then
local dump_cmd="mysqldump"
command -v mariadb-dump &>/dev/null && dump_cmd="mariadb-dump"
local databases=()
if [[ -n "$DB_NAMES" ]]; then
read -ra databases <<< "$DB_NAMES"
else
# Auto-detect: dump all databases except system ones
databases=($(${dump_cmd%dump} -N -e \
"SELECT schema_name FROM information_schema.schemata
WHERE schema_name NOT IN ('information_schema','performance_schema','mysql','sys')" \
2>/dev/null | tr '\n' ' ')) || true
fi
if [[ ${#databases[@]} -gt 0 ]]; then
$dump_cmd --single-transaction --routines --triggers \
--databases "${databases[@]}" 2>/dev/null \
| gzip > "$db_dump"
local db_size
db_size=$(du -sh "$db_dump" 2>/dev/null | cut -f1)
log INFO "Database dump complete: $db_dump ($db_size)"
else
log WARN "No databases found to dump"
fi
fi
}
# ──────────────────────────────────────────────
# Cleanup — prune content backups older than retention
# ──────────────────────────────────────────────
cmd_cleanup() {
ensure_dirs
local before_count after_count
# Content: keep only last 24 hours (1 day)
before_count=$(find "$CONTENT_BACKUP_DIR" -name 'content_*' -type f | wc -l)
find "$CONTENT_BACKUP_DIR" -name 'content_*' -type f \
-mmin +$((CONTENT_BACKUP_RETAIN_HOURS * 60)) -delete 2>/dev/null || true
after_count=$(find "$CONTENT_BACKUP_DIR" -name 'content_*' -type f | wc -l)
local removed=$((before_count - after_count))
[[ "$removed" -gt 0 ]] && log INFO "Pruned $removed content backup(s) older than ${CONTENT_BACKUP_RETAIN_HOURS}h"
# System: keep N most recent (handled in backup-system, but double-check here)
before_count=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*' -type f | wc -l)
local max_system_files=$((SYSTEM_BACKUP_RETAIN * 2)) # .tar.gz + .manifest
if [[ "$before_count" -gt "$max_system_files" ]]; then
local excess=$((before_count - max_system_files))
find "$SYSTEM_BACKUP_DIR" -name 'system_*' -type f -printf '%T+ %p\n' \
| sort | head -n "$excess" | awk '{print $2}' \
| xargs -r rm -f
log INFO "Pruned excess system backups"
fi
log INFO "Cleanup complete"
}
# ──────────────────────────────────────────────
# Boot check — the auto-heal entry point
# ──────────────────────────────────────────────
cmd_boot_check() {
ensure_dirs
acquire_lock "boot-check"
log INFO "=== Boot check started ==="
log INFO "Hostname: $(hostname), Kernel: $(uname -r)"
if has_safepoint; then
log INFO "Safe point found — server was shut down cleanly"
log INFO "Clearing safe point for next cycle"
cmd_clear_safepoint
log INFO "=== Boot check passed (clean restart) ==="
return 0
fi
log WARN "NO safe point found — server restarted without clean shutdown"
log WARN "Initiating auto-heal sequence..."
auto_heal
local rc=$?
# Set safe point after successful heal
if [[ $rc -eq 0 ]]; then
cmd_set_safepoint
log INFO "=== Boot check complete (healed successfully) ==="
else
log ERROR "=== Boot check FAILED — manual intervention required ==="
fi
return $rc
}
# ──────────────────────────────────────────────
# Auto-heal strategy
#
# TODO: This is the core decision point. Implement the recovery
# steps that match your server's architecture. See guidance below.
#
# Trade-offs to consider:
# - Restore-from-backup: safest, but content may be up to 2h stale
# - Service-restart-only: faster, keeps current data, but won't fix
# corrupted configs or broken filesystem state
# - Hybrid: restart services first, verify health, only restore if
# health checks fail — best of both worlds but more complex
#
# The function receives no arguments. Use the latest system + content
# backups to restore if needed. Return 0 on success, 1 on failure.
# ──────────────────────────────────────────────
auto_heal() {
log INFO "Phase 1: Verify and repair filesystem"
# Check for common post-crash issues
repair_filesystem
log INFO "Phase 2: Restore system configuration if corrupted"
restore_system_if_needed
log INFO "Phase 3: Restart core services"
restart_services
log INFO "Phase 4: Verify health"
if ! verify_health; then
log WARN "Health check failed after service restart — restoring from backup"
restore_from_backup
restart_services
if ! verify_health; then
log ERROR "Health check still failing after restore — giving up"
return 1
fi
fi
log INFO "Auto-heal completed successfully"
return 0
}
# ──────────────────────────────────────────────
# Heal sub-steps
# ──────────────────────────────────────────────
repair_filesystem() {
# Fix common post-crash filesystem issues
# Clear stale PID/lock/socket files that prevent services from starting
local stale_files=(
/var/run/nginx.pid
/var/run/mysqld/mysqld.pid
/var/run/php-fpm.pid
/var/lib/mysql/*.pid
)
for f in "${stale_files[@]}"; do
for expanded in $f; do
if [[ -f "$expanded" ]]; then
local pid
pid=$(<"$expanded") 2>/dev/null || true
if [[ -n "$pid" ]] && ! kill -0 "$pid" 2>/dev/null; then
rm -f "$expanded"
log INFO "Removed stale PID file: $expanded"
fi
fi
done
done
# Fix permissions on critical dirs that may get mangled
[[ -d /var/run/mysqld ]] && chown mysql:mysql /var/run/mysqld 2>/dev/null || true
[[ -d /var/lib/php/sessions ]] && chmod 1733 /var/lib/php/sessions 2>/dev/null || true
# Repair tmp/cache dirs
for d in /tmp /var/tmp; do
[[ -d "$d" ]] && chmod 1777 "$d" 2>/dev/null || true
done
}
restore_system_if_needed() {
# Find latest system backup
local latest_system
latest_system=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' -printf '%T+ %p\n' \
2>/dev/null | sort -r | head -1 | awk '{print $2}')
if [[ -z "$latest_system" ]]; then
log WARN "No system backup available to verify against"
return 0
fi
# Check if critical configs exist and are non-empty
local needs_restore=false
local critical_configs=("/etc/nginx/nginx.conf" "/etc/php" "/etc/mysql")
for cfg in "${critical_configs[@]}"; do
if [[ -e "$cfg" ]]; then
# Config exists — check if it's a file and non-empty, or a directory
if [[ -f "$cfg" && ! -s "$cfg" ]]; then
log WARN "Critical config is empty: $cfg"
needs_restore=true
break
fi
fi
done
if $needs_restore; then
log WARN "Restoring system config from $latest_system"
tar -xzf "$latest_system" -C / 2>/dev/null || {
log ERROR "System restore failed from $latest_system"
return 1
}
log INFO "System config restored"
else
log INFO "System configs look intact — skipping restore"
fi
}
restart_services() {
if ! command -v systemctl &>/dev/null; then
log WARN "systemctl not available — skipping service restart"
return 0
fi
local services=("mysql" "mariadb" "nginx" "apache2" "php-fpm" "php8.1-fpm" "php8.2-fpm" "php8.3-fpm")
for svc in "${services[@]}"; do
if systemctl is-enabled "$svc" &>/dev/null; then
log INFO "Restarting $svc..."
systemctl restart "$svc" 2>/dev/null && \
log INFO "$svc restarted OK" || \
log WARN "$svc restart failed"
fi
done
}
verify_health() {
local failures=0
# Check critical services are running
local services=("mysql" "mariadb" "nginx" "apache2")
for svc in "${services[@]}"; do
if systemctl is-enabled "$svc" &>/dev/null; then
if ! systemctl is-active "$svc" &>/dev/null; then
log WARN "Service not running: $svc"
((failures++))
fi
fi
done
# Check if web server responds
if command -v curl &>/dev/null; then
if ! curl -sf -o /dev/null --max-time 10 "http://localhost/" 2>/dev/null; then
log WARN "Local web server not responding"
((failures++))
fi
fi
# Check if database accepts connections
if command -v mysqladmin &>/dev/null; then
if ! mysqladmin ping --silent 2>/dev/null; then
log WARN "Database not responding to ping"
((failures++))
fi
fi
[[ $failures -eq 0 ]]
}
restore_from_backup() {
log WARN "=== Full restore from backup ==="
# Restore system config
local latest_system
latest_system=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' -printf '%T+ %p\n' \
2>/dev/null | sort -r | head -1 | awk '{print $2}')
if [[ -n "$latest_system" ]]; then
log INFO "Restoring system from $latest_system"
tar -xzf "$latest_system" -C / 2>/dev/null || \
log ERROR "System restore failed"
fi
# Restore content
local latest_content
latest_content=$(find "$CONTENT_BACKUP_DIR" -name 'content_*.tar.gz' -printf '%T+ %p\n' \
2>/dev/null | sort -r | head -1 | awk '{print $2}')
if [[ -n "$latest_content" ]]; then
log INFO "Restoring content from $latest_content"
tar -xzf "$latest_content" -C / 2>/dev/null || \
log ERROR "Content restore failed"
fi
# Restore database
local latest_db
latest_db=$(find "$CONTENT_BACKUP_DIR" -name 'content_*.sql.gz' -printf '%T+ %p\n' \
2>/dev/null | sort -r | head -1 | awk '{print $2}')
if [[ -n "$latest_db" ]]; then
log INFO "Restoring database from $latest_db"
local mysql_cmd="mysql"
command -v mariadb &>/dev/null && mysql_cmd="mariadb"
zcat "$latest_db" | $mysql_cmd 2>/dev/null || \
log ERROR "Database restore failed"
fi
}
# ──────────────────────────────────────────────
# Status
# ──────────────────────────────────────────────
cmd_status() {
echo "=== Moko Server Auto-Heal Status ==="
echo ""
# Safe point
if has_safepoint; then
echo "Safe point: SET"
cat "$SAFEPOINT_FILE" | sed 's/^/ /'
else
echo "Safe point: NOT SET (will auto-heal on next boot)"
fi
echo ""
# System backups
echo "System backups (${SYSTEM_BACKUP_DIR}):"
local sys_count
sys_count=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' 2>/dev/null | wc -l)
echo " Count: $sys_count (retain $SYSTEM_BACKUP_RETAIN)"
local latest_sys
latest_sys=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' -printf '%T+ %p\n' \
2>/dev/null | sort -r | head -1)
if [[ -n "$latest_sys" ]]; then
echo " Latest: $(echo "$latest_sys" | awk '{print $2}')"
echo " Timestamp: $(echo "$latest_sys" | awk '{print $1}')"
else
echo " Latest: (none)"
fi
echo ""
# Content backups
echo "Content backups (${CONTENT_BACKUP_DIR}):"
local cnt_count
cnt_count=$(find "$CONTENT_BACKUP_DIR" -name 'content_*.tar.gz' 2>/dev/null | wc -l)
echo " Count: $cnt_count (retain ${CONTENT_BACKUP_RETAIN_HOURS}h)"
local latest_cnt
latest_cnt=$(find "$CONTENT_BACKUP_DIR" -name 'content_*.tar.gz' -printf '%T+ %p\n' \
2>/dev/null | sort -r | head -1)
if [[ -n "$latest_cnt" ]]; then
echo " Latest: $(echo "$latest_cnt" | awk '{print $2}')"
echo " Timestamp: $(echo "$latest_cnt" | awk '{print $1}')"
else
echo " Latest: (none)"
fi
echo ""
# Disk usage
echo "Backup disk usage:"
du -sh "$SYSTEM_BACKUP_DIR" "$CONTENT_BACKUP_DIR" 2>/dev/null | sed 's/^/ /'
}
# ──────────────────────────────────────────────
# Install helper — sets up cron + systemd
# ──────────────────────────────────────────────
cmd_install() {
local script_path
script_path=$(readlink -f "$0")
echo "Installing Moko Auto-Heal..."
# Create config directory
mkdir -p /etc/moko "$(dirname "$LOG_FILE")" "$LOCK_DIR"
# Write example config if none exists
if [[ ! -f "$CONF_FILE" ]]; then
cat > "$CONF_FILE" <<'CONF'
# /etc/moko/autoheal.conf — Server auto-heal configuration
# Uncomment and modify as needed
# BACKUP_ROOT="/var/backups/moko"
# SAFEPOINT_FILE="/var/run/moko/safepoint"
# LOG_FILE="/var/log/moko/autoheal.log"
# System backup paths (space-separated)
# SYSTEM_PATHS="/etc/nginx /etc/php /etc/mysql /etc/cron.d /etc/systemd/system"
# Content backup paths (space-separated)
# CONTENT_PATHS="/var/www"
# Database names (space-separated, empty = auto-detect all)
# DB_NAMES=""
# Retention
# SYSTEM_BACKUP_RETAIN=7 # daily backups to keep
# CONTENT_BACKUP_RETAIN_HOURS=24 # hours of content backups to keep
CONF
echo " Created config: $CONF_FILE"
fi
# Install cron jobs
local cron_file="/etc/cron.d/moko-autoheal"
cat > "$cron_file" <<CRON
# Moko Server Auto-Heal — managed by server-autoheal.sh install
SHELL=/bin/bash
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
# Boot check — auto-heal if no safe point
@reboot root ${script_path} boot-check
# System backup — daily at 3:00 AM
0 3 * * * root ${script_path} backup-system
# Content backup — every 2 hours
0 */2 * * * root ${script_path} backup-content
# Cleanup expired backups — 30 min after each content backup
30 */2 * * * root ${script_path} cleanup
CRON
echo " Installed cron: $cron_file"
# Install shutdown hook to set safe point on clean shutdown
local shutdown_hook="/etc/systemd/system/moko-safepoint.service"
cat > "$shutdown_hook" <<UNIT
[Unit]
Description=Moko Safe Point — mark clean shutdown
DefaultDependencies=no
Before=shutdown.target reboot.target halt.target
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/bin/true
ExecStop=${script_path} set-safepoint
[Install]
WantedBy=multi-user.target
UNIT
systemctl daemon-reload
systemctl enable moko-safepoint.service
echo " Installed systemd hook: $shutdown_hook"
echo ""
echo "Done! Edit $CONF_FILE to configure paths for your server."
echo "Run '${script_path} status' to verify."
}
# ──────────────────────────────────────────────
# Main dispatcher
# ──────────────────────────────────────────────
main() {
local cmd="${1:-help}"
case "$cmd" in
boot-check) cmd_boot_check ;;
set-safepoint) cmd_set_safepoint ;;
clear-safepoint) cmd_clear_safepoint ;;
backup-system) cmd_backup_system ;;
backup-content) cmd_backup_content ;;
cleanup) cmd_cleanup ;;
status) cmd_status ;;
install) cmd_install ;;
help|--help|-h)
sed -n '2,/^$/s/^# //p' "$0"
echo ""
echo "Commands: boot-check, set-safepoint, clear-safepoint,"
echo " backup-system, backup-content, cleanup, status, install"
;;
*)
echo "Unknown command: $cmd" >&2
echo "Run '$0 help' for usage" >&2
exit 1
;;
esac
}
main "$@"
+186
View File
@@ -0,0 +1,186 @@
networks:
monitoring:
driver: bridge
volumes:
prometheus_data: null
grafana_data: null
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: unless-stopped
ports:
- 127.0.0.1:9091:9090
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./targets:/etc/prometheus/targets:ro
- prometheus_data:/prometheus
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=90d
- --web.enable-lifecycle
extra_hosts:
- host.docker.internal:host-gateway
networks:
- monitoring
healthcheck:
test:
- CMD
- wget
- -qO-
- http://localhost:9090/-/healthy
interval: 30s
timeout: 5s
retries: 3
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
restart: unless-stopped
ports:
- 127.0.0.1:9100:9100
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro
- /var/lib/prometheus/node-exporter:/textfile:ro
command:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/rootfs
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
- --collector.netclass.ignored-devices=^(veth.*|br-.*|docker.*)$$
- --collector.diskstats.device-exclude=^(ram|loop|fd|dm-)\d+$$
- --collector.systemd
- --collector.systemd.unit-include=.+
- --collector.textfile.directory=/textfile
pid: host
security_opt:
- apparmor:unconfined
networks:
- monitoring
healthcheck:
test:
- CMD
- wget
- --spider
- -q
- http://localhost:9100/metrics
interval: 30s
timeout: 5s
retries: 3
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
restart: unless-stopped
ports:
- 127.0.0.1:8082:8080
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
privileged: true
devices:
- /dev/kmsg
networks:
- monitoring
healthcheck:
test:
- CMD
- wget
- --spider
- -q
- http://localhost:8080/healthz
interval: 30s
timeout: 5s
retries: 3
nginx-exporter:
image: nginx/nginx-prometheus-exporter:latest
container_name: nginx-exporter
restart: unless-stopped
network_mode: host
command:
- --nginx.scrape-uri=http://127.0.0.1:8888/nginx_status
- --web.listen-address=0.0.0.0:9113
grafana:
image: grafana/grafana:latest
container_name: grafana
restart: unless-stopped
ports:
- 127.0.0.1:3001:3000
environment:
- GF_SECURITY_ADMIN_USER=jmiller
- GF_SECURITY_ADMIN_PASSWORD=#2918HeatherfieldDrive
- GF_SERVER_ROOT_URL=https://bench.mokoconsulting.tech/
- GF_SERVER_DOMAIN=bench.mokoconsulting.tech
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
- GF_AUTH_ANONYMOUS_ORG_ID=1
- GF_USERS_ALLOW_SIGN_UP=false
- GF_USERS_ALLOW_ORG_CREATE=false
- GF_SECURITY_COOKIE_SECURE=true
- GF_SECURITY_STRICT_TRANSPORT_SECURITY=true
- GF_SECURITY_X_CONTENT_TYPE_OPTIONS=true
- GF_SECURITY_X_XSS_PROTECTION=true
- GF_LOG_MODE=console
- GF_LOG_LEVEL=debug
- GF_USERS_DEFAULT_THEME=dark
- GF_BRANDING_APP_TITLE=Moko Bench
- GF_BRANDING_LOGIN_TITLE=Moko Consulting
- GF_BRANDING_LOGIN_SUBTITLE=Server Performance Dashboard
- GF_DATE_FORMATS_FULL_DATE=YYYY-MM-DD HH:mm:ss
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-polystat-panel,yesoreyeram-infinity-datasource,natel-discrete-panel
- GF_AUTH_GOOGLE_ENABLED=true
- GF_AUTH_GOOGLE_CLIENT_ID=349391103517-oiq974b2gq4r3t9f9cf43im31gtruhml.apps.googleusercontent.com
- GF_AUTH_GOOGLE_CLIENT_SECRET=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
- GF_AUTH_GOOGLE_SCOPES=openid email profile
- GF_AUTH_GOOGLE_AUTH_URL=https://accounts.google.com/o/oauth2/v2/auth
- GF_AUTH_GOOGLE_TOKEN_URL=https://oauth2.googleapis.com/token
- GF_AUTH_GOOGLE_ALLOWED_DOMAINS=mokoconsulting.tech
- GF_AUTH_GOOGLE_ALLOW_SIGN_UP=true
- GF_AUTH_GOOGLE_AUTO_LOGIN=false
- GF_AUTH_GOOGLE_SKIP_ORG_ROLE_SYNC=true
- GF_USERS_AUTO_ASSIGN_ORG_ROLE=Admin
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning:ro
- ./grafana/custom.ini:/etc/grafana/grafana.ini:ro
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
networks:
- monitoring
depends_on:
prometheus:
condition: service_healthy
healthcheck:
test:
- CMD
- wget
- --spider
- -q
- http://localhost:3000/api/health
interval: 30s
timeout: 5s
retries: 3
mysqld-exporter:
image: prom/mysqld-exporter:latest
container_name: mysqld-exporter
restart: unless-stopped
network_mode: host
volumes:
- /opt/gitea-server-setup/docker/monitoring/.mysqld-exporter.cnf:/cfg/.my.cnf:ro
environment:
MYSQLD_EXPORTER_PASSWORD: exporter_moko_2026
command:
- --config.my-cnf=/cfg/.my.cnf
- --web.listen-address=127.0.0.1:9104
healthcheck:
test:
- CMD-SHELL
- wget -q --spider http://localhost:9104/metrics || exit 1
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
@@ -79,6 +79,10 @@
"refId": "B"
}
],
"options": {
"legend": { "displayMode": "list", "placement": "right", "calcs": [] },
"tooltip": { "mode": "multi" }
},
"title": "CPU Usage %",
"type": "timeseries"
}
@@ -22,6 +22,10 @@
"refId": "A"
}
],
"options": {
"legend": { "displayMode": "list", "placement": "right", "calcs": [] },
"tooltip": { "mode": "multi" }
},
"title": "CPU Usage by Container",
"type": "timeseries"
}
@@ -22,6 +22,10 @@
"refId": "A"
}
],
"options": {
"legend": { "displayMode": "list", "placement": "right", "calcs": [] },
"tooltip": { "mode": "multi" }
},
"title": "Memory Usage by Container",
"type": "timeseries"
}
@@ -38,6 +38,10 @@
"refId": "D"
}
],
"options": {
"legend": { "displayMode": "list", "placement": "right", "calcs": [] },
"tooltip": { "mode": "multi" }
},
"title": "Memory Usage",
"type": "timeseries"
}
@@ -28,6 +28,10 @@
"refId": "C"
}
],
"options": {
"legend": { "displayMode": "list", "placement": "right", "calcs": [] },
"tooltip": { "mode": "multi" }
},
"title": "Connections",
"type": "timeseries"
}
@@ -59,6 +59,10 @@
"refId": "B"
}
],
"options": {
"legend": { "displayMode": "list", "placement": "right", "calcs": [] },
"tooltip": { "mode": "multi" }
},
"title": "Queries per Second",
"type": "timeseries"
}
@@ -69,6 +69,10 @@
"refId": "D"
}
],
"options": {
"legend": { "displayMode": "list", "placement": "right", "calcs": [] },
"tooltip": { "mode": "multi" }
},
"title": "Network Traffic",
"type": "timeseries"
}
@@ -33,6 +33,10 @@
"refId": "D"
}
],
"options": {
"legend": { "displayMode": "list", "placement": "right", "calcs": [] },
"tooltip": { "mode": "multi" }
},
"title": "Connections Over Time",
"type": "timeseries"
}
@@ -36,6 +36,10 @@
"refId": "A"
}
],
"options": {
"legend": { "displayMode": "list", "placement": "right", "calcs": [] },
"tooltip": { "mode": "multi" }
},
"title": "Request Rate",
"type": "timeseries"
}
@@ -15,7 +15,7 @@
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
"placement": "right"
},
"tooltip": {
"mode": "multi"
@@ -43,7 +43,7 @@
"min"
],
"displayMode": "table",
"placement": "bottom"
"placement": "right"
},
"tooltip": {
"mode": "multi",
@@ -37,7 +37,8 @@
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
"placement": "right",
"calcs": []
},
"tooltip": {
"mode": "single"
+33 -233
View File
@@ -85,7 +85,7 @@
"targets": [
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"expr": "probe_success{site_name=~\"$site\", job=\"blackbox-http\"} and on(site_name) label_replace(joomla_site_online{site=~\"$site\"} == 1, \"site_name\", \"$1\", \"site\", \"(.+)\")",
"expr": "probe_success{site_name=~\"$site\", job=\"blackbox-http\"}",
"instant": true, "format": "table", "refId": "STATUS"
},
{
@@ -100,7 +100,7 @@
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"expr": "label_replace(joomla_site_api_reachable{site=~\"$site\"}, \"site_name\", \"$1\", \"site\", \"(.+)\") and on(site_name) label_replace(joomla_site_online{site=~\"$site\"} == 1, \"site_name\", \"$1\", \"site\", \"(.+)\")",
"expr": "label_replace(joomla_site_api_reachable{site=~\"$site\"}, \"site_name\", \"$1\", \"site\", \"(.+)\")",
"instant": true, "format": "table", "refId": "API"
},
{
@@ -255,22 +255,18 @@
],
"transformations": [
{ "id": "joinByField", "options": { "byField": "site", "mode": "outer" } },
{ "id": "filterFieldsByName", "options": { "include": { "pattern": "^(site_url|version|Value #).*" } } },
{ "id": "filterFieldsByName", "options": { "include": { "pattern": "^(site_url|version|Value #(SYSTEM|EXTUPDATES|TOTAL|ENABLED|DISABLED))$" } } },
{
"id": "organize",
"options": {
"renameByName": {
"site_url": "Site",
"version": "Version",
"Value #VERSION": "v_hidden",
"Value #SYSTEM": "System",
"Value #EXTUPDATES": "Ext Updates",
"Value #TOTAL": "Total",
"Value #ENABLED": "Enabled",
"Value #DISABLED": "Disabled"
},
"excludeByName": {
"v_hidden": true
}
}
},
@@ -294,108 +290,32 @@
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"unit": "s"
}
},
"gridPos": {
"h": 8,
"w": 16,
"x": 0,
"y": 47
},
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": { "defaults": { "unit": "s" } },
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 47 },
"id": 20,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"options": { "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } },
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "probe_http_duration_seconds{site_name=~\"$site\", job=\"blackbox-http\", phase=\"transfer\"}",
"legendFormat": "{{site_name}} transfer"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "probe_http_duration_seconds{site_name=~\"$site\", job=\"blackbox-http\", phase=\"processing\"}",
"legendFormat": "{{site_name}} processing"
}
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "probe_http_duration_seconds{site_name=~\"$site\", job=\"blackbox-http\", phase=\"transfer\"}", "legendFormat": "{{site_name}} transfer" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "probe_http_duration_seconds{site_name=~\"$site\", job=\"blackbox-http\", phase=\"processing\"}", "legendFormat": "{{site_name}} processing" }
],
"title": "Response Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"max": 10,
"min": 0,
"thresholds": {
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 2
},
{
"color": "red",
"value": 5
}
]
},
"unit": "s"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 47
},
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": { "defaults": { "max": 10, "min": 0, "thresholds": { "steps": [{ "color": "green", "value": 0 }, { "color": "yellow", "value": 2 }, { "color": "red", "value": 5 }] }, "unit": "s" } },
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 47 },
"id": 22,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "probe_duration_seconds{site_name=~\"$site\", job=\"blackbox-http\"}",
"legendFormat": "{{site_name}}"
}
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "probe_duration_seconds{site_name=~\"$site\", job=\"blackbox-http\"}", "legendFormat": "{{site_name}}" }
],
"title": "Total Duration",
"type": "gauge"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 55
},
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 },
"id": 104,
"title": "Backup Status",
"type": "row"
@@ -403,78 +323,26 @@
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"noValue": "—",
"custom": { "align": "center", "cellOptions": { "type": "auto" } }
},
"defaults": { "noValue": "—", "custom": { "align": "center", "cellOptions": { "type": "auto" } } },
"overrides": [
{
"matcher": { "id": "byName", "options": "Site" },
"properties": [
{ "id": "custom.width", "value": 300 },
{ "id": "custom.align", "value": "left" },
{ "id": "links", "value": [{ "title": "Manage Backups", "url": "${__value.text}/administrator/index.php?option=com_akeebabackup&view=Manage", "targetBlank": true }] }
]
},
{
"matcher": { "id": "byName", "options": "Status" },
"properties": [
{ "id": "mappings", "value": [{ "options": { "0": { "color": "red", "text": "FAILED" }, "1": { "color": "green", "text": "OK" } }, "type": "value" }] },
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } },
{ "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "basic" } }
]
},
{
"matcher": { "id": "byName", "options": "Age" },
"properties": [
{ "id": "unit", "value": "s" },
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 172800 }, { "color": "red", "value": 604800 }] } },
{ "id": "custom.cellOptions", "value": { "type": "color-text" } },
{ "id": "mappings", "value": [{ "options": { "-1": { "text": "—" } }, "type": "value" }] }
]
},
{
"matcher": { "id": "byName", "options": "Records" },
"properties": [
{ "id": "mappings", "value": [{ "options": { "0": { "text": "—" } }, "type": "value" }] }
]
}
{ "matcher": { "id": "byName", "options": "Site" }, "properties": [{ "id": "custom.width", "value": 300 }, { "id": "custom.align", "value": "left" }, { "id": "links", "value": [{ "title": "Manage Backups", "url": "${__value.text}/administrator/index.php?option=com_akeebabackup&view=Manage", "targetBlank": true }] }] },
{ "matcher": { "id": "byName", "options": "Status" }, "properties": [{ "id": "mappings", "value": [{ "options": { "0": { "color": "red", "text": "FAILED" }, "1": { "color": "green", "text": "OK" } }, "type": "value" }] }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }, { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "basic" } }] },
{ "matcher": { "id": "byName", "options": "Age" }, "properties": [{ "id": "unit", "value": "s" }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 172800 }, { "color": "red", "value": 604800 }] } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "mappings", "value": [{ "options": { "-1": { "text": "—" } }, "type": "value" }] }] },
{ "matcher": { "id": "byName", "options": "Records" }, "properties": [{ "id": "mappings", "value": [{ "options": { "0": { "text": "—" } }, "type": "value" }] }] }
]
},
"gridPos": { "x": 0, "y": 56, "w": 24, "h": 8 },
"id": 40,
"options": { "showHeader": true, "cellHeight": "sm", "footer": { "show": false } },
"targets": [
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"expr": "label_replace(max by (site, exported_instance) (joomla_backup_status{site=~\"$site\"}), \"site_url\", \"$1\", \"exported_instance\", \"(.+)\")",
"instant": true, "format": "table", "refId": "STATUS"
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"expr": "joomla_backup_age_seconds{site=~\"$site\"}",
"instant": true, "format": "table", "refId": "AGE"
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"expr": "joomla_backup_records_total{site=~\"$site\"}",
"instant": true, "format": "table", "refId": "RECORDS"
}
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "label_replace(max by (site, exported_instance) (joomla_backup_status{site=~\"$site\"}), \"site_url\", \"$1\", \"exported_instance\", \"(.+)\")", "instant": true, "format": "table", "refId": "STATUS" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "joomla_backup_age_seconds{site=~\"$site\"}", "instant": true, "format": "table", "refId": "AGE" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "joomla_backup_records_total{site=~\"$site\"}", "instant": true, "format": "table", "refId": "RECORDS" }
],
"transformations": [
{ "id": "joinByField", "options": { "byField": "site", "mode": "outer" } },
{ "id": "filterFieldsByName", "options": { "include": { "pattern": "^(site_url|Value #).*" } } },
{
"id": "organize",
"options": {
"renameByName": {
"site_url": "Site",
"Value #STATUS": "Status",
"Value #AGE": "Age",
"Value #RECORDS": "Records"
}
}
},
{ "id": "organize", "options": { "renameByName": { "site_url": "Site", "Value #STATUS": "Status", "Value #AGE": "Age", "Value #RECORDS": "Records" } } },
{ "id": "sortBy", "options": { "sort": [{ "field": "Site", "desc": false }] } }
],
"title": "Backup Status",
@@ -482,96 +350,31 @@
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 68
},
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 68 },
"id": 103,
"title": "Uptime History",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"lineWidth": 2
},
"max": 1,
"min": 0,
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "yellow",
"value": 0.95
},
{
"color": "green",
"value": 0.99
}
]
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 69
},
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": { "defaults": { "custom": { "fillOpacity": 10, "lineWidth": 2 }, "max": 1, "min": 0, "thresholds": { "steps": [{ "color": "red", "value": 0 }, { "color": "yellow", "value": 0.95 }, { "color": "green", "value": 0.99 }] }, "unit": "percentunit" } },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 69 },
"id": 30,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"options": { "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "single" } },
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "avg_over_time(probe_success{site_name=~\"$site\", job=\"blackbox-http\"}[1h])",
"legendFormat": "{{site_name}}"
}
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "avg_over_time(probe_success{site_name=~\"$site\", job=\"blackbox-http\"}[1h])", "legendFormat": "{{site_name}}" }
],
"title": "Availability (30d)",
"type": "timeseries"
}
],
"refresh": "5m",
"tags": [
"mokowaas",
"joomla",
"endpoints",
"monitoring"
],
"tags": ["mokowaas", "joomla", "endpoints", "monitoring"],
"templating": {
"list": [
{
"current": {
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"current": { "text": "All", "value": "$__all" },
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"includeAll": true,
"label": "Site",
"multi": true,
@@ -583,10 +386,7 @@
}
]
},
"time": {
"from": "now-24h",
"to": "now"
},
"time": { "from": "now-24h", "to": "now" },
"timezone": "browser",
"title": "MokoWaaS",
"uid": "mokowaas",