From e01b072e794fb6186f653343e9fd5a46d3ddc90b Mon Sep 17 00:00:00 2001 From: Manohar Date: Wed, 10 Jun 2026 20:24:59 +0000 Subject: [PATCH] feat: layered system health on the dashboard - bridge GET /tiger/health/system checks each layer a message travels through: host memory/swap (/proc/meminfo), LLM gateway liveliness, OpenClaw container state, per-cron lastStatus. Rolls up to healthy/degraded/critical with human-readable issues. - dashboard /api/health/system proxy (bridge-down itself reported as critical) + HealthBanner on the homepage: invisible while healthy, amber/red expandable strip when not. Polls every 30s. Telegram should never again be the first place a failure shows up. --- bridge/src/routes/health-system.ts | 179 +++++++++++++++++++ dashboard/src/app/api/health/system/route.ts | 21 +++ dashboard/src/app/page.tsx | 4 + dashboard/src/components/health-banner.tsx | 70 ++++++++ 4 files changed, 274 insertions(+) create mode 100644 bridge/src/routes/health-system.ts create mode 100644 dashboard/src/app/api/health/system/route.ts create mode 100644 dashboard/src/components/health-banner.tsx diff --git a/bridge/src/routes/health-system.ts b/bridge/src/routes/health-system.ts new file mode 100644 index 0000000..1be75e4 --- /dev/null +++ b/bridge/src/routes/health-system.ts @@ -0,0 +1,179 @@ +/** + * health-system.ts — GET /tiger/health/system : layered self-diagnosis + * + * Born from a real incident: "⚠️ Tiger timed out or is offline" appeared on + * Telegram with no way to see WHY from the dashboard. This endpoint checks + * each layer a message travels through and names the broken one, so the + * dashboard can show the cause, not just the symptom. + * + * Layers checked (cheap, parallel, ~1s worst case): + * memory host RAM + swap from /proc/meminfo (bridge runs on the host) + * gateway LiteLLM liveliness — if this is down, EVERY agent is down + * openclaw container running? (docker inspect) + * crons lastStatus of every job from cron/jobs.json + * + * Response: + * { ok, verdict: healthy|degraded|critical, issues: string[], checks: {...} } + * + * Thresholds (tuned to this 8GB host's observed failure modes): + * - MemAvailable < 800MB → agent turns crawl, cron timeouts follow + * - swap used > 50% → same, sustained + */ + +import { Router, Request, Response } from "express"; +import { readFileSync } from "fs"; +import { join } from "path"; +import { exec } from "child_process"; +import { promisify } from "util"; + +const execAsync = promisify(exec); +const router = Router(); + +const DATA_DIR = + process.env.OPENCLAW_DATA_DIR || + "/var/lib/docker/volumes/tiger_tiger-config/_data"; +const GATEWAY_HEALTH_URL = + (process.env.LLM_GATEWAY_URL || "https://llm.manohargupta.com/v1").replace(/\/v1\/?$/, "") + + "/health/liveliness"; + +const MEM_AVAILABLE_FLOOR_MB = 800; +const SWAP_USED_CEILING_PCT = 50; + +interface MemoryCheck { + totalMb: number; + availableMb: number; + swapTotalMb: number; + swapUsedMb: number; + swapUsedPct: number; +} + +function checkMemory(): MemoryCheck | null { + try { + const info = readFileSync("/proc/meminfo", "utf-8"); + const grab = (key: string): number => { + const m = info.match(new RegExp(`^${key}:\\s+(\\d+) kB`, "m")); + return m ? Math.round(parseInt(m[1], 10) / 1024) : 0; + }; + const totalMb = grab("MemTotal"); + const availableMb = grab("MemAvailable"); + const swapTotalMb = grab("SwapTotal"); + const swapFreeMb = grab("SwapFree"); + const swapUsedMb = swapTotalMb - swapFreeMb; + return { + totalMb, + availableMb, + swapTotalMb, + swapUsedMb, + swapUsedPct: swapTotalMb > 0 ? Math.round((swapUsedMb / swapTotalMb) * 100) : 0, + }; + } catch { + return null; + } +} + +async function checkGateway(): Promise { + try { + const controller = new AbortController(); + const t = setTimeout(() => controller.abort(), 4000); + const res = await fetch(GATEWAY_HEALTH_URL, { signal: controller.signal }); + clearTimeout(t); + return res.ok; + } catch { + return false; + } +} + +async function checkContainer(): Promise { + try { + const { stdout } = await execAsync( + "docker inspect -f '{{.State.Running}}' tiger-openclaw", + { timeout: 5000 }, + ); + return stdout.trim() === "true"; + } catch { + return false; + } +} + +interface CronCheck { + name: string; + lastStatus: string; + consecutiveErrors: number; +} + +function checkCrons(): CronCheck[] { + try { + const raw = JSON.parse( + readFileSync(join(DATA_DIR, "cron", "jobs.json"), "utf-8"), + ) as { jobs?: Array> }; + return (raw.jobs ?? []).map((j) => ({ + name: String(j.name ?? j.id ?? "unknown"), + lastStatus: String(j.state?.lastStatus ?? j.lastStatus ?? "unknown"), + consecutiveErrors: Number(j.state?.consecutiveErrors ?? 0), + })); + } catch { + return []; + } +} + +router.get("/", async (_req: Request, res: Response) => { + const [memory, gatewayUp, containerUp] = await Promise.all([ + Promise.resolve(checkMemory()), + checkGateway(), + checkContainer(), + ]); + const crons = checkCrons(); + + const issues: string[] = []; + let verdict: "healthy" | "degraded" | "critical" = "healthy"; + const degrade = () => { if (verdict === "healthy") verdict = "degraded"; }; + + if (!containerUp) { + verdict = "critical"; + issues.push("OpenClaw container is not running — Tiger is offline"); + } + if (!gatewayUp) { + verdict = "critical"; + issues.push("LLM gateway unreachable — every agent turn will fail"); + } + if (memory) { + if (memory.availableMb < MEM_AVAILABLE_FLOOR_MB) { + degrade(); + issues.push( + `Low memory: ${memory.availableMb}MB available (floor ${MEM_AVAILABLE_FLOOR_MB}MB) — expect slow turns and cron timeouts`, + ); + } + if (memory.swapUsedPct > SWAP_USED_CEILING_PCT) { + degrade(); + issues.push( + `Heavy swapping: ${memory.swapUsedMb}MB (${memory.swapUsedPct}%) of swap in use`, + ); + } + } else { + degrade(); + issues.push("Could not read /proc/meminfo"); + } + for (const c of crons) { + if (c.lastStatus === "error") { + degrade(); + issues.push( + `Cron "${c.name}" last run failed${c.consecutiveErrors > 1 ? ` (${c.consecutiveErrors} consecutive)` : ""}`, + ); + } + } + + res.json({ + ok: true, + verdict, + issues, + checks: { + memory, + gateway: { up: gatewayUp, url: GATEWAY_HEALTH_URL }, + openclaw: { running: containerUp }, + crons, + }, + checkedAt: new Date().toISOString(), + }); +}); + +export default router; diff --git a/dashboard/src/app/api/health/system/route.ts b/dashboard/src/app/api/health/system/route.ts new file mode 100644 index 0000000..041b7be --- /dev/null +++ b/dashboard/src/app/api/health/system/route.ts @@ -0,0 +1,21 @@ +/** /api/health/system — proxy for the bridge's layered self-diagnosis. */ +import { NextResponse } from "next/server" +import { bridgeGet } from "@/lib/bridge" + +export const dynamic = "force-dynamic" + +export async function GET() { + try { + const data = await bridgeGet("/tiger/health/system") + return NextResponse.json(data) + } catch { + // Bridge itself unreachable IS a finding — surface it as critical. + return NextResponse.json({ + ok: true, + verdict: "critical", + issues: ["Bridge unreachable from dashboard — control plane is down"], + checks: {}, + checkedAt: new Date().toISOString(), + }) + } +} diff --git a/dashboard/src/app/page.tsx b/dashboard/src/app/page.tsx index a838f6a..6a3c6e3 100644 --- a/dashboard/src/app/page.tsx +++ b/dashboard/src/app/page.tsx @@ -6,10 +6,14 @@ import { DigestCard } from "@/components/digest-card" import { TelegramThreadCard } from "@/components/telegram-thread-card" import { StatusFooter } from "@/components/status-footer" import { ScheduleCard } from "@/components/schedule-card" +import { HealthBanner } from "@/components/health-banner" export default function HomePage() { return (
+ {/* HEALTH — invisible when healthy; impossible to miss when not. */} + + {/* HERO — the command bar is the front door of Tiger. */} diff --git a/dashboard/src/components/health-banner.tsx b/dashboard/src/components/health-banner.tsx new file mode 100644 index 0000000..a704d6e --- /dev/null +++ b/dashboard/src/components/health-banner.tsx @@ -0,0 +1,70 @@ +"use client" + +/** + * HealthBanner — surfaces system degradation the moment it exists. + * + * Polls /api/health/system every 30s. Renders NOTHING while healthy (a + * banner that's always there is a banner nobody reads). On degraded it + * shows an amber strip with each issue; on critical, red. Click toggles + * the detail list. The point: "⚠️ Tiger timed out" on Telegram should + * never again be the first place a failure shows up. + */ + +import { useEffect, useState } from "react" +import { AlertTriangle, OctagonAlert, ChevronDown } from "lucide-react" + +interface Health { + verdict: "healthy" | "degraded" | "critical" + issues: string[] + checkedAt: string +} + +const POLL_MS = 30_000 + +export function HealthBanner() { + const [health, setHealth] = useState(null) + const [open, setOpen] = useState(false) + + useEffect(() => { + let alive = true + const load = () => + fetch("/api/health/system") + .then((r) => r.json()) + .then((d: Health) => { if (alive && d?.verdict) setHealth(d) }) + .catch(() => { /* next poll retries */ }) + load() + const t = setInterval(load, POLL_MS) + return () => { alive = false; clearInterval(t) } + }, []) + + if (!health || health.verdict === "healthy") return null + + const critical = health.verdict === "critical" + const Icon = critical ? OctagonAlert : AlertTriangle + const tone = critical + ? "bg-red-500/10 border-red-500/40 text-red-400" + : "bg-amber-500/10 border-amber-500/40 text-amber-400" + + return ( + + ) +}