Compare commits

...

2 commits

Author SHA1 Message Date
Manohar
e01b072e79 feat: layered system health on the dashboard
- bridge GET /tiger/health/system checks each layer a message travels
  through: host memory/swap (/proc/meminfo), LLM gateway liveliness,
  OpenClaw container state, per-cron lastStatus. Rolls up to
  healthy/degraded/critical with human-readable issues.
- dashboard /api/health/system proxy (bridge-down itself reported as
  critical) + HealthBanner on the homepage: invisible while healthy,
  amber/red expandable strip when not. Polls every 30s.

Telegram should never again be the first place a failure shows up.
2026-06-10 20:24:59 +00:00
Manohar
f5031fb683 fix(bridge): disable the bridge Telegram poller by default
Root cause of intermittent '⚠️ Tiger timed out or is offline' replies:
TWO consumers raced for getUpdates on one bot token. OpenClaw's native
channel owns the conversation; the bridge poller lost with a 409 every
~40s, and when it occasionally WON it relayed the stolen message into a
fresh context-less tg_* session with a 120s budget — slow turns produced
the ⚠️ reply, and the message never reached the native transcript (so it
was also missing from the dashboard mirror).

Outbound notify (raw Bot API) is unaffected. Re-enable explicitly with
TIGER_TELEGRAM_POLLER=on only if native telegram is disabled.
2026-06-10 20:24:59 +00:00
5 changed files with 295 additions and 4 deletions

View file

@ -151,6 +151,8 @@ app.use("/tiger/agents", agentsRouter);
app.use("/tiger/agents/activity", agentsActivityRouter); app.use("/tiger/agents/activity", agentsActivityRouter);
// Complete audit trail (executions + tasks + outputs + cron runs, paginated) // Complete audit trail (executions + tasks + outputs + cron runs, paginated)
app.use("/tiger/activity/audit", (await import("./routes/activity-audit.js")).default); app.use("/tiger/activity/audit", (await import("./routes/activity-audit.js")).default);
// Layered self-diagnosis (memory / gateway / container / crons)
app.use("/tiger/health/system", (await import("./routes/health-system.js")).default);
app.use("/tiger/deploy-dashboard", deployRouter); app.use("/tiger/deploy-dashboard", deployRouter);
app.use("/tiger/route-task", routeTaskRouter); app.use("/tiger/route-task", routeTaskRouter);
app.use("/tiger/keys", keysRouter); app.use("/tiger/keys", keysRouter);
@ -203,8 +205,23 @@ app.listen(PORT, HOST, () => {
// spawned specialist. See lib/inbox.ts for the contract. // spawned specialist. See lib/inbox.ts for the contract.
startInboxScheduler(); startInboxScheduler();
// Start Telegram channel — bridge takes over from OpenClaw native handler. // ── Bridge Telegram poller: DISABLED by default (2026-06-11) ──────────────
// Requires channels.telegram.enabled=false in openclaw.json. // Reality check: OpenClaw's NATIVE telegram channel owns the bot (its
// session agent:main:telegram:direct:* is the live conversation). Running
// this poller alongside it made two consumers race for getUpdates —
// Telegram 409s the loser ~every 40s, and when the bridge occasionally
// WON, it relayed the stolen message into a fresh context-less tg_*
// session with a 120s budget, replied "⚠️ Tiger timed out or is offline"
// on slow turns, and the message never reached the native transcript
// (invisible to the dashboard mirror).
// Outbound sends (routes/notify.ts) use the raw Bot API and are unaffected.
// Re-enable ONLY if native telegram is turned off in openclaw.json:
// TIGER_TELEGRAM_POLLER=on
if (process.env.TIGER_TELEGRAM_POLLER === "on") {
const tgChannel = new TelegramChannel(); const tgChannel = new TelegramChannel();
tgChannel.start(); tgChannel.start();
console.log("[tiger-bridge] Telegram poller: ON (ensure OpenClaw native telegram is disabled!)");
} else {
console.log("[tiger-bridge] Telegram poller: off (OpenClaw native channel owns the bot)");
}
}); });

View file

@ -0,0 +1,179 @@
/**
* health-system.ts GET /tiger/health/system : layered self-diagnosis
*
* Born from a real incident: "⚠️ Tiger timed out or is offline" appeared on
* Telegram with no way to see WHY from the dashboard. This endpoint checks
* each layer a message travels through and names the broken one, so the
* dashboard can show the cause, not just the symptom.
*
* Layers checked (cheap, parallel, ~1s worst case):
* memory host RAM + swap from /proc/meminfo (bridge runs on the host)
* gateway LiteLLM liveliness if this is down, EVERY agent is down
* openclaw container running? (docker inspect)
* crons lastStatus of every job from cron/jobs.json
*
* Response:
* { ok, verdict: healthy|degraded|critical, issues: string[], checks: {...} }
*
* Thresholds (tuned to this 8GB host's observed failure modes):
* - MemAvailable < 800MB agent turns crawl, cron timeouts follow
* - swap used > 50% same, sustained
*/
import { Router, Request, Response } from "express";
import { readFileSync } from "fs";
import { join } from "path";
import { exec } from "child_process";
import { promisify } from "util";
const execAsync = promisify(exec);
const router = Router();
const DATA_DIR =
process.env.OPENCLAW_DATA_DIR ||
"/var/lib/docker/volumes/tiger_tiger-config/_data";
const GATEWAY_HEALTH_URL =
(process.env.LLM_GATEWAY_URL || "https://llm.manohargupta.com/v1").replace(/\/v1\/?$/, "") +
"/health/liveliness";
const MEM_AVAILABLE_FLOOR_MB = 800;
const SWAP_USED_CEILING_PCT = 50;
interface MemoryCheck {
totalMb: number;
availableMb: number;
swapTotalMb: number;
swapUsedMb: number;
swapUsedPct: number;
}
function checkMemory(): MemoryCheck | null {
try {
const info = readFileSync("/proc/meminfo", "utf-8");
const grab = (key: string): number => {
const m = info.match(new RegExp(`^${key}:\\s+(\\d+) kB`, "m"));
return m ? Math.round(parseInt(m[1], 10) / 1024) : 0;
};
const totalMb = grab("MemTotal");
const availableMb = grab("MemAvailable");
const swapTotalMb = grab("SwapTotal");
const swapFreeMb = grab("SwapFree");
const swapUsedMb = swapTotalMb - swapFreeMb;
return {
totalMb,
availableMb,
swapTotalMb,
swapUsedMb,
swapUsedPct: swapTotalMb > 0 ? Math.round((swapUsedMb / swapTotalMb) * 100) : 0,
};
} catch {
return null;
}
}
async function checkGateway(): Promise<boolean> {
try {
const controller = new AbortController();
const t = setTimeout(() => controller.abort(), 4000);
const res = await fetch(GATEWAY_HEALTH_URL, { signal: controller.signal });
clearTimeout(t);
return res.ok;
} catch {
return false;
}
}
async function checkContainer(): Promise<boolean> {
try {
const { stdout } = await execAsync(
"docker inspect -f '{{.State.Running}}' tiger-openclaw",
{ timeout: 5000 },
);
return stdout.trim() === "true";
} catch {
return false;
}
}
interface CronCheck {
name: string;
lastStatus: string;
consecutiveErrors: number;
}
function checkCrons(): CronCheck[] {
try {
const raw = JSON.parse(
readFileSync(join(DATA_DIR, "cron", "jobs.json"), "utf-8"),
) as { jobs?: Array<Record<string, any>> };
return (raw.jobs ?? []).map((j) => ({
name: String(j.name ?? j.id ?? "unknown"),
lastStatus: String(j.state?.lastStatus ?? j.lastStatus ?? "unknown"),
consecutiveErrors: Number(j.state?.consecutiveErrors ?? 0),
}));
} catch {
return [];
}
}
router.get("/", async (_req: Request, res: Response) => {
const [memory, gatewayUp, containerUp] = await Promise.all([
Promise.resolve(checkMemory()),
checkGateway(),
checkContainer(),
]);
const crons = checkCrons();
const issues: string[] = [];
let verdict: "healthy" | "degraded" | "critical" = "healthy";
const degrade = () => { if (verdict === "healthy") verdict = "degraded"; };
if (!containerUp) {
verdict = "critical";
issues.push("OpenClaw container is not running — Tiger is offline");
}
if (!gatewayUp) {
verdict = "critical";
issues.push("LLM gateway unreachable — every agent turn will fail");
}
if (memory) {
if (memory.availableMb < MEM_AVAILABLE_FLOOR_MB) {
degrade();
issues.push(
`Low memory: ${memory.availableMb}MB available (floor ${MEM_AVAILABLE_FLOOR_MB}MB) — expect slow turns and cron timeouts`,
);
}
if (memory.swapUsedPct > SWAP_USED_CEILING_PCT) {
degrade();
issues.push(
`Heavy swapping: ${memory.swapUsedMb}MB (${memory.swapUsedPct}%) of swap in use`,
);
}
} else {
degrade();
issues.push("Could not read /proc/meminfo");
}
for (const c of crons) {
if (c.lastStatus === "error") {
degrade();
issues.push(
`Cron "${c.name}" last run failed${c.consecutiveErrors > 1 ? ` (${c.consecutiveErrors} consecutive)` : ""}`,
);
}
}
res.json({
ok: true,
verdict,
issues,
checks: {
memory,
gateway: { up: gatewayUp, url: GATEWAY_HEALTH_URL },
openclaw: { running: containerUp },
crons,
},
checkedAt: new Date().toISOString(),
});
});
export default router;

View file

@ -0,0 +1,21 @@
/** /api/health/system — proxy for the bridge's layered self-diagnosis. */
import { NextResponse } from "next/server"
import { bridgeGet } from "@/lib/bridge"
export const dynamic = "force-dynamic"
export async function GET() {
try {
const data = await bridgeGet("/tiger/health/system")
return NextResponse.json(data)
} catch {
// Bridge itself unreachable IS a finding — surface it as critical.
return NextResponse.json({
ok: true,
verdict: "critical",
issues: ["Bridge unreachable from dashboard — control plane is down"],
checks: {},
checkedAt: new Date().toISOString(),
})
}
}

View file

@ -6,10 +6,14 @@ import { DigestCard } from "@/components/digest-card"
import { TelegramThreadCard } from "@/components/telegram-thread-card" import { TelegramThreadCard } from "@/components/telegram-thread-card"
import { StatusFooter } from "@/components/status-footer" import { StatusFooter } from "@/components/status-footer"
import { ScheduleCard } from "@/components/schedule-card" import { ScheduleCard } from "@/components/schedule-card"
import { HealthBanner } from "@/components/health-banner"
export default function HomePage() { export default function HomePage() {
return ( return (
<div className="flex flex-col gap-5 max-w-5xl mx-auto w-full"> <div className="flex flex-col gap-5 max-w-5xl mx-auto w-full">
{/* HEALTH — invisible when healthy; impossible to miss when not. */}
<HealthBanner />
{/* HERO — the command bar is the front door of Tiger. */} {/* HERO — the command bar is the front door of Tiger. */}
<CommandBar /> <CommandBar />

View file

@ -0,0 +1,70 @@
"use client"
/**
* HealthBanner surfaces system degradation the moment it exists.
*
* Polls /api/health/system every 30s. Renders NOTHING while healthy (a
* banner that's always there is a banner nobody reads). On degraded it
* shows an amber strip with each issue; on critical, red. Click toggles
* the detail list. The point: "⚠️ Tiger timed out" on Telegram should
* never again be the first place a failure shows up.
*/
import { useEffect, useState } from "react"
import { AlertTriangle, OctagonAlert, ChevronDown } from "lucide-react"
interface Health {
verdict: "healthy" | "degraded" | "critical"
issues: string[]
checkedAt: string
}
const POLL_MS = 30_000
export function HealthBanner() {
const [health, setHealth] = useState<Health | null>(null)
const [open, setOpen] = useState(false)
useEffect(() => {
let alive = true
const load = () =>
fetch("/api/health/system")
.then((r) => r.json())
.then((d: Health) => { if (alive && d?.verdict) setHealth(d) })
.catch(() => { /* next poll retries */ })
load()
const t = setInterval(load, POLL_MS)
return () => { alive = false; clearInterval(t) }
}, [])
if (!health || health.verdict === "healthy") return null
const critical = health.verdict === "critical"
const Icon = critical ? OctagonAlert : AlertTriangle
const tone = critical
? "bg-red-500/10 border-red-500/40 text-red-400"
: "bg-amber-500/10 border-amber-500/40 text-amber-400"
return (
<button
onClick={() => setOpen((v) => !v)}
className={`w-full text-left rounded-lg border px-4 py-2.5 mb-4 ${tone}`}
>
<span className="flex items-center gap-2 text-sm font-medium">
<Icon className="h-4 w-4 shrink-0" />
System {health.verdict} {health.issues.length} issue
{health.issues.length === 1 ? "" : "s"}
<ChevronDown
className={`h-3.5 w-3.5 ml-auto transition-transform ${open ? "rotate-180" : ""}`}
/>
</span>
{open && (
<ul className="mt-2 ml-6 list-disc text-[13px] space-y-1 text-foreground/80">
{health.issues.map((i) => (
<li key={i}>{i}</li>
))}
</ul>
)}
</button>
)
}