Compare commits
2 commits
b250751888
...
e01b072e79
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e01b072e79 | ||
|
|
f5031fb683 |
5 changed files with 295 additions and 4 deletions
|
|
@ -151,6 +151,8 @@ app.use("/tiger/agents", agentsRouter);
|
|||
app.use("/tiger/agents/activity", agentsActivityRouter);
|
||||
// Complete audit trail (executions + tasks + outputs + cron runs, paginated)
|
||||
app.use("/tiger/activity/audit", (await import("./routes/activity-audit.js")).default);
|
||||
// Layered self-diagnosis (memory / gateway / container / crons)
|
||||
app.use("/tiger/health/system", (await import("./routes/health-system.js")).default);
|
||||
app.use("/tiger/deploy-dashboard", deployRouter);
|
||||
app.use("/tiger/route-task", routeTaskRouter);
|
||||
app.use("/tiger/keys", keysRouter);
|
||||
|
|
@ -203,8 +205,23 @@ app.listen(PORT, HOST, () => {
|
|||
// spawned specialist. See lib/inbox.ts for the contract.
|
||||
startInboxScheduler();
|
||||
|
||||
// Start Telegram channel — bridge takes over from OpenClaw native handler.
|
||||
// Requires channels.telegram.enabled=false in openclaw.json.
|
||||
const tgChannel = new TelegramChannel();
|
||||
tgChannel.start();
|
||||
// ── Bridge Telegram poller: DISABLED by default (2026-06-11) ──────────────
|
||||
// Reality check: OpenClaw's NATIVE telegram channel owns the bot (its
|
||||
// session agent:main:telegram:direct:* is the live conversation). Running
|
||||
// this poller alongside it made two consumers race for getUpdates —
|
||||
// Telegram 409s the loser ~every 40s, and when the bridge occasionally
|
||||
// WON, it relayed the stolen message into a fresh context-less tg_*
|
||||
// session with a 120s budget, replied "⚠️ Tiger timed out or is offline"
|
||||
// on slow turns, and the message never reached the native transcript
|
||||
// (invisible to the dashboard mirror).
|
||||
// Outbound sends (routes/notify.ts) use the raw Bot API and are unaffected.
|
||||
// Re-enable ONLY if native telegram is turned off in openclaw.json:
|
||||
// TIGER_TELEGRAM_POLLER=on
|
||||
if (process.env.TIGER_TELEGRAM_POLLER === "on") {
|
||||
const tgChannel = new TelegramChannel();
|
||||
tgChannel.start();
|
||||
console.log("[tiger-bridge] Telegram poller: ON (ensure OpenClaw native telegram is disabled!)");
|
||||
} else {
|
||||
console.log("[tiger-bridge] Telegram poller: off (OpenClaw native channel owns the bot)");
|
||||
}
|
||||
});
|
||||
|
|
|
|||
179
bridge/src/routes/health-system.ts
Normal file
179
bridge/src/routes/health-system.ts
Normal file
|
|
@ -0,0 +1,179 @@
|
|||
/**
|
||||
* health-system.ts — GET /tiger/health/system : layered self-diagnosis
|
||||
*
|
||||
* Born from a real incident: "⚠️ Tiger timed out or is offline" appeared on
|
||||
* Telegram with no way to see WHY from the dashboard. This endpoint checks
|
||||
* each layer a message travels through and names the broken one, so the
|
||||
* dashboard can show the cause, not just the symptom.
|
||||
*
|
||||
* Layers checked (cheap, parallel, ~1s worst case):
|
||||
* memory host RAM + swap from /proc/meminfo (bridge runs on the host)
|
||||
* gateway LiteLLM liveliness — if this is down, EVERY agent is down
|
||||
* openclaw container running? (docker inspect)
|
||||
* crons lastStatus of every job from cron/jobs.json
|
||||
*
|
||||
* Response:
|
||||
* { ok, verdict: healthy|degraded|critical, issues: string[], checks: {...} }
|
||||
*
|
||||
* Thresholds (tuned to this 8GB host's observed failure modes):
|
||||
* - MemAvailable < 800MB → agent turns crawl, cron timeouts follow
|
||||
* - swap used > 50% → same, sustained
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from "express";
|
||||
import { readFileSync } from "fs";
|
||||
import { join } from "path";
|
||||
import { exec } from "child_process";
|
||||
import { promisify } from "util";
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
const router = Router();
|
||||
|
||||
const DATA_DIR =
|
||||
process.env.OPENCLAW_DATA_DIR ||
|
||||
"/var/lib/docker/volumes/tiger_tiger-config/_data";
|
||||
const GATEWAY_HEALTH_URL =
|
||||
(process.env.LLM_GATEWAY_URL || "https://llm.manohargupta.com/v1").replace(/\/v1\/?$/, "") +
|
||||
"/health/liveliness";
|
||||
|
||||
const MEM_AVAILABLE_FLOOR_MB = 800;
|
||||
const SWAP_USED_CEILING_PCT = 50;
|
||||
|
||||
interface MemoryCheck {
|
||||
totalMb: number;
|
||||
availableMb: number;
|
||||
swapTotalMb: number;
|
||||
swapUsedMb: number;
|
||||
swapUsedPct: number;
|
||||
}
|
||||
|
||||
function checkMemory(): MemoryCheck | null {
|
||||
try {
|
||||
const info = readFileSync("/proc/meminfo", "utf-8");
|
||||
const grab = (key: string): number => {
|
||||
const m = info.match(new RegExp(`^${key}:\\s+(\\d+) kB`, "m"));
|
||||
return m ? Math.round(parseInt(m[1], 10) / 1024) : 0;
|
||||
};
|
||||
const totalMb = grab("MemTotal");
|
||||
const availableMb = grab("MemAvailable");
|
||||
const swapTotalMb = grab("SwapTotal");
|
||||
const swapFreeMb = grab("SwapFree");
|
||||
const swapUsedMb = swapTotalMb - swapFreeMb;
|
||||
return {
|
||||
totalMb,
|
||||
availableMb,
|
||||
swapTotalMb,
|
||||
swapUsedMb,
|
||||
swapUsedPct: swapTotalMb > 0 ? Math.round((swapUsedMb / swapTotalMb) * 100) : 0,
|
||||
};
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function checkGateway(): Promise<boolean> {
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
const t = setTimeout(() => controller.abort(), 4000);
|
||||
const res = await fetch(GATEWAY_HEALTH_URL, { signal: controller.signal });
|
||||
clearTimeout(t);
|
||||
return res.ok;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function checkContainer(): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
"docker inspect -f '{{.State.Running}}' tiger-openclaw",
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
return stdout.trim() === "true";
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
interface CronCheck {
|
||||
name: string;
|
||||
lastStatus: string;
|
||||
consecutiveErrors: number;
|
||||
}
|
||||
|
||||
function checkCrons(): CronCheck[] {
|
||||
try {
|
||||
const raw = JSON.parse(
|
||||
readFileSync(join(DATA_DIR, "cron", "jobs.json"), "utf-8"),
|
||||
) as { jobs?: Array<Record<string, any>> };
|
||||
return (raw.jobs ?? []).map((j) => ({
|
||||
name: String(j.name ?? j.id ?? "unknown"),
|
||||
lastStatus: String(j.state?.lastStatus ?? j.lastStatus ?? "unknown"),
|
||||
consecutiveErrors: Number(j.state?.consecutiveErrors ?? 0),
|
||||
}));
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
router.get("/", async (_req: Request, res: Response) => {
|
||||
const [memory, gatewayUp, containerUp] = await Promise.all([
|
||||
Promise.resolve(checkMemory()),
|
||||
checkGateway(),
|
||||
checkContainer(),
|
||||
]);
|
||||
const crons = checkCrons();
|
||||
|
||||
const issues: string[] = [];
|
||||
let verdict: "healthy" | "degraded" | "critical" = "healthy";
|
||||
const degrade = () => { if (verdict === "healthy") verdict = "degraded"; };
|
||||
|
||||
if (!containerUp) {
|
||||
verdict = "critical";
|
||||
issues.push("OpenClaw container is not running — Tiger is offline");
|
||||
}
|
||||
if (!gatewayUp) {
|
||||
verdict = "critical";
|
||||
issues.push("LLM gateway unreachable — every agent turn will fail");
|
||||
}
|
||||
if (memory) {
|
||||
if (memory.availableMb < MEM_AVAILABLE_FLOOR_MB) {
|
||||
degrade();
|
||||
issues.push(
|
||||
`Low memory: ${memory.availableMb}MB available (floor ${MEM_AVAILABLE_FLOOR_MB}MB) — expect slow turns and cron timeouts`,
|
||||
);
|
||||
}
|
||||
if (memory.swapUsedPct > SWAP_USED_CEILING_PCT) {
|
||||
degrade();
|
||||
issues.push(
|
||||
`Heavy swapping: ${memory.swapUsedMb}MB (${memory.swapUsedPct}%) of swap in use`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
degrade();
|
||||
issues.push("Could not read /proc/meminfo");
|
||||
}
|
||||
for (const c of crons) {
|
||||
if (c.lastStatus === "error") {
|
||||
degrade();
|
||||
issues.push(
|
||||
`Cron "${c.name}" last run failed${c.consecutiveErrors > 1 ? ` (${c.consecutiveErrors} consecutive)` : ""}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
res.json({
|
||||
ok: true,
|
||||
verdict,
|
||||
issues,
|
||||
checks: {
|
||||
memory,
|
||||
gateway: { up: gatewayUp, url: GATEWAY_HEALTH_URL },
|
||||
openclaw: { running: containerUp },
|
||||
crons,
|
||||
},
|
||||
checkedAt: new Date().toISOString(),
|
||||
});
|
||||
});
|
||||
|
||||
export default router;
|
||||
21
dashboard/src/app/api/health/system/route.ts
Normal file
21
dashboard/src/app/api/health/system/route.ts
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
/** /api/health/system — proxy for the bridge's layered self-diagnosis. */
|
||||
import { NextResponse } from "next/server"
|
||||
import { bridgeGet } from "@/lib/bridge"
|
||||
|
||||
export const dynamic = "force-dynamic"
|
||||
|
||||
export async function GET() {
|
||||
try {
|
||||
const data = await bridgeGet("/tiger/health/system")
|
||||
return NextResponse.json(data)
|
||||
} catch {
|
||||
// Bridge itself unreachable IS a finding — surface it as critical.
|
||||
return NextResponse.json({
|
||||
ok: true,
|
||||
verdict: "critical",
|
||||
issues: ["Bridge unreachable from dashboard — control plane is down"],
|
||||
checks: {},
|
||||
checkedAt: new Date().toISOString(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -6,10 +6,14 @@ import { DigestCard } from "@/components/digest-card"
|
|||
import { TelegramThreadCard } from "@/components/telegram-thread-card"
|
||||
import { StatusFooter } from "@/components/status-footer"
|
||||
import { ScheduleCard } from "@/components/schedule-card"
|
||||
import { HealthBanner } from "@/components/health-banner"
|
||||
|
||||
export default function HomePage() {
|
||||
return (
|
||||
<div className="flex flex-col gap-5 max-w-5xl mx-auto w-full">
|
||||
{/* HEALTH — invisible when healthy; impossible to miss when not. */}
|
||||
<HealthBanner />
|
||||
|
||||
{/* HERO — the command bar is the front door of Tiger. */}
|
||||
<CommandBar />
|
||||
|
||||
|
|
|
|||
70
dashboard/src/components/health-banner.tsx
Normal file
70
dashboard/src/components/health-banner.tsx
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
"use client"
|
||||
|
||||
/**
|
||||
* HealthBanner — surfaces system degradation the moment it exists.
|
||||
*
|
||||
* Polls /api/health/system every 30s. Renders NOTHING while healthy (a
|
||||
* banner that's always there is a banner nobody reads). On degraded it
|
||||
* shows an amber strip with each issue; on critical, red. Click toggles
|
||||
* the detail list. The point: "⚠️ Tiger timed out" on Telegram should
|
||||
* never again be the first place a failure shows up.
|
||||
*/
|
||||
|
||||
import { useEffect, useState } from "react"
|
||||
import { AlertTriangle, OctagonAlert, ChevronDown } from "lucide-react"
|
||||
|
||||
interface Health {
|
||||
verdict: "healthy" | "degraded" | "critical"
|
||||
issues: string[]
|
||||
checkedAt: string
|
||||
}
|
||||
|
||||
const POLL_MS = 30_000
|
||||
|
||||
export function HealthBanner() {
|
||||
const [health, setHealth] = useState<Health | null>(null)
|
||||
const [open, setOpen] = useState(false)
|
||||
|
||||
useEffect(() => {
|
||||
let alive = true
|
||||
const load = () =>
|
||||
fetch("/api/health/system")
|
||||
.then((r) => r.json())
|
||||
.then((d: Health) => { if (alive && d?.verdict) setHealth(d) })
|
||||
.catch(() => { /* next poll retries */ })
|
||||
load()
|
||||
const t = setInterval(load, POLL_MS)
|
||||
return () => { alive = false; clearInterval(t) }
|
||||
}, [])
|
||||
|
||||
if (!health || health.verdict === "healthy") return null
|
||||
|
||||
const critical = health.verdict === "critical"
|
||||
const Icon = critical ? OctagonAlert : AlertTriangle
|
||||
const tone = critical
|
||||
? "bg-red-500/10 border-red-500/40 text-red-400"
|
||||
: "bg-amber-500/10 border-amber-500/40 text-amber-400"
|
||||
|
||||
return (
|
||||
<button
|
||||
onClick={() => setOpen((v) => !v)}
|
||||
className={`w-full text-left rounded-lg border px-4 py-2.5 mb-4 ${tone}`}
|
||||
>
|
||||
<span className="flex items-center gap-2 text-sm font-medium">
|
||||
<Icon className="h-4 w-4 shrink-0" />
|
||||
System {health.verdict} — {health.issues.length} issue
|
||||
{health.issues.length === 1 ? "" : "s"}
|
||||
<ChevronDown
|
||||
className={`h-3.5 w-3.5 ml-auto transition-transform ${open ? "rotate-180" : ""}`}
|
||||
/>
|
||||
</span>
|
||||
{open && (
|
||||
<ul className="mt-2 ml-6 list-disc text-[13px] space-y-1 text-foreground/80">
|
||||
{health.issues.map((i) => (
|
||||
<li key={i}>{i}</li>
|
||||
))}
|
||||
</ul>
|
||||
)}
|
||||
</button>
|
||||
)
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue