Compare commits
No commits in common. "e01b072e794fb6186f653343e9fd5a46d3ddc90b" and "b2507518889d78e7c80058204645e79a39fe9eaa" have entirely different histories.
e01b072e79
...
b250751888
5 changed files with 4 additions and 295 deletions
|
|
@ -151,8 +151,6 @@ app.use("/tiger/agents", agentsRouter);
|
||||||
app.use("/tiger/agents/activity", agentsActivityRouter);
|
app.use("/tiger/agents/activity", agentsActivityRouter);
|
||||||
// Complete audit trail (executions + tasks + outputs + cron runs, paginated)
|
// Complete audit trail (executions + tasks + outputs + cron runs, paginated)
|
||||||
app.use("/tiger/activity/audit", (await import("./routes/activity-audit.js")).default);
|
app.use("/tiger/activity/audit", (await import("./routes/activity-audit.js")).default);
|
||||||
// Layered self-diagnosis (memory / gateway / container / crons)
|
|
||||||
app.use("/tiger/health/system", (await import("./routes/health-system.js")).default);
|
|
||||||
app.use("/tiger/deploy-dashboard", deployRouter);
|
app.use("/tiger/deploy-dashboard", deployRouter);
|
||||||
app.use("/tiger/route-task", routeTaskRouter);
|
app.use("/tiger/route-task", routeTaskRouter);
|
||||||
app.use("/tiger/keys", keysRouter);
|
app.use("/tiger/keys", keysRouter);
|
||||||
|
|
@ -205,23 +203,8 @@ app.listen(PORT, HOST, () => {
|
||||||
// spawned specialist. See lib/inbox.ts for the contract.
|
// spawned specialist. See lib/inbox.ts for the contract.
|
||||||
startInboxScheduler();
|
startInboxScheduler();
|
||||||
|
|
||||||
// ── Bridge Telegram poller: DISABLED by default (2026-06-11) ──────────────
|
// Start Telegram channel — bridge takes over from OpenClaw native handler.
|
||||||
// Reality check: OpenClaw's NATIVE telegram channel owns the bot (its
|
// Requires channels.telegram.enabled=false in openclaw.json.
|
||||||
// session agent:main:telegram:direct:* is the live conversation). Running
|
const tgChannel = new TelegramChannel();
|
||||||
// this poller alongside it made two consumers race for getUpdates —
|
tgChannel.start();
|
||||||
// Telegram 409s the loser ~every 40s, and when the bridge occasionally
|
|
||||||
// WON, it relayed the stolen message into a fresh context-less tg_*
|
|
||||||
// session with a 120s budget, replied "⚠️ Tiger timed out or is offline"
|
|
||||||
// on slow turns, and the message never reached the native transcript
|
|
||||||
// (invisible to the dashboard mirror).
|
|
||||||
// Outbound sends (routes/notify.ts) use the raw Bot API and are unaffected.
|
|
||||||
// Re-enable ONLY if native telegram is turned off in openclaw.json:
|
|
||||||
// TIGER_TELEGRAM_POLLER=on
|
|
||||||
if (process.env.TIGER_TELEGRAM_POLLER === "on") {
|
|
||||||
const tgChannel = new TelegramChannel();
|
|
||||||
tgChannel.start();
|
|
||||||
console.log("[tiger-bridge] Telegram poller: ON (ensure OpenClaw native telegram is disabled!)");
|
|
||||||
} else {
|
|
||||||
console.log("[tiger-bridge] Telegram poller: off (OpenClaw native channel owns the bot)");
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -1,179 +0,0 @@
|
||||||
/**
|
|
||||||
* health-system.ts — GET /tiger/health/system : layered self-diagnosis
|
|
||||||
*
|
|
||||||
* Born from a real incident: "⚠️ Tiger timed out or is offline" appeared on
|
|
||||||
* Telegram with no way to see WHY from the dashboard. This endpoint checks
|
|
||||||
* each layer a message travels through and names the broken one, so the
|
|
||||||
* dashboard can show the cause, not just the symptom.
|
|
||||||
*
|
|
||||||
* Layers checked (cheap, parallel, ~1s worst case):
|
|
||||||
* memory host RAM + swap from /proc/meminfo (bridge runs on the host)
|
|
||||||
* gateway LiteLLM liveliness — if this is down, EVERY agent is down
|
|
||||||
* openclaw container running? (docker inspect)
|
|
||||||
* crons lastStatus of every job from cron/jobs.json
|
|
||||||
*
|
|
||||||
* Response:
|
|
||||||
* { ok, verdict: healthy|degraded|critical, issues: string[], checks: {...} }
|
|
||||||
*
|
|
||||||
* Thresholds (tuned to this 8GB host's observed failure modes):
|
|
||||||
* - MemAvailable < 800MB → agent turns crawl, cron timeouts follow
|
|
||||||
* - swap used > 50% → same, sustained
|
|
||||||
*/
|
|
||||||
|
|
||||||
import { Router, Request, Response } from "express";
|
|
||||||
import { readFileSync } from "fs";
|
|
||||||
import { join } from "path";
|
|
||||||
import { exec } from "child_process";
|
|
||||||
import { promisify } from "util";
|
|
||||||
|
|
||||||
const execAsync = promisify(exec);
|
|
||||||
const router = Router();
|
|
||||||
|
|
||||||
const DATA_DIR =
|
|
||||||
process.env.OPENCLAW_DATA_DIR ||
|
|
||||||
"/var/lib/docker/volumes/tiger_tiger-config/_data";
|
|
||||||
const GATEWAY_HEALTH_URL =
|
|
||||||
(process.env.LLM_GATEWAY_URL || "https://llm.manohargupta.com/v1").replace(/\/v1\/?$/, "") +
|
|
||||||
"/health/liveliness";
|
|
||||||
|
|
||||||
const MEM_AVAILABLE_FLOOR_MB = 800;
|
|
||||||
const SWAP_USED_CEILING_PCT = 50;
|
|
||||||
|
|
||||||
interface MemoryCheck {
|
|
||||||
totalMb: number;
|
|
||||||
availableMb: number;
|
|
||||||
swapTotalMb: number;
|
|
||||||
swapUsedMb: number;
|
|
||||||
swapUsedPct: number;
|
|
||||||
}
|
|
||||||
|
|
||||||
function checkMemory(): MemoryCheck | null {
|
|
||||||
try {
|
|
||||||
const info = readFileSync("/proc/meminfo", "utf-8");
|
|
||||||
const grab = (key: string): number => {
|
|
||||||
const m = info.match(new RegExp(`^${key}:\\s+(\\d+) kB`, "m"));
|
|
||||||
return m ? Math.round(parseInt(m[1], 10) / 1024) : 0;
|
|
||||||
};
|
|
||||||
const totalMb = grab("MemTotal");
|
|
||||||
const availableMb = grab("MemAvailable");
|
|
||||||
const swapTotalMb = grab("SwapTotal");
|
|
||||||
const swapFreeMb = grab("SwapFree");
|
|
||||||
const swapUsedMb = swapTotalMb - swapFreeMb;
|
|
||||||
return {
|
|
||||||
totalMb,
|
|
||||||
availableMb,
|
|
||||||
swapTotalMb,
|
|
||||||
swapUsedMb,
|
|
||||||
swapUsedPct: swapTotalMb > 0 ? Math.round((swapUsedMb / swapTotalMb) * 100) : 0,
|
|
||||||
};
|
|
||||||
} catch {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function checkGateway(): Promise<boolean> {
|
|
||||||
try {
|
|
||||||
const controller = new AbortController();
|
|
||||||
const t = setTimeout(() => controller.abort(), 4000);
|
|
||||||
const res = await fetch(GATEWAY_HEALTH_URL, { signal: controller.signal });
|
|
||||||
clearTimeout(t);
|
|
||||||
return res.ok;
|
|
||||||
} catch {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function checkContainer(): Promise<boolean> {
|
|
||||||
try {
|
|
||||||
const { stdout } = await execAsync(
|
|
||||||
"docker inspect -f '{{.State.Running}}' tiger-openclaw",
|
|
||||||
{ timeout: 5000 },
|
|
||||||
);
|
|
||||||
return stdout.trim() === "true";
|
|
||||||
} catch {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
interface CronCheck {
|
|
||||||
name: string;
|
|
||||||
lastStatus: string;
|
|
||||||
consecutiveErrors: number;
|
|
||||||
}
|
|
||||||
|
|
||||||
function checkCrons(): CronCheck[] {
|
|
||||||
try {
|
|
||||||
const raw = JSON.parse(
|
|
||||||
readFileSync(join(DATA_DIR, "cron", "jobs.json"), "utf-8"),
|
|
||||||
) as { jobs?: Array<Record<string, any>> };
|
|
||||||
return (raw.jobs ?? []).map((j) => ({
|
|
||||||
name: String(j.name ?? j.id ?? "unknown"),
|
|
||||||
lastStatus: String(j.state?.lastStatus ?? j.lastStatus ?? "unknown"),
|
|
||||||
consecutiveErrors: Number(j.state?.consecutiveErrors ?? 0),
|
|
||||||
}));
|
|
||||||
} catch {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
router.get("/", async (_req: Request, res: Response) => {
|
|
||||||
const [memory, gatewayUp, containerUp] = await Promise.all([
|
|
||||||
Promise.resolve(checkMemory()),
|
|
||||||
checkGateway(),
|
|
||||||
checkContainer(),
|
|
||||||
]);
|
|
||||||
const crons = checkCrons();
|
|
||||||
|
|
||||||
const issues: string[] = [];
|
|
||||||
let verdict: "healthy" | "degraded" | "critical" = "healthy";
|
|
||||||
const degrade = () => { if (verdict === "healthy") verdict = "degraded"; };
|
|
||||||
|
|
||||||
if (!containerUp) {
|
|
||||||
verdict = "critical";
|
|
||||||
issues.push("OpenClaw container is not running — Tiger is offline");
|
|
||||||
}
|
|
||||||
if (!gatewayUp) {
|
|
||||||
verdict = "critical";
|
|
||||||
issues.push("LLM gateway unreachable — every agent turn will fail");
|
|
||||||
}
|
|
||||||
if (memory) {
|
|
||||||
if (memory.availableMb < MEM_AVAILABLE_FLOOR_MB) {
|
|
||||||
degrade();
|
|
||||||
issues.push(
|
|
||||||
`Low memory: ${memory.availableMb}MB available (floor ${MEM_AVAILABLE_FLOOR_MB}MB) — expect slow turns and cron timeouts`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (memory.swapUsedPct > SWAP_USED_CEILING_PCT) {
|
|
||||||
degrade();
|
|
||||||
issues.push(
|
|
||||||
`Heavy swapping: ${memory.swapUsedMb}MB (${memory.swapUsedPct}%) of swap in use`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
degrade();
|
|
||||||
issues.push("Could not read /proc/meminfo");
|
|
||||||
}
|
|
||||||
for (const c of crons) {
|
|
||||||
if (c.lastStatus === "error") {
|
|
||||||
degrade();
|
|
||||||
issues.push(
|
|
||||||
`Cron "${c.name}" last run failed${c.consecutiveErrors > 1 ? ` (${c.consecutiveErrors} consecutive)` : ""}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
res.json({
|
|
||||||
ok: true,
|
|
||||||
verdict,
|
|
||||||
issues,
|
|
||||||
checks: {
|
|
||||||
memory,
|
|
||||||
gateway: { up: gatewayUp, url: GATEWAY_HEALTH_URL },
|
|
||||||
openclaw: { running: containerUp },
|
|
||||||
crons,
|
|
||||||
},
|
|
||||||
checkedAt: new Date().toISOString(),
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
export default router;
|
|
||||||
|
|
@ -1,21 +0,0 @@
|
||||||
/** /api/health/system — proxy for the bridge's layered self-diagnosis. */
|
|
||||||
import { NextResponse } from "next/server"
|
|
||||||
import { bridgeGet } from "@/lib/bridge"
|
|
||||||
|
|
||||||
export const dynamic = "force-dynamic"
|
|
||||||
|
|
||||||
export async function GET() {
|
|
||||||
try {
|
|
||||||
const data = await bridgeGet("/tiger/health/system")
|
|
||||||
return NextResponse.json(data)
|
|
||||||
} catch {
|
|
||||||
// Bridge itself unreachable IS a finding — surface it as critical.
|
|
||||||
return NextResponse.json({
|
|
||||||
ok: true,
|
|
||||||
verdict: "critical",
|
|
||||||
issues: ["Bridge unreachable from dashboard — control plane is down"],
|
|
||||||
checks: {},
|
|
||||||
checkedAt: new Date().toISOString(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -6,14 +6,10 @@ import { DigestCard } from "@/components/digest-card"
|
||||||
import { TelegramThreadCard } from "@/components/telegram-thread-card"
|
import { TelegramThreadCard } from "@/components/telegram-thread-card"
|
||||||
import { StatusFooter } from "@/components/status-footer"
|
import { StatusFooter } from "@/components/status-footer"
|
||||||
import { ScheduleCard } from "@/components/schedule-card"
|
import { ScheduleCard } from "@/components/schedule-card"
|
||||||
import { HealthBanner } from "@/components/health-banner"
|
|
||||||
|
|
||||||
export default function HomePage() {
|
export default function HomePage() {
|
||||||
return (
|
return (
|
||||||
<div className="flex flex-col gap-5 max-w-5xl mx-auto w-full">
|
<div className="flex flex-col gap-5 max-w-5xl mx-auto w-full">
|
||||||
{/* HEALTH — invisible when healthy; impossible to miss when not. */}
|
|
||||||
<HealthBanner />
|
|
||||||
|
|
||||||
{/* HERO — the command bar is the front door of Tiger. */}
|
{/* HERO — the command bar is the front door of Tiger. */}
|
||||||
<CommandBar />
|
<CommandBar />
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,70 +0,0 @@
|
||||||
"use client"
|
|
||||||
|
|
||||||
/**
|
|
||||||
* HealthBanner — surfaces system degradation the moment it exists.
|
|
||||||
*
|
|
||||||
* Polls /api/health/system every 30s. Renders NOTHING while healthy (a
|
|
||||||
* banner that's always there is a banner nobody reads). On degraded it
|
|
||||||
* shows an amber strip with each issue; on critical, red. Click toggles
|
|
||||||
* the detail list. The point: "⚠️ Tiger timed out" on Telegram should
|
|
||||||
* never again be the first place a failure shows up.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import { useEffect, useState } from "react"
|
|
||||||
import { AlertTriangle, OctagonAlert, ChevronDown } from "lucide-react"
|
|
||||||
|
|
||||||
interface Health {
|
|
||||||
verdict: "healthy" | "degraded" | "critical"
|
|
||||||
issues: string[]
|
|
||||||
checkedAt: string
|
|
||||||
}
|
|
||||||
|
|
||||||
const POLL_MS = 30_000
|
|
||||||
|
|
||||||
export function HealthBanner() {
|
|
||||||
const [health, setHealth] = useState<Health | null>(null)
|
|
||||||
const [open, setOpen] = useState(false)
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
let alive = true
|
|
||||||
const load = () =>
|
|
||||||
fetch("/api/health/system")
|
|
||||||
.then((r) => r.json())
|
|
||||||
.then((d: Health) => { if (alive && d?.verdict) setHealth(d) })
|
|
||||||
.catch(() => { /* next poll retries */ })
|
|
||||||
load()
|
|
||||||
const t = setInterval(load, POLL_MS)
|
|
||||||
return () => { alive = false; clearInterval(t) }
|
|
||||||
}, [])
|
|
||||||
|
|
||||||
if (!health || health.verdict === "healthy") return null
|
|
||||||
|
|
||||||
const critical = health.verdict === "critical"
|
|
||||||
const Icon = critical ? OctagonAlert : AlertTriangle
|
|
||||||
const tone = critical
|
|
||||||
? "bg-red-500/10 border-red-500/40 text-red-400"
|
|
||||||
: "bg-amber-500/10 border-amber-500/40 text-amber-400"
|
|
||||||
|
|
||||||
return (
|
|
||||||
<button
|
|
||||||
onClick={() => setOpen((v) => !v)}
|
|
||||||
className={`w-full text-left rounded-lg border px-4 py-2.5 mb-4 ${tone}`}
|
|
||||||
>
|
|
||||||
<span className="flex items-center gap-2 text-sm font-medium">
|
|
||||||
<Icon className="h-4 w-4 shrink-0" />
|
|
||||||
System {health.verdict} — {health.issues.length} issue
|
|
||||||
{health.issues.length === 1 ? "" : "s"}
|
|
||||||
<ChevronDown
|
|
||||||
className={`h-3.5 w-3.5 ml-auto transition-transform ${open ? "rotate-180" : ""}`}
|
|
||||||
/>
|
|
||||||
</span>
|
|
||||||
{open && (
|
|
||||||
<ul className="mt-2 ml-6 list-disc text-[13px] space-y-1 text-foreground/80">
|
|
||||||
{health.issues.map((i) => (
|
|
||||||
<li key={i}>{i}</li>
|
|
||||||
))}
|
|
||||||
</ul>
|
|
||||||
)}
|
|
||||||
</button>
|
|
||||||
)
|
|
||||||
}
|
|
||||||
Loading…
Add table
Reference in a new issue