include argus workflow

This commit is contained in:
joyzhuo
2026-03-29 06:29:18 -04:00
parent 275a53ab40
commit 56673078f5
23 changed files with 3098 additions and 307 deletions

415
argus/loop.py Normal file
View File

@@ -0,0 +1,415 @@
"""Main orchestrator loop — capture → analyze → notify → execute.
Three concurrent tasks:
1. VLM loop: capture screenshots, analyze, push results
2. Notification handler: show cards when friction detected, wait for response
3. Input reader: read user's terminal input (1=accept, 0=dismiss)
Swift portability notes:
- The VLM loop becomes a Timer or DispatchSourceTimer
- The notification handler becomes SwiftUI state updates
- The input reader becomes button tap handlers
- The three tasks map to Swift's structured concurrency (async let / TaskGroup)
"""
from __future__ import annotations
import asyncio
import json
import logging
import sys
import time
from argus.backend import send_analysis
from argus.buffer import HistoryBuffer
from argus.capture import capture_screenshot
from argus.config import BUFFER_MAX_LEN, CAPTURE_INTERVAL_S, VLM_CALL_EVERY_N
from argus.executor import execute
from argus.notification import NotificationManager
from argus.session import SessionManager
from argus.vlm import TaskContext, VLMResult, analyze_screenshot
log = logging.getLogger(__name__)
# Track session actions already shown to avoid repeating
_handled_session_actions: set[str] = set()
def _session_action_key(sa) -> str:
return f"{sa.type}:{sa.session_id}"
def _show_session_card(title: str, body: str, options: list[str]) -> None:
"""Display a session-related card (not friction). Swift: native notification."""
print()
print("" + "" * 58 + "")
print(f"{title:<56}")
print("" + " " * 58 + "")
for line in body.split("\n"):
print(f"{line:<56}")
print("" + " " * 58 + "")
for i, opt in enumerate(options):
print(f"│ [{i + 1}] {opt:<53}")
print(f"│ [0] Not now{' ' * 44}")
print("" + "" * 58 + "")
print()
async def _wait_for_input() -> int:
"""Wait for a single integer input from stdin. Returns the number or 0."""
loop = asyncio.get_event_loop()
line = await loop.run_in_executor(None, sys.stdin.readline)
try:
return int(line.strip())
except (ValueError, EOFError):
return 0
async def _handle_session_action(sa, result, sessions: SessionManager, notifier: NotificationManager) -> None:
"""Handle VLM session_action output — show card and act on approval.
Swift portability: becomes SwiftUI state changes + button handlers.
"""
# Skip if we already showed this exact action
key = _session_action_key(sa)
if key in _handled_session_actions:
return
_handled_session_actions.add(key)
if sa.type == "resume":
session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
checkpoint = session.checkpoint_note if session else ""
task_title = session.task_title if session else sa.reason
_show_session_card(
f"📂 Resume: {task_title}",
f"You left off: {checkpoint}" if checkpoint else sa.reason,
["Resume session"],
)
choice = await _wait_for_input()
if choice == 1:
if not sessions._mock:
resume_card = await sessions.get_resume_card(sa.session_id)
if resume_card:
rc = resume_card.get("resume_card", {})
print(f" 💡 {rc.get('welcome_back', '')}")
print(f" {rc.get('you_were_doing', '')}")
print(f" {rc.get('next_step', '')}")
print(f" {rc.get('motivation', '')}")
else:
print(f" ✓ Resumed \"{task_title}\"")
if checkpoint:
print(f" Last checkpoint: {checkpoint}")
if session:
session.status = "active"
sessions._active_session = session
else:
print(" dismissed.")
elif sa.type == "start_new":
task = result.inferred_task
_show_session_card(
"🆕 New focus session",
f"You're working on: {task}",
["Start focus session"],
)
choice = await _wait_for_input()
if choice == 1:
if not sessions._mock:
resp = await sessions.start_session(task)
if resp:
print(f" ✓ Session started: {resp.get('id', '?')}")
else:
print(f" ✓ (mock) Session started for \"{task}\"")
else:
print(" dismissed.")
elif sa.type == "complete":
session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
task_title = session.task_title if session else "session"
_show_session_card(
f"✅ Complete: {task_title}",
sa.reason,
["Complete session"],
)
choice = await _wait_for_input()
if choice == 1 and sa.session_id:
if not sessions._mock:
ok = await sessions.end_session(sa.session_id)
if ok:
print(f" ✓ Session completed")
else:
print(f" ✓ (mock) Session \"{task_title}\" completed")
sessions._sessions = [s for s in sessions._sessions if s.session_id != sa.session_id]
if sessions._active_session and sessions._active_session.session_id == sa.session_id:
sessions._active_session = None
else:
print(" dismissed.")
elif sa.type == "switch":
session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
task_title = session.task_title if session else "another task"
_show_session_card(
f"🔄 Switch to: {task_title}",
sa.reason,
[f"Switch to \"{task_title}\""],
)
choice = await _wait_for_input()
if choice == 1:
print(f" ✓ Switched to \"{task_title}\"")
else:
print(" dismissed.")
def _is_valid_session_id(session_id: str | None, sessions: SessionManager) -> bool:
"""Check if a session_id from VLM output actually exists in our sessions."""
if not session_id:
return False
return any(s.session_id == session_id for s in sessions.sessions)
async def run_loop(
ctx: TaskContext,
*,
api_key: str | None = None,
vlm_backend: str | None = None,
jwt: str | None = None,
base_url: str | None = None,
dry_run: bool = False,
max_iterations: int | None = None,
on_result: None | (callable) = None,
auto_execute: bool = False,
mock_sessions: list | None = None,
) -> None:
"""Run the Argus VLM loop with notification and execution support.
Args:
ctx: Task/session context.
api_key: Gemini API key override.
vlm_backend: "ollama" or "gemini".
jwt: Backend JWT override.
base_url: Backend base URL override.
dry_run: If True, skip sending to backend.
max_iterations: Stop after N iterations (None = forever).
on_result: Optional callback(VLMResult) per analysis.
auto_execute: If True, enable notification + executor flow.
"""
history = HistoryBuffer(image_maxlen=BUFFER_MAX_LEN)
notifier = NotificationManager()
sessions = SessionManager(jwt=jwt, base_url=base_url)
iteration = 0
log.info(
"Argus loop starting — interval=%ds, session=%s, task=%s, executor=%s",
CAPTURE_INTERVAL_S,
ctx.session_id,
ctx.task_title,
"on" if auto_execute else "off",
)
# Load sessions — mock or from backend
use_mock = mock_sessions is not None
if use_mock:
sessions._sessions = mock_sessions
sessions._active_session = next((s for s in mock_sessions if s.status == "active"), None)
sessions._mock = True # prevent refresh from overwriting
log.info("Loaded %d mock sessions", len(mock_sessions))
else:
sessions._mock = False
try:
await sessions.fetch_open_sessions()
except Exception:
log.exception("Startup: failed to fetch sessions — continuing without session context")
# Log attachment status so it's visible in the console
if sessions.active:
s = sessions.active
log.info(
"ATTACHED to session %s — task=%r last_app=%s checkpoint=%r",
s.session_id[:8], s.task_title, s.last_app or "(unknown)", s.checkpoint_note or "(none)",
)
else:
log.info("No active session found — running in monitoring-only mode (dry-run=%s)", dry_run)
# Pending frames collected between VLM calls
pending_frames: list[bytes] = []
capture_count = 0
try:
while max_iterations is None or iteration < max_iterations:
t0 = time.monotonic()
try:
# 0. Refresh sessions periodically
await sessions.maybe_refresh()
# 1. Capture screenshot every interval
screenshot = capture_screenshot()
pending_frames.append(screenshot)
capture_count += 1
log.debug("Captured frame %d (%d pending)", capture_count, len(pending_frames))
# 2. Only call VLM every N captures
if len(pending_frames) < VLM_CALL_EVERY_N:
elapsed = time.monotonic() - t0
sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed)
if sleep_for > 0:
await asyncio.sleep(sleep_for)
continue
# ── VLM call with all pending frames ──
iteration += 1
# Push all pending frames into buffer (buffer keeps last BUFFER_MAX_LEN)
for frame in pending_frames:
history.push(frame, "") # summaries filled after VLM call
pending_frames.clear()
t_vlm = time.monotonic()
result: VLMResult = await analyze_screenshot(
screenshot, ctx, history,
api_key=api_key,
backend=vlm_backend,
session_context=sessions.format_for_prompt(),
)
t_vlm_done = time.monotonic()
payload = result.to_backend_payload(ctx.session_id)
log.info(
"[%d] vlm=%.2fs frames=%d friction=%s summary=%s",
iteration,
t_vlm_done - t_vlm,
VLM_CALL_EVERY_N,
result.friction.type,
result.vlm_summary,
)
# Update last entry's summary now that we have it
history.set_last_output(payload)
# 4. Print / send to backend
if dry_run:
print(json.dumps(payload, indent=2))
else:
resp = await send_analysis(
result, ctx.session_id, jwt=jwt, base_url=base_url
)
log.debug("Backend response: %s", resp)
if auto_execute:
sa = result.session_action
# Validate session_action — check that VLM output relates to the session.
# Match against filename, file stem (no extension), or task title.
if sa.type in ("resume", "switch") and sa.session_id:
session = next(
(s for s in sessions.sessions if s.session_id == sa.session_id), None
)
if session:
context = (result.vlm_summary + " " + result.inferred_task + " " + sa.reason).lower()
matches = []
if session.last_file:
matches.append(session.last_file.lower())
# Also check stem without extension (e.g. "receipt" from "receipt.pdf")
stem = session.last_file.rsplit(".", 1)[0].lower()
if stem:
matches.append(stem)
if session.task_title:
# Check key words from task title
for word in session.task_title.lower().split():
if len(word) > 3: # skip short words
matches.append(word)
if matches and not any(m in context for m in matches):
log.debug(
"Suppressing session_action=%s — none of %s found in context",
sa.type, matches,
)
sa.type = "none"
if not _is_valid_session_id(sa.session_id, sessions) and sa.type in ("resume", "switch"):
sa.type = "none"
# 5. Session actions take priority — but only if not already handled
session_handled = False
if sa.type != "none":
key = _session_action_key(sa)
if key not in _handled_session_actions:
await _handle_session_action(sa, result, sessions, notifier)
session_handled = True
# 6. Friction notification + executor
if not session_handled and notifier.should_notify(payload):
card = notifier.create_card(payload)
notifier.show_card_terminal(card)
choice = await _wait_for_input()
action_idx = choice - 1
if choice > 0:
print(f"\n⚡ Executing action {action_idx + 1}...")
summary = await execute(
payload, action_idx,
history=history,
current_screenshot=screenshot,
api_key=api_key,
)
# Emit a parseable JSON block so Swift can surface the result
# in the HUD and clear the executing spinner.
print(json.dumps({"exec_summary": summary or ""}, indent=2), flush=True)
if summary:
history.set_last_execution(summary)
else:
print("dismissed.")
# 7. Nudge — VLM decided a nudge is appropriate (only if nothing else fired)
elif not session_handled and result.gentle_nudge:
print(f"\n💛 {result.gentle_nudge}")
# 8. Fallback: suggest new session if VLM didn't
elif not session_handled and (
sa.type == "none"
and result.on_task
and not sessions.active
and sessions.should_suggest_new_session(result.inferred_task)
):
task = result.inferred_task
print(f"\n🆕 You've been working on \"{task}\" — start a focus session?")
print(f" [1] Start [0] Not now")
card = notifier.create_card({
"friction": {
"type": "none",
"confidence": 1.0,
"description": f"New task detected: {task}",
"proposed_actions": [{"label": "Start focus session", "action_type": "other", "details": task}],
}
})
notifier.show_card_terminal(card)
accepted, _ = await notifier.wait_for_response(timeout=60.0)
if accepted:
resp = await sessions.start_session(task)
if resp:
print(f" ✓ Session started: {resp.get('id', '?')}")
# Clear execution context after 3 iterations
if history.get_last_execution() and iteration % 3 == 0:
history.clear_last_execution()
# 9. Callback
if on_result:
on_result(result)
except Exception:
log.exception("Error in Argus loop iteration %d", iteration)
# Sleep for remainder of capture interval
elapsed = time.monotonic() - t0
sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed)
if sleep_for > 0:
await asyncio.sleep(sleep_for)
finally:
pass