include argus workflow

2026-03-29 06:29:18 -04:00
parent 275a53ab40
commit 56673078f5
23 changed files with 3098 additions and 307 deletions
--- a/argus/loop.py
+++ b/argus/loop.py
@@ -0,0 +1,415 @@
+"""Main orchestrator loop — capture → analyze → notify → execute.
+
+Three concurrent tasks:
+  1. VLM loop: capture screenshots, analyze, push results
+  2. Notification handler: show cards when friction detected, wait for response
+  3. Input reader: read user's terminal input (1=accept, 0=dismiss)
+
+Swift portability notes:
+  - The VLM loop becomes a Timer or DispatchSourceTimer
+  - The notification handler becomes SwiftUI state updates
+  - The input reader becomes button tap handlers
+  - The three tasks map to Swift's structured concurrency (async let / TaskGroup)
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import sys
+import time
+
+from argus.backend import send_analysis
+from argus.buffer import HistoryBuffer
+from argus.capture import capture_screenshot
+from argus.config import BUFFER_MAX_LEN, CAPTURE_INTERVAL_S, VLM_CALL_EVERY_N
+from argus.executor import execute
+from argus.notification import NotificationManager
+from argus.session import SessionManager
+from argus.vlm import TaskContext, VLMResult, analyze_screenshot
+
+log = logging.getLogger(__name__)
+
+
+# Track session actions already shown to avoid repeating
+_handled_session_actions: set[str] = set()
+
+
+def _session_action_key(sa) -> str:
+    return f"{sa.type}:{sa.session_id}"
+
+
+def _show_session_card(title: str, body: str, options: list[str]) -> None:
+    """Display a session-related card (not friction). Swift: native notification."""
+    print()
+    print("┌" + "─" * 58 + "┐")
+    print(f"│ {title:<56} │")
+    print("│" + " " * 58 + "│")
+    for line in body.split("\n"):
+        print(f"│ {line:<56} │")
+    print("│" + " " * 58 + "│")
+    for i, opt in enumerate(options):
+        print(f"│  [{i + 1}] {opt:<53} │")
+    print(f"│  [0] Not now{' ' * 44} │")
+    print("└" + "─" * 58 + "┘")
+    print()
+
+
+async def _wait_for_input() -> int:
+    """Wait for a single integer input from stdin. Returns the number or 0."""
+    loop = asyncio.get_event_loop()
+    line = await loop.run_in_executor(None, sys.stdin.readline)
+    try:
+        return int(line.strip())
+    except (ValueError, EOFError):
+        return 0
+
+
+async def _handle_session_action(sa, result, sessions: SessionManager, notifier: NotificationManager) -> None:
+    """Handle VLM session_action output — show card and act on approval.
+
+    Swift portability: becomes SwiftUI state changes + button handlers.
+    """
+    # Skip if we already showed this exact action
+    key = _session_action_key(sa)
+    if key in _handled_session_actions:
+        return
+    _handled_session_actions.add(key)
+
+    if sa.type == "resume":
+        session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
+        checkpoint = session.checkpoint_note if session else ""
+        task_title = session.task_title if session else sa.reason
+
+        _show_session_card(
+            f"📂 Resume: {task_title}",
+            f"You left off: {checkpoint}" if checkpoint else sa.reason,
+            ["Resume session"],
+        )
+        choice = await _wait_for_input()
+        if choice == 1:
+            if not sessions._mock:
+                resume_card = await sessions.get_resume_card(sa.session_id)
+                if resume_card:
+                    rc = resume_card.get("resume_card", {})
+                    print(f"  💡 {rc.get('welcome_back', '')}")
+                    print(f"     {rc.get('you_were_doing', '')}")
+                    print(f"     {rc.get('next_step', '')}")
+                    print(f"     {rc.get('motivation', '')}")
+            else:
+                print(f"  ✓ Resumed \"{task_title}\"")
+                if checkpoint:
+                    print(f"    Last checkpoint: {checkpoint}")
+            if session:
+                session.status = "active"
+                sessions._active_session = session
+        else:
+            print("  dismissed.")
+
+    elif sa.type == "start_new":
+        task = result.inferred_task
+        _show_session_card(
+            "🆕 New focus session",
+            f"You're working on: {task}",
+            ["Start focus session"],
+        )
+        choice = await _wait_for_input()
+        if choice == 1:
+            if not sessions._mock:
+                resp = await sessions.start_session(task)
+                if resp:
+                    print(f"  ✓ Session started: {resp.get('id', '?')}")
+            else:
+                print(f"  ✓ (mock) Session started for \"{task}\"")
+        else:
+            print("  dismissed.")
+
+    elif sa.type == "complete":
+        session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
+        task_title = session.task_title if session else "session"
+
+        _show_session_card(
+            f"✅ Complete: {task_title}",
+            sa.reason,
+            ["Complete session"],
+        )
+        choice = await _wait_for_input()
+        if choice == 1 and sa.session_id:
+            if not sessions._mock:
+                ok = await sessions.end_session(sa.session_id)
+                if ok:
+                    print(f"  ✓ Session completed")
+            else:
+                print(f"  ✓ (mock) Session \"{task_title}\" completed")
+            sessions._sessions = [s for s in sessions._sessions if s.session_id != sa.session_id]
+            if sessions._active_session and sessions._active_session.session_id == sa.session_id:
+                sessions._active_session = None
+        else:
+            print("  dismissed.")
+
+    elif sa.type == "switch":
+        session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
+        task_title = session.task_title if session else "another task"
+        _show_session_card(
+            f"🔄 Switch to: {task_title}",
+            sa.reason,
+            [f"Switch to \"{task_title}\""],
+        )
+        choice = await _wait_for_input()
+        if choice == 1:
+            print(f"  ✓ Switched to \"{task_title}\"")
+        else:
+            print("  dismissed.")
+
+
+def _is_valid_session_id(session_id: str | None, sessions: SessionManager) -> bool:
+    """Check if a session_id from VLM output actually exists in our sessions."""
+    if not session_id:
+        return False
+    return any(s.session_id == session_id for s in sessions.sessions)
+
+
+
+
+async def run_loop(
+    ctx: TaskContext,
+    *,
+    api_key: str | None = None,
+    vlm_backend: str | None = None,
+    jwt: str | None = None,
+    base_url: str | None = None,
+    dry_run: bool = False,
+    max_iterations: int | None = None,
+    on_result: None | (callable) = None,
+    auto_execute: bool = False,
+    mock_sessions: list | None = None,
+) -> None:
+    """Run the Argus VLM loop with notification and execution support.
+
+    Args:
+        ctx: Task/session context.
+        api_key: Gemini API key override.
+        vlm_backend: "ollama" or "gemini".
+        jwt: Backend JWT override.
+        base_url: Backend base URL override.
+        dry_run: If True, skip sending to backend.
+        max_iterations: Stop after N iterations (None = forever).
+        on_result: Optional callback(VLMResult) per analysis.
+        auto_execute: If True, enable notification + executor flow.
+    """
+    history = HistoryBuffer(image_maxlen=BUFFER_MAX_LEN)
+    notifier = NotificationManager()
+    sessions = SessionManager(jwt=jwt, base_url=base_url)
+    iteration = 0
+
+    log.info(
+        "Argus loop starting — interval=%ds, session=%s, task=%s, executor=%s",
+        CAPTURE_INTERVAL_S,
+        ctx.session_id,
+        ctx.task_title,
+        "on" if auto_execute else "off",
+    )
+
+    # Load sessions — mock or from backend
+    use_mock = mock_sessions is not None
+    if use_mock:
+        sessions._sessions = mock_sessions
+        sessions._active_session = next((s for s in mock_sessions if s.status == "active"), None)
+        sessions._mock = True  # prevent refresh from overwriting
+        log.info("Loaded %d mock sessions", len(mock_sessions))
+    else:
+        sessions._mock = False
+        try:
+            await sessions.fetch_open_sessions()
+        except Exception:
+            log.exception("Startup: failed to fetch sessions — continuing without session context")
+
+    # Log attachment status so it's visible in the console
+    if sessions.active:
+        s = sessions.active
+        log.info(
+            "ATTACHED to session %s — task=%r  last_app=%s  checkpoint=%r",
+            s.session_id[:8], s.task_title, s.last_app or "(unknown)", s.checkpoint_note or "(none)",
+        )
+    else:
+        log.info("No active session found — running in monitoring-only mode (dry-run=%s)", dry_run)
+
+
+    # Pending frames collected between VLM calls
+    pending_frames: list[bytes] = []
+    capture_count = 0
+
+    try:
+        while max_iterations is None or iteration < max_iterations:
+            t0 = time.monotonic()
+
+            try:
+                # 0. Refresh sessions periodically
+                await sessions.maybe_refresh()
+
+                # 1. Capture screenshot every interval
+                screenshot = capture_screenshot()
+                pending_frames.append(screenshot)
+                capture_count += 1
+                log.debug("Captured frame %d (%d pending)", capture_count, len(pending_frames))
+
+                # 2. Only call VLM every N captures
+                if len(pending_frames) < VLM_CALL_EVERY_N:
+                    elapsed = time.monotonic() - t0
+                    sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed)
+                    if sleep_for > 0:
+                        await asyncio.sleep(sleep_for)
+                    continue
+
+                # ── VLM call with all pending frames ──
+                iteration += 1
+
+                # Push all pending frames into buffer (buffer keeps last BUFFER_MAX_LEN)
+                for frame in pending_frames:
+                    history.push(frame, "")  # summaries filled after VLM call
+                pending_frames.clear()
+
+                t_vlm = time.monotonic()
+                result: VLMResult = await analyze_screenshot(
+                    screenshot, ctx, history,
+                    api_key=api_key,
+                    backend=vlm_backend,
+                    session_context=sessions.format_for_prompt(),
+                )
+                t_vlm_done = time.monotonic()
+                payload = result.to_backend_payload(ctx.session_id)
+
+                log.info(
+                    "[%d] vlm=%.2fs  frames=%d  friction=%s  summary=%s",
+                    iteration,
+                    t_vlm_done - t_vlm,
+                    VLM_CALL_EVERY_N,
+                    result.friction.type,
+                    result.vlm_summary,
+                )
+
+                # Update last entry's summary now that we have it
+                history.set_last_output(payload)
+
+                # 4. Print / send to backend
+                if dry_run:
+                    print(json.dumps(payload, indent=2))
+                else:
+                    resp = await send_analysis(
+                        result, ctx.session_id, jwt=jwt, base_url=base_url
+                    )
+                    log.debug("Backend response: %s", resp)
+
+                if auto_execute:
+                    sa = result.session_action
+
+                    # Validate session_action — check that VLM output relates to the session.
+                    # Match against filename, file stem (no extension), or task title.
+                    if sa.type in ("resume", "switch") and sa.session_id:
+                        session = next(
+                            (s for s in sessions.sessions if s.session_id == sa.session_id), None
+                        )
+                        if session:
+                            context = (result.vlm_summary + " " + result.inferred_task + " " + sa.reason).lower()
+                            matches = []
+                            if session.last_file:
+                                matches.append(session.last_file.lower())
+                                # Also check stem without extension (e.g. "receipt" from "receipt.pdf")
+                                stem = session.last_file.rsplit(".", 1)[0].lower()
+                                if stem:
+                                    matches.append(stem)
+                            if session.task_title:
+                                # Check key words from task title
+                                for word in session.task_title.lower().split():
+                                    if len(word) > 3:  # skip short words
+                                        matches.append(word)
+                            if matches and not any(m in context for m in matches):
+                                log.debug(
+                                    "Suppressing session_action=%s — none of %s found in context",
+                                    sa.type, matches,
+                                )
+                                sa.type = "none"
+
+                    if not _is_valid_session_id(sa.session_id, sessions) and sa.type in ("resume", "switch"):
+                        sa.type = "none"
+
+                    # 5. Session actions take priority — but only if not already handled
+                    session_handled = False
+                    if sa.type != "none":
+                        key = _session_action_key(sa)
+                        if key not in _handled_session_actions:
+                            await _handle_session_action(sa, result, sessions, notifier)
+                            session_handled = True
+
+                    # 6. Friction notification + executor
+                    if not session_handled and notifier.should_notify(payload):
+                        card = notifier.create_card(payload)
+                        notifier.show_card_terminal(card)
+
+                        choice = await _wait_for_input()
+                        action_idx = choice - 1
+
+                        if choice > 0:
+                            print(f"\n⚡ Executing action {action_idx + 1}...")
+                            summary = await execute(
+                                payload, action_idx,
+                                history=history,
+                                current_screenshot=screenshot,
+                                api_key=api_key,
+                            )
+                            # Emit a parseable JSON block so Swift can surface the result
+                            # in the HUD and clear the executing spinner.
+                            print(json.dumps({"exec_summary": summary or ""}, indent=2), flush=True)
+                            if summary:
+                                history.set_last_execution(summary)
+                        else:
+                            print("dismissed.")
+
+                    # 7. Nudge — VLM decided a nudge is appropriate (only if nothing else fired)
+                    elif not session_handled and result.gentle_nudge:
+                        print(f"\n💛 {result.gentle_nudge}")
+
+                    # 8. Fallback: suggest new session if VLM didn't
+                    elif not session_handled and (
+                        sa.type == "none"
+                        and result.on_task
+                        and not sessions.active
+                        and sessions.should_suggest_new_session(result.inferred_task)
+                    ):
+                        task = result.inferred_task
+                        print(f"\n🆕 You've been working on \"{task}\" — start a focus session?")
+                        print(f"   [1] Start  [0] Not now")
+                        card = notifier.create_card({
+                            "friction": {
+                                "type": "none",
+                                "confidence": 1.0,
+                                "description": f"New task detected: {task}",
+                                "proposed_actions": [{"label": "Start focus session", "action_type": "other", "details": task}],
+                            }
+                        })
+                        notifier.show_card_terminal(card)
+                        accepted, _ = await notifier.wait_for_response(timeout=60.0)
+                        if accepted:
+                            resp = await sessions.start_session(task)
+                            if resp:
+                                print(f"  ✓ Session started: {resp.get('id', '?')}")
+
+                # Clear execution context after 3 iterations
+                if history.get_last_execution() and iteration % 3 == 0:
+                    history.clear_last_execution()
+
+                # 9. Callback
+                if on_result:
+                    on_result(result)
+
+            except Exception:
+                log.exception("Error in Argus loop iteration %d", iteration)
+
+            # Sleep for remainder of capture interval
+            elapsed = time.monotonic() - t0
+            sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed)
+            if sleep_for > 0:
+                await asyncio.sleep(sleep_for)
+    finally:
+        pass