include argus workflow

2026-03-29 06:29:18 -04:00
parent 275a53ab40
commit 56673078f5
23 changed files with 3098 additions and 307 deletions
--- a/argus/init.py
+++ b/argus/init.py
--- a/argus/main.py
+++ b/argus/main.py
@@ -0,0 +1,113 @@
+"""CLI entry point: python -m argus [options]"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import sys
+
+from argus.config import CAPTURE_INTERVAL_S
+from argus.loop import run_loop
+from argus.vlm import StepInfo, TaskContext
+
+
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        prog="argus",
+        description="Argus VLM — proactive focus assistant screen analyzer",
+    )
+    p.add_argument("--session-id", default="00000000-0000-0000-0000-000000000000")
+    p.add_argument("--task-title", default="(no task)")
+    p.add_argument("--task-goal", default="")
+    p.add_argument(
+        "--steps-json",
+        default=None,
+        help='JSON array of steps: [{"id":"...", "sort_order":1, "title":"...", "status":"pending"}]',
+    )
+    p.add_argument("--window-title", default="")
+    p.add_argument("--vlm", choices=["ollama", "gemini"], default=None, help="VLM backend (default: ollama)")
+    p.add_argument("--gemini-key", default=None, help="Override GEMINI_API_KEY env var")
+    p.add_argument("--jwt", default=None, help="Override BACKEND_JWT env var")
+    p.add_argument("--backend-url", default=None, help="Override BACKEND_BASE_URL env var")
+    p.add_argument("--dry-run", action="store_true", help="Print JSON instead of POSTing")
+    p.add_argument("--execute", action="store_true", help="Enable notification + executor flow")
+    p.add_argument("--mock-sessions", default=None, help='JSON array of mock sessions: [{"task_title":"...", "status":"interrupted", "last_app":"VS Code", "last_file":"solution.cpp", "checkpoint_note":"stuck on impl"}]')
+    p.add_argument("--iterations", type=int, default=None, help="Stop after N iterations")
+    p.add_argument("-v", "--verbose", action="store_true")
+    return p.parse_args()
+
+
+def main() -> None:
+    args = _parse_args()
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s %(levelname)-5s %(name)s  %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    # httpx logs every request/response at INFO — suppress to WARNING to avoid noisy 404s
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+    logging.getLogger("httpcore").setLevel(logging.WARNING)
+
+    steps: list[StepInfo] = []
+    if args.steps_json:
+        for s in json.loads(args.steps_json):
+            steps.append(
+                StepInfo(
+                    id=s["id"],
+                    sort_order=s["sort_order"],
+                    title=s["title"],
+                    status=s.get("status", "pending"),
+                    checkpoint_note=s.get("checkpoint_note"),
+                )
+            )
+
+    ctx = TaskContext(
+        task_title=args.task_title,
+        task_goal=args.task_goal,
+        steps=steps,
+        window_title=args.window_title,
+        session_id=args.session_id,
+    )
+
+    # Parse mock sessions if provided
+    mock_sessions = None
+    if args.mock_sessions:
+        from argus.session import SessionInfo
+        mock_sessions = []
+        for i, s in enumerate(json.loads(args.mock_sessions)):
+            mock_sessions.append(
+                SessionInfo(
+                    session_id=s.get("session_id", f"mock-{i}"),
+                    task_id=s.get("task_id"),
+                    task_title=s.get("task_title", ""),
+                    task_goal=s.get("task_goal", ""),
+                    status=s.get("status", "interrupted"),
+                    last_app=s.get("last_app", ""),
+                    last_file=s.get("last_file", ""),
+                    checkpoint_note=s.get("checkpoint_note", ""),
+                    started_at=s.get("started_at", ""),
+                    ended_at=s.get("ended_at"),
+                    minutes_ago=s.get("minutes_ago", 30),
+                )
+            )
+
+    asyncio.run(
+        run_loop(
+            ctx,
+            api_key=args.gemini_key,
+            vlm_backend=args.vlm,
+            jwt=args.jwt,
+            base_url=args.backend_url,
+            dry_run=args.dry_run,
+            max_iterations=args.iterations,
+            auto_execute=args.execute,
+            mock_sessions=mock_sessions,
+        )
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/argus/backend.py
+++ b/argus/backend.py
@@ -0,0 +1,50 @@
+"""Backend API client — sends VLM analysis results to the LockInBro server.
+
+POST /distractions/analyze-result with the enriched JSON payload.
+"""
+
+from __future__ import annotations
+
+import logging
+
+import httpx
+
+from argus.config import BACKEND_BASE_URL, BACKEND_JWT
+from argus.vlm import VLMResult
+
+log = logging.getLogger(__name__)
+
+
+async def send_analysis(
+    result: VLMResult,
+    session_id: str,
+    *,
+    jwt: str | None = None,
+    base_url: str | None = None,
+) -> dict:
+    """Post VLM analysis result to the backend.
+
+    Returns the backend response body on success, or an error dict.
+    """
+    url = f"{base_url or BACKEND_BASE_URL}/distractions/analyze-result"
+    token = jwt or BACKEND_JWT
+    payload = result.to_backend_payload(session_id)
+
+    headers = {}
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+
+    log.info("Sending analysis to %s  on_task=%s", url, result.on_task)
+    log.debug("Payload: %s", payload)
+
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            resp = await client.post(url, json=payload, headers=headers)
+            resp.raise_for_status()
+            return resp.json()
+    except httpx.HTTPStatusError as exc:
+        log.error("Backend returned %s: %s", exc.response.status_code, exc.response.text)
+        return {"error": str(exc), "status_code": exc.response.status_code}
+    except httpx.RequestError as exc:
+        log.error("Backend unreachable: %s", exc)
+        return {"error": str(exc)}
--- a/argus/buffer.py
+++ b/argus/buffer.py
@@ -0,0 +1,123 @@
+"""Rolling history buffer for VLM screenshot analysis.
+
+Two tiers:
+  - Image buffer: deque(maxlen=4) of recent screenshots sent as images
+  - Text history: deque(maxlen=12) of older VLM summaries + previous outputs
+    for extended context (what happened 30-60s ago) and self-refinement
+"""
+
+from __future__ import annotations
+
+import time
+from collections import deque
+from dataclasses import dataclass, field
+
+
+@dataclass
+class BufferEntry:
+    jpeg: bytes
+    vlm_summary: str
+    timestamp: float = field(default_factory=time.time)
+
+
+@dataclass
+class TextEntry:
+    vlm_summary: str
+    timestamp: float
+
+
+class HistoryBuffer:
+    def __init__(self, image_maxlen: int = 4, text_maxlen: int = 12):
+        self._images: deque[BufferEntry] = deque(maxlen=image_maxlen)
+        self._text_history: deque[TextEntry] = deque(maxlen=text_maxlen)
+        self._last_output: dict | None = None
+        self._last_execution: str | None = None
+
+    def push(self, jpeg: bytes, vlm_summary: str) -> None:
+        now = time.time()
+        # When an image entry gets evicted from the image buffer,
+        # it's already captured in text_history, so nothing extra needed.
+        self._images.append(BufferEntry(jpeg=jpeg, vlm_summary=vlm_summary, timestamp=now))
+        self._text_history.append(TextEntry(vlm_summary=vlm_summary, timestamp=now))
+
+    def set_last_output(self, output: dict) -> None:
+        """Store the previous VLM JSON output for self-refinement."""
+        self._last_output = output
+
+    def set_last_execution(self, summary: str | None) -> None:
+        """Store the result of the last executor action."""
+        self._last_execution = summary
+
+    def get_last_execution(self) -> str | None:
+        return self._last_execution
+
+    def clear_last_execution(self) -> None:
+        self._last_execution = None
+
+    def get_entries(self) -> list[BufferEntry]:
+        """Return image entries oldest-first."""
+        return list(self._images)
+
+    def format_for_prompt(self) -> str:
+        """Format the full timeline: text history + image labels."""
+        now = time.time()
+        lines: list[str] = []
+
+        # Text-only history (entries older than what's in the image buffer)
+        image_timestamps = {round(e.timestamp, 2) for e in self._images}
+        older = [
+            e for e in self._text_history
+            if round(e.timestamp, 2) not in image_timestamps
+        ]
+        older = [e for e in older if e.vlm_summary]  # skip empty summaries
+        if older:
+            lines.append("Older context (text only, no images):")
+            for entry in older:
+                ago = int(now - entry.timestamp)
+                lines.append(f"  - [{ago}s ago] {entry.vlm_summary}")
+            lines.append("")
+
+        # Image timeline
+        n = len(self._images)
+        if n > 0:
+            lines.append(f"Recent screenshots ({n} prior + 1 current = {n + 1} images):")
+            for i, entry in enumerate(self._images):
+                ago = int(now - entry.timestamp)
+                lines.append(f"  - Screenshot {i + 1}/{n + 1}: [{ago}s ago]")
+            lines.append(f"  - Screenshot {n + 1}/{n + 1}: [now] (current)")
+        else:
+            lines.append("Screenshots:")
+            lines.append("  - Screenshot 1/1: [now] (current, first capture)")
+
+        return "\n".join(lines)
+
+    def format_last_output(self) -> str:
+        """Format previous VLM output for self-refinement context."""
+        if not self._last_output:
+            return ""
+
+        import json
+        # Only include the key fields, not the full blob
+        prev = self._last_output
+        parts = [
+            f"  on_task: {prev.get('on_task')}",
+            f"  app: {prev.get('app_name')}",
+            f"  friction: {prev.get('friction', {}).get('type')}",
+            f"  summary: {prev.get('vlm_summary')}",
+        ]
+        note = prev.get("checkpoint_note_update")
+        if note:
+            parts.append(f"  checkpoint: {note}")
+        desc = prev.get("friction", {}).get("description")
+        if desc:
+            parts.append(f"  friction_desc: {desc}")
+        return "\n".join(parts)
+
+    def __len__(self) -> int:
+        return len(self._images)
+
+    def clear(self) -> None:
+        self._images.clear()
+        self._text_history.clear()
+        self._last_output = None
+        self._last_execution = None
--- a/argus/capture.py
+++ b/argus/capture.py
@@ -0,0 +1,80 @@
+"""Screenshot capture for macOS.
+
+Reads a JPEG file written by the Swift host app (LockInBro.app), which holds
+the Screen Recording TCC permission. The Swift app writes atomically to
+/tmp/lockinbro_capture.jpg every ~5s using ScreenCaptureKit.
+
+capture_screenshot() blocks up to 15 seconds waiting for a fresh file,
+then raises RuntimeError if nothing arrives — so loop.py can handle it
+gracefully without falling back to the screencapture CLI (which would fail
+with a permissions error when called from a Python subprocess).
+"""
+
+from __future__ import annotations
+
+import io
+import os
+import time
+
+from PIL import Image
+
+from argus.config import SCREENSHOT_JPEG_QUALITY, SCREENSHOT_MAX_WIDTH
+
+_SWIFT_FRAME_PATH = "/tmp/lockinbro_capture.jpg"
+_SWIFT_FRAME_MAX_AGE_S = 10.0   # treat as stale if older than this
+_WAIT_TIMEOUT_S = 15.0          # how long to wait for Swift to write the first frame
+_WAIT_POLL_S = 0.5              # polling interval while waiting
+
+
+def _to_jpeg(img: Image.Image) -> bytes:
+    """Convert a PIL Image to JPEG bytes, handling RGBA → RGB."""
+    if img.width > SCREENSHOT_MAX_WIDTH:
+        ratio = SCREENSHOT_MAX_WIDTH / img.width
+        img = img.resize(
+            (SCREENSHOT_MAX_WIDTH, int(img.height * ratio)),
+            Image.LANCZOS,
+        )
+    if img.mode == "RGBA":
+        img = img.convert("RGB")
+    buf = io.BytesIO()
+    img.save(buf, format="JPEG", quality=SCREENSHOT_JPEG_QUALITY)
+    return buf.getvalue()
+
+
+def _read_swift_frame() -> bytes | None:
+    """Return JPEG bytes from the Swift-provided file if it exists and is fresh."""
+    try:
+        age = time.time() - os.path.getmtime(_SWIFT_FRAME_PATH)
+        if age >= _SWIFT_FRAME_MAX_AGE_S:
+            return None
+        with open(_SWIFT_FRAME_PATH, "rb") as f:
+            data = f.read()
+        img = Image.open(io.BytesIO(data))
+        img.load()
+        return _to_jpeg(img)
+    except Exception:
+        return None
+
+
+def capture_screenshot() -> bytes:
+    """Return JPEG bytes for the current screen.
+
+    Blocks up to _WAIT_TIMEOUT_S seconds waiting for the Swift app to write
+    a fresh frame. Raises RuntimeError if no frame arrives in time.
+    """
+    deadline = time.time() + _WAIT_TIMEOUT_S
+    while time.time() < deadline:
+        frame = _read_swift_frame()
+        if frame is not None:
+            return frame
+        time.sleep(_WAIT_POLL_S)
+
+    raise RuntimeError(
+        f"No screenshot received from LockInBro.app within {_WAIT_TIMEOUT_S}s. "
+        "Make sure the app is running and has Screen Recording permission."
+    )
+
+
+def load_image_file(path: str) -> bytes:
+    """Load an image file and return JPEG bytes (for testing)."""
+    return _to_jpeg(Image.open(path))
--- a/argus/config.py
+++ b/argus/config.py
@@ -0,0 +1,28 @@
+import os
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+load_dotenv(Path(__file__).resolve().parent.parent / ".env", override=True)
+
+GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
+GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.5-pro")
+GEMINI_URL = (
+    f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL}:generateContent"
+)
+
+# Ollama (local VLM — default)
+OLLAMA_BASE_URL = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434")
+OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "qwen3.5:9b")
+
+# VLM backend: "ollama" (default) or "gemini"
+VLM_BACKEND = os.environ.get("VLM_BACKEND", "ollama")
+
+BACKEND_BASE_URL = os.environ.get("BACKEND_BASE_URL", "https://wahwa.com/api/v1")
+BACKEND_JWT = os.environ.get("BACKEND_JWT", "")
+
+CAPTURE_INTERVAL_S = 2.5  # screenshot every 2.5s
+BUFFER_MAX_LEN = 4        # 4 frames accumulated
+VLM_CALL_EVERY_N = 4      # call VLM every 4 captures (= every 10s)
+SCREENSHOT_JPEG_QUALITY = 50
+SCREENSHOT_MAX_WIDTH = 1280
--- a/argus/executor.py
+++ b/argus/executor.py
@@ -0,0 +1,384 @@
+"""Agentic executor — uses Gemini 2.5 Pro with tool use to complete actions.
+
+When the user approves a proposed action, the executor gets:
+  - The recent screenshots (so it can read source content)
+  - Tool access: read_file, write_file, run_command
+  - A loop: Gemini proposes tool calls → we execute → feed results back → repeat
+
+This is a full agent loop, not a single-shot LLM call.
+
+Swift portability notes:
+  - The agent loop is the same HTTP pattern (Gemini function calling API)
+  - Tool implementations map to FileManager, NSTask, NSPasteboard
+  - The loop structure is identical in Swift async/await
+"""
+
+from __future__ import annotations
+
+import asyncio
+import base64
+import logging
+import os
+import subprocess
+
+import httpx
+
+from argus.buffer import HistoryBuffer
+from argus.config import GEMINI_API_KEY
+
+log = logging.getLogger(__name__)
+
+EXECUTOR_MODEL = os.environ.get("EXECUTOR_MODEL", "gemini-2.5-pro")
+EXECUTOR_URL = (
+    f"https://generativelanguage.googleapis.com/v1beta/models/{EXECUTOR_MODEL}:generateContent"
+)
+
+MAX_AGENT_STEPS = 10
+
+# ── Tool definitions (Gemini function calling format) ────────────────────
+
+TOOLS = [
+    {
+        "functionDeclarations": [
+            {
+                "name": "read_file",
+                "description": "Read the contents of a file. Use this to inspect source files, code, configs, etc.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "path": {
+                            "type": "string",
+                            "description": "Absolute or relative file path to read",
+                        }
+                    },
+                    "required": ["path"],
+                },
+            },
+            {
+                "name": "output",
+                "description": "Display content to the user as a sticky note. Use for: extracted text from PDFs/images, form fill suggestions, content the user needs to paste into binary formats (docx, ppt, websites). The user will copy/paste from this.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "title": {
+                            "type": "string",
+                            "description": "Short title (e.g. 'Extracted receipt', 'Form values')",
+                        },
+                        "content": {
+                            "type": "string",
+                            "description": "The full content to display",
+                        },
+                    },
+                    "required": ["title", "content"],
+                },
+            },
+            {
+                "name": "write_file",
+                "description": "Write content to an EXISTING plain text file (code, markdown, config, txt). Only use when: (1) you have confirmed the file path via read_file or mdfind, AND (2) the file is a plain text format you can write correctly. NEVER use for binary formats (docx, ppt, pdf, images).",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "path": {
+                            "type": "string",
+                            "description": "Absolute file path (must have been confirmed to exist first)",
+                        },
+                        "content": {
+                            "type": "string",
+                            "description": "Complete file content to write",
+                        },
+                    },
+                    "required": ["path", "content"],
+                },
+            },
+            {
+                "name": "run_command",
+                "description": "Run a shell command and return its output. Use for compiling, testing, listing files, etc.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "command": {
+                            "type": "string",
+                            "description": "The shell command to execute",
+                        }
+                    },
+                    "required": ["command"],
+                },
+            },
+            {
+                "name": "done",
+                "description": "Signal that the task is complete. Call this AFTER using output() to present results. Summary should describe what was shown to the user, not file operations.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "summary": {
+                            "type": "string",
+                            "description": "Brief summary of what was done",
+                        }
+                    },
+                    "required": ["summary"],
+                },
+            },
+        ]
+    }
+]
+
+# ── Tool implementations ─────────────────────────────────────────────────
+
+
+def _exec_read_file(path: str) -> str:
+    resolved = path if os.path.isabs(path) else os.path.join(os.getcwd(), path)
+    try:
+        with open(resolved, "r") as f:
+            content = f.read()
+        log.info("  read_file: %s (%d bytes)", resolved, len(content))
+        return content
+    except OSError as e:
+        return f"Error reading {resolved}: {e}"
+
+
+def _exec_write_file(path: str, content: str) -> str:
+    resolved = path if os.path.isabs(path) else os.path.join(os.getcwd(), path)
+    # Safety: only write to existing text files
+    if not os.path.exists(resolved):
+        return f"Error: {resolved} does not exist. Use output() instead for new content."
+    try:
+        with open(resolved, "r"):
+            pass  # confirm it's readable as text
+    except (OSError, UnicodeDecodeError):
+        return f"Error: {resolved} is not a readable text file. Use output() instead."
+    try:
+        with open(resolved, "w") as f:
+            f.write(content)
+        log.info("  write_file: %s (%d bytes)", resolved, len(content))
+        return f"Successfully wrote {len(content)} bytes to {resolved}"
+    except OSError as e:
+        return f"Error writing {resolved}: {e}"
+
+
+def _exec_output(title: str, content: str) -> str:
+    """Display content to the user via terminal.
+    Swift portability: becomes a sticky note / floating card UI.
+    """
+    print()
+    print(f"┌── 📋 {title} " + "─" * max(0, 50 - len(title)) + "┐")
+    for line in content.split("\n"):
+        print(f"│ {line}")
+    print(f"└" + "─" * 58 + "┘")
+    print()
+    log.info("  output: %s (%d chars)", title, len(content))
+    return f"Displayed '{title}' to user ({len(content)} chars)"
+
+
+def _exec_run_command(command: str) -> str:
+    log.info("  run_command: %s", command[:80])
+    try:
+        result = subprocess.run(
+            command, shell=True, capture_output=True, text=True, timeout=30
+        )
+        output = result.stdout
+        if result.stderr:
+            output += "\nSTDERR:\n" + result.stderr
+        if result.returncode != 0:
+            output += f"\n(exit code {result.returncode})"
+        return output[:4000]  # cap output length
+    except subprocess.TimeoutExpired:
+        return "Error: command timed out after 30s"
+
+
+def _execute_tool(name: str, args: dict) -> str:
+    if name == "read_file":
+        return _exec_read_file(args["path"])
+    elif name == "output":
+        return _exec_output(args["title"], args["content"])
+    elif name == "write_file":
+        return _exec_write_file(args["path"], args["content"])
+    elif name == "run_command":
+        return _exec_run_command(args["command"])
+    elif name == "done":
+        return args.get("summary", "Done.")
+    else:
+        return f"Unknown tool: {name}"
+
+
+# ── Agent loop ───────────────────────────────────────────────────────────
+
+
+async def execute(
+    vlm_payload: dict,
+    action_index: int = 0,
+    *,
+    history: HistoryBuffer | None = None,
+    current_screenshot: bytes | None = None,
+    api_key: str | None = None,
+) -> str | None:
+    """Run the agentic executor loop.
+
+    The agent can read files, write files, and run commands to complete
+    the user's approved action. It loops until it calls done() or hits
+    the step limit.
+
+    Returns a summary of what was done, or None on failure.
+    """
+    friction = vlm_payload.get("friction", {})
+    actions = friction.get("proposed_actions", [])
+    if action_index >= len(actions):
+        log.warning("Action index %d out of range", action_index)
+        return None
+
+    chosen = actions[action_index]
+    key = api_key or GEMINI_API_KEY
+    if not key:
+        log.warning("No API key for executor")
+        return None
+
+    log.info("Agent executing: %s", chosen.get("label", "?")[:80])
+
+    # Build initial message with screenshots + task context
+    initial_parts = _build_initial_parts(vlm_payload, chosen, history, current_screenshot)
+
+    # Conversation history for the agent loop
+    contents = [{"role": "user", "parts": initial_parts}]
+
+    for step in range(MAX_AGENT_STEPS):
+        log.debug("Agent step %d/%d", step + 1, MAX_AGENT_STEPS)
+
+        payload = {
+            "contents": contents,
+            "tools": TOOLS,
+            "generationConfig": {"temperature": 0.2, "maxOutputTokens": 8192},
+        }
+
+        try:
+            async with httpx.AsyncClient(timeout=120.0) as client:
+                for attempt in range(3):
+                    resp = await client.post(f"{EXECUTOR_URL}?key={key}", json=payload)
+                    if resp.status_code == 429:
+                        wait = 2 ** attempt
+                        log.warning("Executor 429, retrying in %ds...", wait)
+                        await asyncio.sleep(wait)
+                        continue
+                    resp.raise_for_status()
+                    break
+                else:
+                    resp.raise_for_status()
+        except Exception:
+            log.exception("Agent API call failed at step %d", step + 1)
+            return None
+
+        body = resp.json()
+        candidate = body["candidates"][0]
+        response_parts = candidate["content"]["parts"]
+
+        # Add assistant response to conversation
+        contents.append({"role": "model", "parts": response_parts})
+
+        # Check for function calls
+        function_calls = [p for p in response_parts if "functionCall" in p]
+
+        if not function_calls:
+            # No tool calls — agent returned text, we're done
+            text = "".join(p.get("text", "") for p in response_parts)
+            log.info("Agent finished with text response (step %d)", step + 1)
+            return text.strip() if text.strip() else "Done."
+
+        # Execute each tool call
+        tool_results = []
+        done_summary = None
+
+        for fc_part in function_calls:
+            fc = fc_part["functionCall"]
+            name = fc["name"]
+            args = fc.get("args", {})
+
+            print(f"  🔧 {name}({_summarize_args(args)})")
+            result = _execute_tool(name, args)
+
+            tool_results.append({
+                "functionResponse": {
+                    "name": name,
+                    "response": {"result": result},
+                }
+            })
+
+            if name == "done":
+                done_summary = result
+
+        # Add tool results to conversation
+        contents.append({"role": "user", "parts": tool_results})
+
+        if done_summary:
+            log.info("Agent called done() at step %d: %s", step + 1, done_summary[:80])
+            return done_summary
+
+    log.warning("Agent hit step limit (%d)", MAX_AGENT_STEPS)
+    return "Agent reached maximum steps without completing."
+
+
+def _build_initial_parts(
+    vlm_payload: dict,
+    action: dict,
+    history: HistoryBuffer | None,
+    current_screenshot: bytes | None,
+) -> list[dict]:
+    """Build the initial message parts: screenshots + task prompt."""
+    parts: list[dict] = []
+
+    # Include screenshots so agent can read source content
+    if history:
+        entries = history.get_entries()
+        for i, entry in enumerate(entries):
+            b64 = base64.b64encode(entry.jpeg).decode()
+            parts.append({"text": f"[Screenshot {i + 1}/{len(entries)}]"})
+            parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
+
+    if current_screenshot:
+        b64 = base64.b64encode(current_screenshot).decode()
+        parts.append({"text": "[Current screenshot]"})
+        parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
+
+    friction = vlm_payload.get("friction", {})
+    prompt = f"""\
+The user approved this action. Complete it using the tools available to you.
+
+ACTION: {action.get('label', '')}
+DETAILS: {action.get('details', '')}
+
+Context:
+  User's task: {vlm_payload.get('inferred_task', '')}
+  Problem: {friction.get('description', '')}
+  Current state: {vlm_payload.get('checkpoint_note_update', '')}
+  Application: {vlm_payload.get('app_name', '')}
+  Source: {friction.get('source_context', '')}
+  Target: {friction.get('target_context', '')}
+
+INSTRUCTIONS:
+1. For BINARY files (PDFs, images, etc.): use your VISION. Read content directly
+   from the screenshots — this is your most reliable source for non-text files.
+2. For TEXT files (code, markdown, configs, txt): use read_file to get exact content.
+3. If you need a file but only know the filename (not the path), FIND IT FIRST:
+   - run_command("mdfind -name 'filename'") — fast macOS Spotlight search
+   - run_command("lsof -c AppName | grep filename") — find what file an app has open
+   Do NOT guess paths. Search first.
+4. Choose the right output method:
+   - Binary format targets (docx, ppt, website forms, PDFs): use output() — user will copy/paste.
+   - Existing plain text files (code, markdown, config): use write_file() — modify directly.
+   - write_file() only works on files that ALREADY EXIST. Confirm the path with read_file first.
+5. Use run_command to compile, test, or search for files. Never to write files.
+6. Do NOT hallucinate content. If you can't read something, say so.
+7. Call done() with a summary when the action is complete.
+
+Working directory: {os.getcwd()}"""
+
+    parts.append({"text": prompt})
+    return parts
+
+
+def _summarize_args(args: dict) -> str:
+    """Short summary of tool args for terminal display."""
+    parts = []
+    for k, v in args.items():
+        sv = str(v)
+        if len(sv) > 50:
+            sv = sv[:47] + "..."
+        parts.append(f"{k}={sv}")
+    return ", ".join(parts)
--- a/argus/loop.py
+++ b/argus/loop.py
@@ -0,0 +1,415 @@
+"""Main orchestrator loop — capture → analyze → notify → execute.
+
+Three concurrent tasks:
+  1. VLM loop: capture screenshots, analyze, push results
+  2. Notification handler: show cards when friction detected, wait for response
+  3. Input reader: read user's terminal input (1=accept, 0=dismiss)
+
+Swift portability notes:
+  - The VLM loop becomes a Timer or DispatchSourceTimer
+  - The notification handler becomes SwiftUI state updates
+  - The input reader becomes button tap handlers
+  - The three tasks map to Swift's structured concurrency (async let / TaskGroup)
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import sys
+import time
+
+from argus.backend import send_analysis
+from argus.buffer import HistoryBuffer
+from argus.capture import capture_screenshot
+from argus.config import BUFFER_MAX_LEN, CAPTURE_INTERVAL_S, VLM_CALL_EVERY_N
+from argus.executor import execute
+from argus.notification import NotificationManager
+from argus.session import SessionManager
+from argus.vlm import TaskContext, VLMResult, analyze_screenshot
+
+log = logging.getLogger(__name__)
+
+
+# Track session actions already shown to avoid repeating
+_handled_session_actions: set[str] = set()
+
+
+def _session_action_key(sa) -> str:
+    return f"{sa.type}:{sa.session_id}"
+
+
+def _show_session_card(title: str, body: str, options: list[str]) -> None:
+    """Display a session-related card (not friction). Swift: native notification."""
+    print()
+    print("┌" + "─" * 58 + "┐")
+    print(f"│ {title:<56} │")
+    print("│" + " " * 58 + "│")
+    for line in body.split("\n"):
+        print(f"│ {line:<56} │")
+    print("│" + " " * 58 + "│")
+    for i, opt in enumerate(options):
+        print(f"│  [{i + 1}] {opt:<53} │")
+    print(f"│  [0] Not now{' ' * 44} │")
+    print("└" + "─" * 58 + "┘")
+    print()
+
+
+async def _wait_for_input() -> int:
+    """Wait for a single integer input from stdin. Returns the number or 0."""
+    loop = asyncio.get_event_loop()
+    line = await loop.run_in_executor(None, sys.stdin.readline)
+    try:
+        return int(line.strip())
+    except (ValueError, EOFError):
+        return 0
+
+
+async def _handle_session_action(sa, result, sessions: SessionManager, notifier: NotificationManager) -> None:
+    """Handle VLM session_action output — show card and act on approval.
+
+    Swift portability: becomes SwiftUI state changes + button handlers.
+    """
+    # Skip if we already showed this exact action
+    key = _session_action_key(sa)
+    if key in _handled_session_actions:
+        return
+    _handled_session_actions.add(key)
+
+    if sa.type == "resume":
+        session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
+        checkpoint = session.checkpoint_note if session else ""
+        task_title = session.task_title if session else sa.reason
+
+        _show_session_card(
+            f"📂 Resume: {task_title}",
+            f"You left off: {checkpoint}" if checkpoint else sa.reason,
+            ["Resume session"],
+        )
+        choice = await _wait_for_input()
+        if choice == 1:
+            if not sessions._mock:
+                resume_card = await sessions.get_resume_card(sa.session_id)
+                if resume_card:
+                    rc = resume_card.get("resume_card", {})
+                    print(f"  💡 {rc.get('welcome_back', '')}")
+                    print(f"     {rc.get('you_were_doing', '')}")
+                    print(f"     {rc.get('next_step', '')}")
+                    print(f"     {rc.get('motivation', '')}")
+            else:
+                print(f"  ✓ Resumed \"{task_title}\"")
+                if checkpoint:
+                    print(f"    Last checkpoint: {checkpoint}")
+            if session:
+                session.status = "active"
+                sessions._active_session = session
+        else:
+            print("  dismissed.")
+
+    elif sa.type == "start_new":
+        task = result.inferred_task
+        _show_session_card(
+            "🆕 New focus session",
+            f"You're working on: {task}",
+            ["Start focus session"],
+        )
+        choice = await _wait_for_input()
+        if choice == 1:
+            if not sessions._mock:
+                resp = await sessions.start_session(task)
+                if resp:
+                    print(f"  ✓ Session started: {resp.get('id', '?')}")
+            else:
+                print(f"  ✓ (mock) Session started for \"{task}\"")
+        else:
+            print("  dismissed.")
+
+    elif sa.type == "complete":
+        session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
+        task_title = session.task_title if session else "session"
+
+        _show_session_card(
+            f"✅ Complete: {task_title}",
+            sa.reason,
+            ["Complete session"],
+        )
+        choice = await _wait_for_input()
+        if choice == 1 and sa.session_id:
+            if not sessions._mock:
+                ok = await sessions.end_session(sa.session_id)
+                if ok:
+                    print(f"  ✓ Session completed")
+            else:
+                print(f"  ✓ (mock) Session \"{task_title}\" completed")
+            sessions._sessions = [s for s in sessions._sessions if s.session_id != sa.session_id]
+            if sessions._active_session and sessions._active_session.session_id == sa.session_id:
+                sessions._active_session = None
+        else:
+            print("  dismissed.")
+
+    elif sa.type == "switch":
+        session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
+        task_title = session.task_title if session else "another task"
+        _show_session_card(
+            f"🔄 Switch to: {task_title}",
+            sa.reason,
+            [f"Switch to \"{task_title}\""],
+        )
+        choice = await _wait_for_input()
+        if choice == 1:
+            print(f"  ✓ Switched to \"{task_title}\"")
+        else:
+            print("  dismissed.")
+
+
+def _is_valid_session_id(session_id: str | None, sessions: SessionManager) -> bool:
+    """Check if a session_id from VLM output actually exists in our sessions."""
+    if not session_id:
+        return False
+    return any(s.session_id == session_id for s in sessions.sessions)
+
+
+
+
+async def run_loop(
+    ctx: TaskContext,
+    *,
+    api_key: str | None = None,
+    vlm_backend: str | None = None,
+    jwt: str | None = None,
+    base_url: str | None = None,
+    dry_run: bool = False,
+    max_iterations: int | None = None,
+    on_result: None | (callable) = None,
+    auto_execute: bool = False,
+    mock_sessions: list | None = None,
+) -> None:
+    """Run the Argus VLM loop with notification and execution support.
+
+    Args:
+        ctx: Task/session context.
+        api_key: Gemini API key override.
+        vlm_backend: "ollama" or "gemini".
+        jwt: Backend JWT override.
+        base_url: Backend base URL override.
+        dry_run: If True, skip sending to backend.
+        max_iterations: Stop after N iterations (None = forever).
+        on_result: Optional callback(VLMResult) per analysis.
+        auto_execute: If True, enable notification + executor flow.
+    """
+    history = HistoryBuffer(image_maxlen=BUFFER_MAX_LEN)
+    notifier = NotificationManager()
+    sessions = SessionManager(jwt=jwt, base_url=base_url)
+    iteration = 0
+
+    log.info(
+        "Argus loop starting — interval=%ds, session=%s, task=%s, executor=%s",
+        CAPTURE_INTERVAL_S,
+        ctx.session_id,
+        ctx.task_title,
+        "on" if auto_execute else "off",
+    )
+
+    # Load sessions — mock or from backend
+    use_mock = mock_sessions is not None
+    if use_mock:
+        sessions._sessions = mock_sessions
+        sessions._active_session = next((s for s in mock_sessions if s.status == "active"), None)
+        sessions._mock = True  # prevent refresh from overwriting
+        log.info("Loaded %d mock sessions", len(mock_sessions))
+    else:
+        sessions._mock = False
+        try:
+            await sessions.fetch_open_sessions()
+        except Exception:
+            log.exception("Startup: failed to fetch sessions — continuing without session context")
+
+    # Log attachment status so it's visible in the console
+    if sessions.active:
+        s = sessions.active
+        log.info(
+            "ATTACHED to session %s — task=%r  last_app=%s  checkpoint=%r",
+            s.session_id[:8], s.task_title, s.last_app or "(unknown)", s.checkpoint_note or "(none)",
+        )
+    else:
+        log.info("No active session found — running in monitoring-only mode (dry-run=%s)", dry_run)
+
+
+    # Pending frames collected between VLM calls
+    pending_frames: list[bytes] = []
+    capture_count = 0
+
+    try:
+        while max_iterations is None or iteration < max_iterations:
+            t0 = time.monotonic()
+
+            try:
+                # 0. Refresh sessions periodically
+                await sessions.maybe_refresh()
+
+                # 1. Capture screenshot every interval
+                screenshot = capture_screenshot()
+                pending_frames.append(screenshot)
+                capture_count += 1
+                log.debug("Captured frame %d (%d pending)", capture_count, len(pending_frames))
+
+                # 2. Only call VLM every N captures
+                if len(pending_frames) < VLM_CALL_EVERY_N:
+                    elapsed = time.monotonic() - t0
+                    sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed)
+                    if sleep_for > 0:
+                        await asyncio.sleep(sleep_for)
+                    continue
+
+                # ── VLM call with all pending frames ──
+                iteration += 1
+
+                # Push all pending frames into buffer (buffer keeps last BUFFER_MAX_LEN)
+                for frame in pending_frames:
+                    history.push(frame, "")  # summaries filled after VLM call
+                pending_frames.clear()
+
+                t_vlm = time.monotonic()
+                result: VLMResult = await analyze_screenshot(
+                    screenshot, ctx, history,
+                    api_key=api_key,
+                    backend=vlm_backend,
+                    session_context=sessions.format_for_prompt(),
+                )
+                t_vlm_done = time.monotonic()
+                payload = result.to_backend_payload(ctx.session_id)
+
+                log.info(
+                    "[%d] vlm=%.2fs  frames=%d  friction=%s  summary=%s",
+                    iteration,
+                    t_vlm_done - t_vlm,
+                    VLM_CALL_EVERY_N,
+                    result.friction.type,
+                    result.vlm_summary,
+                )
+
+                # Update last entry's summary now that we have it
+                history.set_last_output(payload)
+
+                # 4. Print / send to backend
+                if dry_run:
+                    print(json.dumps(payload, indent=2))
+                else:
+                    resp = await send_analysis(
+                        result, ctx.session_id, jwt=jwt, base_url=base_url
+                    )
+                    log.debug("Backend response: %s", resp)
+
+                if auto_execute:
+                    sa = result.session_action
+
+                    # Validate session_action — check that VLM output relates to the session.
+                    # Match against filename, file stem (no extension), or task title.
+                    if sa.type in ("resume", "switch") and sa.session_id:
+                        session = next(
+                            (s for s in sessions.sessions if s.session_id == sa.session_id), None
+                        )
+                        if session:
+                            context = (result.vlm_summary + " " + result.inferred_task + " " + sa.reason).lower()
+                            matches = []
+                            if session.last_file:
+                                matches.append(session.last_file.lower())
+                                # Also check stem without extension (e.g. "receipt" from "receipt.pdf")
+                                stem = session.last_file.rsplit(".", 1)[0].lower()
+                                if stem:
+                                    matches.append(stem)
+                            if session.task_title:
+                                # Check key words from task title
+                                for word in session.task_title.lower().split():
+                                    if len(word) > 3:  # skip short words
+                                        matches.append(word)
+                            if matches and not any(m in context for m in matches):
+                                log.debug(
+                                    "Suppressing session_action=%s — none of %s found in context",
+                                    sa.type, matches,
+                                )
+                                sa.type = "none"
+
+                    if not _is_valid_session_id(sa.session_id, sessions) and sa.type in ("resume", "switch"):
+                        sa.type = "none"
+
+                    # 5. Session actions take priority — but only if not already handled
+                    session_handled = False
+                    if sa.type != "none":
+                        key = _session_action_key(sa)
+                        if key not in _handled_session_actions:
+                            await _handle_session_action(sa, result, sessions, notifier)
+                            session_handled = True
+
+                    # 6. Friction notification + executor
+                    if not session_handled and notifier.should_notify(payload):
+                        card = notifier.create_card(payload)
+                        notifier.show_card_terminal(card)
+
+                        choice = await _wait_for_input()
+                        action_idx = choice - 1
+
+                        if choice > 0:
+                            print(f"\n⚡ Executing action {action_idx + 1}...")
+                            summary = await execute(
+                                payload, action_idx,
+                                history=history,
+                                current_screenshot=screenshot,
+                                api_key=api_key,
+                            )
+                            # Emit a parseable JSON block so Swift can surface the result
+                            # in the HUD and clear the executing spinner.
+                            print(json.dumps({"exec_summary": summary or ""}, indent=2), flush=True)
+                            if summary:
+                                history.set_last_execution(summary)
+                        else:
+                            print("dismissed.")
+
+                    # 7. Nudge — VLM decided a nudge is appropriate (only if nothing else fired)
+                    elif not session_handled and result.gentle_nudge:
+                        print(f"\n💛 {result.gentle_nudge}")
+
+                    # 8. Fallback: suggest new session if VLM didn't
+                    elif not session_handled and (
+                        sa.type == "none"
+                        and result.on_task
+                        and not sessions.active
+                        and sessions.should_suggest_new_session(result.inferred_task)
+                    ):
+                        task = result.inferred_task
+                        print(f"\n🆕 You've been working on \"{task}\" — start a focus session?")
+                        print(f"   [1] Start  [0] Not now")
+                        card = notifier.create_card({
+                            "friction": {
+                                "type": "none",
+                                "confidence": 1.0,
+                                "description": f"New task detected: {task}",
+                                "proposed_actions": [{"label": "Start focus session", "action_type": "other", "details": task}],
+                            }
+                        })
+                        notifier.show_card_terminal(card)
+                        accepted, _ = await notifier.wait_for_response(timeout=60.0)
+                        if accepted:
+                            resp = await sessions.start_session(task)
+                            if resp:
+                                print(f"  ✓ Session started: {resp.get('id', '?')}")
+
+                # Clear execution context after 3 iterations
+                if history.get_last_execution() and iteration % 3 == 0:
+                    history.clear_last_execution()
+
+                # 9. Callback
+                if on_result:
+                    on_result(result)
+
+            except Exception:
+                log.exception("Error in Argus loop iteration %d", iteration)
+
+            # Sleep for remainder of capture interval
+            elapsed = time.monotonic() - t0
+            sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed)
+            if sleep_for > 0:
+                await asyncio.sleep(sleep_for)
+    finally:
+        pass
--- a/argus/notification.py
+++ b/argus/notification.py
@@ -0,0 +1,172 @@
+"""Notification manager — decides when to show action cards to the user.
+
+Only pushes a new notification when the proposed action meaningfully changes.
+In production (Swift), this becomes a native macOS notification / floating card.
+Here it's a terminal prompt for testing.
+
+Swift portability notes:
+  - NotificationManager becomes a class with @Published properties
+  - show_card() triggers a SwiftUI overlay or UNUserNotificationCenter
+  - user_response() is wired to button taps instead of stdin
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from dataclasses import dataclass, field
+
+log = logging.getLogger(__name__)
+
+
+@dataclass
+class ActionCard:
+    """A proposed action shown to the user."""
+    friction_type: str
+    description: str
+    actions: list[dict]
+    source: str
+    target: str
+    vlm_payload: dict  # full VLM result for the executor
+
+
+class NotificationManager:
+    """Deduplicates notifications and manages the pending action card.
+
+    Only shows a new card when the friction type or proposed action labels
+    change from the previous notification.
+    """
+
+    def __init__(self):
+        self._last_fingerprint: str = ""
+        self._pending_card: ActionCard | None = None
+        self._response_event: asyncio.Event = asyncio.Event()
+        self._user_accepted: bool = False
+
+    def _fingerprint(self, vlm_payload: dict) -> str:
+        """Generate a dedup key from friction type + action labels."""
+        friction = vlm_payload.get("friction", {})
+        ftype = friction.get("type", "none")
+        labels = tuple(
+            a.get("label", "") for a in friction.get("proposed_actions", [])
+        )
+        return f"{ftype}:{labels}"
+
+    # Action types that the executor can actually perform
+    ACTIONABLE_TYPES = {"auto_extract", "brain_dump", "auto_fill", "summarize"}
+
+    def should_notify(self, vlm_payload: dict) -> bool:
+        """Check if this VLM result warrants a friction card (executor action).
+
+        Only fires when proposed_actions contain something the executor can
+        act on. Generic suggestions (action_type: "other") are nudges, not
+        executor actions.
+        """
+        friction = vlm_payload.get("friction", {})
+
+        # No friction or no proposed actions → no notification
+        if friction.get("type") == "none":
+            return False
+        actions = friction.get("proposed_actions", [])
+        if not actions:
+            return False
+
+        # Only notify if at least one action is executor-actionable
+        has_actionable = any(
+            a.get("action_type") in self.ACTIONABLE_TYPES for a in actions
+        )
+        if not has_actionable:
+            return False
+
+        # Same as last notification → skip
+        fp = self._fingerprint(vlm_payload)
+        if fp == self._last_fingerprint:
+            return False
+
+        return True
+
+    def create_card(self, vlm_payload: dict) -> ActionCard:
+        """Create an action card from VLM output and mark it as pending."""
+        friction = vlm_payload.get("friction", {})
+        card = ActionCard(
+            friction_type=friction.get("type", ""),
+            description=friction.get("description", ""),
+            actions=friction.get("proposed_actions", []),
+            source=friction.get("source_context", ""),
+            target=friction.get("target_context", ""),
+            vlm_payload=vlm_payload,
+        )
+        self._pending_card = card
+        self._last_fingerprint = self._fingerprint(vlm_payload)
+        self._response_event.clear()
+        self._user_accepted = False
+        return card
+
+    def show_card_terminal(self, card: ActionCard) -> None:
+        """Print the action card to terminal. (Swift: show native UI.)"""
+        print()
+        print("┌" + "─" * 58 + "┐")
+        print(f"│ {'🔧 Friction detected':^58} │")
+        print("│" + " " * 58 + "│")
+        desc_lines = _wrap(card.description, 56)
+        for line in desc_lines:
+            print(f"│ {line:<56} │")
+        print("│" + " " * 58 + "│")
+        for i, action in enumerate(card.actions):
+            label = action.get("label", "?")
+            print(f"│  [{i + 1}] {label:<53} │")
+        print(f"│  [0] Not now{' ' * 44} │")
+        print("└" + "─" * 58 + "┘")
+        print()
+
+    async def wait_for_response(self, timeout: float = 30.0) -> tuple[bool, int]:
+        """Wait for user response. Returns (accepted, action_index).
+
+        Swift portability: this becomes a button tap callback.
+        """
+        try:
+            await asyncio.wait_for(self._response_event.wait(), timeout=timeout)
+            return self._user_accepted, self._selected_index
+        except asyncio.TimeoutError:
+            log.debug("Notification timed out, dismissing")
+            self._pending_card = None
+            return False, -1
+
+    def submit_response(self, choice: int) -> None:
+        """Submit user's choice. Called from input reader task.
+
+        Swift portability: called from button tap handler.
+        """
+        if choice > 0 and self._pending_card:
+            self._user_accepted = True
+            self._selected_index = choice - 1
+        else:
+            self._user_accepted = False
+            self._selected_index = -1
+        self._response_event.set()
+
+    @property
+    def pending(self) -> ActionCard | None:
+        return self._pending_card
+
+    def dismiss(self) -> None:
+        """Dismiss current card without action."""
+        self._pending_card = None
+        self._last_fingerprint = ""
+
+
+def _wrap(text: str, width: int) -> list[str]:
+    """Simple word wrap."""
+    words = text.split()
+    lines: list[str] = []
+    current = ""
+    for word in words:
+        if len(current) + len(word) + 1 <= width:
+            current = f"{current} {word}" if current else word
+        else:
+            if current:
+                lines.append(current)
+            current = word
+    if current:
+        lines.append(current)
+    return lines or [""]
--- a/argus/requirements.txt
+++ b/argus/requirements.txt
@@ -0,0 +1,3 @@
+httpx>=0.27
+Pillow>=10.0
+python-dotenv>=1.0
--- a/argus/session.py
+++ b/argus/session.py
@@ -0,0 +1,293 @@
+"""Session manager — fetches and caches session state from the backend.
+
+Provides session context to the VLM prompt and handles session lifecycle
+actions (start, resume, switch, complete).
+
+Swift portability notes:
+  - SessionManager becomes an ObservableObject with @Published properties
+  - Backend calls use URLSession instead of httpx
+  - match_session() logic is identical
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from dataclasses import dataclass, field
+
+import httpx
+
+from argus.config import BACKEND_BASE_URL, BACKEND_JWT
+
+log = logging.getLogger(__name__)
+
+
+@dataclass
+class SessionInfo:
+    session_id: str
+    task_id: str | None
+    task_title: str
+    task_goal: str
+    status: str  # active | interrupted
+    last_app: str
+    last_file: str
+    checkpoint_note: str
+    started_at: str
+    ended_at: str | None
+    minutes_ago: int | None = None
+
+
+class SessionManager:
+    """Caches open sessions and provides matching + lifecycle operations."""
+
+    def __init__(self, jwt: str | None = None, base_url: str | None = None):
+        self._jwt = jwt or BACKEND_JWT
+        self._base_url = base_url or BACKEND_BASE_URL
+        self._sessions: list[SessionInfo] = []
+        self._active_session: SessionInfo | None = None
+        self._last_fetch: float = 0
+        self._fetch_interval = 30.0  # re-fetch every 30s
+        self._mock: bool = False
+
+        # Track inferred_task stability for new session suggestion
+        self._inferred_task_history: list[str] = []
+        self._stable_threshold = 3  # consecutive matching inferred_tasks
+
+    @property
+    def active(self) -> SessionInfo | None:
+        return self._active_session
+
+    @property
+    def sessions(self) -> list[SessionInfo]:
+        return self._sessions
+
+    def has_sessions(self) -> bool:
+        return len(self._sessions) > 0
+
+    # ── Backend communication ────────────────────────────────────────
+
+    async def fetch_open_sessions(self) -> list[SessionInfo]:
+        """Fetch the active session from backend (GET /sessions/active → single object or 404).
+        Called on startup and periodically.
+        """
+        url = f"{self._base_url}/sessions/active"
+        headers = {}
+        if self._jwt:
+            headers["Authorization"] = f"Bearer {self._jwt}"
+
+        try:
+            async with httpx.AsyncClient(timeout=10.0) as client:
+                resp = await client.get(url, headers=headers)
+
+            if resp.status_code == 404:
+                # No active session — clear cache
+                self._sessions = []
+                self._active_session = None
+                self._last_fetch = time.monotonic()
+                log.info("No active session on backend")
+                return []
+
+            resp.raise_for_status()
+            data = resp.json()
+            session = self._parse_session(data)
+            self._sessions = [session]
+            self._active_session = session
+            self._last_fetch = time.monotonic()
+            log.info("Fetched active session: %s (%s)", session.session_id, session.task_title)
+            return self._sessions
+
+        except httpx.HTTPStatusError as e:
+            log.warning("Failed to fetch sessions: %s", e.response.status_code)
+            return self._sessions
+        except httpx.RequestError as e:
+            log.debug("Backend unreachable for sessions: %s", e)
+            return self._sessions
+
+    async def maybe_refresh(self) -> None:
+        """Re-fetch if stale. Skips if using mock data."""
+        if self._mock:
+            return
+        if time.monotonic() - self._last_fetch > self._fetch_interval:
+            await self.fetch_open_sessions()
+
+    async def start_session(self, task_title: str, task_goal: str = "") -> dict | None:
+        """Start a new focus session. Returns session data or None."""
+        url = f"{self._base_url}/sessions/start"
+        headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {}
+
+        # If we have a task_title but no task_id, the backend should
+        # create an ad-hoc session (or we create the task first).
+        payload = {"platform": "mac"}
+
+        try:
+            async with httpx.AsyncClient(timeout=10.0) as client:
+                resp = await client.post(url, json=payload, headers=headers)
+                resp.raise_for_status()
+            result = resp.json()
+            await self.fetch_open_sessions()  # refresh
+            log.info("Started session: %s", result.get("id"))
+            return result
+        except Exception:
+            log.exception("Failed to start session")
+            return None
+
+    async def end_session(self, session_id: str, status: str = "completed") -> bool:
+        """End a session. Returns True on success."""
+        url = f"{self._base_url}/sessions/{session_id}/end"
+        headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {}
+
+        try:
+            async with httpx.AsyncClient(timeout=10.0) as client:
+                resp = await client.post(url, json={"status": status}, headers=headers)
+                resp.raise_for_status()
+            await self.fetch_open_sessions()  # refresh
+            log.info("Ended session %s (%s)", session_id, status)
+            return True
+        except Exception:
+            log.exception("Failed to end session %s", session_id)
+            return False
+
+    async def get_resume_card(self, session_id: str) -> dict | None:
+        """Fetch AI-generated resume card for a session."""
+        url = f"{self._base_url}/sessions/{session_id}/resume"
+        headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {}
+
+        try:
+            async with httpx.AsyncClient(timeout=15.0) as client:
+                resp = await client.get(url, headers=headers)
+                resp.raise_for_status()
+            return resp.json()
+        except Exception:
+            log.exception("Failed to get resume card for %s", session_id)
+            return None
+
+    # ── Session matching ─────────────────────────────────────────────
+
+    def match_session(self, inferred_task: str, app_name: str) -> SessionInfo | None:
+        """Find an open session that matches the current screen state.
+        Returns the best match or None.
+        """
+        if not inferred_task or not self._sessions:
+            return None
+
+        inferred_lower = inferred_task.lower()
+
+        best_match: SessionInfo | None = None
+        best_score = 0
+
+        for session in self._sessions:
+            score = 0
+            title_lower = session.task_title.lower()
+
+            # Check for keyword overlap between inferred_task and session task_title
+            inferred_words = set(inferred_lower.split())
+            title_words = set(title_lower.split())
+            overlap = inferred_words & title_words
+            # Filter out common words
+            overlap -= {"the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with"}
+            score += len(overlap) * 2
+
+            # App match
+            if app_name and session.last_app and app_name.lower() in session.last_app.lower():
+                score += 3
+
+            # File match — check if session's last_file appears in inferred_task
+            if session.last_file and session.last_file.lower() in inferred_lower:
+                score += 5
+
+            if score > best_score:
+                best_score = score
+                best_match = session
+
+        # Require minimum score to avoid false matches
+        if best_score >= 4:
+            return best_match
+        return None
+
+    def should_suggest_new_session(self, inferred_task: str) -> bool:
+        """Check if we should suggest starting a new session.
+        Returns True if inferred_task has been stable for N iterations
+        and doesn't match any existing session.
+        """
+        if not inferred_task:
+            self._inferred_task_history.clear()
+            return False
+
+        self._inferred_task_history.append(inferred_task)
+        # Keep only recent history
+        if len(self._inferred_task_history) > self._stable_threshold + 2:
+            self._inferred_task_history = self._inferred_task_history[-self._stable_threshold - 2:]
+
+        if len(self._inferred_task_history) < self._stable_threshold:
+            return False
+
+        # Check if the last N inferred tasks are "similar" (share key words)
+        recent = self._inferred_task_history[-self._stable_threshold:]
+        first_words = set(recent[0].lower().split()) - {"the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with"}
+        all_similar = all(
+            len(first_words & set(t.lower().split())) >= len(first_words) * 0.5
+            for t in recent[1:]
+        )
+
+        if not all_similar:
+            return False
+
+        # Make sure it doesn't match an existing session
+        matched = self.match_session(inferred_task, "")
+        return matched is None
+
+    # ── Prompt formatting ────────────────────────────────────────────
+
+    def format_for_prompt(self) -> str:
+        """Format open sessions for injection into the VLM prompt.
+        Includes session_id so VLM can reference the exact ID in session_action.
+        """
+        if not self._sessions:
+            return "(no open sessions)"
+
+        lines: list[str] = []
+        for s in self._sessions:
+            status_tag = f"[{s.status}]"
+            ago = f" (paused {s.minutes_ago}m ago)" if s.minutes_ago else ""
+            line = f"  session_id=\"{s.session_id}\" {status_tag} \"{s.task_title}\" — last in {s.last_app}"
+            if s.last_file:
+                line += f"/{s.last_file}"
+            if s.checkpoint_note:
+                line += f", \"{s.checkpoint_note}\""
+            line += ago
+            lines.append(line)
+        return "\n".join(lines)
+
+    # ── Internal ─────────────────────────────────────────────────────
+
+    def _parse_session(self, data: dict) -> SessionInfo:
+        checkpoint = data.get("checkpoint", {}) or {}
+        ended = data.get("ended_at")
+        minutes_ago = None
+        if ended:
+            import datetime
+            try:
+                ended_dt = datetime.datetime.fromisoformat(ended.replace("Z", "+00:00"))
+                now = datetime.datetime.now(datetime.timezone.utc)
+                minutes_ago = int((now - ended_dt).total_seconds() / 60)
+            except (ValueError, TypeError):
+                pass
+
+        # SessionOut has no nested "task" object — task title is stored in checkpoint["goal"]
+        # by POST /sessions/start when a task_id is provided.
+        task_title = checkpoint.get("goal", "")
+        checkpoint_note = checkpoint.get("last_action_summary", checkpoint.get("last_vlm_summary", ""))
+
+        return SessionInfo(
+            session_id=str(data.get("id", "")),
+            task_id=data.get("task_id"),
+            task_title=task_title,
+            task_goal=task_title,
+            status=data.get("status", ""),
+            last_app=checkpoint.get("active_app", ""),
+            last_file=checkpoint.get("active_file", ""),
+            checkpoint_note=checkpoint_note,
+            started_at=data.get("started_at", ""),
+            ended_at=ended,
+            minutes_ago=minutes_ago,
+        )
--- a/argus/vlm.py
+++ b/argus/vlm.py
@@ -0,0 +1,442 @@
+"""VLM client — supports Ollama (local, default) and Gemini (cloud fallback).
+
+Sends the current screenshot plus text summaries of recent analyses.
+Parses the structured JSON response.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import base64
+import json
+import logging
+import re
+from dataclasses import dataclass, field
+
+import httpx
+
+from argus.buffer import HistoryBuffer
+from argus.config import (
+    GEMINI_API_KEY,
+    GEMINI_URL,
+    OLLAMA_BASE_URL,
+    OLLAMA_MODEL,
+    VLM_BACKEND,
+)
+
+log = logging.getLogger(__name__)
+
+# ── Task context passed in from the session ──────────────────────────────
+
+
+@dataclass
+class StepInfo:
+    id: str
+    sort_order: int
+    title: str
+    status: str  # pending | in_progress | done | skipped
+    checkpoint_note: str | None = None
+
+
+@dataclass
+class TaskContext:
+    """Context about the user's current task/session, fed into the VLM prompt."""
+
+    task_title: str = ""
+    task_goal: str = ""
+    steps: list[StepInfo] = field(default_factory=list)
+    window_title: str = ""
+    session_id: str = ""
+
+
+# ── VLM response schema ─────────────────────────────────────────────────
+
+
+@dataclass
+class FrictionInfo:
+    type: str = "none"
+    confidence: float = 0.0
+    description: str | None = None
+    proposed_actions: list[dict] = field(default_factory=list)
+    source_context: str | None = None
+    target_context: str | None = None
+
+
+@dataclass
+class SessionAction:
+    type: str = "none"  # resume | switch | complete | start_new | none
+    session_id: str | None = None
+    reason: str = ""
+
+
+@dataclass
+class VLMResult:
+    on_task: bool = True
+    current_step_id: str | None = None
+    inferred_task: str = ""
+    checkpoint_note_update: str | None = None
+    steps_completed: list[str] = field(default_factory=list)
+    friction: FrictionInfo = field(default_factory=FrictionInfo)
+    session_action: SessionAction = field(default_factory=SessionAction)
+    intent: str | None = None
+    distraction_type: str | None = None
+    app_name: str = ""
+    confidence: float = 0.0
+    gentle_nudge: str | None = None
+    vlm_summary: str = ""
+
+    def to_backend_payload(self, session_id: str) -> dict:
+        """Serialize to the JSON shape expected by POST /distractions/analyze-result."""
+        return {
+            "session_id": session_id,
+            "on_task": self.on_task,
+            "current_step_id": self.current_step_id,
+            "inferred_task": self.inferred_task,
+            "checkpoint_note_update": self.checkpoint_note_update,
+            "steps_completed": self.steps_completed,
+            "friction": {
+                "type": self.friction.type,
+                "confidence": self.friction.confidence,
+                "description": self.friction.description,
+                "proposed_actions": self.friction.proposed_actions,
+                "source_context": self.friction.source_context,
+                "target_context": self.friction.target_context,
+            },
+            "session_action": {
+                "type": self.session_action.type,
+                "session_id": self.session_action.session_id,
+                "reason": self.session_action.reason,
+            },
+            "intent": self.intent,
+            "distraction_type": self.distraction_type,
+            "app_name": self.app_name,
+            "confidence": self.confidence,
+            "gentle_nudge": self.gentle_nudge,
+            "vlm_summary": self.vlm_summary,
+        }
+
+
+# ── Prompt construction ──────────────────────────────────────────────────
+
+
+def _format_steps(steps: list[StepInfo]) -> str:
+    lines: list[str] = []
+    for s in steps:
+        marker = {"pending": "○", "in_progress": "►", "done": "✓", "skipped": "–"}.get(
+            s.status, "?"
+        )
+        line = f"    {marker} [{s.status}] (id={s.id}) {s.sort_order}. {s.title}"
+        if s.checkpoint_note:
+            line += f"  — checkpoint: {s.checkpoint_note}"
+        lines.append(line)
+    return "\n".join(lines) if lines else "    (no steps)"
+
+
+def build_system_prompt(ctx: TaskContext, history: HistoryBuffer, session_context: str = "") -> str:
+    history_text = history.format_for_prompt()
+    steps_text = _format_steps(ctx.steps)
+    prev_output = history.format_last_output()
+
+    prev_section = ""
+    if prev_output:
+        prev_section = f"""
+Your previous analysis (refine or correct this based on new evidence):
+{prev_output}
+If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it with new observations.
+"""
+
+    execution = history.get_last_execution()
+    exec_section = ""
+    if execution:
+        exec_section = f"""
+IMPORTANT — An AI agent just completed an action for the user:
+  {execution}
+This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT.
+"""
+
+    return f"""\
+You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots.
+
+## How to read the screenshots
+
+You receive screenshots in chronological order (oldest first, newest last).
+You receive ~5 frames spanning ~20 seconds (one frame every 4 seconds). This means:
+  - 2 unchanged frames = 8+ seconds idle. That's significant.
+  - 3+ unchanged frames = 12-20 seconds idle. The user is stuck or distracted.
+  - If ALL frames are identical, the user has been idle for 20 seconds — definitely flag it.
+  - If the user wrote code and then 2+ frames show no changes, they are STUCK NOW.
+Do NOT wait for many frames to flag problems. React fast.
+
+Your PRIMARY signal is the DIFFERENCES between consecutive frames.
+Where the screen CHANGED = where the user's ATTENTION is.
+Where the screen is STATIC = background noise. Ignore it unless the user interacts with it.
+
+Diff signals and what they mean:
+  - New text appearing / cursor advancing → user is actively typing (THIS is their task)
+  - Window or tab switch → context change, could be reference lookup or distraction
+  - Same content, no pixel changes → stalled, idle, or reading
+  - Repeated switching between same 2-3 apps → repetitive loop (manual data transfer)
+  - Scroll position change → reading or browsing
+  - Error message that APPEARED between frames → user just triggered it, relevant
+  - Error message that was ALREADY THERE in all frames → stale, ignore it
+
+## Task inference
+
+Infer the user's current task from what they are ACTIVELY DOING across the frames.
+Do NOT assume static content (old terminal output, background panels, stale errors)
+is the task. The region of the screen where pixels are changing IS the task.
+
+CRITICAL — looking at something ≠ working on something:
+  - User switches to Preview/browser/another app and just LOOKS → this is NOT a new task.
+    It could be a distraction, a quick reference, or idle browsing.
+  - User switches to another app AND starts TYPING/EDITING → this might be a new task.
+  - If the user has an active session and switches away WITHOUT typing in the new app,
+    they are DISTRACTED from their session, not starting a new task.
+  - Only infer a new task when there is clear evidence of productive work (typing, editing,
+    cursor movement between frames) in the new context.
+  - A single app switch is NEVER enough to infer a new task. Wait for active work.
+
+If an explicit task is provided below, use it. Otherwise, infer from the screenshots.
+  Task: {ctx.task_title}
+  Goal: {ctx.task_goal}
+  Steps:
+{steps_text}
+  Window title reported by OS: {ctx.window_title}
+
+## Open sessions from backend (use EXACT session_id values below)
+{session_context if session_context else "(no open sessions — suggest start_new if user is working on something)"}
+Session matching rules — be STRICT:
+  - A session matches ONLY if the user is actively editing the session's last_file.
+    Being in the same app (e.g. VS Code) is NOT enough. The user must be typing/editing
+    in the specific file listed in the session (e.g. solution.cpp).
+  - Chatting in a sidebar, reading logs, or browsing in the same app ≠ working on the session.
+  - If the user is in VS Code but editing a DIFFERENT file or chatting in a panel,
+    they are NOT on-task for the session. That's a distraction.
+If the session's file IS being actively edited, output session_action: resume with the EXACT session_id.
+If the user moved to a different open session's file, output session_action: switch with the EXACT session_id.
+IMPORTANT: If you propose a friction action (e.g. auto_extract) that relates to an existing
+session's task, ALSO output session_action: resume with that session's ID. The friction action
+and session resume should go together — don't propose work related to a session without
+linking it to the session.
+If the user finished and closed the session's file, output session_action: complete with the EXACT session_id.
+If no sessions exist but the user is actively working, output session_action: start_new (session_id: null).
+NEVER invent session IDs. Use only the IDs listed above or null.
+{prev_section}{exec_section}
+Screenshot timeline:
+{history_text}
+
+## What to analyze
+
+1. INFERRED TASK: What is the user actually working on right now? Base this on where
+   the screen is changing, not on static content.
+2. CHECKPOINT: What specific progress did the user make across these frames?
+   Describe what changed (e.g., "typed 3 new lines of C++ code", "scrolled to next section").
+3. FRICTION DETECTION: Is the user stuck in any of these patterns?
+   - REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually)
+   - STALLED: No meaningful pixel changes across 2+ frames, OR user wrote code then
+     deleted/undid it (write-then-delete = struggle, NOT "refining")
+   - TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand)
+   - CONTEXT_OVERHEAD: Many windows open, visibly searching across them
+   - TASK_RESUMPTION: User just returned to a task from earlier (check text history)
+
+   IMPORTANT signals to catch IMMEDIATELY (do NOT wait many frames):
+   - User wrote code/text then deleted it → STUCK, not refining. Flag stalled.
+   - User idle for 2+ frames after deletion → definitely stuck. Flag stalled.
+   - User switching between source doc and target file repeatedly → TEDIOUS_MANUAL.
+     This is NOT "fluent workflow." If the user is copying data from one place to
+     another by switching windows, flag it on the SECOND switch. Don't wait.
+   - Code is incomplete/wrong and user stopped typing → need help.
+4. INTENT: If viewing informational content, is the user skimming, engaged, or unclear?
+5. PROPOSED ACTION: If friction detected, suggest what the AI could DO.
+   Be SPECIFIC: "Extract full text from writeup.pdf into transcript.md" not "Summarize text".
+   The label should be a concrete verb phrase the user can approve with one tap.
+
+   CRITICAL: The "details" field is the executor agent's instruction manual. Write it as
+   a natural language spec — the executor has vision too, so tell it where to look and
+   what to do, not the raw data:
+   Bad:  "Extract data from the document"
+   Bad:  "Help with the code"
+   Good: "User is writing a report in report.md and has a PDF open with source data. They are manually copying table values from the PDF into markdown. Extract the table from the PDF (visible in screenshots), format as a markdown table matching the style in report.md, and append to the file."
+   Good: "User is implementing quicksort in solution.cpp but has been idle after writing the function signature. They appear stuck on the partition logic. Provide a working C++ quicksort implementation that fits their existing code structure."
+
+Respond ONLY with JSON:
+{{
+  "on_task": true,
+  "current_step_id": "step UUID or null",
+  "inferred_task": "what the user is actually working on, based on screen diffs",
+  "checkpoint_note_update": "what changed across these frames specifically",
+  "steps_completed": ["UUIDs"],
+  "friction": {{
+    "type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none",
+    "confidence": 0.0-1.0,
+    "description": "what the user is struggling with, based on diff evidence",
+    "proposed_actions": [
+      {{"label": "specific verb phrase: what to do, from where, to where", "action_type": "auto_extract | brain_dump | auto_fill | summarize | other", "details": "Natural language spec for executor. Include: (1) what the user wants done, (2) where to look in the screenshots, (3) EXACT format to use — quote what the user already wrote so the executor matches it (e.g. if user wrote '3 tacos with steak' in plain text, say 'format as plain text lines like the user already started, NOT JSON'), (4) target file. The executor has vision to read screenshots — tell it WHERE to look, not the raw data."}}
+    ],
+    "source_context": "just the filename if visible (e.g. writeup.pdf), or app name if no file",
+    "target_context": "just the filename if visible (e.g. transcript.md), or app name if no file"
+  }},
+  "session_action": {{
+    "type": "resume | switch | complete | start_new | none",
+    "session_id": "uuid of matching session, or null for start_new/none",
+    "reason": "why this session action is suggested"
+  }},
+  "intent": "skimming | engaged | unclear | null",
+  "distraction_type": "app_switch | browsing | idle | null",
+  "app_name": "primary visible application",
+  "confidence": 0.0-1.0,
+  "gentle_nudge": "nudge text if distracted, null otherwise",
+  "vlm_summary": "1-sentence description of what CHANGED across the frames (not what's static)"
+}}"""
+
+
+# ── Gemini API call ──────────────────────────────────────────────────────
+
+
+def _extract_json(text: str) -> dict:
+    """Extract JSON from Gemini response, handling markdown code fences."""
+    text = text.strip()
+    # Strip ```json ... ``` wrappers
+    m = re.search(r"```(?:json)?\s*([\s\S]*?)```", text)
+    if m:
+        text = m.group(1).strip()
+    return json.loads(text)
+
+
+def _parse_vlm_response(raw: dict) -> VLMResult:
+    """Parse the raw JSON dict into a VLMResult dataclass."""
+    friction_raw = raw.get("friction", {})
+    friction = FrictionInfo(
+        type=friction_raw.get("type", "none"),
+        confidence=friction_raw.get("confidence", 0.0),
+        description=friction_raw.get("description"),
+        proposed_actions=friction_raw.get("proposed_actions", []),
+        source_context=friction_raw.get("source_context"),
+        target_context=friction_raw.get("target_context"),
+    )
+    sa_raw = raw.get("session_action", {})
+    session_action = SessionAction(
+        type=sa_raw.get("type", "none"),
+        session_id=sa_raw.get("session_id"),
+        reason=sa_raw.get("reason", ""),
+    )
+    return VLMResult(
+        on_task=raw.get("on_task", True),
+        current_step_id=raw.get("current_step_id"),
+        inferred_task=raw.get("inferred_task", ""),
+        checkpoint_note_update=raw.get("checkpoint_note_update"),
+        steps_completed=raw.get("steps_completed", []),
+        friction=friction,
+        session_action=session_action,
+        intent=raw.get("intent"),
+        distraction_type=raw.get("distraction_type"),
+        app_name=raw.get("app_name", ""),
+        confidence=raw.get("confidence", 0.0),
+        gentle_nudge=raw.get("gentle_nudge"),
+        vlm_summary=raw.get("vlm_summary", ""),
+    )
+
+
+async def _call_ollama(system_prompt: str, b64_images: list[str]) -> str:
+    """Call Ollama local VLM with multiple images and return raw text."""
+    payload = {
+        "model": OLLAMA_MODEL,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {
+                "role": "user",
+                "content": "/no_think\nAnalyze this screenshot sequence now.",
+                "images": b64_images,
+            },
+        ],
+        "stream": False,
+        "keep_alive": -1,
+        "options": {"temperature": 0.2},
+    }
+
+    async with httpx.AsyncClient(timeout=300.0) as client:
+        resp = await client.post(f"{OLLAMA_BASE_URL}/api/chat", json=payload)
+        resp.raise_for_status()
+
+    body = resp.json()
+    return body["message"]["content"]
+
+
+async def _call_gemini(system_prompt: str, b64_images: list[str], api_key: str) -> str:
+    """Call Gemini Vision API with multiple images and return raw text."""
+    # Build parts: interleave images with labels, newest last
+    parts: list[dict] = []
+    total = len(b64_images)
+    for i, b64 in enumerate(b64_images):
+        parts.append({"text": f"[Screenshot {i + 1}/{total}]"})
+        parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
+    parts.append({"text": "Analyze this screenshot sequence now."})
+
+    payload = {
+        "systemInstruction": {"parts": [{"text": system_prompt}]},
+        "contents": [{"parts": parts}],
+        "generationConfig": {
+            "temperature": 0.2,
+            "maxOutputTokens": 4096,
+        },
+    }
+
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        for attempt in range(3):
+            resp = await client.post(f"{GEMINI_URL}?key={api_key}", json=payload)
+            if resp.status_code == 429:
+                wait = 2 ** attempt
+                log.warning("Gemini 429 rate limited, retrying in %ds...", wait)
+                await asyncio.sleep(wait)
+                continue
+            resp.raise_for_status()
+            break
+        else:
+            resp.raise_for_status()
+
+    body = resp.json()
+    parts = body["candidates"][0]["content"]["parts"]
+    text = ""
+    for part in parts:
+        if "text" in part:
+            text = part["text"]
+    return text
+
+
+async def analyze_screenshot(
+    screenshot_jpeg: bytes,
+    ctx: TaskContext,
+    history: HistoryBuffer,
+    *,
+    api_key: str | None = None,
+    backend: str | None = None,
+    session_context: str = "",
+) -> VLMResult:
+    """Analyze a screenshot sequence via Ollama or Gemini.
+
+    Sends all buffered screenshots + the current one as images (oldest first).
+    """
+    which = backend or VLM_BACKEND
+    system_prompt = build_system_prompt(ctx, history, session_context=session_context)
+
+    # Build image list: buffered (oldest first) + current
+    b64_images: list[str] = []
+    for entry in history.get_entries():
+        b64_images.append(base64.b64encode(entry.jpeg).decode())
+    b64_images.append(base64.b64encode(screenshot_jpeg).decode())
+
+    log.debug("Sending %d screenshots to %s", len(b64_images), which)
+
+    if which == "ollama":
+        text = await _call_ollama(system_prompt, b64_images)
+    elif which == "gemini":
+        key = api_key or GEMINI_API_KEY
+        if not key:
+            raise RuntimeError("GEMINI_API_KEY not set")
+        text = await _call_gemini(system_prompt, b64_images, key)
+    else:
+        raise ValueError(f"Unknown VLM backend: {which}")
+
+    log.debug("VLM raw response: %s", text)
+    parsed = _extract_json(text)
+    return _parse_vlm_response(parsed)