include argus workflow

This commit is contained in:
joyzhuo
2026-03-29 06:29:18 -04:00
parent 275a53ab40
commit 56673078f5
23 changed files with 3098 additions and 307 deletions

0
argus/__init__.py Normal file
View File

113
argus/__main__.py Normal file
View File

@@ -0,0 +1,113 @@
"""CLI entry point: python -m argus [options]"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import sys
from argus.config import CAPTURE_INTERVAL_S
from argus.loop import run_loop
from argus.vlm import StepInfo, TaskContext
def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
prog="argus",
description="Argus VLM — proactive focus assistant screen analyzer",
)
p.add_argument("--session-id", default="00000000-0000-0000-0000-000000000000")
p.add_argument("--task-title", default="(no task)")
p.add_argument("--task-goal", default="")
p.add_argument(
"--steps-json",
default=None,
help='JSON array of steps: [{"id":"...", "sort_order":1, "title":"...", "status":"pending"}]',
)
p.add_argument("--window-title", default="")
p.add_argument("--vlm", choices=["ollama", "gemini"], default=None, help="VLM backend (default: ollama)")
p.add_argument("--gemini-key", default=None, help="Override GEMINI_API_KEY env var")
p.add_argument("--jwt", default=None, help="Override BACKEND_JWT env var")
p.add_argument("--backend-url", default=None, help="Override BACKEND_BASE_URL env var")
p.add_argument("--dry-run", action="store_true", help="Print JSON instead of POSTing")
p.add_argument("--execute", action="store_true", help="Enable notification + executor flow")
p.add_argument("--mock-sessions", default=None, help='JSON array of mock sessions: [{"task_title":"...", "status":"interrupted", "last_app":"VS Code", "last_file":"solution.cpp", "checkpoint_note":"stuck on impl"}]')
p.add_argument("--iterations", type=int, default=None, help="Stop after N iterations")
p.add_argument("-v", "--verbose", action="store_true")
return p.parse_args()
def main() -> None:
args = _parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(asctime)s %(levelname)-5s %(name)s %(message)s",
datefmt="%H:%M:%S",
)
# httpx logs every request/response at INFO — suppress to WARNING to avoid noisy 404s
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
steps: list[StepInfo] = []
if args.steps_json:
for s in json.loads(args.steps_json):
steps.append(
StepInfo(
id=s["id"],
sort_order=s["sort_order"],
title=s["title"],
status=s.get("status", "pending"),
checkpoint_note=s.get("checkpoint_note"),
)
)
ctx = TaskContext(
task_title=args.task_title,
task_goal=args.task_goal,
steps=steps,
window_title=args.window_title,
session_id=args.session_id,
)
# Parse mock sessions if provided
mock_sessions = None
if args.mock_sessions:
from argus.session import SessionInfo
mock_sessions = []
for i, s in enumerate(json.loads(args.mock_sessions)):
mock_sessions.append(
SessionInfo(
session_id=s.get("session_id", f"mock-{i}"),
task_id=s.get("task_id"),
task_title=s.get("task_title", ""),
task_goal=s.get("task_goal", ""),
status=s.get("status", "interrupted"),
last_app=s.get("last_app", ""),
last_file=s.get("last_file", ""),
checkpoint_note=s.get("checkpoint_note", ""),
started_at=s.get("started_at", ""),
ended_at=s.get("ended_at"),
minutes_ago=s.get("minutes_ago", 30),
)
)
asyncio.run(
run_loop(
ctx,
api_key=args.gemini_key,
vlm_backend=args.vlm,
jwt=args.jwt,
base_url=args.backend_url,
dry_run=args.dry_run,
max_iterations=args.iterations,
auto_execute=args.execute,
mock_sessions=mock_sessions,
)
)
if __name__ == "__main__":
main()

50
argus/backend.py Normal file
View File

@@ -0,0 +1,50 @@
"""Backend API client — sends VLM analysis results to the LockInBro server.
POST /distractions/analyze-result with the enriched JSON payload.
"""
from __future__ import annotations
import logging
import httpx
from argus.config import BACKEND_BASE_URL, BACKEND_JWT
from argus.vlm import VLMResult
log = logging.getLogger(__name__)
async def send_analysis(
result: VLMResult,
session_id: str,
*,
jwt: str | None = None,
base_url: str | None = None,
) -> dict:
"""Post VLM analysis result to the backend.
Returns the backend response body on success, or an error dict.
"""
url = f"{base_url or BACKEND_BASE_URL}/distractions/analyze-result"
token = jwt or BACKEND_JWT
payload = result.to_backend_payload(session_id)
headers = {}
if token:
headers["Authorization"] = f"Bearer {token}"
log.info("Sending analysis to %s on_task=%s", url, result.on_task)
log.debug("Payload: %s", payload)
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(url, json=payload, headers=headers)
resp.raise_for_status()
return resp.json()
except httpx.HTTPStatusError as exc:
log.error("Backend returned %s: %s", exc.response.status_code, exc.response.text)
return {"error": str(exc), "status_code": exc.response.status_code}
except httpx.RequestError as exc:
log.error("Backend unreachable: %s", exc)
return {"error": str(exc)}

123
argus/buffer.py Normal file
View File

@@ -0,0 +1,123 @@
"""Rolling history buffer for VLM screenshot analysis.
Two tiers:
- Image buffer: deque(maxlen=4) of recent screenshots sent as images
- Text history: deque(maxlen=12) of older VLM summaries + previous outputs
for extended context (what happened 30-60s ago) and self-refinement
"""
from __future__ import annotations
import time
from collections import deque
from dataclasses import dataclass, field
@dataclass
class BufferEntry:
jpeg: bytes
vlm_summary: str
timestamp: float = field(default_factory=time.time)
@dataclass
class TextEntry:
vlm_summary: str
timestamp: float
class HistoryBuffer:
def __init__(self, image_maxlen: int = 4, text_maxlen: int = 12):
self._images: deque[BufferEntry] = deque(maxlen=image_maxlen)
self._text_history: deque[TextEntry] = deque(maxlen=text_maxlen)
self._last_output: dict | None = None
self._last_execution: str | None = None
def push(self, jpeg: bytes, vlm_summary: str) -> None:
now = time.time()
# When an image entry gets evicted from the image buffer,
# it's already captured in text_history, so nothing extra needed.
self._images.append(BufferEntry(jpeg=jpeg, vlm_summary=vlm_summary, timestamp=now))
self._text_history.append(TextEntry(vlm_summary=vlm_summary, timestamp=now))
def set_last_output(self, output: dict) -> None:
"""Store the previous VLM JSON output for self-refinement."""
self._last_output = output
def set_last_execution(self, summary: str | None) -> None:
"""Store the result of the last executor action."""
self._last_execution = summary
def get_last_execution(self) -> str | None:
return self._last_execution
def clear_last_execution(self) -> None:
self._last_execution = None
def get_entries(self) -> list[BufferEntry]:
"""Return image entries oldest-first."""
return list(self._images)
def format_for_prompt(self) -> str:
"""Format the full timeline: text history + image labels."""
now = time.time()
lines: list[str] = []
# Text-only history (entries older than what's in the image buffer)
image_timestamps = {round(e.timestamp, 2) for e in self._images}
older = [
e for e in self._text_history
if round(e.timestamp, 2) not in image_timestamps
]
older = [e for e in older if e.vlm_summary] # skip empty summaries
if older:
lines.append("Older context (text only, no images):")
for entry in older:
ago = int(now - entry.timestamp)
lines.append(f" - [{ago}s ago] {entry.vlm_summary}")
lines.append("")
# Image timeline
n = len(self._images)
if n > 0:
lines.append(f"Recent screenshots ({n} prior + 1 current = {n + 1} images):")
for i, entry in enumerate(self._images):
ago = int(now - entry.timestamp)
lines.append(f" - Screenshot {i + 1}/{n + 1}: [{ago}s ago]")
lines.append(f" - Screenshot {n + 1}/{n + 1}: [now] (current)")
else:
lines.append("Screenshots:")
lines.append(" - Screenshot 1/1: [now] (current, first capture)")
return "\n".join(lines)
def format_last_output(self) -> str:
"""Format previous VLM output for self-refinement context."""
if not self._last_output:
return ""
import json
# Only include the key fields, not the full blob
prev = self._last_output
parts = [
f" on_task: {prev.get('on_task')}",
f" app: {prev.get('app_name')}",
f" friction: {prev.get('friction', {}).get('type')}",
f" summary: {prev.get('vlm_summary')}",
]
note = prev.get("checkpoint_note_update")
if note:
parts.append(f" checkpoint: {note}")
desc = prev.get("friction", {}).get("description")
if desc:
parts.append(f" friction_desc: {desc}")
return "\n".join(parts)
def __len__(self) -> int:
return len(self._images)
def clear(self) -> None:
self._images.clear()
self._text_history.clear()
self._last_output = None
self._last_execution = None

80
argus/capture.py Normal file
View File

@@ -0,0 +1,80 @@
"""Screenshot capture for macOS.
Reads a JPEG file written by the Swift host app (LockInBro.app), which holds
the Screen Recording TCC permission. The Swift app writes atomically to
/tmp/lockinbro_capture.jpg every ~5s using ScreenCaptureKit.
capture_screenshot() blocks up to 15 seconds waiting for a fresh file,
then raises RuntimeError if nothing arrives — so loop.py can handle it
gracefully without falling back to the screencapture CLI (which would fail
with a permissions error when called from a Python subprocess).
"""
from __future__ import annotations
import io
import os
import time
from PIL import Image
from argus.config import SCREENSHOT_JPEG_QUALITY, SCREENSHOT_MAX_WIDTH
_SWIFT_FRAME_PATH = "/tmp/lockinbro_capture.jpg"
_SWIFT_FRAME_MAX_AGE_S = 10.0 # treat as stale if older than this
_WAIT_TIMEOUT_S = 15.0 # how long to wait for Swift to write the first frame
_WAIT_POLL_S = 0.5 # polling interval while waiting
def _to_jpeg(img: Image.Image) -> bytes:
"""Convert a PIL Image to JPEG bytes, handling RGBA → RGB."""
if img.width > SCREENSHOT_MAX_WIDTH:
ratio = SCREENSHOT_MAX_WIDTH / img.width
img = img.resize(
(SCREENSHOT_MAX_WIDTH, int(img.height * ratio)),
Image.LANCZOS,
)
if img.mode == "RGBA":
img = img.convert("RGB")
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=SCREENSHOT_JPEG_QUALITY)
return buf.getvalue()
def _read_swift_frame() -> bytes | None:
"""Return JPEG bytes from the Swift-provided file if it exists and is fresh."""
try:
age = time.time() - os.path.getmtime(_SWIFT_FRAME_PATH)
if age >= _SWIFT_FRAME_MAX_AGE_S:
return None
with open(_SWIFT_FRAME_PATH, "rb") as f:
data = f.read()
img = Image.open(io.BytesIO(data))
img.load()
return _to_jpeg(img)
except Exception:
return None
def capture_screenshot() -> bytes:
"""Return JPEG bytes for the current screen.
Blocks up to _WAIT_TIMEOUT_S seconds waiting for the Swift app to write
a fresh frame. Raises RuntimeError if no frame arrives in time.
"""
deadline = time.time() + _WAIT_TIMEOUT_S
while time.time() < deadline:
frame = _read_swift_frame()
if frame is not None:
return frame
time.sleep(_WAIT_POLL_S)
raise RuntimeError(
f"No screenshot received from LockInBro.app within {_WAIT_TIMEOUT_S}s. "
"Make sure the app is running and has Screen Recording permission."
)
def load_image_file(path: str) -> bytes:
"""Load an image file and return JPEG bytes (for testing)."""
return _to_jpeg(Image.open(path))

28
argus/config.py Normal file
View File

@@ -0,0 +1,28 @@
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parent.parent / ".env", override=True)
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.5-pro")
GEMINI_URL = (
f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL}:generateContent"
)
# Ollama (local VLM — default)
OLLAMA_BASE_URL = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434")
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "qwen3.5:9b")
# VLM backend: "ollama" (default) or "gemini"
VLM_BACKEND = os.environ.get("VLM_BACKEND", "ollama")
BACKEND_BASE_URL = os.environ.get("BACKEND_BASE_URL", "https://wahwa.com/api/v1")
BACKEND_JWT = os.environ.get("BACKEND_JWT", "")
CAPTURE_INTERVAL_S = 2.5 # screenshot every 2.5s
BUFFER_MAX_LEN = 4 # 4 frames accumulated
VLM_CALL_EVERY_N = 4 # call VLM every 4 captures (= every 10s)
SCREENSHOT_JPEG_QUALITY = 50
SCREENSHOT_MAX_WIDTH = 1280

384
argus/executor.py Normal file
View File

@@ -0,0 +1,384 @@
"""Agentic executor — uses Gemini 2.5 Pro with tool use to complete actions.
When the user approves a proposed action, the executor gets:
- The recent screenshots (so it can read source content)
- Tool access: read_file, write_file, run_command
- A loop: Gemini proposes tool calls → we execute → feed results back → repeat
This is a full agent loop, not a single-shot LLM call.
Swift portability notes:
- The agent loop is the same HTTP pattern (Gemini function calling API)
- Tool implementations map to FileManager, NSTask, NSPasteboard
- The loop structure is identical in Swift async/await
"""
from __future__ import annotations
import asyncio
import base64
import logging
import os
import subprocess
import httpx
from argus.buffer import HistoryBuffer
from argus.config import GEMINI_API_KEY
log = logging.getLogger(__name__)
EXECUTOR_MODEL = os.environ.get("EXECUTOR_MODEL", "gemini-2.5-pro")
EXECUTOR_URL = (
f"https://generativelanguage.googleapis.com/v1beta/models/{EXECUTOR_MODEL}:generateContent"
)
MAX_AGENT_STEPS = 10
# ── Tool definitions (Gemini function calling format) ────────────────────
TOOLS = [
{
"functionDeclarations": [
{
"name": "read_file",
"description": "Read the contents of a file. Use this to inspect source files, code, configs, etc.",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Absolute or relative file path to read",
}
},
"required": ["path"],
},
},
{
"name": "output",
"description": "Display content to the user as a sticky note. Use for: extracted text from PDFs/images, form fill suggestions, content the user needs to paste into binary formats (docx, ppt, websites). The user will copy/paste from this.",
"parameters": {
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "Short title (e.g. 'Extracted receipt', 'Form values')",
},
"content": {
"type": "string",
"description": "The full content to display",
},
},
"required": ["title", "content"],
},
},
{
"name": "write_file",
"description": "Write content to an EXISTING plain text file (code, markdown, config, txt). Only use when: (1) you have confirmed the file path via read_file or mdfind, AND (2) the file is a plain text format you can write correctly. NEVER use for binary formats (docx, ppt, pdf, images).",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Absolute file path (must have been confirmed to exist first)",
},
"content": {
"type": "string",
"description": "Complete file content to write",
},
},
"required": ["path", "content"],
},
},
{
"name": "run_command",
"description": "Run a shell command and return its output. Use for compiling, testing, listing files, etc.",
"parameters": {
"type": "object",
"properties": {
"command": {
"type": "string",
"description": "The shell command to execute",
}
},
"required": ["command"],
},
},
{
"name": "done",
"description": "Signal that the task is complete. Call this AFTER using output() to present results. Summary should describe what was shown to the user, not file operations.",
"parameters": {
"type": "object",
"properties": {
"summary": {
"type": "string",
"description": "Brief summary of what was done",
}
},
"required": ["summary"],
},
},
]
}
]
# ── Tool implementations ─────────────────────────────────────────────────
def _exec_read_file(path: str) -> str:
resolved = path if os.path.isabs(path) else os.path.join(os.getcwd(), path)
try:
with open(resolved, "r") as f:
content = f.read()
log.info(" read_file: %s (%d bytes)", resolved, len(content))
return content
except OSError as e:
return f"Error reading {resolved}: {e}"
def _exec_write_file(path: str, content: str) -> str:
resolved = path if os.path.isabs(path) else os.path.join(os.getcwd(), path)
# Safety: only write to existing text files
if not os.path.exists(resolved):
return f"Error: {resolved} does not exist. Use output() instead for new content."
try:
with open(resolved, "r"):
pass # confirm it's readable as text
except (OSError, UnicodeDecodeError):
return f"Error: {resolved} is not a readable text file. Use output() instead."
try:
with open(resolved, "w") as f:
f.write(content)
log.info(" write_file: %s (%d bytes)", resolved, len(content))
return f"Successfully wrote {len(content)} bytes to {resolved}"
except OSError as e:
return f"Error writing {resolved}: {e}"
def _exec_output(title: str, content: str) -> str:
"""Display content to the user via terminal.
Swift portability: becomes a sticky note / floating card UI.
"""
print()
print(f"┌── 📋 {title} " + "" * max(0, 50 - len(title)) + "")
for line in content.split("\n"):
print(f"{line}")
print(f"" + "" * 58 + "")
print()
log.info(" output: %s (%d chars)", title, len(content))
return f"Displayed '{title}' to user ({len(content)} chars)"
def _exec_run_command(command: str) -> str:
log.info(" run_command: %s", command[:80])
try:
result = subprocess.run(
command, shell=True, capture_output=True, text=True, timeout=30
)
output = result.stdout
if result.stderr:
output += "\nSTDERR:\n" + result.stderr
if result.returncode != 0:
output += f"\n(exit code {result.returncode})"
return output[:4000] # cap output length
except subprocess.TimeoutExpired:
return "Error: command timed out after 30s"
def _execute_tool(name: str, args: dict) -> str:
if name == "read_file":
return _exec_read_file(args["path"])
elif name == "output":
return _exec_output(args["title"], args["content"])
elif name == "write_file":
return _exec_write_file(args["path"], args["content"])
elif name == "run_command":
return _exec_run_command(args["command"])
elif name == "done":
return args.get("summary", "Done.")
else:
return f"Unknown tool: {name}"
# ── Agent loop ───────────────────────────────────────────────────────────
async def execute(
vlm_payload: dict,
action_index: int = 0,
*,
history: HistoryBuffer | None = None,
current_screenshot: bytes | None = None,
api_key: str | None = None,
) -> str | None:
"""Run the agentic executor loop.
The agent can read files, write files, and run commands to complete
the user's approved action. It loops until it calls done() or hits
the step limit.
Returns a summary of what was done, or None on failure.
"""
friction = vlm_payload.get("friction", {})
actions = friction.get("proposed_actions", [])
if action_index >= len(actions):
log.warning("Action index %d out of range", action_index)
return None
chosen = actions[action_index]
key = api_key or GEMINI_API_KEY
if not key:
log.warning("No API key for executor")
return None
log.info("Agent executing: %s", chosen.get("label", "?")[:80])
# Build initial message with screenshots + task context
initial_parts = _build_initial_parts(vlm_payload, chosen, history, current_screenshot)
# Conversation history for the agent loop
contents = [{"role": "user", "parts": initial_parts}]
for step in range(MAX_AGENT_STEPS):
log.debug("Agent step %d/%d", step + 1, MAX_AGENT_STEPS)
payload = {
"contents": contents,
"tools": TOOLS,
"generationConfig": {"temperature": 0.2, "maxOutputTokens": 8192},
}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
for attempt in range(3):
resp = await client.post(f"{EXECUTOR_URL}?key={key}", json=payload)
if resp.status_code == 429:
wait = 2 ** attempt
log.warning("Executor 429, retrying in %ds...", wait)
await asyncio.sleep(wait)
continue
resp.raise_for_status()
break
else:
resp.raise_for_status()
except Exception:
log.exception("Agent API call failed at step %d", step + 1)
return None
body = resp.json()
candidate = body["candidates"][0]
response_parts = candidate["content"]["parts"]
# Add assistant response to conversation
contents.append({"role": "model", "parts": response_parts})
# Check for function calls
function_calls = [p for p in response_parts if "functionCall" in p]
if not function_calls:
# No tool calls — agent returned text, we're done
text = "".join(p.get("text", "") for p in response_parts)
log.info("Agent finished with text response (step %d)", step + 1)
return text.strip() if text.strip() else "Done."
# Execute each tool call
tool_results = []
done_summary = None
for fc_part in function_calls:
fc = fc_part["functionCall"]
name = fc["name"]
args = fc.get("args", {})
print(f" 🔧 {name}({_summarize_args(args)})")
result = _execute_tool(name, args)
tool_results.append({
"functionResponse": {
"name": name,
"response": {"result": result},
}
})
if name == "done":
done_summary = result
# Add tool results to conversation
contents.append({"role": "user", "parts": tool_results})
if done_summary:
log.info("Agent called done() at step %d: %s", step + 1, done_summary[:80])
return done_summary
log.warning("Agent hit step limit (%d)", MAX_AGENT_STEPS)
return "Agent reached maximum steps without completing."
def _build_initial_parts(
vlm_payload: dict,
action: dict,
history: HistoryBuffer | None,
current_screenshot: bytes | None,
) -> list[dict]:
"""Build the initial message parts: screenshots + task prompt."""
parts: list[dict] = []
# Include screenshots so agent can read source content
if history:
entries = history.get_entries()
for i, entry in enumerate(entries):
b64 = base64.b64encode(entry.jpeg).decode()
parts.append({"text": f"[Screenshot {i + 1}/{len(entries)}]"})
parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
if current_screenshot:
b64 = base64.b64encode(current_screenshot).decode()
parts.append({"text": "[Current screenshot]"})
parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
friction = vlm_payload.get("friction", {})
prompt = f"""\
The user approved this action. Complete it using the tools available to you.
ACTION: {action.get('label', '')}
DETAILS: {action.get('details', '')}
Context:
User's task: {vlm_payload.get('inferred_task', '')}
Problem: {friction.get('description', '')}
Current state: {vlm_payload.get('checkpoint_note_update', '')}
Application: {vlm_payload.get('app_name', '')}
Source: {friction.get('source_context', '')}
Target: {friction.get('target_context', '')}
INSTRUCTIONS:
1. For BINARY files (PDFs, images, etc.): use your VISION. Read content directly
from the screenshots — this is your most reliable source for non-text files.
2. For TEXT files (code, markdown, configs, txt): use read_file to get exact content.
3. If you need a file but only know the filename (not the path), FIND IT FIRST:
- run_command("mdfind -name 'filename'") — fast macOS Spotlight search
- run_command("lsof -c AppName | grep filename") — find what file an app has open
Do NOT guess paths. Search first.
4. Choose the right output method:
- Binary format targets (docx, ppt, website forms, PDFs): use output() — user will copy/paste.
- Existing plain text files (code, markdown, config): use write_file() — modify directly.
- write_file() only works on files that ALREADY EXIST. Confirm the path with read_file first.
5. Use run_command to compile, test, or search for files. Never to write files.
6. Do NOT hallucinate content. If you can't read something, say so.
7. Call done() with a summary when the action is complete.
Working directory: {os.getcwd()}"""
parts.append({"text": prompt})
return parts
def _summarize_args(args: dict) -> str:
"""Short summary of tool args for terminal display."""
parts = []
for k, v in args.items():
sv = str(v)
if len(sv) > 50:
sv = sv[:47] + "..."
parts.append(f"{k}={sv}")
return ", ".join(parts)

415
argus/loop.py Normal file
View File

@@ -0,0 +1,415 @@
"""Main orchestrator loop — capture → analyze → notify → execute.
Three concurrent tasks:
1. VLM loop: capture screenshots, analyze, push results
2. Notification handler: show cards when friction detected, wait for response
3. Input reader: read user's terminal input (1=accept, 0=dismiss)
Swift portability notes:
- The VLM loop becomes a Timer or DispatchSourceTimer
- The notification handler becomes SwiftUI state updates
- The input reader becomes button tap handlers
- The three tasks map to Swift's structured concurrency (async let / TaskGroup)
"""
from __future__ import annotations
import asyncio
import json
import logging
import sys
import time
from argus.backend import send_analysis
from argus.buffer import HistoryBuffer
from argus.capture import capture_screenshot
from argus.config import BUFFER_MAX_LEN, CAPTURE_INTERVAL_S, VLM_CALL_EVERY_N
from argus.executor import execute
from argus.notification import NotificationManager
from argus.session import SessionManager
from argus.vlm import TaskContext, VLMResult, analyze_screenshot
log = logging.getLogger(__name__)
# Track session actions already shown to avoid repeating
_handled_session_actions: set[str] = set()
def _session_action_key(sa) -> str:
return f"{sa.type}:{sa.session_id}"
def _show_session_card(title: str, body: str, options: list[str]) -> None:
"""Display a session-related card (not friction). Swift: native notification."""
print()
print("" + "" * 58 + "")
print(f"{title:<56}")
print("" + " " * 58 + "")
for line in body.split("\n"):
print(f"{line:<56}")
print("" + " " * 58 + "")
for i, opt in enumerate(options):
print(f"│ [{i + 1}] {opt:<53}")
print(f"│ [0] Not now{' ' * 44}")
print("" + "" * 58 + "")
print()
async def _wait_for_input() -> int:
"""Wait for a single integer input from stdin. Returns the number or 0."""
loop = asyncio.get_event_loop()
line = await loop.run_in_executor(None, sys.stdin.readline)
try:
return int(line.strip())
except (ValueError, EOFError):
return 0
async def _handle_session_action(sa, result, sessions: SessionManager, notifier: NotificationManager) -> None:
"""Handle VLM session_action output — show card and act on approval.
Swift portability: becomes SwiftUI state changes + button handlers.
"""
# Skip if we already showed this exact action
key = _session_action_key(sa)
if key in _handled_session_actions:
return
_handled_session_actions.add(key)
if sa.type == "resume":
session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
checkpoint = session.checkpoint_note if session else ""
task_title = session.task_title if session else sa.reason
_show_session_card(
f"📂 Resume: {task_title}",
f"You left off: {checkpoint}" if checkpoint else sa.reason,
["Resume session"],
)
choice = await _wait_for_input()
if choice == 1:
if not sessions._mock:
resume_card = await sessions.get_resume_card(sa.session_id)
if resume_card:
rc = resume_card.get("resume_card", {})
print(f" 💡 {rc.get('welcome_back', '')}")
print(f" {rc.get('you_were_doing', '')}")
print(f" {rc.get('next_step', '')}")
print(f" {rc.get('motivation', '')}")
else:
print(f" ✓ Resumed \"{task_title}\"")
if checkpoint:
print(f" Last checkpoint: {checkpoint}")
if session:
session.status = "active"
sessions._active_session = session
else:
print(" dismissed.")
elif sa.type == "start_new":
task = result.inferred_task
_show_session_card(
"🆕 New focus session",
f"You're working on: {task}",
["Start focus session"],
)
choice = await _wait_for_input()
if choice == 1:
if not sessions._mock:
resp = await sessions.start_session(task)
if resp:
print(f" ✓ Session started: {resp.get('id', '?')}")
else:
print(f" ✓ (mock) Session started for \"{task}\"")
else:
print(" dismissed.")
elif sa.type == "complete":
session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
task_title = session.task_title if session else "session"
_show_session_card(
f"✅ Complete: {task_title}",
sa.reason,
["Complete session"],
)
choice = await _wait_for_input()
if choice == 1 and sa.session_id:
if not sessions._mock:
ok = await sessions.end_session(sa.session_id)
if ok:
print(f" ✓ Session completed")
else:
print(f" ✓ (mock) Session \"{task_title}\" completed")
sessions._sessions = [s for s in sessions._sessions if s.session_id != sa.session_id]
if sessions._active_session and sessions._active_session.session_id == sa.session_id:
sessions._active_session = None
else:
print(" dismissed.")
elif sa.type == "switch":
session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
task_title = session.task_title if session else "another task"
_show_session_card(
f"🔄 Switch to: {task_title}",
sa.reason,
[f"Switch to \"{task_title}\""],
)
choice = await _wait_for_input()
if choice == 1:
print(f" ✓ Switched to \"{task_title}\"")
else:
print(" dismissed.")
def _is_valid_session_id(session_id: str | None, sessions: SessionManager) -> bool:
"""Check if a session_id from VLM output actually exists in our sessions."""
if not session_id:
return False
return any(s.session_id == session_id for s in sessions.sessions)
async def run_loop(
ctx: TaskContext,
*,
api_key: str | None = None,
vlm_backend: str | None = None,
jwt: str | None = None,
base_url: str | None = None,
dry_run: bool = False,
max_iterations: int | None = None,
on_result: None | (callable) = None,
auto_execute: bool = False,
mock_sessions: list | None = None,
) -> None:
"""Run the Argus VLM loop with notification and execution support.
Args:
ctx: Task/session context.
api_key: Gemini API key override.
vlm_backend: "ollama" or "gemini".
jwt: Backend JWT override.
base_url: Backend base URL override.
dry_run: If True, skip sending to backend.
max_iterations: Stop after N iterations (None = forever).
on_result: Optional callback(VLMResult) per analysis.
auto_execute: If True, enable notification + executor flow.
"""
history = HistoryBuffer(image_maxlen=BUFFER_MAX_LEN)
notifier = NotificationManager()
sessions = SessionManager(jwt=jwt, base_url=base_url)
iteration = 0
log.info(
"Argus loop starting — interval=%ds, session=%s, task=%s, executor=%s",
CAPTURE_INTERVAL_S,
ctx.session_id,
ctx.task_title,
"on" if auto_execute else "off",
)
# Load sessions — mock or from backend
use_mock = mock_sessions is not None
if use_mock:
sessions._sessions = mock_sessions
sessions._active_session = next((s for s in mock_sessions if s.status == "active"), None)
sessions._mock = True # prevent refresh from overwriting
log.info("Loaded %d mock sessions", len(mock_sessions))
else:
sessions._mock = False
try:
await sessions.fetch_open_sessions()
except Exception:
log.exception("Startup: failed to fetch sessions — continuing without session context")
# Log attachment status so it's visible in the console
if sessions.active:
s = sessions.active
log.info(
"ATTACHED to session %s — task=%r last_app=%s checkpoint=%r",
s.session_id[:8], s.task_title, s.last_app or "(unknown)", s.checkpoint_note or "(none)",
)
else:
log.info("No active session found — running in monitoring-only mode (dry-run=%s)", dry_run)
# Pending frames collected between VLM calls
pending_frames: list[bytes] = []
capture_count = 0
try:
while max_iterations is None or iteration < max_iterations:
t0 = time.monotonic()
try:
# 0. Refresh sessions periodically
await sessions.maybe_refresh()
# 1. Capture screenshot every interval
screenshot = capture_screenshot()
pending_frames.append(screenshot)
capture_count += 1
log.debug("Captured frame %d (%d pending)", capture_count, len(pending_frames))
# 2. Only call VLM every N captures
if len(pending_frames) < VLM_CALL_EVERY_N:
elapsed = time.monotonic() - t0
sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed)
if sleep_for > 0:
await asyncio.sleep(sleep_for)
continue
# ── VLM call with all pending frames ──
iteration += 1
# Push all pending frames into buffer (buffer keeps last BUFFER_MAX_LEN)
for frame in pending_frames:
history.push(frame, "") # summaries filled after VLM call
pending_frames.clear()
t_vlm = time.monotonic()
result: VLMResult = await analyze_screenshot(
screenshot, ctx, history,
api_key=api_key,
backend=vlm_backend,
session_context=sessions.format_for_prompt(),
)
t_vlm_done = time.monotonic()
payload = result.to_backend_payload(ctx.session_id)
log.info(
"[%d] vlm=%.2fs frames=%d friction=%s summary=%s",
iteration,
t_vlm_done - t_vlm,
VLM_CALL_EVERY_N,
result.friction.type,
result.vlm_summary,
)
# Update last entry's summary now that we have it
history.set_last_output(payload)
# 4. Print / send to backend
if dry_run:
print(json.dumps(payload, indent=2))
else:
resp = await send_analysis(
result, ctx.session_id, jwt=jwt, base_url=base_url
)
log.debug("Backend response: %s", resp)
if auto_execute:
sa = result.session_action
# Validate session_action — check that VLM output relates to the session.
# Match against filename, file stem (no extension), or task title.
if sa.type in ("resume", "switch") and sa.session_id:
session = next(
(s for s in sessions.sessions if s.session_id == sa.session_id), None
)
if session:
context = (result.vlm_summary + " " + result.inferred_task + " " + sa.reason).lower()
matches = []
if session.last_file:
matches.append(session.last_file.lower())
# Also check stem without extension (e.g. "receipt" from "receipt.pdf")
stem = session.last_file.rsplit(".", 1)[0].lower()
if stem:
matches.append(stem)
if session.task_title:
# Check key words from task title
for word in session.task_title.lower().split():
if len(word) > 3: # skip short words
matches.append(word)
if matches and not any(m in context for m in matches):
log.debug(
"Suppressing session_action=%s — none of %s found in context",
sa.type, matches,
)
sa.type = "none"
if not _is_valid_session_id(sa.session_id, sessions) and sa.type in ("resume", "switch"):
sa.type = "none"
# 5. Session actions take priority — but only if not already handled
session_handled = False
if sa.type != "none":
key = _session_action_key(sa)
if key not in _handled_session_actions:
await _handle_session_action(sa, result, sessions, notifier)
session_handled = True
# 6. Friction notification + executor
if not session_handled and notifier.should_notify(payload):
card = notifier.create_card(payload)
notifier.show_card_terminal(card)
choice = await _wait_for_input()
action_idx = choice - 1
if choice > 0:
print(f"\n⚡ Executing action {action_idx + 1}...")
summary = await execute(
payload, action_idx,
history=history,
current_screenshot=screenshot,
api_key=api_key,
)
# Emit a parseable JSON block so Swift can surface the result
# in the HUD and clear the executing spinner.
print(json.dumps({"exec_summary": summary or ""}, indent=2), flush=True)
if summary:
history.set_last_execution(summary)
else:
print("dismissed.")
# 7. Nudge — VLM decided a nudge is appropriate (only if nothing else fired)
elif not session_handled and result.gentle_nudge:
print(f"\n💛 {result.gentle_nudge}")
# 8. Fallback: suggest new session if VLM didn't
elif not session_handled and (
sa.type == "none"
and result.on_task
and not sessions.active
and sessions.should_suggest_new_session(result.inferred_task)
):
task = result.inferred_task
print(f"\n🆕 You've been working on \"{task}\" — start a focus session?")
print(f" [1] Start [0] Not now")
card = notifier.create_card({
"friction": {
"type": "none",
"confidence": 1.0,
"description": f"New task detected: {task}",
"proposed_actions": [{"label": "Start focus session", "action_type": "other", "details": task}],
}
})
notifier.show_card_terminal(card)
accepted, _ = await notifier.wait_for_response(timeout=60.0)
if accepted:
resp = await sessions.start_session(task)
if resp:
print(f" ✓ Session started: {resp.get('id', '?')}")
# Clear execution context after 3 iterations
if history.get_last_execution() and iteration % 3 == 0:
history.clear_last_execution()
# 9. Callback
if on_result:
on_result(result)
except Exception:
log.exception("Error in Argus loop iteration %d", iteration)
# Sleep for remainder of capture interval
elapsed = time.monotonic() - t0
sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed)
if sleep_for > 0:
await asyncio.sleep(sleep_for)
finally:
pass

172
argus/notification.py Normal file
View File

@@ -0,0 +1,172 @@
"""Notification manager — decides when to show action cards to the user.
Only pushes a new notification when the proposed action meaningfully changes.
In production (Swift), this becomes a native macOS notification / floating card.
Here it's a terminal prompt for testing.
Swift portability notes:
- NotificationManager becomes a class with @Published properties
- show_card() triggers a SwiftUI overlay or UNUserNotificationCenter
- user_response() is wired to button taps instead of stdin
"""
from __future__ import annotations
import asyncio
import logging
from dataclasses import dataclass, field
log = logging.getLogger(__name__)
@dataclass
class ActionCard:
"""A proposed action shown to the user."""
friction_type: str
description: str
actions: list[dict]
source: str
target: str
vlm_payload: dict # full VLM result for the executor
class NotificationManager:
"""Deduplicates notifications and manages the pending action card.
Only shows a new card when the friction type or proposed action labels
change from the previous notification.
"""
def __init__(self):
self._last_fingerprint: str = ""
self._pending_card: ActionCard | None = None
self._response_event: asyncio.Event = asyncio.Event()
self._user_accepted: bool = False
def _fingerprint(self, vlm_payload: dict) -> str:
"""Generate a dedup key from friction type + action labels."""
friction = vlm_payload.get("friction", {})
ftype = friction.get("type", "none")
labels = tuple(
a.get("label", "") for a in friction.get("proposed_actions", [])
)
return f"{ftype}:{labels}"
# Action types that the executor can actually perform
ACTIONABLE_TYPES = {"auto_extract", "brain_dump", "auto_fill", "summarize"}
def should_notify(self, vlm_payload: dict) -> bool:
"""Check if this VLM result warrants a friction card (executor action).
Only fires when proposed_actions contain something the executor can
act on. Generic suggestions (action_type: "other") are nudges, not
executor actions.
"""
friction = vlm_payload.get("friction", {})
# No friction or no proposed actions → no notification
if friction.get("type") == "none":
return False
actions = friction.get("proposed_actions", [])
if not actions:
return False
# Only notify if at least one action is executor-actionable
has_actionable = any(
a.get("action_type") in self.ACTIONABLE_TYPES for a in actions
)
if not has_actionable:
return False
# Same as last notification → skip
fp = self._fingerprint(vlm_payload)
if fp == self._last_fingerprint:
return False
return True
def create_card(self, vlm_payload: dict) -> ActionCard:
"""Create an action card from VLM output and mark it as pending."""
friction = vlm_payload.get("friction", {})
card = ActionCard(
friction_type=friction.get("type", ""),
description=friction.get("description", ""),
actions=friction.get("proposed_actions", []),
source=friction.get("source_context", ""),
target=friction.get("target_context", ""),
vlm_payload=vlm_payload,
)
self._pending_card = card
self._last_fingerprint = self._fingerprint(vlm_payload)
self._response_event.clear()
self._user_accepted = False
return card
def show_card_terminal(self, card: ActionCard) -> None:
"""Print the action card to terminal. (Swift: show native UI.)"""
print()
print("" + "" * 58 + "")
print(f"{'🔧 Friction detected':^58}")
print("" + " " * 58 + "")
desc_lines = _wrap(card.description, 56)
for line in desc_lines:
print(f"{line:<56}")
print("" + " " * 58 + "")
for i, action in enumerate(card.actions):
label = action.get("label", "?")
print(f"│ [{i + 1}] {label:<53}")
print(f"│ [0] Not now{' ' * 44}")
print("" + "" * 58 + "")
print()
async def wait_for_response(self, timeout: float = 30.0) -> tuple[bool, int]:
"""Wait for user response. Returns (accepted, action_index).
Swift portability: this becomes a button tap callback.
"""
try:
await asyncio.wait_for(self._response_event.wait(), timeout=timeout)
return self._user_accepted, self._selected_index
except asyncio.TimeoutError:
log.debug("Notification timed out, dismissing")
self._pending_card = None
return False, -1
def submit_response(self, choice: int) -> None:
"""Submit user's choice. Called from input reader task.
Swift portability: called from button tap handler.
"""
if choice > 0 and self._pending_card:
self._user_accepted = True
self._selected_index = choice - 1
else:
self._user_accepted = False
self._selected_index = -1
self._response_event.set()
@property
def pending(self) -> ActionCard | None:
return self._pending_card
def dismiss(self) -> None:
"""Dismiss current card without action."""
self._pending_card = None
self._last_fingerprint = ""
def _wrap(text: str, width: int) -> list[str]:
"""Simple word wrap."""
words = text.split()
lines: list[str] = []
current = ""
for word in words:
if len(current) + len(word) + 1 <= width:
current = f"{current} {word}" if current else word
else:
if current:
lines.append(current)
current = word
if current:
lines.append(current)
return lines or [""]

3
argus/requirements.txt Normal file
View File

@@ -0,0 +1,3 @@
httpx>=0.27
Pillow>=10.0
python-dotenv>=1.0

293
argus/session.py Normal file
View File

@@ -0,0 +1,293 @@
"""Session manager — fetches and caches session state from the backend.
Provides session context to the VLM prompt and handles session lifecycle
actions (start, resume, switch, complete).
Swift portability notes:
- SessionManager becomes an ObservableObject with @Published properties
- Backend calls use URLSession instead of httpx
- match_session() logic is identical
"""
from __future__ import annotations
import logging
import time
from dataclasses import dataclass, field
import httpx
from argus.config import BACKEND_BASE_URL, BACKEND_JWT
log = logging.getLogger(__name__)
@dataclass
class SessionInfo:
session_id: str
task_id: str | None
task_title: str
task_goal: str
status: str # active | interrupted
last_app: str
last_file: str
checkpoint_note: str
started_at: str
ended_at: str | None
minutes_ago: int | None = None
class SessionManager:
"""Caches open sessions and provides matching + lifecycle operations."""
def __init__(self, jwt: str | None = None, base_url: str | None = None):
self._jwt = jwt or BACKEND_JWT
self._base_url = base_url or BACKEND_BASE_URL
self._sessions: list[SessionInfo] = []
self._active_session: SessionInfo | None = None
self._last_fetch: float = 0
self._fetch_interval = 30.0 # re-fetch every 30s
self._mock: bool = False
# Track inferred_task stability for new session suggestion
self._inferred_task_history: list[str] = []
self._stable_threshold = 3 # consecutive matching inferred_tasks
@property
def active(self) -> SessionInfo | None:
return self._active_session
@property
def sessions(self) -> list[SessionInfo]:
return self._sessions
def has_sessions(self) -> bool:
return len(self._sessions) > 0
# ── Backend communication ────────────────────────────────────────
async def fetch_open_sessions(self) -> list[SessionInfo]:
"""Fetch the active session from backend (GET /sessions/active → single object or 404).
Called on startup and periodically.
"""
url = f"{self._base_url}/sessions/active"
headers = {}
if self._jwt:
headers["Authorization"] = f"Bearer {self._jwt}"
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.get(url, headers=headers)
if resp.status_code == 404:
# No active session — clear cache
self._sessions = []
self._active_session = None
self._last_fetch = time.monotonic()
log.info("No active session on backend")
return []
resp.raise_for_status()
data = resp.json()
session = self._parse_session(data)
self._sessions = [session]
self._active_session = session
self._last_fetch = time.monotonic()
log.info("Fetched active session: %s (%s)", session.session_id, session.task_title)
return self._sessions
except httpx.HTTPStatusError as e:
log.warning("Failed to fetch sessions: %s", e.response.status_code)
return self._sessions
except httpx.RequestError as e:
log.debug("Backend unreachable for sessions: %s", e)
return self._sessions
async def maybe_refresh(self) -> None:
"""Re-fetch if stale. Skips if using mock data."""
if self._mock:
return
if time.monotonic() - self._last_fetch > self._fetch_interval:
await self.fetch_open_sessions()
async def start_session(self, task_title: str, task_goal: str = "") -> dict | None:
"""Start a new focus session. Returns session data or None."""
url = f"{self._base_url}/sessions/start"
headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {}
# If we have a task_title but no task_id, the backend should
# create an ad-hoc session (or we create the task first).
payload = {"platform": "mac"}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(url, json=payload, headers=headers)
resp.raise_for_status()
result = resp.json()
await self.fetch_open_sessions() # refresh
log.info("Started session: %s", result.get("id"))
return result
except Exception:
log.exception("Failed to start session")
return None
async def end_session(self, session_id: str, status: str = "completed") -> bool:
"""End a session. Returns True on success."""
url = f"{self._base_url}/sessions/{session_id}/end"
headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(url, json={"status": status}, headers=headers)
resp.raise_for_status()
await self.fetch_open_sessions() # refresh
log.info("Ended session %s (%s)", session_id, status)
return True
except Exception:
log.exception("Failed to end session %s", session_id)
return False
async def get_resume_card(self, session_id: str) -> dict | None:
"""Fetch AI-generated resume card for a session."""
url = f"{self._base_url}/sessions/{session_id}/resume"
headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {}
try:
async with httpx.AsyncClient(timeout=15.0) as client:
resp = await client.get(url, headers=headers)
resp.raise_for_status()
return resp.json()
except Exception:
log.exception("Failed to get resume card for %s", session_id)
return None
# ── Session matching ─────────────────────────────────────────────
def match_session(self, inferred_task: str, app_name: str) -> SessionInfo | None:
"""Find an open session that matches the current screen state.
Returns the best match or None.
"""
if not inferred_task or not self._sessions:
return None
inferred_lower = inferred_task.lower()
best_match: SessionInfo | None = None
best_score = 0
for session in self._sessions:
score = 0
title_lower = session.task_title.lower()
# Check for keyword overlap between inferred_task and session task_title
inferred_words = set(inferred_lower.split())
title_words = set(title_lower.split())
overlap = inferred_words & title_words
# Filter out common words
overlap -= {"the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with"}
score += len(overlap) * 2
# App match
if app_name and session.last_app and app_name.lower() in session.last_app.lower():
score += 3
# File match — check if session's last_file appears in inferred_task
if session.last_file and session.last_file.lower() in inferred_lower:
score += 5
if score > best_score:
best_score = score
best_match = session
# Require minimum score to avoid false matches
if best_score >= 4:
return best_match
return None
def should_suggest_new_session(self, inferred_task: str) -> bool:
"""Check if we should suggest starting a new session.
Returns True if inferred_task has been stable for N iterations
and doesn't match any existing session.
"""
if not inferred_task:
self._inferred_task_history.clear()
return False
self._inferred_task_history.append(inferred_task)
# Keep only recent history
if len(self._inferred_task_history) > self._stable_threshold + 2:
self._inferred_task_history = self._inferred_task_history[-self._stable_threshold - 2:]
if len(self._inferred_task_history) < self._stable_threshold:
return False
# Check if the last N inferred tasks are "similar" (share key words)
recent = self._inferred_task_history[-self._stable_threshold:]
first_words = set(recent[0].lower().split()) - {"the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with"}
all_similar = all(
len(first_words & set(t.lower().split())) >= len(first_words) * 0.5
for t in recent[1:]
)
if not all_similar:
return False
# Make sure it doesn't match an existing session
matched = self.match_session(inferred_task, "")
return matched is None
# ── Prompt formatting ────────────────────────────────────────────
def format_for_prompt(self) -> str:
"""Format open sessions for injection into the VLM prompt.
Includes session_id so VLM can reference the exact ID in session_action.
"""
if not self._sessions:
return "(no open sessions)"
lines: list[str] = []
for s in self._sessions:
status_tag = f"[{s.status}]"
ago = f" (paused {s.minutes_ago}m ago)" if s.minutes_ago else ""
line = f" session_id=\"{s.session_id}\" {status_tag} \"{s.task_title}\" — last in {s.last_app}"
if s.last_file:
line += f"/{s.last_file}"
if s.checkpoint_note:
line += f", \"{s.checkpoint_note}\""
line += ago
lines.append(line)
return "\n".join(lines)
# ── Internal ─────────────────────────────────────────────────────
def _parse_session(self, data: dict) -> SessionInfo:
checkpoint = data.get("checkpoint", {}) or {}
ended = data.get("ended_at")
minutes_ago = None
if ended:
import datetime
try:
ended_dt = datetime.datetime.fromisoformat(ended.replace("Z", "+00:00"))
now = datetime.datetime.now(datetime.timezone.utc)
minutes_ago = int((now - ended_dt).total_seconds() / 60)
except (ValueError, TypeError):
pass
# SessionOut has no nested "task" object — task title is stored in checkpoint["goal"]
# by POST /sessions/start when a task_id is provided.
task_title = checkpoint.get("goal", "")
checkpoint_note = checkpoint.get("last_action_summary", checkpoint.get("last_vlm_summary", ""))
return SessionInfo(
session_id=str(data.get("id", "")),
task_id=data.get("task_id"),
task_title=task_title,
task_goal=task_title,
status=data.get("status", ""),
last_app=checkpoint.get("active_app", ""),
last_file=checkpoint.get("active_file", ""),
checkpoint_note=checkpoint_note,
started_at=data.get("started_at", ""),
ended_at=ended,
minutes_ago=minutes_ago,
)

442
argus/vlm.py Normal file
View File

@@ -0,0 +1,442 @@
"""VLM client — supports Ollama (local, default) and Gemini (cloud fallback).
Sends the current screenshot plus text summaries of recent analyses.
Parses the structured JSON response.
"""
from __future__ import annotations
import asyncio
import base64
import json
import logging
import re
from dataclasses import dataclass, field
import httpx
from argus.buffer import HistoryBuffer
from argus.config import (
GEMINI_API_KEY,
GEMINI_URL,
OLLAMA_BASE_URL,
OLLAMA_MODEL,
VLM_BACKEND,
)
log = logging.getLogger(__name__)
# ── Task context passed in from the session ──────────────────────────────
@dataclass
class StepInfo:
id: str
sort_order: int
title: str
status: str # pending | in_progress | done | skipped
checkpoint_note: str | None = None
@dataclass
class TaskContext:
"""Context about the user's current task/session, fed into the VLM prompt."""
task_title: str = ""
task_goal: str = ""
steps: list[StepInfo] = field(default_factory=list)
window_title: str = ""
session_id: str = ""
# ── VLM response schema ─────────────────────────────────────────────────
@dataclass
class FrictionInfo:
type: str = "none"
confidence: float = 0.0
description: str | None = None
proposed_actions: list[dict] = field(default_factory=list)
source_context: str | None = None
target_context: str | None = None
@dataclass
class SessionAction:
type: str = "none" # resume | switch | complete | start_new | none
session_id: str | None = None
reason: str = ""
@dataclass
class VLMResult:
on_task: bool = True
current_step_id: str | None = None
inferred_task: str = ""
checkpoint_note_update: str | None = None
steps_completed: list[str] = field(default_factory=list)
friction: FrictionInfo = field(default_factory=FrictionInfo)
session_action: SessionAction = field(default_factory=SessionAction)
intent: str | None = None
distraction_type: str | None = None
app_name: str = ""
confidence: float = 0.0
gentle_nudge: str | None = None
vlm_summary: str = ""
def to_backend_payload(self, session_id: str) -> dict:
"""Serialize to the JSON shape expected by POST /distractions/analyze-result."""
return {
"session_id": session_id,
"on_task": self.on_task,
"current_step_id": self.current_step_id,
"inferred_task": self.inferred_task,
"checkpoint_note_update": self.checkpoint_note_update,
"steps_completed": self.steps_completed,
"friction": {
"type": self.friction.type,
"confidence": self.friction.confidence,
"description": self.friction.description,
"proposed_actions": self.friction.proposed_actions,
"source_context": self.friction.source_context,
"target_context": self.friction.target_context,
},
"session_action": {
"type": self.session_action.type,
"session_id": self.session_action.session_id,
"reason": self.session_action.reason,
},
"intent": self.intent,
"distraction_type": self.distraction_type,
"app_name": self.app_name,
"confidence": self.confidence,
"gentle_nudge": self.gentle_nudge,
"vlm_summary": self.vlm_summary,
}
# ── Prompt construction ──────────────────────────────────────────────────
def _format_steps(steps: list[StepInfo]) -> str:
lines: list[str] = []
for s in steps:
marker = {"pending": "", "in_progress": "", "done": "", "skipped": ""}.get(
s.status, "?"
)
line = f" {marker} [{s.status}] (id={s.id}) {s.sort_order}. {s.title}"
if s.checkpoint_note:
line += f" — checkpoint: {s.checkpoint_note}"
lines.append(line)
return "\n".join(lines) if lines else " (no steps)"
def build_system_prompt(ctx: TaskContext, history: HistoryBuffer, session_context: str = "") -> str:
history_text = history.format_for_prompt()
steps_text = _format_steps(ctx.steps)
prev_output = history.format_last_output()
prev_section = ""
if prev_output:
prev_section = f"""
Your previous analysis (refine or correct this based on new evidence):
{prev_output}
If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it with new observations.
"""
execution = history.get_last_execution()
exec_section = ""
if execution:
exec_section = f"""
IMPORTANT — An AI agent just completed an action for the user:
{execution}
This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT.
"""
return f"""\
You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots.
## How to read the screenshots
You receive screenshots in chronological order (oldest first, newest last).
You receive ~5 frames spanning ~20 seconds (one frame every 4 seconds). This means:
- 2 unchanged frames = 8+ seconds idle. That's significant.
- 3+ unchanged frames = 12-20 seconds idle. The user is stuck or distracted.
- If ALL frames are identical, the user has been idle for 20 seconds — definitely flag it.
- If the user wrote code and then 2+ frames show no changes, they are STUCK NOW.
Do NOT wait for many frames to flag problems. React fast.
Your PRIMARY signal is the DIFFERENCES between consecutive frames.
Where the screen CHANGED = where the user's ATTENTION is.
Where the screen is STATIC = background noise. Ignore it unless the user interacts with it.
Diff signals and what they mean:
- New text appearing / cursor advancing → user is actively typing (THIS is their task)
- Window or tab switch → context change, could be reference lookup or distraction
- Same content, no pixel changes → stalled, idle, or reading
- Repeated switching between same 2-3 apps → repetitive loop (manual data transfer)
- Scroll position change → reading or browsing
- Error message that APPEARED between frames → user just triggered it, relevant
- Error message that was ALREADY THERE in all frames → stale, ignore it
## Task inference
Infer the user's current task from what they are ACTIVELY DOING across the frames.
Do NOT assume static content (old terminal output, background panels, stale errors)
is the task. The region of the screen where pixels are changing IS the task.
CRITICAL — looking at something ≠ working on something:
- User switches to Preview/browser/another app and just LOOKS → this is NOT a new task.
It could be a distraction, a quick reference, or idle browsing.
- User switches to another app AND starts TYPING/EDITING → this might be a new task.
- If the user has an active session and switches away WITHOUT typing in the new app,
they are DISTRACTED from their session, not starting a new task.
- Only infer a new task when there is clear evidence of productive work (typing, editing,
cursor movement between frames) in the new context.
- A single app switch is NEVER enough to infer a new task. Wait for active work.
If an explicit task is provided below, use it. Otherwise, infer from the screenshots.
Task: {ctx.task_title}
Goal: {ctx.task_goal}
Steps:
{steps_text}
Window title reported by OS: {ctx.window_title}
## Open sessions from backend (use EXACT session_id values below)
{session_context if session_context else "(no open sessions — suggest start_new if user is working on something)"}
Session matching rules — be STRICT:
- A session matches ONLY if the user is actively editing the session's last_file.
Being in the same app (e.g. VS Code) is NOT enough. The user must be typing/editing
in the specific file listed in the session (e.g. solution.cpp).
- Chatting in a sidebar, reading logs, or browsing in the same app ≠ working on the session.
- If the user is in VS Code but editing a DIFFERENT file or chatting in a panel,
they are NOT on-task for the session. That's a distraction.
If the session's file IS being actively edited, output session_action: resume with the EXACT session_id.
If the user moved to a different open session's file, output session_action: switch with the EXACT session_id.
IMPORTANT: If you propose a friction action (e.g. auto_extract) that relates to an existing
session's task, ALSO output session_action: resume with that session's ID. The friction action
and session resume should go together — don't propose work related to a session without
linking it to the session.
If the user finished and closed the session's file, output session_action: complete with the EXACT session_id.
If no sessions exist but the user is actively working, output session_action: start_new (session_id: null).
NEVER invent session IDs. Use only the IDs listed above or null.
{prev_section}{exec_section}
Screenshot timeline:
{history_text}
## What to analyze
1. INFERRED TASK: What is the user actually working on right now? Base this on where
the screen is changing, not on static content.
2. CHECKPOINT: What specific progress did the user make across these frames?
Describe what changed (e.g., "typed 3 new lines of C++ code", "scrolled to next section").
3. FRICTION DETECTION: Is the user stuck in any of these patterns?
- REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually)
- STALLED: No meaningful pixel changes across 2+ frames, OR user wrote code then
deleted/undid it (write-then-delete = struggle, NOT "refining")
- TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand)
- CONTEXT_OVERHEAD: Many windows open, visibly searching across them
- TASK_RESUMPTION: User just returned to a task from earlier (check text history)
IMPORTANT signals to catch IMMEDIATELY (do NOT wait many frames):
- User wrote code/text then deleted it → STUCK, not refining. Flag stalled.
- User idle for 2+ frames after deletion → definitely stuck. Flag stalled.
- User switching between source doc and target file repeatedly → TEDIOUS_MANUAL.
This is NOT "fluent workflow." If the user is copying data from one place to
another by switching windows, flag it on the SECOND switch. Don't wait.
- Code is incomplete/wrong and user stopped typing → need help.
4. INTENT: If viewing informational content, is the user skimming, engaged, or unclear?
5. PROPOSED ACTION: If friction detected, suggest what the AI could DO.
Be SPECIFIC: "Extract full text from writeup.pdf into transcript.md" not "Summarize text".
The label should be a concrete verb phrase the user can approve with one tap.
CRITICAL: The "details" field is the executor agent's instruction manual. Write it as
a natural language spec — the executor has vision too, so tell it where to look and
what to do, not the raw data:
Bad: "Extract data from the document"
Bad: "Help with the code"
Good: "User is writing a report in report.md and has a PDF open with source data. They are manually copying table values from the PDF into markdown. Extract the table from the PDF (visible in screenshots), format as a markdown table matching the style in report.md, and append to the file."
Good: "User is implementing quicksort in solution.cpp but has been idle after writing the function signature. They appear stuck on the partition logic. Provide a working C++ quicksort implementation that fits their existing code structure."
Respond ONLY with JSON:
{{
"on_task": true,
"current_step_id": "step UUID or null",
"inferred_task": "what the user is actually working on, based on screen diffs",
"checkpoint_note_update": "what changed across these frames specifically",
"steps_completed": ["UUIDs"],
"friction": {{
"type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none",
"confidence": 0.0-1.0,
"description": "what the user is struggling with, based on diff evidence",
"proposed_actions": [
{{"label": "specific verb phrase: what to do, from where, to where", "action_type": "auto_extract | brain_dump | auto_fill | summarize | other", "details": "Natural language spec for executor. Include: (1) what the user wants done, (2) where to look in the screenshots, (3) EXACT format to use — quote what the user already wrote so the executor matches it (e.g. if user wrote '3 tacos with steak' in plain text, say 'format as plain text lines like the user already started, NOT JSON'), (4) target file. The executor has vision to read screenshots — tell it WHERE to look, not the raw data."}}
],
"source_context": "just the filename if visible (e.g. writeup.pdf), or app name if no file",
"target_context": "just the filename if visible (e.g. transcript.md), or app name if no file"
}},
"session_action": {{
"type": "resume | switch | complete | start_new | none",
"session_id": "uuid of matching session, or null for start_new/none",
"reason": "why this session action is suggested"
}},
"intent": "skimming | engaged | unclear | null",
"distraction_type": "app_switch | browsing | idle | null",
"app_name": "primary visible application",
"confidence": 0.0-1.0,
"gentle_nudge": "nudge text if distracted, null otherwise",
"vlm_summary": "1-sentence description of what CHANGED across the frames (not what's static)"
}}"""
# ── Gemini API call ──────────────────────────────────────────────────────
def _extract_json(text: str) -> dict:
"""Extract JSON from Gemini response, handling markdown code fences."""
text = text.strip()
# Strip ```json ... ``` wrappers
m = re.search(r"```(?:json)?\s*([\s\S]*?)```", text)
if m:
text = m.group(1).strip()
return json.loads(text)
def _parse_vlm_response(raw: dict) -> VLMResult:
"""Parse the raw JSON dict into a VLMResult dataclass."""
friction_raw = raw.get("friction", {})
friction = FrictionInfo(
type=friction_raw.get("type", "none"),
confidence=friction_raw.get("confidence", 0.0),
description=friction_raw.get("description"),
proposed_actions=friction_raw.get("proposed_actions", []),
source_context=friction_raw.get("source_context"),
target_context=friction_raw.get("target_context"),
)
sa_raw = raw.get("session_action", {})
session_action = SessionAction(
type=sa_raw.get("type", "none"),
session_id=sa_raw.get("session_id"),
reason=sa_raw.get("reason", ""),
)
return VLMResult(
on_task=raw.get("on_task", True),
current_step_id=raw.get("current_step_id"),
inferred_task=raw.get("inferred_task", ""),
checkpoint_note_update=raw.get("checkpoint_note_update"),
steps_completed=raw.get("steps_completed", []),
friction=friction,
session_action=session_action,
intent=raw.get("intent"),
distraction_type=raw.get("distraction_type"),
app_name=raw.get("app_name", ""),
confidence=raw.get("confidence", 0.0),
gentle_nudge=raw.get("gentle_nudge"),
vlm_summary=raw.get("vlm_summary", ""),
)
async def _call_ollama(system_prompt: str, b64_images: list[str]) -> str:
"""Call Ollama local VLM with multiple images and return raw text."""
payload = {
"model": OLLAMA_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": "/no_think\nAnalyze this screenshot sequence now.",
"images": b64_images,
},
],
"stream": False,
"keep_alive": -1,
"options": {"temperature": 0.2},
}
async with httpx.AsyncClient(timeout=300.0) as client:
resp = await client.post(f"{OLLAMA_BASE_URL}/api/chat", json=payload)
resp.raise_for_status()
body = resp.json()
return body["message"]["content"]
async def _call_gemini(system_prompt: str, b64_images: list[str], api_key: str) -> str:
"""Call Gemini Vision API with multiple images and return raw text."""
# Build parts: interleave images with labels, newest last
parts: list[dict] = []
total = len(b64_images)
for i, b64 in enumerate(b64_images):
parts.append({"text": f"[Screenshot {i + 1}/{total}]"})
parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
parts.append({"text": "Analyze this screenshot sequence now."})
payload = {
"systemInstruction": {"parts": [{"text": system_prompt}]},
"contents": [{"parts": parts}],
"generationConfig": {
"temperature": 0.2,
"maxOutputTokens": 4096,
},
}
async with httpx.AsyncClient(timeout=60.0) as client:
for attempt in range(3):
resp = await client.post(f"{GEMINI_URL}?key={api_key}", json=payload)
if resp.status_code == 429:
wait = 2 ** attempt
log.warning("Gemini 429 rate limited, retrying in %ds...", wait)
await asyncio.sleep(wait)
continue
resp.raise_for_status()
break
else:
resp.raise_for_status()
body = resp.json()
parts = body["candidates"][0]["content"]["parts"]
text = ""
for part in parts:
if "text" in part:
text = part["text"]
return text
async def analyze_screenshot(
screenshot_jpeg: bytes,
ctx: TaskContext,
history: HistoryBuffer,
*,
api_key: str | None = None,
backend: str | None = None,
session_context: str = "",
) -> VLMResult:
"""Analyze a screenshot sequence via Ollama or Gemini.
Sends all buffered screenshots + the current one as images (oldest first).
"""
which = backend or VLM_BACKEND
system_prompt = build_system_prompt(ctx, history, session_context=session_context)
# Build image list: buffered (oldest first) + current
b64_images: list[str] = []
for entry in history.get_entries():
b64_images.append(base64.b64encode(entry.jpeg).decode())
b64_images.append(base64.b64encode(screenshot_jpeg).decode())
log.debug("Sending %d screenshots to %s", len(b64_images), which)
if which == "ollama":
text = await _call_ollama(system_prompt, b64_images)
elif which == "gemini":
key = api_key or GEMINI_API_KEY
if not key:
raise RuntimeError("GEMINI_API_KEY not set")
text = await _call_gemini(system_prompt, b64_images, key)
else:
raise ValueError(f"Unknown VLM backend: {which}")
log.debug("VLM raw response: %s", text)
parsed = _extract_json(text)
return _parse_vlm_response(parsed)