include argus workflow

This commit is contained in:
joyzhuo
2026-03-29 06:29:18 -04:00
parent 275a53ab40
commit 56673078f5
23 changed files with 3098 additions and 307 deletions

442
argus/vlm.py Normal file
View File

@@ -0,0 +1,442 @@
"""VLM client — supports Ollama (local, default) and Gemini (cloud fallback).
Sends the current screenshot plus text summaries of recent analyses.
Parses the structured JSON response.
"""
from __future__ import annotations
import asyncio
import base64
import json
import logging
import re
from dataclasses import dataclass, field
import httpx
from argus.buffer import HistoryBuffer
from argus.config import (
GEMINI_API_KEY,
GEMINI_URL,
OLLAMA_BASE_URL,
OLLAMA_MODEL,
VLM_BACKEND,
)
log = logging.getLogger(__name__)
# ── Task context passed in from the session ──────────────────────────────
@dataclass
class StepInfo:
id: str
sort_order: int
title: str
status: str # pending | in_progress | done | skipped
checkpoint_note: str | None = None
@dataclass
class TaskContext:
"""Context about the user's current task/session, fed into the VLM prompt."""
task_title: str = ""
task_goal: str = ""
steps: list[StepInfo] = field(default_factory=list)
window_title: str = ""
session_id: str = ""
# ── VLM response schema ─────────────────────────────────────────────────
@dataclass
class FrictionInfo:
type: str = "none"
confidence: float = 0.0
description: str | None = None
proposed_actions: list[dict] = field(default_factory=list)
source_context: str | None = None
target_context: str | None = None
@dataclass
class SessionAction:
type: str = "none" # resume | switch | complete | start_new | none
session_id: str | None = None
reason: str = ""
@dataclass
class VLMResult:
on_task: bool = True
current_step_id: str | None = None
inferred_task: str = ""
checkpoint_note_update: str | None = None
steps_completed: list[str] = field(default_factory=list)
friction: FrictionInfo = field(default_factory=FrictionInfo)
session_action: SessionAction = field(default_factory=SessionAction)
intent: str | None = None
distraction_type: str | None = None
app_name: str = ""
confidence: float = 0.0
gentle_nudge: str | None = None
vlm_summary: str = ""
def to_backend_payload(self, session_id: str) -> dict:
"""Serialize to the JSON shape expected by POST /distractions/analyze-result."""
return {
"session_id": session_id,
"on_task": self.on_task,
"current_step_id": self.current_step_id,
"inferred_task": self.inferred_task,
"checkpoint_note_update": self.checkpoint_note_update,
"steps_completed": self.steps_completed,
"friction": {
"type": self.friction.type,
"confidence": self.friction.confidence,
"description": self.friction.description,
"proposed_actions": self.friction.proposed_actions,
"source_context": self.friction.source_context,
"target_context": self.friction.target_context,
},
"session_action": {
"type": self.session_action.type,
"session_id": self.session_action.session_id,
"reason": self.session_action.reason,
},
"intent": self.intent,
"distraction_type": self.distraction_type,
"app_name": self.app_name,
"confidence": self.confidence,
"gentle_nudge": self.gentle_nudge,
"vlm_summary": self.vlm_summary,
}
# ── Prompt construction ──────────────────────────────────────────────────
def _format_steps(steps: list[StepInfo]) -> str:
lines: list[str] = []
for s in steps:
marker = {"pending": "", "in_progress": "", "done": "", "skipped": ""}.get(
s.status, "?"
)
line = f" {marker} [{s.status}] (id={s.id}) {s.sort_order}. {s.title}"
if s.checkpoint_note:
line += f" — checkpoint: {s.checkpoint_note}"
lines.append(line)
return "\n".join(lines) if lines else " (no steps)"
def build_system_prompt(ctx: TaskContext, history: HistoryBuffer, session_context: str = "") -> str:
history_text = history.format_for_prompt()
steps_text = _format_steps(ctx.steps)
prev_output = history.format_last_output()
prev_section = ""
if prev_output:
prev_section = f"""
Your previous analysis (refine or correct this based on new evidence):
{prev_output}
If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it with new observations.
"""
execution = history.get_last_execution()
exec_section = ""
if execution:
exec_section = f"""
IMPORTANT — An AI agent just completed an action for the user:
{execution}
This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT.
"""
return f"""\
You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots.
## How to read the screenshots
You receive screenshots in chronological order (oldest first, newest last).
You receive ~5 frames spanning ~20 seconds (one frame every 4 seconds). This means:
- 2 unchanged frames = 8+ seconds idle. That's significant.
- 3+ unchanged frames = 12-20 seconds idle. The user is stuck or distracted.
- If ALL frames are identical, the user has been idle for 20 seconds — definitely flag it.
- If the user wrote code and then 2+ frames show no changes, they are STUCK NOW.
Do NOT wait for many frames to flag problems. React fast.
Your PRIMARY signal is the DIFFERENCES between consecutive frames.
Where the screen CHANGED = where the user's ATTENTION is.
Where the screen is STATIC = background noise. Ignore it unless the user interacts with it.
Diff signals and what they mean:
- New text appearing / cursor advancing → user is actively typing (THIS is their task)
- Window or tab switch → context change, could be reference lookup or distraction
- Same content, no pixel changes → stalled, idle, or reading
- Repeated switching between same 2-3 apps → repetitive loop (manual data transfer)
- Scroll position change → reading or browsing
- Error message that APPEARED between frames → user just triggered it, relevant
- Error message that was ALREADY THERE in all frames → stale, ignore it
## Task inference
Infer the user's current task from what they are ACTIVELY DOING across the frames.
Do NOT assume static content (old terminal output, background panels, stale errors)
is the task. The region of the screen where pixels are changing IS the task.
CRITICAL — looking at something ≠ working on something:
- User switches to Preview/browser/another app and just LOOKS → this is NOT a new task.
It could be a distraction, a quick reference, or idle browsing.
- User switches to another app AND starts TYPING/EDITING → this might be a new task.
- If the user has an active session and switches away WITHOUT typing in the new app,
they are DISTRACTED from their session, not starting a new task.
- Only infer a new task when there is clear evidence of productive work (typing, editing,
cursor movement between frames) in the new context.
- A single app switch is NEVER enough to infer a new task. Wait for active work.
If an explicit task is provided below, use it. Otherwise, infer from the screenshots.
Task: {ctx.task_title}
Goal: {ctx.task_goal}
Steps:
{steps_text}
Window title reported by OS: {ctx.window_title}
## Open sessions from backend (use EXACT session_id values below)
{session_context if session_context else "(no open sessions — suggest start_new if user is working on something)"}
Session matching rules — be STRICT:
- A session matches ONLY if the user is actively editing the session's last_file.
Being in the same app (e.g. VS Code) is NOT enough. The user must be typing/editing
in the specific file listed in the session (e.g. solution.cpp).
- Chatting in a sidebar, reading logs, or browsing in the same app ≠ working on the session.
- If the user is in VS Code but editing a DIFFERENT file or chatting in a panel,
they are NOT on-task for the session. That's a distraction.
If the session's file IS being actively edited, output session_action: resume with the EXACT session_id.
If the user moved to a different open session's file, output session_action: switch with the EXACT session_id.
IMPORTANT: If you propose a friction action (e.g. auto_extract) that relates to an existing
session's task, ALSO output session_action: resume with that session's ID. The friction action
and session resume should go together — don't propose work related to a session without
linking it to the session.
If the user finished and closed the session's file, output session_action: complete with the EXACT session_id.
If no sessions exist but the user is actively working, output session_action: start_new (session_id: null).
NEVER invent session IDs. Use only the IDs listed above or null.
{prev_section}{exec_section}
Screenshot timeline:
{history_text}
## What to analyze
1. INFERRED TASK: What is the user actually working on right now? Base this on where
the screen is changing, not on static content.
2. CHECKPOINT: What specific progress did the user make across these frames?
Describe what changed (e.g., "typed 3 new lines of C++ code", "scrolled to next section").
3. FRICTION DETECTION: Is the user stuck in any of these patterns?
- REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually)
- STALLED: No meaningful pixel changes across 2+ frames, OR user wrote code then
deleted/undid it (write-then-delete = struggle, NOT "refining")
- TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand)
- CONTEXT_OVERHEAD: Many windows open, visibly searching across them
- TASK_RESUMPTION: User just returned to a task from earlier (check text history)
IMPORTANT signals to catch IMMEDIATELY (do NOT wait many frames):
- User wrote code/text then deleted it → STUCK, not refining. Flag stalled.
- User idle for 2+ frames after deletion → definitely stuck. Flag stalled.
- User switching between source doc and target file repeatedly → TEDIOUS_MANUAL.
This is NOT "fluent workflow." If the user is copying data from one place to
another by switching windows, flag it on the SECOND switch. Don't wait.
- Code is incomplete/wrong and user stopped typing → need help.
4. INTENT: If viewing informational content, is the user skimming, engaged, or unclear?
5. PROPOSED ACTION: If friction detected, suggest what the AI could DO.
Be SPECIFIC: "Extract full text from writeup.pdf into transcript.md" not "Summarize text".
The label should be a concrete verb phrase the user can approve with one tap.
CRITICAL: The "details" field is the executor agent's instruction manual. Write it as
a natural language spec — the executor has vision too, so tell it where to look and
what to do, not the raw data:
Bad: "Extract data from the document"
Bad: "Help with the code"
Good: "User is writing a report in report.md and has a PDF open with source data. They are manually copying table values from the PDF into markdown. Extract the table from the PDF (visible in screenshots), format as a markdown table matching the style in report.md, and append to the file."
Good: "User is implementing quicksort in solution.cpp but has been idle after writing the function signature. They appear stuck on the partition logic. Provide a working C++ quicksort implementation that fits their existing code structure."
Respond ONLY with JSON:
{{
"on_task": true,
"current_step_id": "step UUID or null",
"inferred_task": "what the user is actually working on, based on screen diffs",
"checkpoint_note_update": "what changed across these frames specifically",
"steps_completed": ["UUIDs"],
"friction": {{
"type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none",
"confidence": 0.0-1.0,
"description": "what the user is struggling with, based on diff evidence",
"proposed_actions": [
{{"label": "specific verb phrase: what to do, from where, to where", "action_type": "auto_extract | brain_dump | auto_fill | summarize | other", "details": "Natural language spec for executor. Include: (1) what the user wants done, (2) where to look in the screenshots, (3) EXACT format to use — quote what the user already wrote so the executor matches it (e.g. if user wrote '3 tacos with steak' in plain text, say 'format as plain text lines like the user already started, NOT JSON'), (4) target file. The executor has vision to read screenshots — tell it WHERE to look, not the raw data."}}
],
"source_context": "just the filename if visible (e.g. writeup.pdf), or app name if no file",
"target_context": "just the filename if visible (e.g. transcript.md), or app name if no file"
}},
"session_action": {{
"type": "resume | switch | complete | start_new | none",
"session_id": "uuid of matching session, or null for start_new/none",
"reason": "why this session action is suggested"
}},
"intent": "skimming | engaged | unclear | null",
"distraction_type": "app_switch | browsing | idle | null",
"app_name": "primary visible application",
"confidence": 0.0-1.0,
"gentle_nudge": "nudge text if distracted, null otherwise",
"vlm_summary": "1-sentence description of what CHANGED across the frames (not what's static)"
}}"""
# ── Gemini API call ──────────────────────────────────────────────────────
def _extract_json(text: str) -> dict:
"""Extract JSON from Gemini response, handling markdown code fences."""
text = text.strip()
# Strip ```json ... ``` wrappers
m = re.search(r"```(?:json)?\s*([\s\S]*?)```", text)
if m:
text = m.group(1).strip()
return json.loads(text)
def _parse_vlm_response(raw: dict) -> VLMResult:
"""Parse the raw JSON dict into a VLMResult dataclass."""
friction_raw = raw.get("friction", {})
friction = FrictionInfo(
type=friction_raw.get("type", "none"),
confidence=friction_raw.get("confidence", 0.0),
description=friction_raw.get("description"),
proposed_actions=friction_raw.get("proposed_actions", []),
source_context=friction_raw.get("source_context"),
target_context=friction_raw.get("target_context"),
)
sa_raw = raw.get("session_action", {})
session_action = SessionAction(
type=sa_raw.get("type", "none"),
session_id=sa_raw.get("session_id"),
reason=sa_raw.get("reason", ""),
)
return VLMResult(
on_task=raw.get("on_task", True),
current_step_id=raw.get("current_step_id"),
inferred_task=raw.get("inferred_task", ""),
checkpoint_note_update=raw.get("checkpoint_note_update"),
steps_completed=raw.get("steps_completed", []),
friction=friction,
session_action=session_action,
intent=raw.get("intent"),
distraction_type=raw.get("distraction_type"),
app_name=raw.get("app_name", ""),
confidence=raw.get("confidence", 0.0),
gentle_nudge=raw.get("gentle_nudge"),
vlm_summary=raw.get("vlm_summary", ""),
)
async def _call_ollama(system_prompt: str, b64_images: list[str]) -> str:
"""Call Ollama local VLM with multiple images and return raw text."""
payload = {
"model": OLLAMA_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": "/no_think\nAnalyze this screenshot sequence now.",
"images": b64_images,
},
],
"stream": False,
"keep_alive": -1,
"options": {"temperature": 0.2},
}
async with httpx.AsyncClient(timeout=300.0) as client:
resp = await client.post(f"{OLLAMA_BASE_URL}/api/chat", json=payload)
resp.raise_for_status()
body = resp.json()
return body["message"]["content"]
async def _call_gemini(system_prompt: str, b64_images: list[str], api_key: str) -> str:
"""Call Gemini Vision API with multiple images and return raw text."""
# Build parts: interleave images with labels, newest last
parts: list[dict] = []
total = len(b64_images)
for i, b64 in enumerate(b64_images):
parts.append({"text": f"[Screenshot {i + 1}/{total}]"})
parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
parts.append({"text": "Analyze this screenshot sequence now."})
payload = {
"systemInstruction": {"parts": [{"text": system_prompt}]},
"contents": [{"parts": parts}],
"generationConfig": {
"temperature": 0.2,
"maxOutputTokens": 4096,
},
}
async with httpx.AsyncClient(timeout=60.0) as client:
for attempt in range(3):
resp = await client.post(f"{GEMINI_URL}?key={api_key}", json=payload)
if resp.status_code == 429:
wait = 2 ** attempt
log.warning("Gemini 429 rate limited, retrying in %ds...", wait)
await asyncio.sleep(wait)
continue
resp.raise_for_status()
break
else:
resp.raise_for_status()
body = resp.json()
parts = body["candidates"][0]["content"]["parts"]
text = ""
for part in parts:
if "text" in part:
text = part["text"]
return text
async def analyze_screenshot(
screenshot_jpeg: bytes,
ctx: TaskContext,
history: HistoryBuffer,
*,
api_key: str | None = None,
backend: str | None = None,
session_context: str = "",
) -> VLMResult:
"""Analyze a screenshot sequence via Ollama or Gemini.
Sends all buffered screenshots + the current one as images (oldest first).
"""
which = backend or VLM_BACKEND
system_prompt = build_system_prompt(ctx, history, session_context=session_context)
# Build image list: buffered (oldest first) + current
b64_images: list[str] = []
for entry in history.get_entries():
b64_images.append(base64.b64encode(entry.jpeg).decode())
b64_images.append(base64.b64encode(screenshot_jpeg).decode())
log.debug("Sending %d screenshots to %s", len(b64_images), which)
if which == "ollama":
text = await _call_ollama(system_prompt, b64_images)
elif which == "gemini":
key = api_key or GEMINI_API_KEY
if not key:
raise RuntimeError("GEMINI_API_KEY not set")
text = await _call_gemini(system_prompt, b64_images, key)
else:
raise ValueError(f"Unknown VLM backend: {which}")
log.debug("VLM raw response: %s", text)
parsed = _extract_json(text)
return _parse_vlm_response(parsed)