include argus workflow

2026-03-29 06:29:18 -04:00
parent 275a53ab40
commit 56673078f5
23 changed files with 3098 additions and 307 deletions
--- a/GeminiVLMClient.swift
+++ b/GeminiVLMClient.swift
@@ -0,0 +1,278 @@
+// GeminiVLMClient.swift — Native Swift Gemini Vision API client
+// Ports the Python argus VLM analysis (vlm.py) directly into Swift.
+// No subprocess required: screenshots go straight from ScreenCaptureKit → Gemini → UI.
+
+import Foundation
+
+struct GeminiVLMClient {
+
+    private static let apiBase = "https://generativelanguage.googleapis.com/v1beta/models"
+    private static let model   = "gemini-3.1-pro-preview"
+
+    let apiKey: String
+
+    // MARK: - Public
+
+    /// Analyze a sequence of JPEG frames and return a structured distraction analysis.
+    /// - Parameters:
+    ///   - frames: JPEG screenshot frames, oldest first, newest last.
+    ///   - taskTitle: Current task title (empty if no session).
+    ///   - taskGoal: Task description / goal.
+    ///   - steps: Active step list for the current task.
+    ///   - windowTitle: Frontmost app name from NSWorkspace.
+    ///   - recentSummaries: Rolling summaries from previous analyses (temporal context).
+    func analyze(
+        frames: [Data],
+        taskTitle: String,
+        taskGoal: String,
+        steps: [Step],
+        windowTitle: String,
+        recentSummaries: [String]
+    ) async throws -> DistractionAnalysisResponse {
+        let prompt = buildPrompt(
+            taskTitle: taskTitle,
+            taskGoal: taskGoal,
+            steps: steps,
+            windowTitle: windowTitle,
+            recentSummaries: recentSummaries
+        )
+        let raw = try await callGemini(prompt: prompt, frames: frames)
+        return try parseResponse(raw)
+    }
+
+    // MARK: - Prompt Builder (ported from vlm.py build_system_prompt)
+
+    private func buildPrompt(
+        taskTitle: String,
+        taskGoal: String,
+        steps: [Step],
+        windowTitle: String,
+        recentSummaries: [String]
+    ) -> String {
+        let stepsText: String
+        if steps.isEmpty {
+            stepsText = "    (no steps defined)"
+        } else {
+            stepsText = steps.map { s in
+                let marker: String
+                switch s.status {
+                case "pending":     marker = "○"
+                case "in_progress": marker = "►"
+                case "done":        marker = "✓"
+                default:            marker = "?"
+                }
+                var line = "    \(marker) [\(s.status)] (id=\(s.id)) \(s.sortOrder). \(s.title)"
+                if let note = s.checkpointNote { line += "  — checkpoint: \(note)" }
+                return line
+            }.joined(separator: "\n")
+        }
+
+        let historyText: String
+        if recentSummaries.isEmpty {
+            historyText = "  (no previous frames)"
+        } else {
+            historyText = recentSummaries.enumerated()
+                .map { i, s in "  [frame \(i + 1)] \(s)" }
+                .joined(separator: "\n")
+        }
+
+        return """
+        You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots.
+
+        ## How to read the screenshots
+
+        You receive screenshots in chronological order (oldest first, newest last).
+        Each frame is ~5 seconds apart. This means:
+          - 2 unchanged frames = ~10 seconds idle — significant.
+          - 3 unchanged frames = ~15 seconds idle — user is stuck or distracted.
+          - If ALL frames are identical, the user has been idle for 15+ seconds — flag it.
+
+        Your PRIMARY signal is the DIFFERENCES between consecutive frames.
+        Where the screen CHANGED = where attention is. Static areas = ignore.
+
+        Diff signals and what they mean:
+          - New text appearing / cursor advancing → user is actively typing (this IS their task)
+          - Window or tab switch → context change, could be reference or distraction
+          - Same content, no pixel changes → stalled, idle, or reading
+          - Repeated switching between same 2-3 apps → repetitive loop (manual data transfer)
+          - Error message that APPEARED between frames → user just triggered it, relevant
+          - Error message already in ALL frames → stale, ignore
+
+        CRITICAL — looking at something ≠ working on something:
+          - User switches to browser/another app and just LOOKS → distraction or quick reference.
+          - User switches and starts TYPING/EDITING → might be a new task.
+          - If the user has an active session and switches away WITHOUT typing in the new app,
+            they are DISTRACTED from their session, not starting a new task.
+          - A single app switch is NEVER enough to infer a new task. Wait for active work.
+
+        ## Current task context
+
+        Task: \(taskTitle.isEmpty ? "(no active task)" : taskTitle)
+        Goal: \(taskGoal.isEmpty ? taskTitle : taskGoal)
+        Steps:
+        \(stepsText)
+          Window title (OS): \(windowTitle.isEmpty ? "(unknown)" : windowTitle)
+
+        ## Recent screen history (for temporal context)
+        \(historyText)
+
+        ## What to output
+
+        Analyze the screenshots and return JSON with EXACTLY this structure (no extra fields, no markdown):
+        {
+          "on_task": true,
+          "current_step_id": "step UUID or null",
+          "inferred_task": "what the user is actually working on based on screen diffs",
+          "checkpoint_note_update": "what specifically changed across these frames",
+          "steps_completed": [],
+          "friction": {
+            "type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none",
+            "confidence": 0.0,
+            "description": "what the user is struggling with",
+            "proposed_actions": [
+              {
+                "label": "specific verb phrase: what to do",
+                "action_type": "auto_extract | brain_dump | other",
+                "details": "natural language spec for what action to take"
+              }
+            ],
+            "source_context": "filename or app name, or null",
+            "target_context": "filename or app name, or null"
+          },
+          "session_action": {
+            "type": "none",
+            "session_id": null,
+            "reason": ""
+          },
+          "intent": "skimming | engaged | unclear | null",
+          "distraction_type": "app_switch | browsing | idle | null",
+          "app_name": "primary visible application",
+          "confidence": 0.8,
+          "gentle_nudge": "short nudge message if distracted but no friction action applies, otherwise null",
+          "vlm_summary": "1-sentence description of what CHANGED across the frames (not what is static)"
+        }
+
+        FRICTION DETECTION rules:
+          - REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually)
+          - STALLED: No meaningful pixel changes across 2+ frames; or user wrote then deleted
+          - TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand)
+          - CONTEXT_OVERHEAD: Many windows open, visibly searching across them
+          - TASK_RESUMPTION: User just returned to a task they were working on earlier
+
+        If friction confidence < 0.5, set type to "none".
+        Only set gentle_nudge when user is off-task AND no actionable friction applies.
+        """
+    }
+
+    // MARK: - Action Executor
+
+    /// Execute a user-approved proactive action and return a plain-text result.
+    func executeAction(
+        label: String,
+        actionType: String,
+        details: String,
+        screenshot: Data?
+    ) async throws -> String {
+        let taskInstruction: String
+        switch actionType {
+        case "auto_extract":
+            taskInstruction = "Extract the relevant data from the screenshot and present it concisely as plain text."
+        case "brain_dump":
+            taskInstruction = "Format this as a short brain-dump note the user should add to their task list."
+        default:
+            taskInstruction = "Provide 2–3 concrete next steps the user can take right now."
+        }
+        let prompt = """
+        You are a productivity assistant. The user approved this action: "\(label)"
+        Details: \(details.isEmpty ? "(none)" : details)
+        \(taskInstruction)
+        Be specific and brief (3–5 sentences max). No markdown, no preamble, plain text only.
+        """
+        let frames: [Data] = screenshot.map { [$0] } ?? []
+        return try await callGemini(prompt: prompt, frames: frames)
+    }
+
+    // MARK: - Gemini REST API Call
+
+    private func callGemini(prompt: String, frames: [Data]) async throws -> String {
+        let urlStr = "\(Self.apiBase)/\(Self.model):generateContent?key=\(apiKey)"
+        guard let url = URL(string: urlStr) else { throw URLError(.badURL) }
+
+        // Build content parts: label + image for each frame, then instruction
+        var parts: [[String: Any]] = []
+        let total = frames.count
+        for (i, frame) in frames.enumerated() {
+            parts.append(["text": "[Screenshot \(i + 1)/\(total) — \((total - i) * 5)s ago]"])
+            parts.append([
+                "inlineData": [
+                    "mimeType": "image/jpeg",
+                    "data": frame.base64EncodedString()
+                ]
+            ])
+        }
+        parts.append(["text": "Analyze this screenshot sequence now. Reply with ONLY valid JSON — no markdown, no code fences."])
+
+        let body: [String: Any] = [
+            "systemInstruction": ["parts": [["text": prompt]]],
+            "contents": [["parts": parts]],
+            "generationConfig": [
+                "temperature": 0.2,
+                "maxOutputTokens": 1024
+            ]
+        ]
+
+        var request = URLRequest(url: url)
+        request.httpMethod = "POST"
+        request.setValue("application/json", forHTTPHeaderField: "Content-Type")
+        request.httpBody = try JSONSerialization.data(withJSONObject: body)
+        request.timeoutInterval = 60
+
+        let (data, response) = try await URLSession.shared.data(for: request)
+
+        if let http = response as? HTTPURLResponse, http.statusCode != 200 {
+            let msg = String(data: data, encoding: .utf8) ?? "HTTP \(http.statusCode)"
+            print("[GeminiVLM] API error \(http.statusCode): \(msg)")
+            throw URLError(.badServerResponse)
+        }
+
+        guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
+              let candidates = json["candidates"] as? [[String: Any]],
+              let first = candidates.first,
+              let content = first["content"] as? [String: Any],
+              let contentParts = content["parts"] as? [[String: Any]],
+              let text = contentParts.first?["text"] as? String
+        else {
+            let raw = String(data: data, encoding: .utf8) ?? ""
+            print("[GeminiVLM] Unexpected response shape: \(raw.prefix(300))")
+            throw URLError(.cannotParseResponse)
+        }
+
+        print("[GeminiVLM] Response (\(text.count) chars): \(text.prefix(200))")
+        return text
+    }
+
+    // MARK: - Response Parsing
+
+    private func parseResponse(_ text: String) throws -> DistractionAnalysisResponse {
+        var cleaned = text.trimmingCharacters(in: .whitespacesAndNewlines)
+        // Strip ```json ... ``` or ``` ... ``` fences
+        if cleaned.hasPrefix("```") {
+            let lines = cleaned.components(separatedBy: "\n")
+            cleaned = lines.dropFirst().joined(separator: "\n")
+            if let backtickRange = cleaned.range(of: "```") {
+                cleaned = String(cleaned[..<backtickRange.lowerBound])
+            }
+            cleaned = cleaned.trimmingCharacters(in: .whitespacesAndNewlines)
+        }
+        // Find JSON object boundaries robustly
+        guard let start = cleaned.firstIndex(of: "{"),
+              let end = cleaned.lastIndex(of: "}") else {
+            throw URLError(.cannotParseResponse)
+        }
+        let jsonStr = String(cleaned[start...end])
+        guard let jsonData = jsonStr.data(using: .utf8) else {
+            throw URLError(.cannotParseResponse)
+        }
+        return try JSONDecoder().decode(DistractionAnalysisResponse.self, from: jsonData)
+    }
+}