// GeminiVLMClient.swift — Native Swift Gemini Vision API client // Ports the Python argus VLM analysis (vlm.py) directly into Swift. // No subprocess required: screenshots go straight from ScreenCaptureKit → Gemini → UI. import Foundation struct GeminiVLMClient { private static let apiBase = "https://generativelanguage.googleapis.com/v1beta/models" private static let model = "gemini-3.1-pro-preview" let apiKey: String // MARK: - Public /// Analyze a sequence of JPEG frames and return a structured distraction analysis. /// - Parameters: /// - frames: JPEG screenshot frames, oldest first, newest last. /// - taskTitle: Current task title (empty if no session). /// - taskGoal: Task description / goal. /// - steps: Active step list for the current task. /// - windowTitle: Frontmost app name from NSWorkspace. /// - recentSummaries: Rolling summaries from previous analyses (temporal context). func analyze( frames: [Data], taskTitle: String, taskGoal: String, steps: [Step], windowTitle: String, recentSummaries: [String] ) async throws -> DistractionAnalysisResponse { let prompt = buildPrompt( taskTitle: taskTitle, taskGoal: taskGoal, steps: steps, windowTitle: windowTitle, recentSummaries: recentSummaries ) let raw = try await callGemini(prompt: prompt, frames: frames) return try parseResponse(raw) } // MARK: - Prompt Builder (ported from vlm.py build_system_prompt) private func buildPrompt( taskTitle: String, taskGoal: String, steps: [Step], windowTitle: String, recentSummaries: [String] ) -> String { let stepsText: String if steps.isEmpty { stepsText = " (no steps defined)" } else { stepsText = steps.map { s in let marker: String switch s.status { case "pending": marker = "○" case "in_progress": marker = "►" case "done": marker = "✓" default: marker = "?" } var line = " \(marker) [\(s.status)] (id=\(s.id)) \(s.sortOrder). \(s.title)" if let note = s.checkpointNote { line += " — checkpoint: \(note)" } return line }.joined(separator: "\n") } let historyText: String if recentSummaries.isEmpty { historyText = " (no previous frames)" } else { historyText = recentSummaries.enumerated() .map { i, s in " [frame \(i + 1)] \(s)" } .joined(separator: "\n") } return """ You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots. ## How to read the screenshots You receive screenshots in chronological order (oldest first, newest last). Each frame is ~5 seconds apart. This means: - 2 unchanged frames = ~10 seconds idle — significant. - 3 unchanged frames = ~15 seconds idle — user is stuck or distracted. - If ALL frames are identical, the user has been idle for 15+ seconds — flag it. Your PRIMARY signal is the DIFFERENCES between consecutive frames. Where the screen CHANGED = where attention is. Static areas = ignore. Diff signals and what they mean: - New text appearing / cursor advancing → user is actively typing (this IS their task) - Window or tab switch → context change, could be reference or distraction - Same content, no pixel changes → stalled, idle, or reading - Repeated switching between same 2-3 apps → repetitive loop (manual data transfer) - Error message that APPEARED between frames → user just triggered it, relevant - Error message already in ALL frames → stale, ignore CRITICAL — looking at something ≠ working on something: - User switches to browser/another app and just LOOKS → distraction or quick reference. - User switches and starts TYPING/EDITING → might be a new task. - If the user has an active session and switches away WITHOUT typing in the new app, they are DISTRACTED from their session, not starting a new task. - A single app switch is NEVER enough to infer a new task. Wait for active work. ## Current task context Task: \(taskTitle.isEmpty ? "(no active task)" : taskTitle) Goal: \(taskGoal.isEmpty ? taskTitle : taskGoal) Steps: \(stepsText) Window title (OS): \(windowTitle.isEmpty ? "(unknown)" : windowTitle) ## Recent screen history (for temporal context) \(historyText) ## What to output Analyze the screenshots and return JSON with EXACTLY this structure (no extra fields, no markdown): { "on_task": true, "current_step_id": "step UUID or null", "inferred_task": "what the user is actually working on based on screen diffs", "checkpoint_note_update": "what specifically changed across these frames", "steps_completed": [], "friction": { "type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none", "confidence": 0.0, "description": "what the user is struggling with", "proposed_actions": [ { "label": "specific verb phrase: what to do", "action_type": "auto_extract | brain_dump | other", "details": "natural language spec for what action to take" } ], "source_context": "filename or app name, or null", "target_context": "filename or app name, or null" }, "session_action": { "type": "none", "session_id": null, "reason": "" }, "intent": "skimming | engaged | unclear | null", "distraction_type": "app_switch | browsing | idle | null", "app_name": "primary visible application", "confidence": 0.8, "gentle_nudge": "short nudge message if distracted but no friction action applies, otherwise null", "vlm_summary": "1-sentence description of what CHANGED across the frames (not what is static)" } FRICTION DETECTION rules: - REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually) - STALLED: No meaningful pixel changes across 2+ frames; or user wrote then deleted - TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand) - CONTEXT_OVERHEAD: Many windows open, visibly searching across them - TASK_RESUMPTION: User just returned to a task they were working on earlier If friction confidence < 0.5, set type to "none". Only set gentle_nudge when user is off-task AND no actionable friction applies. """ } // MARK: - Action Executor /// Execute a user-approved proactive action and return a plain-text result. func executeAction( label: String, actionType: String, details: String, screenshot: Data? ) async throws -> String { let taskInstruction: String switch actionType { case "auto_extract": taskInstruction = "Extract the relevant data from the screenshot and present it concisely as plain text." case "brain_dump": taskInstruction = "Format this as a short brain-dump note the user should add to their task list." default: taskInstruction = "Provide 2–3 concrete next steps the user can take right now." } let prompt = """ You are a productivity assistant. The user approved this action: "\(label)" Details: \(details.isEmpty ? "(none)" : details) \(taskInstruction) Be specific and brief (3–5 sentences max). No markdown, no preamble, plain text only. """ let frames: [Data] = screenshot.map { [$0] } ?? [] return try await callGemini(prompt: prompt, frames: frames) } // MARK: - Gemini REST API Call private func callGemini(prompt: String, frames: [Data]) async throws -> String { let urlStr = "\(Self.apiBase)/\(Self.model):generateContent?key=\(apiKey)" guard let url = URL(string: urlStr) else { throw URLError(.badURL) } // Build content parts: label + image for each frame, then instruction var parts: [[String: Any]] = [] let total = frames.count for (i, frame) in frames.enumerated() { parts.append(["text": "[Screenshot \(i + 1)/\(total) — \((total - i) * 5)s ago]"]) parts.append([ "inlineData": [ "mimeType": "image/jpeg", "data": frame.base64EncodedString() ] ]) } parts.append(["text": "Analyze this screenshot sequence now. Reply with ONLY valid JSON — no markdown, no code fences."]) let body: [String: Any] = [ "systemInstruction": ["parts": [["text": prompt]]], "contents": [["parts": parts]], "generationConfig": [ "temperature": 0.2, "maxOutputTokens": 1024 ] ] var request = URLRequest(url: url) request.httpMethod = "POST" request.setValue("application/json", forHTTPHeaderField: "Content-Type") request.httpBody = try JSONSerialization.data(withJSONObject: body) request.timeoutInterval = 60 let (data, response) = try await URLSession.shared.data(for: request) if let http = response as? HTTPURLResponse, http.statusCode != 200 { let msg = String(data: data, encoding: .utf8) ?? "HTTP \(http.statusCode)" print("[GeminiVLM] API error \(http.statusCode): \(msg)") throw URLError(.badServerResponse) } guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any], let candidates = json["candidates"] as? [[String: Any]], let first = candidates.first, let content = first["content"] as? [String: Any], let contentParts = content["parts"] as? [[String: Any]], let text = contentParts.first?["text"] as? String else { let raw = String(data: data, encoding: .utf8) ?? "" print("[GeminiVLM] Unexpected response shape: \(raw.prefix(300))") throw URLError(.cannotParseResponse) } print("[GeminiVLM] Response (\(text.count) chars): \(text.prefix(200))") return text } // MARK: - Response Parsing private func parseResponse(_ text: String) throws -> DistractionAnalysisResponse { var cleaned = text.trimmingCharacters(in: .whitespacesAndNewlines) // Strip ```json ... ``` or ``` ... ``` fences if cleaned.hasPrefix("```") { let lines = cleaned.components(separatedBy: "\n") cleaned = lines.dropFirst().joined(separator: "\n") if let backtickRange = cleaned.range(of: "```") { cleaned = String(cleaned[..