Files
LockInBroMacOS/GeminiVLMClient.swift
2026-03-29 06:29:18 -04:00

279 lines
12 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// GeminiVLMClient.swift Native Swift Gemini Vision API client
// Ports the Python argus VLM analysis (vlm.py) directly into Swift.
// No subprocess required: screenshots go straight from ScreenCaptureKit Gemini UI.
import Foundation
struct GeminiVLMClient {
private static let apiBase = "https://generativelanguage.googleapis.com/v1beta/models"
private static let model = "gemini-3.1-pro-preview"
let apiKey: String
// MARK: - Public
/// Analyze a sequence of JPEG frames and return a structured distraction analysis.
/// - Parameters:
/// - frames: JPEG screenshot frames, oldest first, newest last.
/// - taskTitle: Current task title (empty if no session).
/// - taskGoal: Task description / goal.
/// - steps: Active step list for the current task.
/// - windowTitle: Frontmost app name from NSWorkspace.
/// - recentSummaries: Rolling summaries from previous analyses (temporal context).
func analyze(
frames: [Data],
taskTitle: String,
taskGoal: String,
steps: [Step],
windowTitle: String,
recentSummaries: [String]
) async throws -> DistractionAnalysisResponse {
let prompt = buildPrompt(
taskTitle: taskTitle,
taskGoal: taskGoal,
steps: steps,
windowTitle: windowTitle,
recentSummaries: recentSummaries
)
let raw = try await callGemini(prompt: prompt, frames: frames)
return try parseResponse(raw)
}
// MARK: - Prompt Builder (ported from vlm.py build_system_prompt)
private func buildPrompt(
taskTitle: String,
taskGoal: String,
steps: [Step],
windowTitle: String,
recentSummaries: [String]
) -> String {
let stepsText: String
if steps.isEmpty {
stepsText = " (no steps defined)"
} else {
stepsText = steps.map { s in
let marker: String
switch s.status {
case "pending": marker = ""
case "in_progress": marker = ""
case "done": marker = ""
default: marker = "?"
}
var line = " \(marker) [\(s.status)] (id=\(s.id)) \(s.sortOrder). \(s.title)"
if let note = s.checkpointNote { line += " — checkpoint: \(note)" }
return line
}.joined(separator: "\n")
}
let historyText: String
if recentSummaries.isEmpty {
historyText = " (no previous frames)"
} else {
historyText = recentSummaries.enumerated()
.map { i, s in " [frame \(i + 1)] \(s)" }
.joined(separator: "\n")
}
return """
You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots.
## How to read the screenshots
You receive screenshots in chronological order (oldest first, newest last).
Each frame is ~5 seconds apart. This means:
- 2 unchanged frames = ~10 seconds idle — significant.
- 3 unchanged frames = ~15 seconds idle — user is stuck or distracted.
- If ALL frames are identical, the user has been idle for 15+ seconds — flag it.
Your PRIMARY signal is the DIFFERENCES between consecutive frames.
Where the screen CHANGED = where attention is. Static areas = ignore.
Diff signals and what they mean:
- New text appearing / cursor advancing → user is actively typing (this IS their task)
- Window or tab switch → context change, could be reference or distraction
- Same content, no pixel changes → stalled, idle, or reading
- Repeated switching between same 2-3 apps → repetitive loop (manual data transfer)
- Error message that APPEARED between frames → user just triggered it, relevant
- Error message already in ALL frames → stale, ignore
CRITICAL — looking at something ≠ working on something:
- User switches to browser/another app and just LOOKS → distraction or quick reference.
- User switches and starts TYPING/EDITING → might be a new task.
- If the user has an active session and switches away WITHOUT typing in the new app,
they are DISTRACTED from their session, not starting a new task.
- A single app switch is NEVER enough to infer a new task. Wait for active work.
## Current task context
Task: \(taskTitle.isEmpty ? "(no active task)" : taskTitle)
Goal: \(taskGoal.isEmpty ? taskTitle : taskGoal)
Steps:
\(stepsText)
Window title (OS): \(windowTitle.isEmpty ? "(unknown)" : windowTitle)
## Recent screen history (for temporal context)
\(historyText)
## What to output
Analyze the screenshots and return JSON with EXACTLY this structure (no extra fields, no markdown):
{
"on_task": true,
"current_step_id": "step UUID or null",
"inferred_task": "what the user is actually working on based on screen diffs",
"checkpoint_note_update": "what specifically changed across these frames",
"steps_completed": [],
"friction": {
"type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none",
"confidence": 0.0,
"description": "what the user is struggling with",
"proposed_actions": [
{
"label": "specific verb phrase: what to do",
"action_type": "auto_extract | brain_dump | other",
"details": "natural language spec for what action to take"
}
],
"source_context": "filename or app name, or null",
"target_context": "filename or app name, or null"
},
"session_action": {
"type": "none",
"session_id": null,
"reason": ""
},
"intent": "skimming | engaged | unclear | null",
"distraction_type": "app_switch | browsing | idle | null",
"app_name": "primary visible application",
"confidence": 0.8,
"gentle_nudge": "short nudge message if distracted but no friction action applies, otherwise null",
"vlm_summary": "1-sentence description of what CHANGED across the frames (not what is static)"
}
FRICTION DETECTION rules:
- REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually)
- STALLED: No meaningful pixel changes across 2+ frames; or user wrote then deleted
- TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand)
- CONTEXT_OVERHEAD: Many windows open, visibly searching across them
- TASK_RESUMPTION: User just returned to a task they were working on earlier
If friction confidence < 0.5, set type to "none".
Only set gentle_nudge when user is off-task AND no actionable friction applies.
"""
}
// MARK: - Action Executor
/// Execute a user-approved proactive action and return a plain-text result.
func executeAction(
label: String,
actionType: String,
details: String,
screenshot: Data?
) async throws -> String {
let taskInstruction: String
switch actionType {
case "auto_extract":
taskInstruction = "Extract the relevant data from the screenshot and present it concisely as plain text."
case "brain_dump":
taskInstruction = "Format this as a short brain-dump note the user should add to their task list."
default:
taskInstruction = "Provide 23 concrete next steps the user can take right now."
}
let prompt = """
You are a productivity assistant. The user approved this action: "\(label)"
Details: \(details.isEmpty ? "(none)" : details)
\(taskInstruction)
Be specific and brief (35 sentences max). No markdown, no preamble, plain text only.
"""
let frames: [Data] = screenshot.map { [$0] } ?? []
return try await callGemini(prompt: prompt, frames: frames)
}
// MARK: - Gemini REST API Call
private func callGemini(prompt: String, frames: [Data]) async throws -> String {
let urlStr = "\(Self.apiBase)/\(Self.model):generateContent?key=\(apiKey)"
guard let url = URL(string: urlStr) else { throw URLError(.badURL) }
// Build content parts: label + image for each frame, then instruction
var parts: [[String: Any]] = []
let total = frames.count
for (i, frame) in frames.enumerated() {
parts.append(["text": "[Screenshot \(i + 1)/\(total)\((total - i) * 5)s ago]"])
parts.append([
"inlineData": [
"mimeType": "image/jpeg",
"data": frame.base64EncodedString()
]
])
}
parts.append(["text": "Analyze this screenshot sequence now. Reply with ONLY valid JSON — no markdown, no code fences."])
let body: [String: Any] = [
"systemInstruction": ["parts": [["text": prompt]]],
"contents": [["parts": parts]],
"generationConfig": [
"temperature": 0.2,
"maxOutputTokens": 1024
]
]
var request = URLRequest(url: url)
request.httpMethod = "POST"
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
request.httpBody = try JSONSerialization.data(withJSONObject: body)
request.timeoutInterval = 60
let (data, response) = try await URLSession.shared.data(for: request)
if let http = response as? HTTPURLResponse, http.statusCode != 200 {
let msg = String(data: data, encoding: .utf8) ?? "HTTP \(http.statusCode)"
print("[GeminiVLM] API error \(http.statusCode): \(msg)")
throw URLError(.badServerResponse)
}
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
let candidates = json["candidates"] as? [[String: Any]],
let first = candidates.first,
let content = first["content"] as? [String: Any],
let contentParts = content["parts"] as? [[String: Any]],
let text = contentParts.first?["text"] as? String
else {
let raw = String(data: data, encoding: .utf8) ?? ""
print("[GeminiVLM] Unexpected response shape: \(raw.prefix(300))")
throw URLError(.cannotParseResponse)
}
print("[GeminiVLM] Response (\(text.count) chars): \(text.prefix(200))")
return text
}
// MARK: - Response Parsing
private func parseResponse(_ text: String) throws -> DistractionAnalysisResponse {
var cleaned = text.trimmingCharacters(in: .whitespacesAndNewlines)
// Strip ```json ... ``` or ``` ... ``` fences
if cleaned.hasPrefix("```") {
let lines = cleaned.components(separatedBy: "\n")
cleaned = lines.dropFirst().joined(separator: "\n")
if let backtickRange = cleaned.range(of: "```") {
cleaned = String(cleaned[..<backtickRange.lowerBound])
}
cleaned = cleaned.trimmingCharacters(in: .whitespacesAndNewlines)
}
// Find JSON object boundaries robustly
guard let start = cleaned.firstIndex(of: "{"),
let end = cleaned.lastIndex(of: "}") else {
throw URLError(.cannotParseResponse)
}
let jsonStr = String(cleaned[start...end])
guard let jsonData = jsonStr.data(using: .utf8) else {
throw URLError(.cannotParseResponse)
}
return try JSONDecoder().decode(DistractionAnalysisResponse.self, from: jsonData)
}
}