279 lines
12 KiB
Swift
279 lines
12 KiB
Swift
|
|
// GeminiVLMClient.swift — Native Swift Gemini Vision API client
|
|||
|
|
// Ports the Python argus VLM analysis (vlm.py) directly into Swift.
|
|||
|
|
// No subprocess required: screenshots go straight from ScreenCaptureKit → Gemini → UI.
|
|||
|
|
|
|||
|
|
import Foundation
|
|||
|
|
|
|||
|
|
struct GeminiVLMClient {
|
|||
|
|
|
|||
|
|
private static let apiBase = "https://generativelanguage.googleapis.com/v1beta/models"
|
|||
|
|
private static let model = "gemini-3.1-pro-preview"
|
|||
|
|
|
|||
|
|
let apiKey: String
|
|||
|
|
|
|||
|
|
// MARK: - Public
|
|||
|
|
|
|||
|
|
/// Analyze a sequence of JPEG frames and return a structured distraction analysis.
|
|||
|
|
/// - Parameters:
|
|||
|
|
/// - frames: JPEG screenshot frames, oldest first, newest last.
|
|||
|
|
/// - taskTitle: Current task title (empty if no session).
|
|||
|
|
/// - taskGoal: Task description / goal.
|
|||
|
|
/// - steps: Active step list for the current task.
|
|||
|
|
/// - windowTitle: Frontmost app name from NSWorkspace.
|
|||
|
|
/// - recentSummaries: Rolling summaries from previous analyses (temporal context).
|
|||
|
|
func analyze(
|
|||
|
|
frames: [Data],
|
|||
|
|
taskTitle: String,
|
|||
|
|
taskGoal: String,
|
|||
|
|
steps: [Step],
|
|||
|
|
windowTitle: String,
|
|||
|
|
recentSummaries: [String]
|
|||
|
|
) async throws -> DistractionAnalysisResponse {
|
|||
|
|
let prompt = buildPrompt(
|
|||
|
|
taskTitle: taskTitle,
|
|||
|
|
taskGoal: taskGoal,
|
|||
|
|
steps: steps,
|
|||
|
|
windowTitle: windowTitle,
|
|||
|
|
recentSummaries: recentSummaries
|
|||
|
|
)
|
|||
|
|
let raw = try await callGemini(prompt: prompt, frames: frames)
|
|||
|
|
return try parseResponse(raw)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// MARK: - Prompt Builder (ported from vlm.py build_system_prompt)
|
|||
|
|
|
|||
|
|
private func buildPrompt(
|
|||
|
|
taskTitle: String,
|
|||
|
|
taskGoal: String,
|
|||
|
|
steps: [Step],
|
|||
|
|
windowTitle: String,
|
|||
|
|
recentSummaries: [String]
|
|||
|
|
) -> String {
|
|||
|
|
let stepsText: String
|
|||
|
|
if steps.isEmpty {
|
|||
|
|
stepsText = " (no steps defined)"
|
|||
|
|
} else {
|
|||
|
|
stepsText = steps.map { s in
|
|||
|
|
let marker: String
|
|||
|
|
switch s.status {
|
|||
|
|
case "pending": marker = "○"
|
|||
|
|
case "in_progress": marker = "►"
|
|||
|
|
case "done": marker = "✓"
|
|||
|
|
default: marker = "?"
|
|||
|
|
}
|
|||
|
|
var line = " \(marker) [\(s.status)] (id=\(s.id)) \(s.sortOrder). \(s.title)"
|
|||
|
|
if let note = s.checkpointNote { line += " — checkpoint: \(note)" }
|
|||
|
|
return line
|
|||
|
|
}.joined(separator: "\n")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let historyText: String
|
|||
|
|
if recentSummaries.isEmpty {
|
|||
|
|
historyText = " (no previous frames)"
|
|||
|
|
} else {
|
|||
|
|
historyText = recentSummaries.enumerated()
|
|||
|
|
.map { i, s in " [frame \(i + 1)] \(s)" }
|
|||
|
|
.joined(separator: "\n")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return """
|
|||
|
|
You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots.
|
|||
|
|
|
|||
|
|
## How to read the screenshots
|
|||
|
|
|
|||
|
|
You receive screenshots in chronological order (oldest first, newest last).
|
|||
|
|
Each frame is ~5 seconds apart. This means:
|
|||
|
|
- 2 unchanged frames = ~10 seconds idle — significant.
|
|||
|
|
- 3 unchanged frames = ~15 seconds idle — user is stuck or distracted.
|
|||
|
|
- If ALL frames are identical, the user has been idle for 15+ seconds — flag it.
|
|||
|
|
|
|||
|
|
Your PRIMARY signal is the DIFFERENCES between consecutive frames.
|
|||
|
|
Where the screen CHANGED = where attention is. Static areas = ignore.
|
|||
|
|
|
|||
|
|
Diff signals and what they mean:
|
|||
|
|
- New text appearing / cursor advancing → user is actively typing (this IS their task)
|
|||
|
|
- Window or tab switch → context change, could be reference or distraction
|
|||
|
|
- Same content, no pixel changes → stalled, idle, or reading
|
|||
|
|
- Repeated switching between same 2-3 apps → repetitive loop (manual data transfer)
|
|||
|
|
- Error message that APPEARED between frames → user just triggered it, relevant
|
|||
|
|
- Error message already in ALL frames → stale, ignore
|
|||
|
|
|
|||
|
|
CRITICAL — looking at something ≠ working on something:
|
|||
|
|
- User switches to browser/another app and just LOOKS → distraction or quick reference.
|
|||
|
|
- User switches and starts TYPING/EDITING → might be a new task.
|
|||
|
|
- If the user has an active session and switches away WITHOUT typing in the new app,
|
|||
|
|
they are DISTRACTED from their session, not starting a new task.
|
|||
|
|
- A single app switch is NEVER enough to infer a new task. Wait for active work.
|
|||
|
|
|
|||
|
|
## Current task context
|
|||
|
|
|
|||
|
|
Task: \(taskTitle.isEmpty ? "(no active task)" : taskTitle)
|
|||
|
|
Goal: \(taskGoal.isEmpty ? taskTitle : taskGoal)
|
|||
|
|
Steps:
|
|||
|
|
\(stepsText)
|
|||
|
|
Window title (OS): \(windowTitle.isEmpty ? "(unknown)" : windowTitle)
|
|||
|
|
|
|||
|
|
## Recent screen history (for temporal context)
|
|||
|
|
\(historyText)
|
|||
|
|
|
|||
|
|
## What to output
|
|||
|
|
|
|||
|
|
Analyze the screenshots and return JSON with EXACTLY this structure (no extra fields, no markdown):
|
|||
|
|
{
|
|||
|
|
"on_task": true,
|
|||
|
|
"current_step_id": "step UUID or null",
|
|||
|
|
"inferred_task": "what the user is actually working on based on screen diffs",
|
|||
|
|
"checkpoint_note_update": "what specifically changed across these frames",
|
|||
|
|
"steps_completed": [],
|
|||
|
|
"friction": {
|
|||
|
|
"type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none",
|
|||
|
|
"confidence": 0.0,
|
|||
|
|
"description": "what the user is struggling with",
|
|||
|
|
"proposed_actions": [
|
|||
|
|
{
|
|||
|
|
"label": "specific verb phrase: what to do",
|
|||
|
|
"action_type": "auto_extract | brain_dump | other",
|
|||
|
|
"details": "natural language spec for what action to take"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source_context": "filename or app name, or null",
|
|||
|
|
"target_context": "filename or app name, or null"
|
|||
|
|
},
|
|||
|
|
"session_action": {
|
|||
|
|
"type": "none",
|
|||
|
|
"session_id": null,
|
|||
|
|
"reason": ""
|
|||
|
|
},
|
|||
|
|
"intent": "skimming | engaged | unclear | null",
|
|||
|
|
"distraction_type": "app_switch | browsing | idle | null",
|
|||
|
|
"app_name": "primary visible application",
|
|||
|
|
"confidence": 0.8,
|
|||
|
|
"gentle_nudge": "short nudge message if distracted but no friction action applies, otherwise null",
|
|||
|
|
"vlm_summary": "1-sentence description of what CHANGED across the frames (not what is static)"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
FRICTION DETECTION rules:
|
|||
|
|
- REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually)
|
|||
|
|
- STALLED: No meaningful pixel changes across 2+ frames; or user wrote then deleted
|
|||
|
|
- TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand)
|
|||
|
|
- CONTEXT_OVERHEAD: Many windows open, visibly searching across them
|
|||
|
|
- TASK_RESUMPTION: User just returned to a task they were working on earlier
|
|||
|
|
|
|||
|
|
If friction confidence < 0.5, set type to "none".
|
|||
|
|
Only set gentle_nudge when user is off-task AND no actionable friction applies.
|
|||
|
|
"""
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// MARK: - Action Executor
|
|||
|
|
|
|||
|
|
/// Execute a user-approved proactive action and return a plain-text result.
|
|||
|
|
func executeAction(
|
|||
|
|
label: String,
|
|||
|
|
actionType: String,
|
|||
|
|
details: String,
|
|||
|
|
screenshot: Data?
|
|||
|
|
) async throws -> String {
|
|||
|
|
let taskInstruction: String
|
|||
|
|
switch actionType {
|
|||
|
|
case "auto_extract":
|
|||
|
|
taskInstruction = "Extract the relevant data from the screenshot and present it concisely as plain text."
|
|||
|
|
case "brain_dump":
|
|||
|
|
taskInstruction = "Format this as a short brain-dump note the user should add to their task list."
|
|||
|
|
default:
|
|||
|
|
taskInstruction = "Provide 2–3 concrete next steps the user can take right now."
|
|||
|
|
}
|
|||
|
|
let prompt = """
|
|||
|
|
You are a productivity assistant. The user approved this action: "\(label)"
|
|||
|
|
Details: \(details.isEmpty ? "(none)" : details)
|
|||
|
|
\(taskInstruction)
|
|||
|
|
Be specific and brief (3–5 sentences max). No markdown, no preamble, plain text only.
|
|||
|
|
"""
|
|||
|
|
let frames: [Data] = screenshot.map { [$0] } ?? []
|
|||
|
|
return try await callGemini(prompt: prompt, frames: frames)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// MARK: - Gemini REST API Call
|
|||
|
|
|
|||
|
|
private func callGemini(prompt: String, frames: [Data]) async throws -> String {
|
|||
|
|
let urlStr = "\(Self.apiBase)/\(Self.model):generateContent?key=\(apiKey)"
|
|||
|
|
guard let url = URL(string: urlStr) else { throw URLError(.badURL) }
|
|||
|
|
|
|||
|
|
// Build content parts: label + image for each frame, then instruction
|
|||
|
|
var parts: [[String: Any]] = []
|
|||
|
|
let total = frames.count
|
|||
|
|
for (i, frame) in frames.enumerated() {
|
|||
|
|
parts.append(["text": "[Screenshot \(i + 1)/\(total) — \((total - i) * 5)s ago]"])
|
|||
|
|
parts.append([
|
|||
|
|
"inlineData": [
|
|||
|
|
"mimeType": "image/jpeg",
|
|||
|
|
"data": frame.base64EncodedString()
|
|||
|
|
]
|
|||
|
|
])
|
|||
|
|
}
|
|||
|
|
parts.append(["text": "Analyze this screenshot sequence now. Reply with ONLY valid JSON — no markdown, no code fences."])
|
|||
|
|
|
|||
|
|
let body: [String: Any] = [
|
|||
|
|
"systemInstruction": ["parts": [["text": prompt]]],
|
|||
|
|
"contents": [["parts": parts]],
|
|||
|
|
"generationConfig": [
|
|||
|
|
"temperature": 0.2,
|
|||
|
|
"maxOutputTokens": 1024
|
|||
|
|
]
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
var request = URLRequest(url: url)
|
|||
|
|
request.httpMethod = "POST"
|
|||
|
|
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
|||
|
|
request.httpBody = try JSONSerialization.data(withJSONObject: body)
|
|||
|
|
request.timeoutInterval = 60
|
|||
|
|
|
|||
|
|
let (data, response) = try await URLSession.shared.data(for: request)
|
|||
|
|
|
|||
|
|
if let http = response as? HTTPURLResponse, http.statusCode != 200 {
|
|||
|
|
let msg = String(data: data, encoding: .utf8) ?? "HTTP \(http.statusCode)"
|
|||
|
|
print("[GeminiVLM] API error \(http.statusCode): \(msg)")
|
|||
|
|
throw URLError(.badServerResponse)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
|
|||
|
|
let candidates = json["candidates"] as? [[String: Any]],
|
|||
|
|
let first = candidates.first,
|
|||
|
|
let content = first["content"] as? [String: Any],
|
|||
|
|
let contentParts = content["parts"] as? [[String: Any]],
|
|||
|
|
let text = contentParts.first?["text"] as? String
|
|||
|
|
else {
|
|||
|
|
let raw = String(data: data, encoding: .utf8) ?? ""
|
|||
|
|
print("[GeminiVLM] Unexpected response shape: \(raw.prefix(300))")
|
|||
|
|
throw URLError(.cannotParseResponse)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
print("[GeminiVLM] Response (\(text.count) chars): \(text.prefix(200))")
|
|||
|
|
return text
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// MARK: - Response Parsing
|
|||
|
|
|
|||
|
|
private func parseResponse(_ text: String) throws -> DistractionAnalysisResponse {
|
|||
|
|
var cleaned = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
|||
|
|
// Strip ```json ... ``` or ``` ... ``` fences
|
|||
|
|
if cleaned.hasPrefix("```") {
|
|||
|
|
let lines = cleaned.components(separatedBy: "\n")
|
|||
|
|
cleaned = lines.dropFirst().joined(separator: "\n")
|
|||
|
|
if let backtickRange = cleaned.range(of: "```") {
|
|||
|
|
cleaned = String(cleaned[..<backtickRange.lowerBound])
|
|||
|
|
}
|
|||
|
|
cleaned = cleaned.trimmingCharacters(in: .whitespacesAndNewlines)
|
|||
|
|
}
|
|||
|
|
// Find JSON object boundaries robustly
|
|||
|
|
guard let start = cleaned.firstIndex(of: "{"),
|
|||
|
|
let end = cleaned.lastIndex(of: "}") else {
|
|||
|
|
throw URLError(.cannotParseResponse)
|
|||
|
|
}
|
|||
|
|
let jsonStr = String(cleaned[start...end])
|
|||
|
|
guard let jsonData = jsonStr.data(using: .utf8) else {
|
|||
|
|
throw URLError(.cannotParseResponse)
|
|||
|
|
}
|
|||
|
|
return try JSONDecoder().decode(DistractionAnalysisResponse.self, from: jsonData)
|
|||
|
|
}
|
|||
|
|
}
|