include argus workflow
This commit is contained in:
@@ -1,4 +1,6 @@
|
||||
// SessionManager.swift — Focus session state, screenshot engine, distraction detection
|
||||
// SessionManager.swift — Focus session state, native VLM screen analysis
|
||||
// Screenshot capture → Gemini Vision API → apply results to UI + post to backend.
|
||||
// No Python subprocess. No external process management.
|
||||
|
||||
import AppKit
|
||||
import SwiftUI
|
||||
@@ -25,30 +27,27 @@ final class SessionManager {
|
||||
var errorMessage: String?
|
||||
var isLoading: Bool = false
|
||||
|
||||
// Proactive agent
|
||||
// VLM / proactive agent
|
||||
var proactiveCard: ProactiveCard?
|
||||
/// Set when the user approves a proposed action — shown as a confirmation toast
|
||||
var approvedActionLabel: String?
|
||||
/// Latest one-sentence summary from the VLM, shown in the floating HUD
|
||||
var latestVlmSummary: String?
|
||||
/// True while the argus executor is running an approved action
|
||||
var latestInferredTask: String?
|
||||
var isExecuting: Bool = false
|
||||
/// Result produced by the executor's output() tool — shown as a sticky card in the HUD
|
||||
var executorOutput: (title: String, content: String)?
|
||||
var monitoringError: String?
|
||||
|
||||
// Screenshot engine
|
||||
var isCapturing: Bool = false
|
||||
|
||||
private var captureTask: Task<Void, Never>?
|
||||
@ObservationIgnored private var captureTask: Task<Void, Never>?
|
||||
private let captureInterval: TimeInterval = 5.0
|
||||
|
||||
// Rolling screenshot history buffer (max 4 entries, ~20-second window)
|
||||
// Provides temporal context to the VLM so it can detect patterns across captures.
|
||||
private struct ScreenshotHistoryEntry {
|
||||
let summary: String // vlm_summary text from the previous analysis
|
||||
let timestamp: Date
|
||||
}
|
||||
@ObservationIgnored private var screenshotHistory: [ScreenshotHistoryEntry] = []
|
||||
// Frame buffer — accumulate N frames before calling VLM for temporal diff context
|
||||
@ObservationIgnored private var frameBuffer: [Data] = []
|
||||
private let framesPerVLMCall = 3
|
||||
|
||||
// Rolling summary history fed as context into subsequent VLM calls
|
||||
private struct HistoryEntry { let summary: String; let timestamp: Date }
|
||||
@ObservationIgnored private var screenshotHistory: [HistoryEntry] = []
|
||||
|
||||
// App switch tracking
|
||||
@ObservationIgnored private var appSwitches: [(name: String, bundleId: String, time: Date)] = []
|
||||
@@ -56,15 +55,8 @@ final class SessionManager {
|
||||
@ObservationIgnored private var lastApp: (name: String, bundleId: String) = ("", "")
|
||||
@ObservationIgnored private var lastAppEnteredAt: Date = Date()
|
||||
|
||||
// Argus subprocess (device-side VLM)
|
||||
@ObservationIgnored private var argusProcess: Process?
|
||||
@ObservationIgnored private var argusReadTask: Task<Void, Never>?
|
||||
@ObservationIgnored private var argusStdinPipe: Pipe?
|
||||
/// Whether the current proactive card came from VLM (needs argus stdin response) vs local heuristic
|
||||
@ObservationIgnored private var proactiveCardNeedsArgusResponse = false
|
||||
// Proactive card auto-dismiss timer
|
||||
@ObservationIgnored private var proactiveCardTimer: Task<Void, Never>?
|
||||
private let argusPythonPath = "/Users/joyzhuo/miniconda3/envs/gmr/bin/python3"
|
||||
private let argusRepoPath = "/Users/joyzhuo/yhack/lockinbro-argus"
|
||||
|
||||
private init() {}
|
||||
|
||||
@@ -83,9 +75,96 @@ final class SessionManager {
|
||||
return Date().timeIntervalSince(start)
|
||||
}
|
||||
|
||||
// MARK: - Monitoring Lifecycle
|
||||
|
||||
/// Immediately shuts down all monitoring without making any API calls.
|
||||
func stopMonitoring() {
|
||||
stopCapture()
|
||||
stopAppObserver()
|
||||
proactiveCardTimer?.cancel()
|
||||
proactiveCardTimer = nil
|
||||
activeSession = nil
|
||||
activeTask = nil
|
||||
activeSteps = []
|
||||
isSessionActive = false
|
||||
sessionStartDate = nil
|
||||
lastNudge = nil
|
||||
resumeCard = nil
|
||||
showingResumeCard = false
|
||||
proactiveCard = nil
|
||||
latestVlmSummary = nil
|
||||
latestInferredTask = nil
|
||||
isExecuting = false
|
||||
executorOutput = nil
|
||||
monitoringError = nil
|
||||
screenshotHistory = []
|
||||
frameBuffer = []
|
||||
persistedSessionId = nil
|
||||
}
|
||||
|
||||
/// Called once after login. Auto-resumes any existing active session and starts the capture loop.
|
||||
func startMonitoring() async {
|
||||
guard TokenStore.shared.token != nil else { return }
|
||||
guard !isCapturing else { return }
|
||||
|
||||
monitoringError = nil
|
||||
await requestNotificationPermission()
|
||||
|
||||
// Silent preflight — never shows UI; only request permission if not yet granted.
|
||||
if !CGPreflightScreenCaptureAccess() {
|
||||
CGRequestScreenCaptureAccess()
|
||||
monitoringError = "Screen Recording permission required — enable in System Settings → Privacy & Security → Screen Recording, then tap Retry"
|
||||
return
|
||||
}
|
||||
|
||||
do {
|
||||
if let existing = try await APIClient.shared.getActiveSession() {
|
||||
await autoResumeSession(existing)
|
||||
} else {
|
||||
startCapture()
|
||||
startAppObserver()
|
||||
}
|
||||
} catch {
|
||||
startCapture()
|
||||
startAppObserver()
|
||||
}
|
||||
}
|
||||
|
||||
/// Silently resume an active session found on the backend (no loading UI shown).
|
||||
private func autoResumeSession(_ session: FocusSession) async {
|
||||
activeSession = session
|
||||
persistedSessionId = session.id
|
||||
isSessionActive = true
|
||||
sessionStartDate = Date()
|
||||
distractionCount = 0
|
||||
lastNudge = nil
|
||||
screenshotHistory = []
|
||||
frameBuffer = []
|
||||
|
||||
if let taskId = session.taskId {
|
||||
do {
|
||||
let tasks = try await APIClient.shared.getTasks()
|
||||
activeTask = tasks.first(where: { $0.id == taskId })
|
||||
if let task = activeTask {
|
||||
let steps = try await APIClient.shared.getSteps(taskId: task.id)
|
||||
activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder }
|
||||
currentStepIndex = activeSteps.firstIndex(where: { $0.isActive })
|
||||
?? activeSteps.firstIndex(where: { $0.status == "pending" })
|
||||
?? 0
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
|
||||
let shortId = String(session.id.prefix(8))
|
||||
let taskLabel = activeTask?.title ?? "(no task)"
|
||||
latestVlmSummary = "Resumed session \(shortId) · \(taskLabel)"
|
||||
|
||||
startCapture()
|
||||
startAppObserver()
|
||||
}
|
||||
|
||||
// MARK: - Session Lifecycle
|
||||
|
||||
// Persisted so we can end a stale session after an app restart
|
||||
private var persistedSessionId: String? {
|
||||
get { UserDefaults.standard.string(forKey: "lockInBro.lastSessionId") }
|
||||
set {
|
||||
@@ -98,18 +177,16 @@ final class SessionManager {
|
||||
isLoading = true
|
||||
errorMessage = nil
|
||||
do {
|
||||
let session: FocusSession
|
||||
do {
|
||||
session = try await APIClient.shared.startSession(taskId: task?.id)
|
||||
} catch NetworkError.httpError(409, _) {
|
||||
// End whichever session is active — prefer the locally known one,
|
||||
// fall back to the last persisted ID (survives app restarts)
|
||||
let staleId = activeSession?.id ?? persistedSessionId
|
||||
if let id = staleId {
|
||||
_ = try? await APIClient.shared.endSession(sessionId: id, status: "completed")
|
||||
}
|
||||
session = try await APIClient.shared.startSession(taskId: task?.id)
|
||||
// End any existing session first
|
||||
var staleId: String? = activeSession?.id ?? persistedSessionId
|
||||
if staleId == nil {
|
||||
staleId = (try? await APIClient.shared.getActiveSession())?.id
|
||||
}
|
||||
if let id = staleId {
|
||||
_ = try? await APIClient.shared.endSession(sessionId: id, status: "completed")
|
||||
}
|
||||
|
||||
let session = try await APIClient.shared.startSession(taskId: task?.id)
|
||||
activeSession = session
|
||||
persistedSessionId = session.id
|
||||
activeTask = task
|
||||
@@ -119,20 +196,22 @@ final class SessionManager {
|
||||
sessionStartDate = Date()
|
||||
distractionCount = 0
|
||||
lastNudge = nil
|
||||
screenshotHistory = []
|
||||
frameBuffer = []
|
||||
|
||||
if let task {
|
||||
let steps = try await APIClient.shared.getSteps(taskId: task.id)
|
||||
activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder }
|
||||
// Pick first in-progress or first pending step
|
||||
currentStepIndex = activeSteps.firstIndex(where: { $0.isActive })
|
||||
?? activeSteps.firstIndex(where: { $0.status == "pending" })
|
||||
?? 0
|
||||
}
|
||||
|
||||
screenshotHistory = []
|
||||
await requestNotificationPermission()
|
||||
startArgus(session: session, task: task)
|
||||
startAppObserver()
|
||||
// Restart capture loop (in case it wasn't running or was in monitoring-only mode)
|
||||
stopCapture()
|
||||
startCapture()
|
||||
if appSwitchObserver == nil { startAppObserver() }
|
||||
} catch {
|
||||
errorMessage = error.localizedDescription
|
||||
}
|
||||
@@ -140,7 +219,6 @@ final class SessionManager {
|
||||
}
|
||||
|
||||
func endSession(status: String = "completed") async {
|
||||
stopArgus()
|
||||
stopCapture()
|
||||
stopAppObserver()
|
||||
if let session = activeSession {
|
||||
@@ -155,14 +233,21 @@ final class SessionManager {
|
||||
resumeCard = nil
|
||||
showingResumeCard = false
|
||||
proactiveCard = nil
|
||||
approvedActionLabel = nil
|
||||
latestVlmSummary = nil
|
||||
latestInferredTask = nil
|
||||
isExecuting = false
|
||||
executorOutput = nil
|
||||
proactiveCardTimer?.cancel()
|
||||
proactiveCardTimer = nil
|
||||
screenshotHistory = []
|
||||
frameBuffer = []
|
||||
persistedSessionId = nil
|
||||
|
||||
// Keep the capture loop running for app-switch heuristics
|
||||
if TokenStore.shared.token != nil {
|
||||
startCapture()
|
||||
startAppObserver()
|
||||
}
|
||||
}
|
||||
|
||||
func fetchResumeCard() async {
|
||||
@@ -183,7 +268,6 @@ final class SessionManager {
|
||||
if let idx = activeSteps.firstIndex(where: { $0.id == updated.id }) {
|
||||
activeSteps[idx] = updated
|
||||
}
|
||||
// Advance to next pending
|
||||
if let next = activeSteps.firstIndex(where: { $0.status == "pending" }) {
|
||||
currentStepIndex = next
|
||||
}
|
||||
@@ -192,12 +276,19 @@ final class SessionManager {
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Retry (HUD Retry button)
|
||||
|
||||
func retryMonitoring() {
|
||||
monitoringError = nil
|
||||
frameBuffer = []
|
||||
stopCapture()
|
||||
startCapture()
|
||||
if appSwitchObserver == nil { startAppObserver() }
|
||||
}
|
||||
|
||||
// MARK: - Proactive Card Lifecycle
|
||||
|
||||
/// Show a proactive card and start the 15-second auto-dismiss timer.
|
||||
/// - Parameter vlmCard: Pass true when the card came from VLM so argus gets a stdin response on dismiss.
|
||||
private func showProactiveCard(_ card: ProactiveCard, vlmCard: Bool = false) {
|
||||
proactiveCardNeedsArgusResponse = vlmCard
|
||||
private func showProactiveCard(_ card: ProactiveCard) {
|
||||
proactiveCardTimer?.cancel()
|
||||
withAnimation { proactiveCard = card }
|
||||
|
||||
@@ -208,31 +299,44 @@ final class SessionManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// Dismiss the current card (user tapped "Not now" or 15s elapsed).
|
||||
func dismissProactiveCard() {
|
||||
proactiveCardTimer?.cancel()
|
||||
proactiveCardTimer = nil
|
||||
withAnimation { proactiveCard = nil }
|
||||
if proactiveCardNeedsArgusResponse { sendArgusResponse(0) }
|
||||
proactiveCardNeedsArgusResponse = false
|
||||
}
|
||||
|
||||
/// Approve action at the given index (0-based). Argus stdin uses 1-based (1 = action 0).
|
||||
func approveProactiveCard(actionIndex: Int) {
|
||||
proactiveCardTimer?.cancel()
|
||||
proactiveCardTimer = nil
|
||||
let card = proactiveCard
|
||||
withAnimation { proactiveCard = nil }
|
||||
if proactiveCardNeedsArgusResponse {
|
||||
sendArgusResponse(actionIndex + 1)
|
||||
isExecuting = true
|
||||
guard case .vlmFriction(_, _, let actions) = card?.source,
|
||||
actionIndex < actions.count else { return }
|
||||
let action = actions[actionIndex]
|
||||
isExecuting = true
|
||||
Task {
|
||||
do {
|
||||
let screenshot = await captureScreen()
|
||||
let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
|
||||
guard !geminiKey.isEmpty else {
|
||||
isExecuting = false
|
||||
executorOutput = (title: action.label, content: action.details ?? "Action approved.")
|
||||
return
|
||||
}
|
||||
let client = GeminiVLMClient(apiKey: geminiKey)
|
||||
let result = try await client.executeAction(
|
||||
label: action.label,
|
||||
actionType: action.actionType,
|
||||
details: action.details ?? "",
|
||||
screenshot: screenshot
|
||||
)
|
||||
isExecuting = false
|
||||
executorOutput = (title: action.label, content: result)
|
||||
} catch {
|
||||
isExecuting = false
|
||||
executorOutput = (title: action.label, content: action.details ?? "Couldn't complete automatically.")
|
||||
}
|
||||
}
|
||||
proactiveCardNeedsArgusResponse = false
|
||||
}
|
||||
|
||||
private func sendArgusResponse(_ choice: Int) {
|
||||
guard let pipe = argusStdinPipe,
|
||||
let data = "\(choice)\n".data(using: .utf8) else { return }
|
||||
try? pipe.fileHandleForWriting.write(contentsOf: data)
|
||||
}
|
||||
|
||||
// MARK: - App Switch Observer
|
||||
@@ -269,7 +373,7 @@ final class SessionManager {
|
||||
|
||||
guard name != lastApp.name else { return }
|
||||
|
||||
// Log previous app's dwell time to backend (fire-and-forget)
|
||||
// Log previous app dwell time to backend
|
||||
let duration = max(1, Int(now.timeIntervalSince(lastAppEnteredAt)))
|
||||
let prev = lastApp
|
||||
if let session = activeSession, !prev.name.isEmpty {
|
||||
@@ -289,159 +393,36 @@ final class SessionManager {
|
||||
appSwitches.append((name: name, bundleId: bundleId, time: now))
|
||||
if appSwitches.count > 30 { appSwitches.removeFirst() }
|
||||
|
||||
// Only trigger card during active session and when none is already showing
|
||||
guard isSessionActive, proactiveCard == nil else { return }
|
||||
if let loop = detectRepetitiveLoop() {
|
||||
showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count)), vlmCard: false)
|
||||
showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count)))
|
||||
}
|
||||
}
|
||||
|
||||
// Detects a back-and-forth pattern between exactly 2 apps within a 5-minute window.
|
||||
// Requires 3 full cycles (6 consecutive alternating switches) to avoid false positives.
|
||||
private func detectRepetitiveLoop() -> (apps: [String], count: Int)? {
|
||||
let cutoff = Date().addingTimeInterval(-300)
|
||||
let recent = appSwitches.filter { $0.time > cutoff }.map(\.name)
|
||||
guard recent.count >= 6 else { return nil }
|
||||
|
||||
let last6 = Array(recent.suffix(6))
|
||||
guard Set(last6).count == 2 else { return nil }
|
||||
|
||||
// Strictly alternating — no two consecutive identical app names
|
||||
for i in 1..<last6.count {
|
||||
if last6[i] == last6[i - 1] { return nil }
|
||||
}
|
||||
return (apps: Array(Set(last6)).sorted(), count: 3)
|
||||
}
|
||||
|
||||
// MARK: - Argus Subprocess (device-side VLM)
|
||||
|
||||
/// Launch the argus Python daemon as a subprocess.
|
||||
/// Argus captures screenshots itself, runs them through a local VLM (Ollama/Gemini),
|
||||
/// posts results to the backend, and emits RESULT:{json} lines to stdout for Swift to consume.
|
||||
/// Falls back to the internal `startCapture()` loop if the process cannot be launched.
|
||||
private func startArgus(session: FocusSession, task: AppTask?) {
|
||||
guard FileManager.default.fileExists(atPath: argusPythonPath),
|
||||
FileManager.default.fileExists(atPath: argusRepoPath) else {
|
||||
startCapture()
|
||||
return
|
||||
}
|
||||
|
||||
// Encode steps as JSON for --steps-json arg
|
||||
var stepsJSONString = "[]"
|
||||
if !activeSteps.isEmpty {
|
||||
let stepsArray: [[String: Any]] = activeSteps.map { step in
|
||||
var s: [String: Any] = [
|
||||
"id": step.id,
|
||||
"sort_order": step.sortOrder,
|
||||
"title": step.title,
|
||||
"status": step.status
|
||||
]
|
||||
if let note = step.checkpointNote { s["checkpoint_note"] = note }
|
||||
return s
|
||||
}
|
||||
if let data = try? JSONSerialization.data(withJSONObject: stepsArray),
|
||||
let str = String(data: data, encoding: .utf8) {
|
||||
stepsJSONString = str
|
||||
}
|
||||
}
|
||||
|
||||
let jwt = TokenStore.shared.token ?? ""
|
||||
let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
|
||||
|
||||
var arguments = [
|
||||
"-m", "argus",
|
||||
"--session-id", session.id,
|
||||
"--task-title", task?.title ?? "(no task)",
|
||||
"--task-goal", task?.description ?? "",
|
||||
"--steps-json", stepsJSONString,
|
||||
"--window-title", NSWorkspace.shared.frontmostApplication?.localizedName ?? "",
|
||||
"--vlm", "gemini",
|
||||
"--jwt", jwt,
|
||||
"--backend-url", "https://wahwa.com/api/v1",
|
||||
"--swift-ipc",
|
||||
"--execute" // enables agentic executor; Swift sends 0/1/2 via stdin
|
||||
]
|
||||
if !geminiKey.isEmpty {
|
||||
arguments += ["--gemini-key", geminiKey]
|
||||
}
|
||||
|
||||
let process = Process()
|
||||
process.executableURL = URL(fileURLWithPath: argusPythonPath)
|
||||
process.currentDirectoryURL = URL(fileURLWithPath: argusRepoPath)
|
||||
process.arguments = arguments
|
||||
|
||||
// Pipe stdout for RESULT:/STATUS:/EXEC_OUTPUT: lines
|
||||
// stderr is NOT captured — leaving it unset lets argus log to the system console
|
||||
// without risk of the pipe buffer filling and blocking the process.
|
||||
let stdoutPipe = Pipe()
|
||||
let stdinPipe = Pipe()
|
||||
process.standardOutput = stdoutPipe
|
||||
process.standardInput = stdinPipe
|
||||
|
||||
do {
|
||||
try process.run()
|
||||
} catch {
|
||||
startCapture()
|
||||
return
|
||||
}
|
||||
|
||||
argusProcess = process
|
||||
argusStdinPipe = stdinPipe
|
||||
isCapturing = true
|
||||
|
||||
// Read RESULT:/STATUS:/EXEC_OUTPUT: lines from argus stdout in a background task
|
||||
let fileHandle = stdoutPipe.fileHandleForReading
|
||||
|
||||
argusReadTask = Task { [weak self] in
|
||||
do {
|
||||
for try await line in fileHandle.bytes.lines {
|
||||
guard let self, !Task.isCancelled else { break }
|
||||
|
||||
if line.hasPrefix("RESULT:") {
|
||||
let jsonStr = String(line.dropFirst("RESULT:".count))
|
||||
if let data = jsonStr.data(using: .utf8),
|
||||
let result = try? JSONDecoder().decode(DistractionAnalysisResponse.self, from: data) {
|
||||
await MainActor.run { self.applyDistractionResult(result) }
|
||||
}
|
||||
} else if line.hasPrefix("STATUS:exec_done:") {
|
||||
await MainActor.run { self.isExecuting = false }
|
||||
} else if line.hasPrefix("EXEC_OUTPUT:") {
|
||||
let jsonStr = String(line.dropFirst("EXEC_OUTPUT:".count))
|
||||
if let data = jsonStr.data(using: .utf8),
|
||||
let obj = try? JSONSerialization.jsonObject(with: data) as? [String: String],
|
||||
let title = obj["title"], let content = obj["content"] {
|
||||
await MainActor.run { self.executorOutput = (title: title, content: content) }
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Pipe closed — argus process ended
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func stopArgus() {
|
||||
argusReadTask?.cancel()
|
||||
argusReadTask = nil
|
||||
argusStdinPipe = nil
|
||||
if let proc = argusProcess {
|
||||
proc.terminate()
|
||||
argusProcess = nil
|
||||
isCapturing = false
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Screenshot Capture Loop (fallback when argus is unavailable)
|
||||
// MARK: - Screenshot Capture Loop
|
||||
|
||||
private func startCapture() {
|
||||
guard !isCapturing else { return }
|
||||
isCapturing = true
|
||||
captureTask = Task { [weak self] in
|
||||
guard let self else { return }
|
||||
// Capture immediately on session start, then repeat on interval
|
||||
// Capture immediately, then repeat on interval
|
||||
await self.captureAndAnalyze()
|
||||
while !Task.isCancelled && self.isSessionActive {
|
||||
while !Task.isCancelled {
|
||||
try? await Task.sleep(for: .seconds(self.captureInterval))
|
||||
guard !Task.isCancelled && self.isSessionActive else { break }
|
||||
guard !Task.isCancelled else { break }
|
||||
await self.captureAndAnalyze()
|
||||
}
|
||||
}
|
||||
@@ -453,56 +434,77 @@ final class SessionManager {
|
||||
isCapturing = false
|
||||
}
|
||||
|
||||
/// Capture one frame, buffer it, and call VLM every `framesPerVLMCall` frames.
|
||||
private func captureAndAnalyze() async {
|
||||
guard let session = activeSession else { return }
|
||||
guard let imageData = await captureScreen() else { return }
|
||||
|
||||
let windowTitle = NSWorkspace.shared.frontmostApplication?.localizedName ?? "Unknown"
|
||||
var context = buildTaskContext()
|
||||
frameBuffer.append(imageData)
|
||||
// Keep buffer bounded — rolling window of most recent frames
|
||||
if frameBuffer.count > framesPerVLMCall { frameBuffer.removeFirst() }
|
||||
|
||||
// Inject rolling history so the VLM has temporal context across captures.
|
||||
// Only summaries (text) are sent — not the raw images — to keep token cost low.
|
||||
if !screenshotHistory.isEmpty {
|
||||
let iso = ISO8601DateFormatter()
|
||||
context["screenshot_history"] = screenshotHistory.map { entry in
|
||||
["summary": entry.summary, "timestamp": iso.string(from: entry.timestamp)]
|
||||
}
|
||||
// Only call VLM once we have a full batch for temporal diff analysis
|
||||
guard frameBuffer.count >= framesPerVLMCall else { return }
|
||||
|
||||
let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
|
||||
guard !geminiKey.isEmpty else {
|
||||
print("[VLM] No Gemini API key set — skipping analysis")
|
||||
return
|
||||
}
|
||||
|
||||
do {
|
||||
let result = try await APIClient.shared.analyzeScreenshot(
|
||||
imageData: imageData,
|
||||
windowTitle: windowTitle,
|
||||
sessionId: session.id,
|
||||
taskContext: context
|
||||
)
|
||||
let client = GeminiVLMClient(apiKey: geminiKey)
|
||||
let windowTitle = NSWorkspace.shared.frontmostApplication?.localizedName ?? ""
|
||||
let recentSummaries = screenshotHistory.map(\.summary)
|
||||
let frames = frameBuffer // snapshot before async gap
|
||||
|
||||
// Append this result's summary to the rolling buffer (max 4 entries)
|
||||
if let summary = result.vlmSummary {
|
||||
screenshotHistory.append(ScreenshotHistoryEntry(summary: summary, timestamp: Date()))
|
||||
do {
|
||||
print("[VLM] Calling Gemini with \(frames.count) frames…")
|
||||
let result = try await client.analyze(
|
||||
frames: frames,
|
||||
taskTitle: activeTask?.title ?? "",
|
||||
taskGoal: activeTask?.description ?? "",
|
||||
steps: activeSteps,
|
||||
windowTitle: windowTitle,
|
||||
recentSummaries: recentSummaries
|
||||
)
|
||||
print("[VLM] Result: on_task=\(result.onTask), friction=\(result.friction?.type ?? "none"), summary=\(result.vlmSummary ?? "")")
|
||||
|
||||
// Append to rolling summary history
|
||||
if let summary = result.vlmSummary, !summary.isEmpty {
|
||||
screenshotHistory.append(HistoryEntry(summary: summary, timestamp: Date()))
|
||||
if screenshotHistory.count > 4 { screenshotHistory.removeFirst() }
|
||||
}
|
||||
|
||||
// Clear frame buffer — next batch starts fresh
|
||||
frameBuffer.removeAll()
|
||||
|
||||
monitoringError = nil
|
||||
applyDistractionResult(result)
|
||||
|
||||
// Post result to backend (fire-and-forget)
|
||||
if let session = activeSession {
|
||||
Task {
|
||||
try? await APIClient.shared.postAnalysisResult(result, sessionId: session.id)
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Silent fail — don't interrupt the user
|
||||
print("[VLM] Analysis error: \(error)")
|
||||
// Don't surface transient errors — the next attempt will retry automatically
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Screen Capture
|
||||
|
||||
private func captureScreen() async -> Data? {
|
||||
guard CGPreflightScreenCaptureAccess() else { return nil }
|
||||
do {
|
||||
let content = try await SCShareableContent.current
|
||||
guard let display = content.displays.first else { return nil }
|
||||
|
||||
let config = SCStreamConfiguration()
|
||||
config.width = 1280
|
||||
config.height = 720
|
||||
|
||||
let filter = SCContentFilter(display: display, excludingWindows: [])
|
||||
let image = try await SCScreenshotManager.captureImage(
|
||||
contentFilter: filter,
|
||||
configuration: config
|
||||
)
|
||||
contentFilter: filter, configuration: config)
|
||||
return cgImageToJPEG(image)
|
||||
} catch {
|
||||
return nil
|
||||
@@ -518,29 +520,13 @@ final class SessionManager {
|
||||
return jpeg
|
||||
}
|
||||
|
||||
private func buildTaskContext() -> [String: Any] {
|
||||
var ctx: [String: Any] = [:]
|
||||
guard let task = activeTask else { return ctx }
|
||||
ctx["task_title"] = task.title
|
||||
ctx["task_goal"] = task.description ?? task.title
|
||||
ctx["steps"] = activeSteps.map { step -> [String: Any] in
|
||||
var s: [String: Any] = [
|
||||
"id": step.id,
|
||||
"sort_order": step.sortOrder,
|
||||
"title": step.title,
|
||||
"status": step.status
|
||||
]
|
||||
if let note = step.checkpointNote { s["checkpoint_note"] = note }
|
||||
return s
|
||||
}
|
||||
return ctx
|
||||
}
|
||||
// MARK: - Apply VLM Result
|
||||
|
||||
private func applyDistractionResult(_ result: DistractionAnalysisResponse) {
|
||||
// 0. Store latest summary for the floating HUD
|
||||
if let summary = result.vlmSummary { latestVlmSummary = summary }
|
||||
if let task = result.inferredTask, !task.isEmpty { latestInferredTask = task }
|
||||
|
||||
// 1. Apply step side-effects (always)
|
||||
// Apply step side-effects
|
||||
for completedId in result.stepsCompleted {
|
||||
if let idx = activeSteps.firstIndex(where: { $0.id == completedId }) {
|
||||
activeSteps[idx].status = "done"
|
||||
@@ -556,22 +542,25 @@ final class SessionManager {
|
||||
currentStepIndex = idx
|
||||
}
|
||||
|
||||
// 2. Notification priority (design spec §1.5):
|
||||
// Proactive friction help → Context resume → Gentle nudge
|
||||
// NEVER nudge when the system could help instead.
|
||||
if let friction = result.friction, friction.isActionable {
|
||||
if friction.isResumption {
|
||||
// Task resumption detected — auto-surface resume card without button press
|
||||
Task { await fetchResumeCard() }
|
||||
} else if proactiveCard == nil {
|
||||
showProactiveCard(ProactiveCard(source: .vlmFriction(
|
||||
frictionType: friction.type,
|
||||
description: friction.description,
|
||||
actions: friction.proposedActions
|
||||
)), vlmCard: true)
|
||||
// Notification priority: friction card (formal or has actions) → nudge
|
||||
if let friction = result.friction {
|
||||
let shouldShow = friction.isActionable || !friction.proposedActions.isEmpty
|
||||
if shouldShow {
|
||||
if friction.isResumption {
|
||||
Task { await fetchResumeCard() }
|
||||
} else if proactiveCard == nil {
|
||||
showProactiveCard(ProactiveCard(source: .vlmFriction(
|
||||
frictionType: friction.type,
|
||||
description: friction.description,
|
||||
actions: friction.proposedActions
|
||||
)))
|
||||
}
|
||||
} else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge {
|
||||
distractionCount += 1
|
||||
lastNudge = nudge
|
||||
sendNudgeNotification(nudge)
|
||||
}
|
||||
} else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge {
|
||||
// Only nudge if VLM found no actionable friction
|
||||
distractionCount += 1
|
||||
lastNudge = nudge
|
||||
sendNudgeNotification(nudge)
|
||||
|
||||
Reference in New Issue
Block a user