more proactive vlm agent features

This commit is contained in:
joyzhuo
2026-03-29 00:58:22 -04:00
parent 15943b4759
commit 275a53ab40
7 changed files with 431 additions and 120 deletions

View File

@@ -29,11 +29,15 @@ final class SessionManager {
var proactiveCard: ProactiveCard?
/// Set when the user approves a proposed action shown as a confirmation toast
var approvedActionLabel: String?
/// Latest one-sentence summary from the VLM, shown in the floating HUD
var latestVlmSummary: String?
/// True while the argus executor is running an approved action
var isExecuting: Bool = false
/// Result produced by the executor's output() tool shown as a sticky card in the HUD
var executorOutput: (title: String, content: String)?
// Screenshot engine
var isCapturing: Bool = false
/// Live pipeline status shown in FocusSessionView (updated at each stage)
var argusStatus: String = ""
private var captureTask: Task<Void, Never>?
private let captureInterval: TimeInterval = 5.0
@@ -55,6 +59,10 @@ final class SessionManager {
// Argus subprocess (device-side VLM)
@ObservationIgnored private var argusProcess: Process?
@ObservationIgnored private var argusReadTask: Task<Void, Never>?
@ObservationIgnored private var argusStdinPipe: Pipe?
/// Whether the current proactive card came from VLM (needs argus stdin response) vs local heuristic
@ObservationIgnored private var proactiveCardNeedsArgusResponse = false
@ObservationIgnored private var proactiveCardTimer: Task<Void, Never>?
private let argusPythonPath = "/Users/joyzhuo/miniconda3/envs/gmr/bin/python3"
private let argusRepoPath = "/Users/joyzhuo/yhack/lockinbro-argus"
@@ -148,6 +156,11 @@ final class SessionManager {
showingResumeCard = false
proactiveCard = nil
approvedActionLabel = nil
latestVlmSummary = nil
isExecuting = false
executorOutput = nil
proactiveCardTimer?.cancel()
proactiveCardTimer = nil
screenshotHistory = []
persistedSessionId = nil
}
@@ -179,6 +192,49 @@ final class SessionManager {
}
}
// MARK: - Proactive Card Lifecycle
/// Show a proactive card and start the 15-second auto-dismiss timer.
/// - Parameter vlmCard: Pass true when the card came from VLM so argus gets a stdin response on dismiss.
private func showProactiveCard(_ card: ProactiveCard, vlmCard: Bool = false) {
proactiveCardNeedsArgusResponse = vlmCard
proactiveCardTimer?.cancel()
withAnimation { proactiveCard = card }
proactiveCardTimer = Task { [weak self] in
try? await Task.sleep(for: .seconds(15))
guard !Task.isCancelled, let self else { return }
await MainActor.run { self.dismissProactiveCard() }
}
}
/// Dismiss the current card (user tapped "Not now" or 15s elapsed).
func dismissProactiveCard() {
proactiveCardTimer?.cancel()
proactiveCardTimer = nil
withAnimation { proactiveCard = nil }
if proactiveCardNeedsArgusResponse { sendArgusResponse(0) }
proactiveCardNeedsArgusResponse = false
}
/// Approve action at the given index (0-based). Argus stdin uses 1-based (1 = action 0).
func approveProactiveCard(actionIndex: Int) {
proactiveCardTimer?.cancel()
proactiveCardTimer = nil
withAnimation { proactiveCard = nil }
if proactiveCardNeedsArgusResponse {
sendArgusResponse(actionIndex + 1)
isExecuting = true
}
proactiveCardNeedsArgusResponse = false
}
private func sendArgusResponse(_ choice: Int) {
guard let pipe = argusStdinPipe,
let data = "\(choice)\n".data(using: .utf8) else { return }
try? pipe.fileHandleForWriting.write(contentsOf: data)
}
// MARK: - App Switch Observer
private func startAppObserver() {
@@ -236,7 +292,7 @@ final class SessionManager {
// Only trigger card during active session and when none is already showing
guard isSessionActive, proactiveCard == nil else { return }
if let loop = detectRepetitiveLoop() {
proactiveCard = ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count))
showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count)), vlmCard: false)
}
}
@@ -266,7 +322,6 @@ final class SessionManager {
private func startArgus(session: FocusSession, task: AppTask?) {
guard FileManager.default.fileExists(atPath: argusPythonPath),
FileManager.default.fileExists(atPath: argusRepoPath) else {
argusStatus = "⚠️ Argus not found — using fallback capture"
startCapture()
return
}
@@ -303,7 +358,8 @@ final class SessionManager {
"--vlm", "gemini",
"--jwt", jwt,
"--backend-url", "https://wahwa.com/api/v1",
"--swift-ipc"
"--swift-ipc",
"--execute" // enables agentic executor; Swift sends 0/1/2 via stdin
]
if !geminiKey.isEmpty {
arguments += ["--gemini-key", geminiKey]
@@ -314,49 +370,47 @@ final class SessionManager {
process.currentDirectoryURL = URL(fileURLWithPath: argusRepoPath)
process.arguments = arguments
// Pipe stdout for RESULT: lines; redirect stderr so argus logs don't clutter console
// Pipe stdout for RESULT:/STATUS:/EXEC_OUTPUT: lines
// stderr is NOT captured leaving it unset lets argus log to the system console
// without risk of the pipe buffer filling and blocking the process.
let stdoutPipe = Pipe()
let stderrPipe = Pipe()
let stdinPipe = Pipe()
process.standardOutput = stdoutPipe
process.standardError = stderrPipe
process.standardInput = stdinPipe
do {
try process.launch()
try process.run()
} catch {
argusStatus = "⚠️ Argus failed to launch — using fallback capture"
startCapture()
return
}
argusProcess = process
argusStdinPipe = stdinPipe
isCapturing = true
let taskLabel = task?.title ?? "session"
argusStatus = "🚀 Argus started — waiting for first screenshot…"
sendDebugNotification(title: "🚀 Argus VLM Started", body: "Screen monitoring active for \(taskLabel)")
// Read RESULT: lines from argus stdout in a background task
// Read RESULT:/STATUS:/EXEC_OUTPUT: lines from argus stdout in a background task
let fileHandle = stdoutPipe.fileHandleForReading
sendDebugNotification(title: "🚀 Argus VLM Started", body: "Screen monitoring active for \(taskLabel)")
argusReadTask = Task { [weak self] in
do {
for try await line in fileHandle.bytes.lines {
guard let self, !Task.isCancelled else { break }
if line.hasPrefix("STATUS:") {
let event = String(line.dropFirst("STATUS:".count))
await MainActor.run { self.handleArgusStatus(event) }
} else if line.hasPrefix("RESULT:") {
if line.hasPrefix("RESULT:") {
let jsonStr = String(line.dropFirst("RESULT:".count))
guard let data = jsonStr.data(using: .utf8) else { continue }
if let result = try? JSONDecoder().decode(DistractionAnalysisResponse.self, from: data) {
await MainActor.run {
let summary = result.vlmSummary ?? "no summary"
self.argusStatus = "\(summary)"
self.sendDebugNotification(title: "✅ VLM Result", body: summary)
self.applyDistractionResult(result)
}
if let data = jsonStr.data(using: .utf8),
let result = try? JSONDecoder().decode(DistractionAnalysisResponse.self, from: data) {
await MainActor.run { self.applyDistractionResult(result) }
}
} else if line.hasPrefix("STATUS:exec_done:") {
await MainActor.run { self.isExecuting = false }
} else if line.hasPrefix("EXEC_OUTPUT:") {
let jsonStr = String(line.dropFirst("EXEC_OUTPUT:".count))
if let data = jsonStr.data(using: .utf8),
let obj = try? JSONSerialization.jsonObject(with: data) as? [String: String],
let title = obj["title"], let content = obj["content"] {
await MainActor.run { self.executorOutput = (title: title, content: content) }
}
}
}
@@ -369,6 +423,7 @@ final class SessionManager {
private func stopArgus() {
argusReadTask?.cancel()
argusReadTask = nil
argusStdinPipe = nil
if let proc = argusProcess {
proc.terminate()
argusProcess = nil
@@ -482,6 +537,9 @@ final class SessionManager {
}
private func applyDistractionResult(_ result: DistractionAnalysisResponse) {
// 0. Store latest summary for the floating HUD
if let summary = result.vlmSummary { latestVlmSummary = summary }
// 1. Apply step side-effects (always)
for completedId in result.stepsCompleted {
if let idx = activeSteps.firstIndex(where: { $0.id == completedId }) {
@@ -506,11 +564,11 @@ final class SessionManager {
// Task resumption detected auto-surface resume card without button press
Task { await fetchResumeCard() }
} else if proactiveCard == nil {
proactiveCard = ProactiveCard(source: .vlmFriction(
showProactiveCard(ProactiveCard(source: .vlmFriction(
frictionType: friction.type,
description: friction.description,
actions: friction.proposedActions
))
)), vlmCard: true)
}
} else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge {
// Only nudge if VLM found no actionable friction
@@ -522,34 +580,6 @@ final class SessionManager {
// MARK: - Notifications
private func handleArgusStatus(_ event: String) {
switch event {
case "screenshot_captured":
argusStatus = "📸 Screenshot captured — sending to VLM…"
sendDebugNotification(title: "📸 Screenshot Captured", body: "Sending to VLM for analysis…")
case "vlm_running":
argusStatus = "🤖 VLM analyzing screen…"
sendDebugNotification(title: "🤖 VLM Running", body: "Gemini is analyzing your screen…")
case "vlm_done":
argusStatus = "🧠 VLM done — applying result…"
sendDebugNotification(title: "🧠 VLM Done", body: "Analysis complete, processing result…")
default:
break
}
}
private func sendDebugNotification(title: String, body: String) {
let content = UNMutableNotificationContent()
content.title = title
content.body = body
let req = UNNotificationRequest(
identifier: "debug-\(UUID().uuidString)",
content: content,
trigger: nil
)
UNUserNotificationCenter.current().add(req)
}
private func sendNudgeNotification(_ nudge: String) {
let content = UNMutableNotificationContent()
content.title = "Hey, quick check-in!"