include argus workflow

This commit is contained in:
joyzhuo
2026-03-29 06:29:18 -04:00
parent 275a53ab40
commit 56673078f5
23 changed files with 3098 additions and 307 deletions

View File

@@ -1,4 +1,6 @@
// SessionManager.swift Focus session state, screenshot engine, distraction detection
// SessionManager.swift Focus session state, native VLM screen analysis
// Screenshot capture Gemini Vision API apply results to UI + post to backend.
// No Python subprocess. No external process management.
import AppKit
import SwiftUI
@@ -25,30 +27,27 @@ final class SessionManager {
var errorMessage: String?
var isLoading: Bool = false
// Proactive agent
// VLM / proactive agent
var proactiveCard: ProactiveCard?
/// Set when the user approves a proposed action shown as a confirmation toast
var approvedActionLabel: String?
/// Latest one-sentence summary from the VLM, shown in the floating HUD
var latestVlmSummary: String?
/// True while the argus executor is running an approved action
var latestInferredTask: String?
var isExecuting: Bool = false
/// Result produced by the executor's output() tool shown as a sticky card in the HUD
var executorOutput: (title: String, content: String)?
var monitoringError: String?
// Screenshot engine
var isCapturing: Bool = false
private var captureTask: Task<Void, Never>?
@ObservationIgnored private var captureTask: Task<Void, Never>?
private let captureInterval: TimeInterval = 5.0
// Rolling screenshot history buffer (max 4 entries, ~20-second window)
// Provides temporal context to the VLM so it can detect patterns across captures.
private struct ScreenshotHistoryEntry {
let summary: String // vlm_summary text from the previous analysis
let timestamp: Date
}
@ObservationIgnored private var screenshotHistory: [ScreenshotHistoryEntry] = []
// Frame buffer accumulate N frames before calling VLM for temporal diff context
@ObservationIgnored private var frameBuffer: [Data] = []
private let framesPerVLMCall = 3
// Rolling summary history fed as context into subsequent VLM calls
private struct HistoryEntry { let summary: String; let timestamp: Date }
@ObservationIgnored private var screenshotHistory: [HistoryEntry] = []
// App switch tracking
@ObservationIgnored private var appSwitches: [(name: String, bundleId: String, time: Date)] = []
@@ -56,15 +55,8 @@ final class SessionManager {
@ObservationIgnored private var lastApp: (name: String, bundleId: String) = ("", "")
@ObservationIgnored private var lastAppEnteredAt: Date = Date()
// Argus subprocess (device-side VLM)
@ObservationIgnored private var argusProcess: Process?
@ObservationIgnored private var argusReadTask: Task<Void, Never>?
@ObservationIgnored private var argusStdinPipe: Pipe?
/// Whether the current proactive card came from VLM (needs argus stdin response) vs local heuristic
@ObservationIgnored private var proactiveCardNeedsArgusResponse = false
// Proactive card auto-dismiss timer
@ObservationIgnored private var proactiveCardTimer: Task<Void, Never>?
private let argusPythonPath = "/Users/joyzhuo/miniconda3/envs/gmr/bin/python3"
private let argusRepoPath = "/Users/joyzhuo/yhack/lockinbro-argus"
private init() {}
@@ -83,9 +75,96 @@ final class SessionManager {
return Date().timeIntervalSince(start)
}
// MARK: - Monitoring Lifecycle
/// Immediately shuts down all monitoring without making any API calls.
func stopMonitoring() {
stopCapture()
stopAppObserver()
proactiveCardTimer?.cancel()
proactiveCardTimer = nil
activeSession = nil
activeTask = nil
activeSteps = []
isSessionActive = false
sessionStartDate = nil
lastNudge = nil
resumeCard = nil
showingResumeCard = false
proactiveCard = nil
latestVlmSummary = nil
latestInferredTask = nil
isExecuting = false
executorOutput = nil
monitoringError = nil
screenshotHistory = []
frameBuffer = []
persistedSessionId = nil
}
/// Called once after login. Auto-resumes any existing active session and starts the capture loop.
func startMonitoring() async {
guard TokenStore.shared.token != nil else { return }
guard !isCapturing else { return }
monitoringError = nil
await requestNotificationPermission()
// Silent preflight never shows UI; only request permission if not yet granted.
if !CGPreflightScreenCaptureAccess() {
CGRequestScreenCaptureAccess()
monitoringError = "Screen Recording permission required — enable in System Settings → Privacy & Security → Screen Recording, then tap Retry"
return
}
do {
if let existing = try await APIClient.shared.getActiveSession() {
await autoResumeSession(existing)
} else {
startCapture()
startAppObserver()
}
} catch {
startCapture()
startAppObserver()
}
}
/// Silently resume an active session found on the backend (no loading UI shown).
private func autoResumeSession(_ session: FocusSession) async {
activeSession = session
persistedSessionId = session.id
isSessionActive = true
sessionStartDate = Date()
distractionCount = 0
lastNudge = nil
screenshotHistory = []
frameBuffer = []
if let taskId = session.taskId {
do {
let tasks = try await APIClient.shared.getTasks()
activeTask = tasks.first(where: { $0.id == taskId })
if let task = activeTask {
let steps = try await APIClient.shared.getSteps(taskId: task.id)
activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder }
currentStepIndex = activeSteps.firstIndex(where: { $0.isActive })
?? activeSteps.firstIndex(where: { $0.status == "pending" })
?? 0
}
} catch {}
}
let shortId = String(session.id.prefix(8))
let taskLabel = activeTask?.title ?? "(no task)"
latestVlmSummary = "Resumed session \(shortId) · \(taskLabel)"
startCapture()
startAppObserver()
}
// MARK: - Session Lifecycle
// Persisted so we can end a stale session after an app restart
private var persistedSessionId: String? {
get { UserDefaults.standard.string(forKey: "lockInBro.lastSessionId") }
set {
@@ -98,18 +177,16 @@ final class SessionManager {
isLoading = true
errorMessage = nil
do {
let session: FocusSession
do {
session = try await APIClient.shared.startSession(taskId: task?.id)
} catch NetworkError.httpError(409, _) {
// End whichever session is active prefer the locally known one,
// fall back to the last persisted ID (survives app restarts)
let staleId = activeSession?.id ?? persistedSessionId
if let id = staleId {
_ = try? await APIClient.shared.endSession(sessionId: id, status: "completed")
}
session = try await APIClient.shared.startSession(taskId: task?.id)
// End any existing session first
var staleId: String? = activeSession?.id ?? persistedSessionId
if staleId == nil {
staleId = (try? await APIClient.shared.getActiveSession())?.id
}
if let id = staleId {
_ = try? await APIClient.shared.endSession(sessionId: id, status: "completed")
}
let session = try await APIClient.shared.startSession(taskId: task?.id)
activeSession = session
persistedSessionId = session.id
activeTask = task
@@ -119,20 +196,22 @@ final class SessionManager {
sessionStartDate = Date()
distractionCount = 0
lastNudge = nil
screenshotHistory = []
frameBuffer = []
if let task {
let steps = try await APIClient.shared.getSteps(taskId: task.id)
activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder }
// Pick first in-progress or first pending step
currentStepIndex = activeSteps.firstIndex(where: { $0.isActive })
?? activeSteps.firstIndex(where: { $0.status == "pending" })
?? 0
}
screenshotHistory = []
await requestNotificationPermission()
startArgus(session: session, task: task)
startAppObserver()
// Restart capture loop (in case it wasn't running or was in monitoring-only mode)
stopCapture()
startCapture()
if appSwitchObserver == nil { startAppObserver() }
} catch {
errorMessage = error.localizedDescription
}
@@ -140,7 +219,6 @@ final class SessionManager {
}
func endSession(status: String = "completed") async {
stopArgus()
stopCapture()
stopAppObserver()
if let session = activeSession {
@@ -155,14 +233,21 @@ final class SessionManager {
resumeCard = nil
showingResumeCard = false
proactiveCard = nil
approvedActionLabel = nil
latestVlmSummary = nil
latestInferredTask = nil
isExecuting = false
executorOutput = nil
proactiveCardTimer?.cancel()
proactiveCardTimer = nil
screenshotHistory = []
frameBuffer = []
persistedSessionId = nil
// Keep the capture loop running for app-switch heuristics
if TokenStore.shared.token != nil {
startCapture()
startAppObserver()
}
}
func fetchResumeCard() async {
@@ -183,7 +268,6 @@ final class SessionManager {
if let idx = activeSteps.firstIndex(where: { $0.id == updated.id }) {
activeSteps[idx] = updated
}
// Advance to next pending
if let next = activeSteps.firstIndex(where: { $0.status == "pending" }) {
currentStepIndex = next
}
@@ -192,12 +276,19 @@ final class SessionManager {
}
}
// MARK: - Retry (HUD Retry button)
func retryMonitoring() {
monitoringError = nil
frameBuffer = []
stopCapture()
startCapture()
if appSwitchObserver == nil { startAppObserver() }
}
// MARK: - Proactive Card Lifecycle
/// Show a proactive card and start the 15-second auto-dismiss timer.
/// - Parameter vlmCard: Pass true when the card came from VLM so argus gets a stdin response on dismiss.
private func showProactiveCard(_ card: ProactiveCard, vlmCard: Bool = false) {
proactiveCardNeedsArgusResponse = vlmCard
private func showProactiveCard(_ card: ProactiveCard) {
proactiveCardTimer?.cancel()
withAnimation { proactiveCard = card }
@@ -208,31 +299,44 @@ final class SessionManager {
}
}
/// Dismiss the current card (user tapped "Not now" or 15s elapsed).
func dismissProactiveCard() {
proactiveCardTimer?.cancel()
proactiveCardTimer = nil
withAnimation { proactiveCard = nil }
if proactiveCardNeedsArgusResponse { sendArgusResponse(0) }
proactiveCardNeedsArgusResponse = false
}
/// Approve action at the given index (0-based). Argus stdin uses 1-based (1 = action 0).
func approveProactiveCard(actionIndex: Int) {
proactiveCardTimer?.cancel()
proactiveCardTimer = nil
let card = proactiveCard
withAnimation { proactiveCard = nil }
if proactiveCardNeedsArgusResponse {
sendArgusResponse(actionIndex + 1)
isExecuting = true
guard case .vlmFriction(_, _, let actions) = card?.source,
actionIndex < actions.count else { return }
let action = actions[actionIndex]
isExecuting = true
Task {
do {
let screenshot = await captureScreen()
let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
guard !geminiKey.isEmpty else {
isExecuting = false
executorOutput = (title: action.label, content: action.details ?? "Action approved.")
return
}
let client = GeminiVLMClient(apiKey: geminiKey)
let result = try await client.executeAction(
label: action.label,
actionType: action.actionType,
details: action.details ?? "",
screenshot: screenshot
)
isExecuting = false
executorOutput = (title: action.label, content: result)
} catch {
isExecuting = false
executorOutput = (title: action.label, content: action.details ?? "Couldn't complete automatically.")
}
}
proactiveCardNeedsArgusResponse = false
}
private func sendArgusResponse(_ choice: Int) {
guard let pipe = argusStdinPipe,
let data = "\(choice)\n".data(using: .utf8) else { return }
try? pipe.fileHandleForWriting.write(contentsOf: data)
}
// MARK: - App Switch Observer
@@ -269,7 +373,7 @@ final class SessionManager {
guard name != lastApp.name else { return }
// Log previous app's dwell time to backend (fire-and-forget)
// Log previous app dwell time to backend
let duration = max(1, Int(now.timeIntervalSince(lastAppEnteredAt)))
let prev = lastApp
if let session = activeSession, !prev.name.isEmpty {
@@ -289,159 +393,36 @@ final class SessionManager {
appSwitches.append((name: name, bundleId: bundleId, time: now))
if appSwitches.count > 30 { appSwitches.removeFirst() }
// Only trigger card during active session and when none is already showing
guard isSessionActive, proactiveCard == nil else { return }
if let loop = detectRepetitiveLoop() {
showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count)), vlmCard: false)
showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count)))
}
}
// Detects a back-and-forth pattern between exactly 2 apps within a 5-minute window.
// Requires 3 full cycles (6 consecutive alternating switches) to avoid false positives.
private func detectRepetitiveLoop() -> (apps: [String], count: Int)? {
let cutoff = Date().addingTimeInterval(-300)
let recent = appSwitches.filter { $0.time > cutoff }.map(\.name)
guard recent.count >= 6 else { return nil }
let last6 = Array(recent.suffix(6))
guard Set(last6).count == 2 else { return nil }
// Strictly alternating no two consecutive identical app names
for i in 1..<last6.count {
if last6[i] == last6[i - 1] { return nil }
}
return (apps: Array(Set(last6)).sorted(), count: 3)
}
// MARK: - Argus Subprocess (device-side VLM)
/// Launch the argus Python daemon as a subprocess.
/// Argus captures screenshots itself, runs them through a local VLM (Ollama/Gemini),
/// posts results to the backend, and emits RESULT:{json} lines to stdout for Swift to consume.
/// Falls back to the internal `startCapture()` loop if the process cannot be launched.
private func startArgus(session: FocusSession, task: AppTask?) {
guard FileManager.default.fileExists(atPath: argusPythonPath),
FileManager.default.fileExists(atPath: argusRepoPath) else {
startCapture()
return
}
// Encode steps as JSON for --steps-json arg
var stepsJSONString = "[]"
if !activeSteps.isEmpty {
let stepsArray: [[String: Any]] = activeSteps.map { step in
var s: [String: Any] = [
"id": step.id,
"sort_order": step.sortOrder,
"title": step.title,
"status": step.status
]
if let note = step.checkpointNote { s["checkpoint_note"] = note }
return s
}
if let data = try? JSONSerialization.data(withJSONObject: stepsArray),
let str = String(data: data, encoding: .utf8) {
stepsJSONString = str
}
}
let jwt = TokenStore.shared.token ?? ""
let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
var arguments = [
"-m", "argus",
"--session-id", session.id,
"--task-title", task?.title ?? "(no task)",
"--task-goal", task?.description ?? "",
"--steps-json", stepsJSONString,
"--window-title", NSWorkspace.shared.frontmostApplication?.localizedName ?? "",
"--vlm", "gemini",
"--jwt", jwt,
"--backend-url", "https://wahwa.com/api/v1",
"--swift-ipc",
"--execute" // enables agentic executor; Swift sends 0/1/2 via stdin
]
if !geminiKey.isEmpty {
arguments += ["--gemini-key", geminiKey]
}
let process = Process()
process.executableURL = URL(fileURLWithPath: argusPythonPath)
process.currentDirectoryURL = URL(fileURLWithPath: argusRepoPath)
process.arguments = arguments
// Pipe stdout for RESULT:/STATUS:/EXEC_OUTPUT: lines
// stderr is NOT captured leaving it unset lets argus log to the system console
// without risk of the pipe buffer filling and blocking the process.
let stdoutPipe = Pipe()
let stdinPipe = Pipe()
process.standardOutput = stdoutPipe
process.standardInput = stdinPipe
do {
try process.run()
} catch {
startCapture()
return
}
argusProcess = process
argusStdinPipe = stdinPipe
isCapturing = true
// Read RESULT:/STATUS:/EXEC_OUTPUT: lines from argus stdout in a background task
let fileHandle = stdoutPipe.fileHandleForReading
argusReadTask = Task { [weak self] in
do {
for try await line in fileHandle.bytes.lines {
guard let self, !Task.isCancelled else { break }
if line.hasPrefix("RESULT:") {
let jsonStr = String(line.dropFirst("RESULT:".count))
if let data = jsonStr.data(using: .utf8),
let result = try? JSONDecoder().decode(DistractionAnalysisResponse.self, from: data) {
await MainActor.run { self.applyDistractionResult(result) }
}
} else if line.hasPrefix("STATUS:exec_done:") {
await MainActor.run { self.isExecuting = false }
} else if line.hasPrefix("EXEC_OUTPUT:") {
let jsonStr = String(line.dropFirst("EXEC_OUTPUT:".count))
if let data = jsonStr.data(using: .utf8),
let obj = try? JSONSerialization.jsonObject(with: data) as? [String: String],
let title = obj["title"], let content = obj["content"] {
await MainActor.run { self.executorOutput = (title: title, content: content) }
}
}
}
} catch {
// Pipe closed argus process ended
}
}
}
private func stopArgus() {
argusReadTask?.cancel()
argusReadTask = nil
argusStdinPipe = nil
if let proc = argusProcess {
proc.terminate()
argusProcess = nil
isCapturing = false
}
}
// MARK: - Screenshot Capture Loop (fallback when argus is unavailable)
// MARK: - Screenshot Capture Loop
private func startCapture() {
guard !isCapturing else { return }
isCapturing = true
captureTask = Task { [weak self] in
guard let self else { return }
// Capture immediately on session start, then repeat on interval
// Capture immediately, then repeat on interval
await self.captureAndAnalyze()
while !Task.isCancelled && self.isSessionActive {
while !Task.isCancelled {
try? await Task.sleep(for: .seconds(self.captureInterval))
guard !Task.isCancelled && self.isSessionActive else { break }
guard !Task.isCancelled else { break }
await self.captureAndAnalyze()
}
}
@@ -453,56 +434,77 @@ final class SessionManager {
isCapturing = false
}
/// Capture one frame, buffer it, and call VLM every `framesPerVLMCall` frames.
private func captureAndAnalyze() async {
guard let session = activeSession else { return }
guard let imageData = await captureScreen() else { return }
let windowTitle = NSWorkspace.shared.frontmostApplication?.localizedName ?? "Unknown"
var context = buildTaskContext()
frameBuffer.append(imageData)
// Keep buffer bounded rolling window of most recent frames
if frameBuffer.count > framesPerVLMCall { frameBuffer.removeFirst() }
// Inject rolling history so the VLM has temporal context across captures.
// Only summaries (text) are sent not the raw images to keep token cost low.
if !screenshotHistory.isEmpty {
let iso = ISO8601DateFormatter()
context["screenshot_history"] = screenshotHistory.map { entry in
["summary": entry.summary, "timestamp": iso.string(from: entry.timestamp)]
}
// Only call VLM once we have a full batch for temporal diff analysis
guard frameBuffer.count >= framesPerVLMCall else { return }
let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
guard !geminiKey.isEmpty else {
print("[VLM] No Gemini API key set — skipping analysis")
return
}
do {
let result = try await APIClient.shared.analyzeScreenshot(
imageData: imageData,
windowTitle: windowTitle,
sessionId: session.id,
taskContext: context
)
let client = GeminiVLMClient(apiKey: geminiKey)
let windowTitle = NSWorkspace.shared.frontmostApplication?.localizedName ?? ""
let recentSummaries = screenshotHistory.map(\.summary)
let frames = frameBuffer // snapshot before async gap
// Append this result's summary to the rolling buffer (max 4 entries)
if let summary = result.vlmSummary {
screenshotHistory.append(ScreenshotHistoryEntry(summary: summary, timestamp: Date()))
do {
print("[VLM] Calling Gemini with \(frames.count) frames…")
let result = try await client.analyze(
frames: frames,
taskTitle: activeTask?.title ?? "",
taskGoal: activeTask?.description ?? "",
steps: activeSteps,
windowTitle: windowTitle,
recentSummaries: recentSummaries
)
print("[VLM] Result: on_task=\(result.onTask), friction=\(result.friction?.type ?? "none"), summary=\(result.vlmSummary ?? "")")
// Append to rolling summary history
if let summary = result.vlmSummary, !summary.isEmpty {
screenshotHistory.append(HistoryEntry(summary: summary, timestamp: Date()))
if screenshotHistory.count > 4 { screenshotHistory.removeFirst() }
}
// Clear frame buffer next batch starts fresh
frameBuffer.removeAll()
monitoringError = nil
applyDistractionResult(result)
// Post result to backend (fire-and-forget)
if let session = activeSession {
Task {
try? await APIClient.shared.postAnalysisResult(result, sessionId: session.id)
}
}
} catch {
// Silent fail don't interrupt the user
print("[VLM] Analysis error: \(error)")
// Don't surface transient errors the next attempt will retry automatically
}
}
// MARK: - Screen Capture
private func captureScreen() async -> Data? {
guard CGPreflightScreenCaptureAccess() else { return nil }
do {
let content = try await SCShareableContent.current
guard let display = content.displays.first else { return nil }
let config = SCStreamConfiguration()
config.width = 1280
config.height = 720
let filter = SCContentFilter(display: display, excludingWindows: [])
let image = try await SCScreenshotManager.captureImage(
contentFilter: filter,
configuration: config
)
contentFilter: filter, configuration: config)
return cgImageToJPEG(image)
} catch {
return nil
@@ -518,29 +520,13 @@ final class SessionManager {
return jpeg
}
private func buildTaskContext() -> [String: Any] {
var ctx: [String: Any] = [:]
guard let task = activeTask else { return ctx }
ctx["task_title"] = task.title
ctx["task_goal"] = task.description ?? task.title
ctx["steps"] = activeSteps.map { step -> [String: Any] in
var s: [String: Any] = [
"id": step.id,
"sort_order": step.sortOrder,
"title": step.title,
"status": step.status
]
if let note = step.checkpointNote { s["checkpoint_note"] = note }
return s
}
return ctx
}
// MARK: - Apply VLM Result
private func applyDistractionResult(_ result: DistractionAnalysisResponse) {
// 0. Store latest summary for the floating HUD
if let summary = result.vlmSummary { latestVlmSummary = summary }
if let task = result.inferredTask, !task.isEmpty { latestInferredTask = task }
// 1. Apply step side-effects (always)
// Apply step side-effects
for completedId in result.stepsCompleted {
if let idx = activeSteps.firstIndex(where: { $0.id == completedId }) {
activeSteps[idx].status = "done"
@@ -556,22 +542,25 @@ final class SessionManager {
currentStepIndex = idx
}
// 2. Notification priority (design spec §1.5):
// Proactive friction help Context resume Gentle nudge
// NEVER nudge when the system could help instead.
if let friction = result.friction, friction.isActionable {
if friction.isResumption {
// Task resumption detected auto-surface resume card without button press
Task { await fetchResumeCard() }
} else if proactiveCard == nil {
showProactiveCard(ProactiveCard(source: .vlmFriction(
frictionType: friction.type,
description: friction.description,
actions: friction.proposedActions
)), vlmCard: true)
// Notification priority: friction card (formal or has actions) nudge
if let friction = result.friction {
let shouldShow = friction.isActionable || !friction.proposedActions.isEmpty
if shouldShow {
if friction.isResumption {
Task { await fetchResumeCard() }
} else if proactiveCard == nil {
showProactiveCard(ProactiveCard(source: .vlmFriction(
frictionType: friction.type,
description: friction.description,
actions: friction.proposedActions
)))
}
} else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge {
distractionCount += 1
lastNudge = nudge
sendNudgeNotification(nudge)
}
} else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge {
// Only nudge if VLM found no actionable friction
distractionCount += 1
lastNudge = nudge
sendNudgeNotification(nudge)