include argus workflow

This commit is contained in:
joyzhuo
2026-03-29 06:29:18 -04:00
parent 275a53ab40
commit 56673078f5
23 changed files with 3098 additions and 307 deletions

View File

@@ -24,16 +24,34 @@ enum NetworkError: Error, LocalizedError {
final class TokenStore {
static let shared = TokenStore()
private let key = "lockInBro.jwt"
private let accessKey = "lockInBro.jwt"
private let refreshKey = "lockInBro.refreshToken"
private init() {}
var token: String? {
get { UserDefaults.standard.string(forKey: key) }
get { UserDefaults.standard.string(forKey: accessKey) }
set {
if let v = newValue { UserDefaults.standard.set(v, forKey: key) }
else { UserDefaults.standard.removeObject(forKey: key) }
if let v = newValue { UserDefaults.standard.set(v, forKey: accessKey) }
else { UserDefaults.standard.removeObject(forKey: accessKey) }
}
}
var refreshToken: String? {
get { UserDefaults.standard.string(forKey: refreshKey) }
set {
if let v = newValue { UserDefaults.standard.set(v, forKey: refreshKey) }
else { UserDefaults.standard.removeObject(forKey: refreshKey) }
}
}
func clear() {
token = nil
refreshToken = nil
}
}
extension Notification.Name {
static let lockInBroAuthExpired = Notification.Name("lockInBroAuthExpired")
}
// MARK: - APIClient
@@ -46,13 +64,17 @@ final class APIClient {
// MARK: Core Request
// Coalesces concurrent 401-triggered refreshes into one request
private var activeRefreshTask: Task<Bool, Never>?
private func req(
_ path: String,
method: String = "GET",
body: Data? = nil,
contentType: String = "application/json",
auth: Bool = true,
timeout: TimeInterval = 30
timeout: TimeInterval = 30,
isRetry: Bool = false
) async throws -> Data {
guard let url = URL(string: base + path) else {
throw NetworkError.unknown(URLError(.badURL))
@@ -75,6 +97,17 @@ final class APIClient {
throw NetworkError.unknown(URLError(.badServerResponse))
}
guard http.statusCode < 400 else {
if http.statusCode == 401 && auth && !isRetry {
// Try to silently refresh the access token, then retry once
let refreshed = await refreshAccessToken()
if refreshed {
return try await req(path, method: method, body: body,
contentType: contentType, auth: auth,
timeout: timeout, isRetry: true)
}
// Refresh also failed force logout
await MainActor.run { AuthManager.shared.handleSessionExpired() }
}
let msg = (try? JSONDecoder().decode(APIErrorResponse.self, from: data))?.detail
?? String(data: data, encoding: .utf8)
?? "Unknown error"
@@ -83,6 +116,32 @@ final class APIClient {
return data
}
/// Refreshes the access token. Concurrent callers share one in-flight request.
private func refreshAccessToken() async -> Bool {
if let existing = activeRefreshTask { return await existing.value }
let task = Task<Bool, Never> {
defer { self.activeRefreshTask = nil }
guard let refresh = TokenStore.shared.refreshToken else { return false }
do {
let body = try JSONSerialization.data(withJSONObject: ["refresh_token": refresh])
guard let url = URL(string: base + "/auth/refresh") else { return false }
var req = URLRequest(url: url)
req.httpMethod = "POST"
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.httpBody = body
req.timeoutInterval = 30
let (data, res) = try await urlSession.data(for: req)
guard let http = res as? HTTPURLResponse, http.statusCode == 200 else { return false }
let auth = try self.decode(AuthResponse.self, from: data)
TokenStore.shared.token = auth.accessToken
TokenStore.shared.refreshToken = auth.refreshToken
return true
} catch { return false }
}
activeRefreshTask = task
return await task.value
}
private func decode<T: Decodable>(_ type: T.Type, from data: Data) throws -> T {
let decoder = JSONDecoder()
decoder.dateDecodingStrategy = .iso8601
@@ -206,6 +265,16 @@ final class APIClient {
// MARK: - Sessions
/// Returns the currently active session, or nil if none (404).
func getActiveSession() async throws -> FocusSession? {
do {
let data = try await req("/sessions/active")
return try decode(FocusSession.self, from: data)
} catch NetworkError.httpError(404, _) {
return nil
}
}
func startSession(taskId: String?) async throws -> FocusSession {
var dict: [String: Any] = ["platform": "mac"]
if let tid = taskId { dict["task_id"] = tid }
@@ -268,9 +337,35 @@ final class APIClient {
_ = try await req("/distractions/app-activity", method: "POST", body: body)
}
// MARK: - Distraction / Screenshot Analysis
// Note: spec primary endpoint is POST /distractions/analyze-result (device-side VLM, JSON only).
// Backend currently implements analyze-screenshot (legacy fallback) using that until analyze-result is deployed.
// MARK: - Distraction / VLM Analysis
/// Post a VLM analysis result (from GeminiVLMClient) to the backend.
/// This updates the session checkpoint so the backend has the latest on_task / friction data.
func postAnalysisResult(_ result: DistractionAnalysisResponse, sessionId: String) async throws {
var payload: [String: Any] = [
"session_id": sessionId,
"on_task": result.onTask,
"confidence": result.confidence,
"vlm_summary": result.vlmSummary ?? "",
"steps_completed": result.stepsCompleted,
]
if let stepId = result.currentStepId { payload["current_step_id"] = stepId }
if let note = result.checkpointNoteUpdate { payload["checkpoint_note_update"] = note }
if let app = result.appName { payload["app_name"] = app }
if let nudge = result.gentleNudge { payload["gentle_nudge"] = nudge }
if let friction = result.friction {
payload["friction"] = [
"type": friction.type,
"confidence": friction.confidence,
"description": friction.description as Any,
"proposed_actions": friction.proposedActions.map {
["label": $0.label, "action_type": $0.actionType, "details": $0.details as Any]
},
]
}
let body = try JSONSerialization.data(withJSONObject: payload)
_ = try await req("/distractions/analyze-result", method: "POST", body: body)
}
func analyzeScreenshot(
imageData: Data,

View File

@@ -22,6 +22,7 @@ final class AuthManager {
do {
let response = try await APIClient.shared.login(email: email, password: password)
TokenStore.shared.token = response.accessToken
TokenStore.shared.refreshToken = response.refreshToken
currentUser = response.user
isLoggedIn = true
} catch {
@@ -40,6 +41,7 @@ final class AuthManager {
displayName: displayName
)
TokenStore.shared.token = response.accessToken
TokenStore.shared.refreshToken = response.refreshToken
currentUser = response.user
isLoggedIn = true
} catch {
@@ -58,6 +60,7 @@ final class AuthManager {
fullName: fullName
)
TokenStore.shared.token = response.accessToken
TokenStore.shared.refreshToken = response.refreshToken
currentUser = response.user
isLoggedIn = true
} catch {
@@ -67,8 +70,20 @@ final class AuthManager {
}
func logout() {
TokenStore.shared.token = nil
SessionManager.shared.stopMonitoring()
TokenStore.shared.clear()
currentUser = nil
isLoggedIn = false
errorMessage = nil
}
/// Called by APIClient when the server returns 401 and the refresh token is also dead.
func handleSessionExpired() {
guard isLoggedIn else { return }
SessionManager.shared.stopMonitoring()
TokenStore.shared.clear()
currentUser = nil
isLoggedIn = false
errorMessage = "Your session expired — please log in again."
}
}

View File

@@ -401,6 +401,8 @@ private struct ProactiveCardView: View {
return description ?? "I noticed something that might be slowing you down."
case .appSwitchLoop(let apps, let count):
return "You've switched between \(apps.joined(separator: "")) \(count)× in a row — are you stuck?"
case .sessionAction(_, _, let checkpoint, let reason, _):
return checkpoint.isEmpty ? reason : "Left off: \(checkpoint)"
}
}
}

View File

@@ -2,8 +2,24 @@
import SwiftUI
// MARK: - AppDelegate (subprocess cleanup on quit)
final class AppDelegate: NSObject, NSApplicationDelegate {
/// Called for normal quits (Cmd+Q), window close, and SIGTERM.
/// Ensures the argus subprocess is killed before the process exits.
func applicationWillTerminate(_ notification: Notification) {
// applicationWillTerminate runs on the main thread, so we can safely
// call @MainActor methods synchronously via assumeIsolated.
MainActor.assumeIsolated {
SessionManager.shared.stopMonitoring()
}
}
}
@main
struct LockInBroApp: App {
@NSApplicationDelegateAdaptor(AppDelegate.self) var appDelegate
@State private var auth = AuthManager.shared
@State private var session = SessionManager.shared
@@ -13,9 +29,11 @@ struct LockInBroApp: App {
ContentView()
.environment(auth)
.environment(session)
.onChange(of: session.isSessionActive) { _, isActive in
if isActive {
.onChange(of: auth.isLoggedIn, initial: true) { _, loggedIn in
if loggedIn {
// Show HUD and start always-on monitoring as soon as user logs in
FloatingPanelController.shared.show(session: session)
Task { await session.startMonitoring() }
} else {
FloatingPanelController.shared.close()
}

View File

@@ -109,6 +109,32 @@ struct Step: Identifiable, Codable, Hashable {
// MARK: - Focus Session
/// Subset of the JSONB checkpoint dict stored on the backend session record.
/// Populated by argus when it POSTs to /distractions/analyze-result.
struct SessionCheckpoint: Codable {
/// Written by POST /distractions/analyze-result (argus live mode).
let lastVlmSummary: String?
/// Written by POST /distractions/analyze-screenshot (Swift fallback).
let lastScreenshotAnalysis: String?
/// Concise summary of the last completed action.
let lastActionSummary: String?
/// Frontmost application name at last checkpoint.
let activeApp: String?
/// Running count of distractions logged during this session.
let distractionCount: Int?
/// Returns whichever VLM summary field is populated, preferring the most recent.
var vlmSummary: String? { lastVlmSummary ?? lastScreenshotAnalysis }
enum CodingKeys: String, CodingKey {
case lastVlmSummary = "last_vlm_summary"
case lastScreenshotAnalysis = "last_screenshot_analysis"
case lastActionSummary = "last_action_summary"
case activeApp = "active_app"
case distractionCount = "distraction_count"
}
}
struct FocusSession: Identifiable, Codable {
let id: String
let userId: String
@@ -117,9 +143,11 @@ struct FocusSession: Identifiable, Codable {
let startedAt: String
var endedAt: String?
var status: String
/// Live checkpoint data written by argus (nil when no checkpoint exists yet).
var checkpoint: SessionCheckpoint?
enum CodingKeys: String, CodingKey {
case id, platform, status
case id, platform, status, checkpoint
case userId = "user_id"
case taskId = "task_id"
case startedAt = "started_at"
@@ -144,8 +172,10 @@ struct BrainDumpResponse: Codable {
struct ParsedTask: Codable, Identifiable {
// local UUID for list identity before saving
var localId: String = UUID().uuidString
var id: String { localId }
var id: String { taskId ?? localId }
/// Set by backend when the brain-dump endpoint creates the task automatically.
let taskId: String?
let title: String
let description: String?
let priority: Int
@@ -154,6 +184,7 @@ struct ParsedTask: Codable, Identifiable {
let tags: [String]
enum CodingKeys: String, CodingKey {
case taskId = "task_id"
case title, description, priority, deadline, tags
case estimatedMinutes = "estimated_minutes"
}
@@ -208,14 +239,29 @@ struct FrictionInfo: Codable {
var isResumption: Bool { type == "task_resumption" }
}
/// Session lifecycle action suggested by the VLM (new argus feature).
struct SessionAction: Codable {
/// resume | switch | complete | start_new | none
let type: String
let sessionId: String?
let reason: String?
enum CodingKeys: String, CodingKey {
case type, reason
case sessionId = "session_id"
}
}
struct DistractionAnalysisResponse: Codable {
let onTask: Bool
let currentStepId: String?
let inferredTask: String?
let checkpointNoteUpdate: String?
let stepsCompleted: [String]
// Upgraded Argus prompt fields (nil when backend uses legacy prompt)
let friction: FrictionInfo?
let intent: String? // skimming | engaged | unclear | null
let sessionAction: SessionAction? // new argus: session lifecycle suggestions
let intent: String? // skimming | engaged | unclear | null
let distractionType: String?
let appName: String?
let confidence: Double
@@ -225,9 +271,11 @@ struct DistractionAnalysisResponse: Codable {
enum CodingKeys: String, CodingKey {
case onTask = "on_task"
case currentStepId = "current_step_id"
case inferredTask = "inferred_task"
case checkpointNoteUpdate = "checkpoint_note_update"
case stepsCompleted = "steps_completed"
case friction, intent
case sessionAction = "session_action"
case distractionType = "distraction_type"
case appName = "app_name"
case confidence
@@ -298,6 +346,8 @@ struct ProactiveCard: Identifiable {
case vlmFriction(frictionType: String, description: String?, actions: [ProposedAction])
/// Heuristic app-switch loop detected by NSWorkspace observer (fallback when VLM hasn't returned friction yet).
case appSwitchLoop(apps: [String], switchCount: Int)
/// VLM suggests a session lifecycle action (new argus: resume, switch, complete, start_new).
case sessionAction(type: String, taskTitle: String, checkpoint: String, reason: String, sessionId: String?)
}
let id = UUID()
@@ -316,6 +366,14 @@ struct ProactiveCard: Identifiable {
}
case .appSwitchLoop:
return "Repetitive Pattern Detected"
case .sessionAction(let type, let taskTitle, _, _, _):
switch type {
case "resume": return "Resume: \(taskTitle)"
case "switch": return "Switch to: \(taskTitle)"
case "complete": return "Done with \(taskTitle)?"
case "start_new": return "Start a Focus Session?"
default: return "Session Suggestion"
}
}
}
@@ -332,6 +390,14 @@ struct ProactiveCard: Identifiable {
}
case .appSwitchLoop:
return "arrow.triangle.2.circlepath"
case .sessionAction(let type, _, _, _, _):
switch type {
case "resume": return "arrow.counterclockwise.circle"
case "switch": return "arrow.left.arrow.right"
case "complete": return "checkmark.circle"
case "start_new": return "plus.circle"
default: return "circle"
}
}
}
}

View File

@@ -1,4 +1,6 @@
// SessionManager.swift Focus session state, screenshot engine, distraction detection
// SessionManager.swift Focus session state, native VLM screen analysis
// Screenshot capture Gemini Vision API apply results to UI + post to backend.
// No Python subprocess. No external process management.
import AppKit
import SwiftUI
@@ -25,30 +27,27 @@ final class SessionManager {
var errorMessage: String?
var isLoading: Bool = false
// Proactive agent
// VLM / proactive agent
var proactiveCard: ProactiveCard?
/// Set when the user approves a proposed action shown as a confirmation toast
var approvedActionLabel: String?
/// Latest one-sentence summary from the VLM, shown in the floating HUD
var latestVlmSummary: String?
/// True while the argus executor is running an approved action
var latestInferredTask: String?
var isExecuting: Bool = false
/// Result produced by the executor's output() tool shown as a sticky card in the HUD
var executorOutput: (title: String, content: String)?
var monitoringError: String?
// Screenshot engine
var isCapturing: Bool = false
private var captureTask: Task<Void, Never>?
@ObservationIgnored private var captureTask: Task<Void, Never>?
private let captureInterval: TimeInterval = 5.0
// Rolling screenshot history buffer (max 4 entries, ~20-second window)
// Provides temporal context to the VLM so it can detect patterns across captures.
private struct ScreenshotHistoryEntry {
let summary: String // vlm_summary text from the previous analysis
let timestamp: Date
}
@ObservationIgnored private var screenshotHistory: [ScreenshotHistoryEntry] = []
// Frame buffer accumulate N frames before calling VLM for temporal diff context
@ObservationIgnored private var frameBuffer: [Data] = []
private let framesPerVLMCall = 3
// Rolling summary history fed as context into subsequent VLM calls
private struct HistoryEntry { let summary: String; let timestamp: Date }
@ObservationIgnored private var screenshotHistory: [HistoryEntry] = []
// App switch tracking
@ObservationIgnored private var appSwitches: [(name: String, bundleId: String, time: Date)] = []
@@ -56,15 +55,8 @@ final class SessionManager {
@ObservationIgnored private var lastApp: (name: String, bundleId: String) = ("", "")
@ObservationIgnored private var lastAppEnteredAt: Date = Date()
// Argus subprocess (device-side VLM)
@ObservationIgnored private var argusProcess: Process?
@ObservationIgnored private var argusReadTask: Task<Void, Never>?
@ObservationIgnored private var argusStdinPipe: Pipe?
/// Whether the current proactive card came from VLM (needs argus stdin response) vs local heuristic
@ObservationIgnored private var proactiveCardNeedsArgusResponse = false
// Proactive card auto-dismiss timer
@ObservationIgnored private var proactiveCardTimer: Task<Void, Never>?
private let argusPythonPath = "/Users/joyzhuo/miniconda3/envs/gmr/bin/python3"
private let argusRepoPath = "/Users/joyzhuo/yhack/lockinbro-argus"
private init() {}
@@ -83,9 +75,96 @@ final class SessionManager {
return Date().timeIntervalSince(start)
}
// MARK: - Monitoring Lifecycle
/// Immediately shuts down all monitoring without making any API calls.
func stopMonitoring() {
stopCapture()
stopAppObserver()
proactiveCardTimer?.cancel()
proactiveCardTimer = nil
activeSession = nil
activeTask = nil
activeSteps = []
isSessionActive = false
sessionStartDate = nil
lastNudge = nil
resumeCard = nil
showingResumeCard = false
proactiveCard = nil
latestVlmSummary = nil
latestInferredTask = nil
isExecuting = false
executorOutput = nil
monitoringError = nil
screenshotHistory = []
frameBuffer = []
persistedSessionId = nil
}
/// Called once after login. Auto-resumes any existing active session and starts the capture loop.
func startMonitoring() async {
guard TokenStore.shared.token != nil else { return }
guard !isCapturing else { return }
monitoringError = nil
await requestNotificationPermission()
// Silent preflight never shows UI; only request permission if not yet granted.
if !CGPreflightScreenCaptureAccess() {
CGRequestScreenCaptureAccess()
monitoringError = "Screen Recording permission required — enable in System Settings → Privacy & Security → Screen Recording, then tap Retry"
return
}
do {
if let existing = try await APIClient.shared.getActiveSession() {
await autoResumeSession(existing)
} else {
startCapture()
startAppObserver()
}
} catch {
startCapture()
startAppObserver()
}
}
/// Silently resume an active session found on the backend (no loading UI shown).
private func autoResumeSession(_ session: FocusSession) async {
activeSession = session
persistedSessionId = session.id
isSessionActive = true
sessionStartDate = Date()
distractionCount = 0
lastNudge = nil
screenshotHistory = []
frameBuffer = []
if let taskId = session.taskId {
do {
let tasks = try await APIClient.shared.getTasks()
activeTask = tasks.first(where: { $0.id == taskId })
if let task = activeTask {
let steps = try await APIClient.shared.getSteps(taskId: task.id)
activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder }
currentStepIndex = activeSteps.firstIndex(where: { $0.isActive })
?? activeSteps.firstIndex(where: { $0.status == "pending" })
?? 0
}
} catch {}
}
let shortId = String(session.id.prefix(8))
let taskLabel = activeTask?.title ?? "(no task)"
latestVlmSummary = "Resumed session \(shortId) · \(taskLabel)"
startCapture()
startAppObserver()
}
// MARK: - Session Lifecycle
// Persisted so we can end a stale session after an app restart
private var persistedSessionId: String? {
get { UserDefaults.standard.string(forKey: "lockInBro.lastSessionId") }
set {
@@ -98,18 +177,16 @@ final class SessionManager {
isLoading = true
errorMessage = nil
do {
let session: FocusSession
do {
session = try await APIClient.shared.startSession(taskId: task?.id)
} catch NetworkError.httpError(409, _) {
// End whichever session is active prefer the locally known one,
// fall back to the last persisted ID (survives app restarts)
let staleId = activeSession?.id ?? persistedSessionId
if let id = staleId {
_ = try? await APIClient.shared.endSession(sessionId: id, status: "completed")
}
session = try await APIClient.shared.startSession(taskId: task?.id)
// End any existing session first
var staleId: String? = activeSession?.id ?? persistedSessionId
if staleId == nil {
staleId = (try? await APIClient.shared.getActiveSession())?.id
}
if let id = staleId {
_ = try? await APIClient.shared.endSession(sessionId: id, status: "completed")
}
let session = try await APIClient.shared.startSession(taskId: task?.id)
activeSession = session
persistedSessionId = session.id
activeTask = task
@@ -119,20 +196,22 @@ final class SessionManager {
sessionStartDate = Date()
distractionCount = 0
lastNudge = nil
screenshotHistory = []
frameBuffer = []
if let task {
let steps = try await APIClient.shared.getSteps(taskId: task.id)
activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder }
// Pick first in-progress or first pending step
currentStepIndex = activeSteps.firstIndex(where: { $0.isActive })
?? activeSteps.firstIndex(where: { $0.status == "pending" })
?? 0
}
screenshotHistory = []
await requestNotificationPermission()
startArgus(session: session, task: task)
startAppObserver()
// Restart capture loop (in case it wasn't running or was in monitoring-only mode)
stopCapture()
startCapture()
if appSwitchObserver == nil { startAppObserver() }
} catch {
errorMessage = error.localizedDescription
}
@@ -140,7 +219,6 @@ final class SessionManager {
}
func endSession(status: String = "completed") async {
stopArgus()
stopCapture()
stopAppObserver()
if let session = activeSession {
@@ -155,14 +233,21 @@ final class SessionManager {
resumeCard = nil
showingResumeCard = false
proactiveCard = nil
approvedActionLabel = nil
latestVlmSummary = nil
latestInferredTask = nil
isExecuting = false
executorOutput = nil
proactiveCardTimer?.cancel()
proactiveCardTimer = nil
screenshotHistory = []
frameBuffer = []
persistedSessionId = nil
// Keep the capture loop running for app-switch heuristics
if TokenStore.shared.token != nil {
startCapture()
startAppObserver()
}
}
func fetchResumeCard() async {
@@ -183,7 +268,6 @@ final class SessionManager {
if let idx = activeSteps.firstIndex(where: { $0.id == updated.id }) {
activeSteps[idx] = updated
}
// Advance to next pending
if let next = activeSteps.firstIndex(where: { $0.status == "pending" }) {
currentStepIndex = next
}
@@ -192,12 +276,19 @@ final class SessionManager {
}
}
// MARK: - Retry (HUD Retry button)
func retryMonitoring() {
monitoringError = nil
frameBuffer = []
stopCapture()
startCapture()
if appSwitchObserver == nil { startAppObserver() }
}
// MARK: - Proactive Card Lifecycle
/// Show a proactive card and start the 15-second auto-dismiss timer.
/// - Parameter vlmCard: Pass true when the card came from VLM so argus gets a stdin response on dismiss.
private func showProactiveCard(_ card: ProactiveCard, vlmCard: Bool = false) {
proactiveCardNeedsArgusResponse = vlmCard
private func showProactiveCard(_ card: ProactiveCard) {
proactiveCardTimer?.cancel()
withAnimation { proactiveCard = card }
@@ -208,31 +299,44 @@ final class SessionManager {
}
}
/// Dismiss the current card (user tapped "Not now" or 15s elapsed).
func dismissProactiveCard() {
proactiveCardTimer?.cancel()
proactiveCardTimer = nil
withAnimation { proactiveCard = nil }
if proactiveCardNeedsArgusResponse { sendArgusResponse(0) }
proactiveCardNeedsArgusResponse = false
}
/// Approve action at the given index (0-based). Argus stdin uses 1-based (1 = action 0).
func approveProactiveCard(actionIndex: Int) {
proactiveCardTimer?.cancel()
proactiveCardTimer = nil
let card = proactiveCard
withAnimation { proactiveCard = nil }
if proactiveCardNeedsArgusResponse {
sendArgusResponse(actionIndex + 1)
isExecuting = true
guard case .vlmFriction(_, _, let actions) = card?.source,
actionIndex < actions.count else { return }
let action = actions[actionIndex]
isExecuting = true
Task {
do {
let screenshot = await captureScreen()
let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
guard !geminiKey.isEmpty else {
isExecuting = false
executorOutput = (title: action.label, content: action.details ?? "Action approved.")
return
}
let client = GeminiVLMClient(apiKey: geminiKey)
let result = try await client.executeAction(
label: action.label,
actionType: action.actionType,
details: action.details ?? "",
screenshot: screenshot
)
isExecuting = false
executorOutput = (title: action.label, content: result)
} catch {
isExecuting = false
executorOutput = (title: action.label, content: action.details ?? "Couldn't complete automatically.")
}
}
proactiveCardNeedsArgusResponse = false
}
private func sendArgusResponse(_ choice: Int) {
guard let pipe = argusStdinPipe,
let data = "\(choice)\n".data(using: .utf8) else { return }
try? pipe.fileHandleForWriting.write(contentsOf: data)
}
// MARK: - App Switch Observer
@@ -269,7 +373,7 @@ final class SessionManager {
guard name != lastApp.name else { return }
// Log previous app's dwell time to backend (fire-and-forget)
// Log previous app dwell time to backend
let duration = max(1, Int(now.timeIntervalSince(lastAppEnteredAt)))
let prev = lastApp
if let session = activeSession, !prev.name.isEmpty {
@@ -289,159 +393,36 @@ final class SessionManager {
appSwitches.append((name: name, bundleId: bundleId, time: now))
if appSwitches.count > 30 { appSwitches.removeFirst() }
// Only trigger card during active session and when none is already showing
guard isSessionActive, proactiveCard == nil else { return }
if let loop = detectRepetitiveLoop() {
showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count)), vlmCard: false)
showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count)))
}
}
// Detects a back-and-forth pattern between exactly 2 apps within a 5-minute window.
// Requires 3 full cycles (6 consecutive alternating switches) to avoid false positives.
private func detectRepetitiveLoop() -> (apps: [String], count: Int)? {
let cutoff = Date().addingTimeInterval(-300)
let recent = appSwitches.filter { $0.time > cutoff }.map(\.name)
guard recent.count >= 6 else { return nil }
let last6 = Array(recent.suffix(6))
guard Set(last6).count == 2 else { return nil }
// Strictly alternating no two consecutive identical app names
for i in 1..<last6.count {
if last6[i] == last6[i - 1] { return nil }
}
return (apps: Array(Set(last6)).sorted(), count: 3)
}
// MARK: - Argus Subprocess (device-side VLM)
/// Launch the argus Python daemon as a subprocess.
/// Argus captures screenshots itself, runs them through a local VLM (Ollama/Gemini),
/// posts results to the backend, and emits RESULT:{json} lines to stdout for Swift to consume.
/// Falls back to the internal `startCapture()` loop if the process cannot be launched.
private func startArgus(session: FocusSession, task: AppTask?) {
guard FileManager.default.fileExists(atPath: argusPythonPath),
FileManager.default.fileExists(atPath: argusRepoPath) else {
startCapture()
return
}
// Encode steps as JSON for --steps-json arg
var stepsJSONString = "[]"
if !activeSteps.isEmpty {
let stepsArray: [[String: Any]] = activeSteps.map { step in
var s: [String: Any] = [
"id": step.id,
"sort_order": step.sortOrder,
"title": step.title,
"status": step.status
]
if let note = step.checkpointNote { s["checkpoint_note"] = note }
return s
}
if let data = try? JSONSerialization.data(withJSONObject: stepsArray),
let str = String(data: data, encoding: .utf8) {
stepsJSONString = str
}
}
let jwt = TokenStore.shared.token ?? ""
let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
var arguments = [
"-m", "argus",
"--session-id", session.id,
"--task-title", task?.title ?? "(no task)",
"--task-goal", task?.description ?? "",
"--steps-json", stepsJSONString,
"--window-title", NSWorkspace.shared.frontmostApplication?.localizedName ?? "",
"--vlm", "gemini",
"--jwt", jwt,
"--backend-url", "https://wahwa.com/api/v1",
"--swift-ipc",
"--execute" // enables agentic executor; Swift sends 0/1/2 via stdin
]
if !geminiKey.isEmpty {
arguments += ["--gemini-key", geminiKey]
}
let process = Process()
process.executableURL = URL(fileURLWithPath: argusPythonPath)
process.currentDirectoryURL = URL(fileURLWithPath: argusRepoPath)
process.arguments = arguments
// Pipe stdout for RESULT:/STATUS:/EXEC_OUTPUT: lines
// stderr is NOT captured leaving it unset lets argus log to the system console
// without risk of the pipe buffer filling and blocking the process.
let stdoutPipe = Pipe()
let stdinPipe = Pipe()
process.standardOutput = stdoutPipe
process.standardInput = stdinPipe
do {
try process.run()
} catch {
startCapture()
return
}
argusProcess = process
argusStdinPipe = stdinPipe
isCapturing = true
// Read RESULT:/STATUS:/EXEC_OUTPUT: lines from argus stdout in a background task
let fileHandle = stdoutPipe.fileHandleForReading
argusReadTask = Task { [weak self] in
do {
for try await line in fileHandle.bytes.lines {
guard let self, !Task.isCancelled else { break }
if line.hasPrefix("RESULT:") {
let jsonStr = String(line.dropFirst("RESULT:".count))
if let data = jsonStr.data(using: .utf8),
let result = try? JSONDecoder().decode(DistractionAnalysisResponse.self, from: data) {
await MainActor.run { self.applyDistractionResult(result) }
}
} else if line.hasPrefix("STATUS:exec_done:") {
await MainActor.run { self.isExecuting = false }
} else if line.hasPrefix("EXEC_OUTPUT:") {
let jsonStr = String(line.dropFirst("EXEC_OUTPUT:".count))
if let data = jsonStr.data(using: .utf8),
let obj = try? JSONSerialization.jsonObject(with: data) as? [String: String],
let title = obj["title"], let content = obj["content"] {
await MainActor.run { self.executorOutput = (title: title, content: content) }
}
}
}
} catch {
// Pipe closed argus process ended
}
}
}
private func stopArgus() {
argusReadTask?.cancel()
argusReadTask = nil
argusStdinPipe = nil
if let proc = argusProcess {
proc.terminate()
argusProcess = nil
isCapturing = false
}
}
// MARK: - Screenshot Capture Loop (fallback when argus is unavailable)
// MARK: - Screenshot Capture Loop
private func startCapture() {
guard !isCapturing else { return }
isCapturing = true
captureTask = Task { [weak self] in
guard let self else { return }
// Capture immediately on session start, then repeat on interval
// Capture immediately, then repeat on interval
await self.captureAndAnalyze()
while !Task.isCancelled && self.isSessionActive {
while !Task.isCancelled {
try? await Task.sleep(for: .seconds(self.captureInterval))
guard !Task.isCancelled && self.isSessionActive else { break }
guard !Task.isCancelled else { break }
await self.captureAndAnalyze()
}
}
@@ -453,56 +434,77 @@ final class SessionManager {
isCapturing = false
}
/// Capture one frame, buffer it, and call VLM every `framesPerVLMCall` frames.
private func captureAndAnalyze() async {
guard let session = activeSession else { return }
guard let imageData = await captureScreen() else { return }
let windowTitle = NSWorkspace.shared.frontmostApplication?.localizedName ?? "Unknown"
var context = buildTaskContext()
frameBuffer.append(imageData)
// Keep buffer bounded rolling window of most recent frames
if frameBuffer.count > framesPerVLMCall { frameBuffer.removeFirst() }
// Inject rolling history so the VLM has temporal context across captures.
// Only summaries (text) are sent not the raw images to keep token cost low.
if !screenshotHistory.isEmpty {
let iso = ISO8601DateFormatter()
context["screenshot_history"] = screenshotHistory.map { entry in
["summary": entry.summary, "timestamp": iso.string(from: entry.timestamp)]
}
// Only call VLM once we have a full batch for temporal diff analysis
guard frameBuffer.count >= framesPerVLMCall else { return }
let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
guard !geminiKey.isEmpty else {
print("[VLM] No Gemini API key set — skipping analysis")
return
}
do {
let result = try await APIClient.shared.analyzeScreenshot(
imageData: imageData,
windowTitle: windowTitle,
sessionId: session.id,
taskContext: context
)
let client = GeminiVLMClient(apiKey: geminiKey)
let windowTitle = NSWorkspace.shared.frontmostApplication?.localizedName ?? ""
let recentSummaries = screenshotHistory.map(\.summary)
let frames = frameBuffer // snapshot before async gap
// Append this result's summary to the rolling buffer (max 4 entries)
if let summary = result.vlmSummary {
screenshotHistory.append(ScreenshotHistoryEntry(summary: summary, timestamp: Date()))
do {
print("[VLM] Calling Gemini with \(frames.count) frames…")
let result = try await client.analyze(
frames: frames,
taskTitle: activeTask?.title ?? "",
taskGoal: activeTask?.description ?? "",
steps: activeSteps,
windowTitle: windowTitle,
recentSummaries: recentSummaries
)
print("[VLM] Result: on_task=\(result.onTask), friction=\(result.friction?.type ?? "none"), summary=\(result.vlmSummary ?? "")")
// Append to rolling summary history
if let summary = result.vlmSummary, !summary.isEmpty {
screenshotHistory.append(HistoryEntry(summary: summary, timestamp: Date()))
if screenshotHistory.count > 4 { screenshotHistory.removeFirst() }
}
// Clear frame buffer next batch starts fresh
frameBuffer.removeAll()
monitoringError = nil
applyDistractionResult(result)
// Post result to backend (fire-and-forget)
if let session = activeSession {
Task {
try? await APIClient.shared.postAnalysisResult(result, sessionId: session.id)
}
}
} catch {
// Silent fail don't interrupt the user
print("[VLM] Analysis error: \(error)")
// Don't surface transient errors the next attempt will retry automatically
}
}
// MARK: - Screen Capture
private func captureScreen() async -> Data? {
guard CGPreflightScreenCaptureAccess() else { return nil }
do {
let content = try await SCShareableContent.current
guard let display = content.displays.first else { return nil }
let config = SCStreamConfiguration()
config.width = 1280
config.height = 720
let filter = SCContentFilter(display: display, excludingWindows: [])
let image = try await SCScreenshotManager.captureImage(
contentFilter: filter,
configuration: config
)
contentFilter: filter, configuration: config)
return cgImageToJPEG(image)
} catch {
return nil
@@ -518,29 +520,13 @@ final class SessionManager {
return jpeg
}
private func buildTaskContext() -> [String: Any] {
var ctx: [String: Any] = [:]
guard let task = activeTask else { return ctx }
ctx["task_title"] = task.title
ctx["task_goal"] = task.description ?? task.title
ctx["steps"] = activeSteps.map { step -> [String: Any] in
var s: [String: Any] = [
"id": step.id,
"sort_order": step.sortOrder,
"title": step.title,
"status": step.status
]
if let note = step.checkpointNote { s["checkpoint_note"] = note }
return s
}
return ctx
}
// MARK: - Apply VLM Result
private func applyDistractionResult(_ result: DistractionAnalysisResponse) {
// 0. Store latest summary for the floating HUD
if let summary = result.vlmSummary { latestVlmSummary = summary }
if let task = result.inferredTask, !task.isEmpty { latestInferredTask = task }
// 1. Apply step side-effects (always)
// Apply step side-effects
for completedId in result.stepsCompleted {
if let idx = activeSteps.firstIndex(where: { $0.id == completedId }) {
activeSteps[idx].status = "done"
@@ -556,22 +542,25 @@ final class SessionManager {
currentStepIndex = idx
}
// 2. Notification priority (design spec §1.5):
// Proactive friction help Context resume Gentle nudge
// NEVER nudge when the system could help instead.
if let friction = result.friction, friction.isActionable {
if friction.isResumption {
// Task resumption detected auto-surface resume card without button press
Task { await fetchResumeCard() }
} else if proactiveCard == nil {
showProactiveCard(ProactiveCard(source: .vlmFriction(
frictionType: friction.type,
description: friction.description,
actions: friction.proposedActions
)), vlmCard: true)
// Notification priority: friction card (formal or has actions) nudge
if let friction = result.friction {
let shouldShow = friction.isActionable || !friction.proposedActions.isEmpty
if shouldShow {
if friction.isResumption {
Task { await fetchResumeCard() }
} else if proactiveCard == nil {
showProactiveCard(ProactiveCard(source: .vlmFriction(
frictionType: friction.type,
description: friction.description,
actions: friction.proposedActions
)))
}
} else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge {
distractionCount += 1
lastNudge = nudge
sendNudgeNotification(nudge)
}
} else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge {
// Only nudge if VLM found no actionable friction
distractionCount += 1
lastNudge = nudge
sendNudgeNotification(nudge)