include argus workflow

2026-03-29 06:29:18 -04:00
parent 275a53ab40
commit 56673078f5
23 changed files with 3098 additions and 307 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,11 @@
 # Python venv — run `python3 -m venv .venv && pip install -r argus/requirements.txt` to recreate
 .venv/
 # Python bytecode
 argus/__pycache__/
 argus/**/__pycache__/
 *.pyc
 *.pyo
 # macOS
 .DS_Store
--- a/FloatingHUDView.swift
+++ b/FloatingHUDView.swift
@@ -18,6 +18,7 @@ struct FloatingHUDView: View {
        .animation(.spring(duration: 0.3), value: session.proactiveCard?.id)
        .animation(.spring(duration: 0.3), value: session.isExecuting)
        .animation(.spring(duration: 0.3), value: session.executorOutput?.title)
        .animation(.spring(duration: 0.3), value: session.monitoringError)
    }
    // MARK: - Header
@@ -58,6 +59,12 @@ struct FloatingHUDView: View {
    @ViewBuilder
    private var content: some View {
        // Error / warning banner — shown above all other content when monitoring has a problem
        if let error = session.monitoringError {
            MonitoringErrorBanner(message: error)
                .transition(.move(edge: .top).combined(with: .opacity))
        }
        // Executor output sticky card (highest priority — persists until dismissed)
        if let output = session.executorOutput {
            ExecutorOutputCard(title: output.title, content: output.content) {
@@ -83,11 +90,21 @@ struct FloatingHUDView: View {
                .transition(.move(edge: .top).combined(with: .opacity))
        }
        // Latest VLM summary (idle state)
-        else {
+        else if session.monitoringError == nil {
            VStack(alignment: .leading, spacing: 4) {
                if let task = session.latestInferredTask, !task.isEmpty {
                    Text(task)
                        .font(.caption.bold())
                        .foregroundStyle(.primary)
                        .fixedSize(horizontal: false, vertical: true)
                        .lineLimit(2)
                }
                Text(session.latestVlmSummary ?? "Monitoring your screen…")
                    .font(.caption)
                    .foregroundStyle(.secondary)
                    .fixedSize(horizontal: false, vertical: true)
                    .lineLimit(3)
            }
            .padding(14)
            .transition(.opacity)
        }
@@ -128,8 +145,17 @@ private struct HUDCardView: View {
                .buttonStyle(.plain)
            }
-            // Action buttons — only for VLM friction with proposed actions
+            // Action buttons
-            if case .vlmFriction(_, _, let actions) = card.source, !actions.isEmpty {
+            actionButtons
        }
        .padding(14)
        .background(Color.purple.opacity(0.07))
    }
    @ViewBuilder
    private var actionButtons: some View {
        switch card.source {
        case .vlmFriction(_, _, let actions) where !actions.isEmpty:
            VStack(alignment: .leading, spacing: 6) {
                ForEach(Array(actions.prefix(2).enumerated()), id: \.offset) { index, action in
                    Button {
@@ -157,18 +183,49 @@ private struct HUDCardView: View {
                    .buttonStyle(.plain)
                    .foregroundStyle(.purple)
                }
                notNowButton
            }
        case .sessionAction(let type, _, _, _, _):
            VStack(alignment: .leading, spacing: 6) {
                Button {
                    session.approveProactiveCard(actionIndex: 0)
                } label: {
                    Text(sessionActionButtonLabel(type))
                        .font(.caption.bold())
                        .frame(maxWidth: .infinity, alignment: .leading)
                        .padding(.horizontal, 10)
                        .padding(.vertical, 6)
                        .background(Color.purple.opacity(0.12))
                        .clipShape(.rect(cornerRadius: 8))
                }
                .buttonStyle(.plain)
                .foregroundStyle(.purple)
                notNowButton
            }
        default:
            EmptyView()
        }
    }
    private var notNowButton: some View {
        Button("Not now — I'm good") { session.dismissProactiveCard() }
            .font(.caption)
            .foregroundStyle(.secondary)
            .buttonStyle(.plain)
            .padding(.top, 2)
    }
    private func sessionActionButtonLabel(_ type: String) -> String {
        switch type {
        case "resume":    return "Resume session"
        case "switch":    return "Switch to this task"
        case "complete":  return "Mark complete"
        case "start_new": return "Start focus session"
        default:          return "OK"
        }
    }
        .padding(14)
        .background(Color.purple.opacity(0.07))
    }
    private var bodyText: String {
        switch card.source {
@@ -176,10 +233,59 @@ private struct HUDCardView: View {
            return description ?? "I noticed something that might be slowing you down."
        case .appSwitchLoop(let apps, let count):
            return "You've switched between \(apps.joined(separator: " ↔ ")) \(count)× — are you stuck?"
        case .sessionAction(_, _, let checkpoint, let reason, _):
            if !checkpoint.isEmpty { return "Left off: \(checkpoint)" }
            return reason.isEmpty ? "Argus noticed a session change." : reason
        }
    }
 }
 // MARK: - Monitoring Error Banner
 private struct MonitoringErrorBanner: View {
    let message: String
    @Environment(SessionManager.self) private var session
    private var isRestarting: Bool { message.contains("restarting") }
    var body: some View {
        VStack(alignment: .leading, spacing: 8) {
            HStack(spacing: 8) {
                Image(systemName: isRestarting ? "arrow.clockwise" : "exclamationmark.triangle.fill")
                    .font(.caption)
                    .foregroundStyle(isRestarting ? .orange : .red)
                    .symbolEffect(.pulse, isActive: isRestarting)
                Text(message)
                    .font(.caption)
                    .foregroundStyle(isRestarting ? .orange : .red)
                    .fixedSize(horizontal: false, vertical: true)
                Spacer(minLength: 0)
            }
            if !isRestarting {
                Button("Retry") { session.retryMonitoring() }
                    .font(.caption.bold())
                    .foregroundStyle(.white)
                    .padding(.horizontal, 12)
                    .padding(.vertical, 5)
                    .background(Color.red.opacity(0.8))
                    .clipShape(.rect(cornerRadius: 6))
                    .buttonStyle(.plain)
            }
        }
        .padding(12)
        .background(isRestarting ? Color.orange.opacity(0.08) : Color.red.opacity(0.08))
        .overlay(
            Rectangle()
                .frame(width: 3)
                .foregroundStyle(isRestarting ? Color.orange : Color.red),
            alignment: .leading
        )
    }
 }
 // MARK: - Executor Output Sticky Card
 private struct ExecutorOutputCard: View {
--- a/GeminiVLMClient.swift
+++ b/GeminiVLMClient.swift
@@ -0,0 +1,278 @@
 // GeminiVLMClient.swift — Native Swift Gemini Vision API client
 // Ports the Python argus VLM analysis (vlm.py) directly into Swift.
 // No subprocess required: screenshots go straight from ScreenCaptureKit → Gemini → UI.
 import Foundation
 struct GeminiVLMClient {
    private static let apiBase = "https://generativelanguage.googleapis.com/v1beta/models"
    private static let model   = "gemini-3.1-pro-preview"
    let apiKey: String
    // MARK: - Public
    /// Analyze a sequence of JPEG frames and return a structured distraction analysis.
    /// - Parameters:
    ///   - frames: JPEG screenshot frames, oldest first, newest last.
    ///   - taskTitle: Current task title (empty if no session).
    ///   - taskGoal: Task description / goal.
    ///   - steps: Active step list for the current task.
    ///   - windowTitle: Frontmost app name from NSWorkspace.
    ///   - recentSummaries: Rolling summaries from previous analyses (temporal context).
    func analyze(
        frames: [Data],
        taskTitle: String,
        taskGoal: String,
        steps: [Step],
        windowTitle: String,
        recentSummaries: [String]
    ) async throws -> DistractionAnalysisResponse {
        let prompt = buildPrompt(
            taskTitle: taskTitle,
            taskGoal: taskGoal,
            steps: steps,
            windowTitle: windowTitle,
            recentSummaries: recentSummaries
        )
        let raw = try await callGemini(prompt: prompt, frames: frames)
        return try parseResponse(raw)
    }
    // MARK: - Prompt Builder (ported from vlm.py build_system_prompt)
    private func buildPrompt(
        taskTitle: String,
        taskGoal: String,
        steps: [Step],
        windowTitle: String,
        recentSummaries: [String]
    ) -> String {
        let stepsText: String
        if steps.isEmpty {
            stepsText = "    (no steps defined)"
        } else {
            stepsText = steps.map { s in
                let marker: String
                switch s.status {
                case "pending":     marker = "○"
                case "in_progress": marker = "►"
                case "done":        marker = "✓"
                default:            marker = "?"
                }
                var line = "    \(marker) [\(s.status)] (id=\(s.id)) \(s.sortOrder). \(s.title)"
                if let note = s.checkpointNote { line += "  — checkpoint: \(note)" }
                return line
            }.joined(separator: "\n")
        }
        let historyText: String
        if recentSummaries.isEmpty {
            historyText = "  (no previous frames)"
        } else {
            historyText = recentSummaries.enumerated()
                .map { i, s in "  [frame \(i + 1)] \(s)" }
                .joined(separator: "\n")
        }
        return """
        You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots.
        ## How to read the screenshots
        You receive screenshots in chronological order (oldest first, newest last).
        Each frame is ~5 seconds apart. This means:
          - 2 unchanged frames = ~10 seconds idle — significant.
          - 3 unchanged frames = ~15 seconds idle — user is stuck or distracted.
          - If ALL frames are identical, the user has been idle for 15+ seconds — flag it.
        Your PRIMARY signal is the DIFFERENCES between consecutive frames.
        Where the screen CHANGED = where attention is. Static areas = ignore.
        Diff signals and what they mean:
          - New text appearing / cursor advancing → user is actively typing (this IS their task)
          - Window or tab switch → context change, could be reference or distraction
          - Same content, no pixel changes → stalled, idle, or reading
          - Repeated switching between same 2-3 apps → repetitive loop (manual data transfer)
          - Error message that APPEARED between frames → user just triggered it, relevant
          - Error message already in ALL frames → stale, ignore
        CRITICAL — looking at something ≠ working on something:
          - User switches to browser/another app and just LOOKS → distraction or quick reference.
          - User switches and starts TYPING/EDITING → might be a new task.
          - If the user has an active session and switches away WITHOUT typing in the new app,
            they are DISTRACTED from their session, not starting a new task.
          - A single app switch is NEVER enough to infer a new task. Wait for active work.
        ## Current task context
        Task: \(taskTitle.isEmpty ? "(no active task)" : taskTitle)
        Goal: \(taskGoal.isEmpty ? taskTitle : taskGoal)
        Steps:
        \(stepsText)
          Window title (OS): \(windowTitle.isEmpty ? "(unknown)" : windowTitle)
        ## Recent screen history (for temporal context)
        \(historyText)
        ## What to output
        Analyze the screenshots and return JSON with EXACTLY this structure (no extra fields, no markdown):
        {
          "on_task": true,
          "current_step_id": "step UUID or null",
          "inferred_task": "what the user is actually working on based on screen diffs",
          "checkpoint_note_update": "what specifically changed across these frames",
          "steps_completed": [],
          "friction": {
            "type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none",
            "confidence": 0.0,
            "description": "what the user is struggling with",
            "proposed_actions": [
              {
                "label": "specific verb phrase: what to do",
                "action_type": "auto_extract | brain_dump | other",
                "details": "natural language spec for what action to take"
              }
            ],
            "source_context": "filename or app name, or null",
            "target_context": "filename or app name, or null"
          },
          "session_action": {
            "type": "none",
            "session_id": null,
            "reason": ""
          },
          "intent": "skimming | engaged | unclear | null",
          "distraction_type": "app_switch | browsing | idle | null",
          "app_name": "primary visible application",
          "confidence": 0.8,
          "gentle_nudge": "short nudge message if distracted but no friction action applies, otherwise null",
          "vlm_summary": "1-sentence description of what CHANGED across the frames (not what is static)"
        }
        FRICTION DETECTION rules:
          - REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually)
          - STALLED: No meaningful pixel changes across 2+ frames; or user wrote then deleted
          - TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand)
          - CONTEXT_OVERHEAD: Many windows open, visibly searching across them
          - TASK_RESUMPTION: User just returned to a task they were working on earlier
        If friction confidence < 0.5, set type to "none".
        Only set gentle_nudge when user is off-task AND no actionable friction applies.
        """
    }
    // MARK: - Action Executor
    /// Execute a user-approved proactive action and return a plain-text result.
    func executeAction(
        label: String,
        actionType: String,
        details: String,
        screenshot: Data?
    ) async throws -> String {
        let taskInstruction: String
        switch actionType {
        case "auto_extract":
            taskInstruction = "Extract the relevant data from the screenshot and present it concisely as plain text."
        case "brain_dump":
            taskInstruction = "Format this as a short brain-dump note the user should add to their task list."
        default:
            taskInstruction = "Provide 2–3 concrete next steps the user can take right now."
        }
        let prompt = """
        You are a productivity assistant. The user approved this action: "\(label)"
        Details: \(details.isEmpty ? "(none)" : details)
        \(taskInstruction)
        Be specific and brief (3–5 sentences max). No markdown, no preamble, plain text only.
        """
        let frames: [Data] = screenshot.map { [$0] } ?? []
        return try await callGemini(prompt: prompt, frames: frames)
    }
    // MARK: - Gemini REST API Call
    private func callGemini(prompt: String, frames: [Data]) async throws -> String {
        let urlStr = "\(Self.apiBase)/\(Self.model):generateContent?key=\(apiKey)"
        guard let url = URL(string: urlStr) else { throw URLError(.badURL) }
        // Build content parts: label + image for each frame, then instruction
        var parts: [[String: Any]] = []
        let total = frames.count
        for (i, frame) in frames.enumerated() {
            parts.append(["text": "[Screenshot \(i + 1)/\(total) — \((total - i) * 5)s ago]"])
            parts.append([
                "inlineData": [
                    "mimeType": "image/jpeg",
                    "data": frame.base64EncodedString()
                ]
            ])
        }
        parts.append(["text": "Analyze this screenshot sequence now. Reply with ONLY valid JSON — no markdown, no code fences."])
        let body: [String: Any] = [
            "systemInstruction": ["parts": [["text": prompt]]],
            "contents": [["parts": parts]],
            "generationConfig": [
                "temperature": 0.2,
                "maxOutputTokens": 1024
            ]
        ]
        var request = URLRequest(url: url)
        request.httpMethod = "POST"
        request.setValue("application/json", forHTTPHeaderField: "Content-Type")
        request.httpBody = try JSONSerialization.data(withJSONObject: body)
        request.timeoutInterval = 60
        let (data, response) = try await URLSession.shared.data(for: request)
        if let http = response as? HTTPURLResponse, http.statusCode != 200 {
            let msg = String(data: data, encoding: .utf8) ?? "HTTP \(http.statusCode)"
            print("[GeminiVLM] API error \(http.statusCode): \(msg)")
            throw URLError(.badServerResponse)
        }
        guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
              let candidates = json["candidates"] as? [[String: Any]],
              let first = candidates.first,
              let content = first["content"] as? [String: Any],
              let contentParts = content["parts"] as? [[String: Any]],
              let text = contentParts.first?["text"] as? String
        else {
            let raw = String(data: data, encoding: .utf8) ?? ""
            print("[GeminiVLM] Unexpected response shape: \(raw.prefix(300))")
            throw URLError(.cannotParseResponse)
        }
        print("[GeminiVLM] Response (\(text.count) chars): \(text.prefix(200))")
        return text
    }
    // MARK: - Response Parsing
    private func parseResponse(_ text: String) throws -> DistractionAnalysisResponse {
        var cleaned = text.trimmingCharacters(in: .whitespacesAndNewlines)
        // Strip ```json ... ``` or ``` ... ``` fences
        if cleaned.hasPrefix("```") {
            let lines = cleaned.components(separatedBy: "\n")
            cleaned = lines.dropFirst().joined(separator: "\n")
            if let backtickRange = cleaned.range(of: "```") {
                cleaned = String(cleaned[..<backtickRange.lowerBound])
            }
            cleaned = cleaned.trimmingCharacters(in: .whitespacesAndNewlines)
        }
        // Find JSON object boundaries robustly
        guard let start = cleaned.firstIndex(of: "{"),
              let end = cleaned.lastIndex(of: "}") else {
            throw URLError(.cannotParseResponse)
        }
        let jsonStr = String(cleaned[start...end])
        guard let jsonData = jsonStr.data(using: .utf8) else {
            throw URLError(.cannotParseResponse)
        }
        return try JSONDecoder().decode(DistractionAnalysisResponse.self, from: jsonData)
    }
 }
--- a/LockInBro.xcodeproj/project.pbxproj
+++ b/LockInBro.xcodeproj/project.pbxproj
@@ -7,6 +7,7 @@
 	objects = {
 /* Begin PBXBuildFile section */
 		FF341F642F7932FA00B5716A /* GeminiVLMClient.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF341F632F7932FA00B5716A /* GeminiVLMClient.swift */; };
 		FF935B1E2F78A83100ED3330 /* SpeakerKit in Frameworks */ = {isa = PBXBuildFile; productRef = FF935B1D2F78A83100ED3330 /* SpeakerKit */; };
 		FF935B202F78A83100ED3330 /* TTSKit in Frameworks */ = {isa = PBXBuildFile; productRef = FF935B1F2F78A83100ED3330 /* TTSKit */; };
 		FF935B222F78A83100ED3330 /* WhisperKit in Frameworks */ = {isa = PBXBuildFile; productRef = FF935B212F78A83100ED3330 /* WhisperKit */; };
@@ -16,6 +17,7 @@
 /* Begin PBXFileReference section */
 		FF3296C22F785B3300C734EB /* LockInBro.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LockInBro.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		FF341F632F7932FA00B5716A /* GeminiVLMClient.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = GeminiVLMClient.swift; sourceTree = "<group>"; };
 		FF935B232F78D0AA00ED3330 /* FloatingPanel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FloatingPanel.swift; sourceTree = "<group>"; };
 		FF935B252F78D0BF00ED3330 /* FloatingHUDView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FloatingHUDView.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */
@@ -49,6 +51,7 @@
 				FF3296C32F785B3300C734EB /* Products */,
 				FF935B232F78D0AA00ED3330 /* FloatingPanel.swift */,
 				FF935B252F78D0BF00ED3330 /* FloatingHUDView.swift */,
 				FF341F632F7932FA00B5716A /* GeminiVLMClient.swift */,
 			);
 			sourceTree = "<group>";
 		};
@@ -141,6 +144,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				FF935B262F78D0BF00ED3330 /* FloatingHUDView.swift in Sources */,
 				FF341F642F7932FA00B5716A /* GeminiVLMClient.swift in Sources */,
 				FF935B242F78D0AA00ED3330 /* FloatingPanel.swift in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
--- a/LockInBro/APIClient.swift
+++ b/LockInBro/APIClient.swift
@@ -24,16 +24,34 @@ enum NetworkError: Error, LocalizedError {
 final class TokenStore {
    static let shared = TokenStore()
-    private let key = "lockInBro.jwt"
+    private let accessKey = "lockInBro.jwt"
    private let refreshKey = "lockInBro.refreshToken"
    private init() {}
    var token: String? {
-        get { UserDefaults.standard.string(forKey: key) }
+        get { UserDefaults.standard.string(forKey: accessKey) }
        set {
-            if let v = newValue { UserDefaults.standard.set(v, forKey: key) }
+            if let v = newValue { UserDefaults.standard.set(v, forKey: accessKey) }
-            else { UserDefaults.standard.removeObject(forKey: key) }
+            else { UserDefaults.standard.removeObject(forKey: accessKey) }
        }
    }
    var refreshToken: String? {
        get { UserDefaults.standard.string(forKey: refreshKey) }
        set {
            if let v = newValue { UserDefaults.standard.set(v, forKey: refreshKey) }
            else { UserDefaults.standard.removeObject(forKey: refreshKey) }
        }
    }
    func clear() {
        token = nil
        refreshToken = nil
    }
 }
 extension Notification.Name {
    static let lockInBroAuthExpired = Notification.Name("lockInBroAuthExpired")
 }
 // MARK: - APIClient
@@ -46,13 +64,17 @@ final class APIClient {
    // MARK: Core Request
    // Coalesces concurrent 401-triggered refreshes into one request
    private var activeRefreshTask: Task<Bool, Never>?
    private func req(
        _ path: String,
        method: String = "GET",
        body: Data? = nil,
        contentType: String = "application/json",
        auth: Bool = true,
-        timeout: TimeInterval = 30
+        timeout: TimeInterval = 30,
        isRetry: Bool = false
    ) async throws -> Data {
        guard let url = URL(string: base + path) else {
            throw NetworkError.unknown(URLError(.badURL))
@@ -75,6 +97,17 @@ final class APIClient {
            throw NetworkError.unknown(URLError(.badServerResponse))
        }
        guard http.statusCode < 400 else {
            if http.statusCode == 401 && auth && !isRetry {
                // Try to silently refresh the access token, then retry once
                let refreshed = await refreshAccessToken()
                if refreshed {
                    return try await req(path, method: method, body: body,
                                        contentType: contentType, auth: auth,
                                        timeout: timeout, isRetry: true)
                }
                // Refresh also failed — force logout
                await MainActor.run { AuthManager.shared.handleSessionExpired() }
            }
            let msg = (try? JSONDecoder().decode(APIErrorResponse.self, from: data))?.detail
                ?? String(data: data, encoding: .utf8)
                ?? "Unknown error"
@@ -83,6 +116,32 @@ final class APIClient {
        return data
    }
    /// Refreshes the access token. Concurrent callers share one in-flight request.
    private func refreshAccessToken() async -> Bool {
        if let existing = activeRefreshTask { return await existing.value }
        let task = Task<Bool, Never> {
            defer { self.activeRefreshTask = nil }
            guard let refresh = TokenStore.shared.refreshToken else { return false }
            do {
                let body = try JSONSerialization.data(withJSONObject: ["refresh_token": refresh])
                guard let url = URL(string: base + "/auth/refresh") else { return false }
                var req = URLRequest(url: url)
                req.httpMethod = "POST"
                req.setValue("application/json", forHTTPHeaderField: "Content-Type")
                req.httpBody = body
                req.timeoutInterval = 30
                let (data, res) = try await urlSession.data(for: req)
                guard let http = res as? HTTPURLResponse, http.statusCode == 200 else { return false }
                let auth = try self.decode(AuthResponse.self, from: data)
                TokenStore.shared.token = auth.accessToken
                TokenStore.shared.refreshToken = auth.refreshToken
                return true
            } catch { return false }
        }
        activeRefreshTask = task
        return await task.value
    }
    private func decode<T: Decodable>(_ type: T.Type, from data: Data) throws -> T {
        let decoder = JSONDecoder()
        decoder.dateDecodingStrategy = .iso8601
@@ -206,6 +265,16 @@ final class APIClient {
    // MARK: - Sessions
    /// Returns the currently active session, or nil if none (404).
    func getActiveSession() async throws -> FocusSession? {
        do {
            let data = try await req("/sessions/active")
            return try decode(FocusSession.self, from: data)
        } catch NetworkError.httpError(404, _) {
            return nil
        }
    }
    func startSession(taskId: String?) async throws -> FocusSession {
        var dict: [String: Any] = ["platform": "mac"]
        if let tid = taskId { dict["task_id"] = tid }
@@ -268,9 +337,35 @@ final class APIClient {
        _ = try await req("/distractions/app-activity", method: "POST", body: body)
    }
-    // MARK: - Distraction / Screenshot Analysis
+    // MARK: - Distraction / VLM Analysis
-    // Note: spec primary endpoint is POST /distractions/analyze-result (device-side VLM, JSON only).
+
-    // Backend currently implements analyze-screenshot (legacy fallback) — using that until analyze-result is deployed.
+    /// Post a VLM analysis result (from GeminiVLMClient) to the backend.
    /// This updates the session checkpoint so the backend has the latest on_task / friction data.
    func postAnalysisResult(_ result: DistractionAnalysisResponse, sessionId: String) async throws {
        var payload: [String: Any] = [
            "session_id": sessionId,
            "on_task": result.onTask,
            "confidence": result.confidence,
            "vlm_summary": result.vlmSummary ?? "",
            "steps_completed": result.stepsCompleted,
        ]
        if let stepId = result.currentStepId { payload["current_step_id"] = stepId }
        if let note = result.checkpointNoteUpdate { payload["checkpoint_note_update"] = note }
        if let app = result.appName { payload["app_name"] = app }
        if let nudge = result.gentleNudge { payload["gentle_nudge"] = nudge }
        if let friction = result.friction {
            payload["friction"] = [
                "type": friction.type,
                "confidence": friction.confidence,
                "description": friction.description as Any,
                "proposed_actions": friction.proposedActions.map {
                    ["label": $0.label, "action_type": $0.actionType, "details": $0.details as Any]
                },
            ]
        }
        let body = try JSONSerialization.data(withJSONObject: payload)
        _ = try await req("/distractions/analyze-result", method: "POST", body: body)
    }
    func analyzeScreenshot(
        imageData: Data,
--- a/LockInBro/AuthManager.swift
+++ b/LockInBro/AuthManager.swift
@@ -22,6 +22,7 @@ final class AuthManager {
        do {
            let response = try await APIClient.shared.login(email: email, password: password)
            TokenStore.shared.token = response.accessToken
            TokenStore.shared.refreshToken = response.refreshToken
            currentUser = response.user
            isLoggedIn = true
        } catch {
@@ -40,6 +41,7 @@ final class AuthManager {
                displayName: displayName
            )
            TokenStore.shared.token = response.accessToken
            TokenStore.shared.refreshToken = response.refreshToken
            currentUser = response.user
            isLoggedIn = true
        } catch {
@@ -58,6 +60,7 @@ final class AuthManager {
                fullName: fullName
            )
            TokenStore.shared.token = response.accessToken
            TokenStore.shared.refreshToken = response.refreshToken
            currentUser = response.user
            isLoggedIn = true
        } catch {
@@ -67,8 +70,20 @@ final class AuthManager {
    }
    func logout() {
-        TokenStore.shared.token = nil
+        SessionManager.shared.stopMonitoring()
        TokenStore.shared.clear()
        currentUser = nil
        isLoggedIn = false
        errorMessage = nil
    }
    /// Called by APIClient when the server returns 401 and the refresh token is also dead.
    func handleSessionExpired() {
        guard isLoggedIn else { return }
        SessionManager.shared.stopMonitoring()
        TokenStore.shared.clear()
        currentUser = nil
        isLoggedIn = false
        errorMessage = "Your session expired — please log in again."
    }
 }
--- a/LockInBro/FocusSessionView.swift
+++ b/LockInBro/FocusSessionView.swift
@@ -401,6 +401,8 @@ private struct ProactiveCardView: View {
            return description ?? "I noticed something that might be slowing you down."
        case .appSwitchLoop(let apps, let count):
            return "You've switched between \(apps.joined(separator: " ↔ ")) \(count)× in a row — are you stuck?"
        case .sessionAction(_, _, let checkpoint, let reason, _):
            return checkpoint.isEmpty ? reason : "Left off: \(checkpoint)"
        }
    }
 }
--- a/LockInBro/LockInBroApp.swift
+++ b/LockInBro/LockInBroApp.swift
@@ -2,8 +2,24 @@
 import SwiftUI
 // MARK: - AppDelegate (subprocess cleanup on quit)
 final class AppDelegate: NSObject, NSApplicationDelegate {
    /// Called for normal quits (Cmd+Q), window close, and SIGTERM.
    /// Ensures the argus subprocess is killed before the process exits.
    func applicationWillTerminate(_ notification: Notification) {
        // applicationWillTerminate runs on the main thread, so we can safely
        // call @MainActor methods synchronously via assumeIsolated.
        MainActor.assumeIsolated {
            SessionManager.shared.stopMonitoring()
        }
    }
 }
@main
 struct LockInBroApp: App {
    @NSApplicationDelegateAdaptor(AppDelegate.self) var appDelegate
    @State private var auth = AuthManager.shared
    @State private var session = SessionManager.shared
@@ -13,9 +29,11 @@ struct LockInBroApp: App {
            ContentView()
                .environment(auth)
                .environment(session)
-                .onChange(of: session.isSessionActive) { _, isActive in
+                .onChange(of: auth.isLoggedIn, initial: true) { _, loggedIn in
-                    if isActive {
+                    if loggedIn {
                        // Show HUD and start always-on monitoring as soon as user logs in
                        FloatingPanelController.shared.show(session: session)
                        Task { await session.startMonitoring() }
                    } else {
                        FloatingPanelController.shared.close()
                    }
--- a/LockInBro/Models.swift
+++ b/LockInBro/Models.swift
@@ -109,6 +109,32 @@ struct Step: Identifiable, Codable, Hashable {
 // MARK: - Focus Session
 /// Subset of the JSONB checkpoint dict stored on the backend session record.
 /// Populated by argus when it POSTs to /distractions/analyze-result.
 struct SessionCheckpoint: Codable {
    /// Written by POST /distractions/analyze-result (argus live mode).
    let lastVlmSummary: String?
    /// Written by POST /distractions/analyze-screenshot (Swift fallback).
    let lastScreenshotAnalysis: String?
    /// Concise summary of the last completed action.
    let lastActionSummary: String?
    /// Frontmost application name at last checkpoint.
    let activeApp: String?
    /// Running count of distractions logged during this session.
    let distractionCount: Int?
    /// Returns whichever VLM summary field is populated, preferring the most recent.
    var vlmSummary: String? { lastVlmSummary ?? lastScreenshotAnalysis }
    enum CodingKeys: String, CodingKey {
        case lastVlmSummary = "last_vlm_summary"
        case lastScreenshotAnalysis = "last_screenshot_analysis"
        case lastActionSummary = "last_action_summary"
        case activeApp = "active_app"
        case distractionCount = "distraction_count"
    }
 }
 struct FocusSession: Identifiable, Codable {
    let id: String
    let userId: String
@@ -117,9 +143,11 @@ struct FocusSession: Identifiable, Codable {
    let startedAt: String
    var endedAt: String?
    var status: String
    /// Live checkpoint data written by argus (nil when no checkpoint exists yet).
    var checkpoint: SessionCheckpoint?
    enum CodingKeys: String, CodingKey {
-        case id, platform, status
+        case id, platform, status, checkpoint
        case userId = "user_id"
        case taskId = "task_id"
        case startedAt = "started_at"
@@ -144,8 +172,10 @@ struct BrainDumpResponse: Codable {
 struct ParsedTask: Codable, Identifiable {
    // local UUID for list identity before saving
    var localId: String = UUID().uuidString
-    var id: String { localId }
+    var id: String { taskId ?? localId }
    /// Set by backend when the brain-dump endpoint creates the task automatically.
    let taskId: String?
    let title: String
    let description: String?
    let priority: Int
@@ -154,6 +184,7 @@ struct ParsedTask: Codable, Identifiable {
    let tags: [String]
    enum CodingKeys: String, CodingKey {
        case taskId = "task_id"
        case title, description, priority, deadline, tags
        case estimatedMinutes = "estimated_minutes"
    }
@@ -208,13 +239,28 @@ struct FrictionInfo: Codable {
    var isResumption: Bool { type == "task_resumption" }
 }
 /// Session lifecycle action suggested by the VLM (new argus feature).
 struct SessionAction: Codable {
    /// resume | switch | complete | start_new | none
    let type: String
    let sessionId: String?
    let reason: String?
    enum CodingKeys: String, CodingKey {
        case type, reason
        case sessionId = "session_id"
    }
 }
 struct DistractionAnalysisResponse: Codable {
    let onTask: Bool
    let currentStepId: String?
    let inferredTask: String?
    let checkpointNoteUpdate: String?
    let stepsCompleted: [String]
    // Upgraded Argus prompt fields (nil when backend uses legacy prompt)
    let friction: FrictionInfo?
    let sessionAction: SessionAction?   // new argus: session lifecycle suggestions
    let intent: String?                 // skimming | engaged | unclear | null
    let distractionType: String?
    let appName: String?
@@ -225,9 +271,11 @@ struct DistractionAnalysisResponse: Codable {
    enum CodingKeys: String, CodingKey {
        case onTask = "on_task"
        case currentStepId = "current_step_id"
        case inferredTask = "inferred_task"
        case checkpointNoteUpdate = "checkpoint_note_update"
        case stepsCompleted = "steps_completed"
        case friction, intent
        case sessionAction = "session_action"
        case distractionType = "distraction_type"
        case appName = "app_name"
        case confidence
@@ -298,6 +346,8 @@ struct ProactiveCard: Identifiable {
        case vlmFriction(frictionType: String, description: String?, actions: [ProposedAction])
        /// Heuristic app-switch loop detected by NSWorkspace observer (fallback when VLM hasn't returned friction yet).
        case appSwitchLoop(apps: [String], switchCount: Int)
        /// VLM suggests a session lifecycle action (new argus: resume, switch, complete, start_new).
        case sessionAction(type: String, taskTitle: String, checkpoint: String, reason: String, sessionId: String?)
    }
    let id = UUID()
@@ -316,6 +366,14 @@ struct ProactiveCard: Identifiable {
            }
        case .appSwitchLoop:
            return "Repetitive Pattern Detected"
        case .sessionAction(let type, let taskTitle, _, _, _):
            switch type {
            case "resume":     return "Resume: \(taskTitle)"
            case "switch":     return "Switch to: \(taskTitle)"
            case "complete":   return "Done with \(taskTitle)?"
            case "start_new":  return "Start a Focus Session?"
            default:           return "Session Suggestion"
            }
        }
    }
@@ -332,6 +390,14 @@ struct ProactiveCard: Identifiable {
            }
        case .appSwitchLoop:
            return "arrow.triangle.2.circlepath"
        case .sessionAction(let type, _, _, _, _):
            switch type {
            case "resume":     return "arrow.counterclockwise.circle"
            case "switch":     return "arrow.left.arrow.right"
            case "complete":   return "checkmark.circle"
            case "start_new":  return "plus.circle"
            default:           return "circle"
            }
        }
    }
 }
--- a/LockInBro/SessionManager.swift
+++ b/LockInBro/SessionManager.swift
@@ -1,4 +1,6 @@
-// SessionManager.swift — Focus session state, screenshot engine, distraction detection
+// SessionManager.swift — Focus session state, native VLM screen analysis
 // Screenshot capture → Gemini Vision API → apply results to UI + post to backend.
 // No Python subprocess. No external process management.
 import AppKit
 import SwiftUI
@@ -25,30 +27,27 @@ final class SessionManager {
    var errorMessage: String?
    var isLoading: Bool = false
-    // Proactive agent
+    // VLM / proactive agent
    var proactiveCard: ProactiveCard?
    /// Set when the user approves a proposed action — shown as a confirmation toast
    var approvedActionLabel: String?
    /// Latest one-sentence summary from the VLM, shown in the floating HUD
    var latestVlmSummary: String?
-    /// True while the argus executor is running an approved action
+    var latestInferredTask: String?
    var isExecuting: Bool = false
    /// Result produced by the executor's output() tool — shown as a sticky card in the HUD
    var executorOutput: (title: String, content: String)?
    var monitoringError: String?
    // Screenshot engine
    var isCapturing: Bool = false
-    private var captureTask: Task<Void, Never>?
+    @ObservationIgnored private var captureTask: Task<Void, Never>?
    private let captureInterval: TimeInterval = 5.0
-    // Rolling screenshot history buffer (max 4 entries, ~20-second window)
+    // Frame buffer — accumulate N frames before calling VLM for temporal diff context
-    // Provides temporal context to the VLM so it can detect patterns across captures.
+    @ObservationIgnored private var frameBuffer: [Data] = []
-    private struct ScreenshotHistoryEntry {
+    private let framesPerVLMCall = 3
-        let summary: String   // vlm_summary text from the previous analysis
+
-        let timestamp: Date
+    // Rolling summary history fed as context into subsequent VLM calls
-    }
+    private struct HistoryEntry { let summary: String; let timestamp: Date }
-    @ObservationIgnored private var screenshotHistory: [ScreenshotHistoryEntry] = []
+    @ObservationIgnored private var screenshotHistory: [HistoryEntry] = []
    // App switch tracking
    @ObservationIgnored private var appSwitches: [(name: String, bundleId: String, time: Date)] = []
@@ -56,15 +55,8 @@ final class SessionManager {
    @ObservationIgnored private var lastApp: (name: String, bundleId: String) = ("", "")
    @ObservationIgnored private var lastAppEnteredAt: Date = Date()
-    // Argus subprocess (device-side VLM)
+    // Proactive card auto-dismiss timer
    @ObservationIgnored private var argusProcess: Process?
    @ObservationIgnored private var argusReadTask: Task<Void, Never>?
    @ObservationIgnored private var argusStdinPipe: Pipe?
    /// Whether the current proactive card came from VLM (needs argus stdin response) vs local heuristic
    @ObservationIgnored private var proactiveCardNeedsArgusResponse = false
    @ObservationIgnored private var proactiveCardTimer: Task<Void, Never>?
    private let argusPythonPath = "/Users/joyzhuo/miniconda3/envs/gmr/bin/python3"
    private let argusRepoPath = "/Users/joyzhuo/yhack/lockinbro-argus"
    private init() {}
@@ -83,9 +75,96 @@ final class SessionManager {
        return Date().timeIntervalSince(start)
    }
    // MARK: - Monitoring Lifecycle
    /// Immediately shuts down all monitoring without making any API calls.
    func stopMonitoring() {
        stopCapture()
        stopAppObserver()
        proactiveCardTimer?.cancel()
        proactiveCardTimer = nil
        activeSession = nil
        activeTask = nil
        activeSteps = []
        isSessionActive = false
        sessionStartDate = nil
        lastNudge = nil
        resumeCard = nil
        showingResumeCard = false
        proactiveCard = nil
        latestVlmSummary = nil
        latestInferredTask = nil
        isExecuting = false
        executorOutput = nil
        monitoringError = nil
        screenshotHistory = []
        frameBuffer = []
        persistedSessionId = nil
    }
    /// Called once after login. Auto-resumes any existing active session and starts the capture loop.
    func startMonitoring() async {
        guard TokenStore.shared.token != nil else { return }
        guard !isCapturing else { return }
        monitoringError = nil
        await requestNotificationPermission()
        // Silent preflight — never shows UI; only request permission if not yet granted.
        if !CGPreflightScreenCaptureAccess() {
            CGRequestScreenCaptureAccess()
            monitoringError = "Screen Recording permission required — enable in System Settings → Privacy & Security → Screen Recording, then tap Retry"
            return
        }
        do {
            if let existing = try await APIClient.shared.getActiveSession() {
                await autoResumeSession(existing)
            } else {
                startCapture()
                startAppObserver()
            }
        } catch {
            startCapture()
            startAppObserver()
        }
    }
    /// Silently resume an active session found on the backend (no loading UI shown).
    private func autoResumeSession(_ session: FocusSession) async {
        activeSession = session
        persistedSessionId = session.id
        isSessionActive = true
        sessionStartDate = Date()
        distractionCount = 0
        lastNudge = nil
        screenshotHistory = []
        frameBuffer = []
        if let taskId = session.taskId {
            do {
                let tasks = try await APIClient.shared.getTasks()
                activeTask = tasks.first(where: { $0.id == taskId })
                if let task = activeTask {
                    let steps = try await APIClient.shared.getSteps(taskId: task.id)
                    activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder }
                    currentStepIndex = activeSteps.firstIndex(where: { $0.isActive })
                        ?? activeSteps.firstIndex(where: { $0.status == "pending" })
                        ?? 0
                }
            } catch {}
        }
        let shortId = String(session.id.prefix(8))
        let taskLabel = activeTask?.title ?? "(no task)"
        latestVlmSummary = "Resumed session \(shortId) · \(taskLabel)"
        startCapture()
        startAppObserver()
    }
    // MARK: - Session Lifecycle
    // Persisted so we can end a stale session after an app restart
    private var persistedSessionId: String? {
        get { UserDefaults.standard.string(forKey: "lockInBro.lastSessionId") }
        set {
@@ -98,18 +177,16 @@ final class SessionManager {
        isLoading = true
        errorMessage = nil
        do {
-            let session: FocusSession
+            // End any existing session first
-            do {
+            var staleId: String? = activeSession?.id ?? persistedSessionId
-                session = try await APIClient.shared.startSession(taskId: task?.id)
+            if staleId == nil {
-            } catch NetworkError.httpError(409, _) {
+                staleId = (try? await APIClient.shared.getActiveSession())?.id
-                // End whichever session is active — prefer the locally known one,
+            }
                // fall back to the last persisted ID (survives app restarts)
                let staleId = activeSession?.id ?? persistedSessionId
            if let id = staleId {
                _ = try? await APIClient.shared.endSession(sessionId: id, status: "completed")
            }
-                session = try await APIClient.shared.startSession(taskId: task?.id)
+
-            }
+            let session = try await APIClient.shared.startSession(taskId: task?.id)
            activeSession = session
            persistedSessionId = session.id
            activeTask = task
@@ -119,20 +196,22 @@ final class SessionManager {
            sessionStartDate = Date()
            distractionCount = 0
            lastNudge = nil
            screenshotHistory = []
            frameBuffer = []
            if let task {
                let steps = try await APIClient.shared.getSteps(taskId: task.id)
                activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder }
                // Pick first in-progress or first pending step
                currentStepIndex = activeSteps.firstIndex(where: { $0.isActive })
                    ?? activeSteps.firstIndex(where: { $0.status == "pending" })
                    ?? 0
            }
            screenshotHistory = []
            await requestNotificationPermission()
-            startArgus(session: session, task: task)
+            // Restart capture loop (in case it wasn't running or was in monitoring-only mode)
-            startAppObserver()
+            stopCapture()
            startCapture()
            if appSwitchObserver == nil { startAppObserver() }
        } catch {
            errorMessage = error.localizedDescription
        }
@@ -140,7 +219,6 @@ final class SessionManager {
    }
    func endSession(status: String = "completed") async {
        stopArgus()
        stopCapture()
        stopAppObserver()
        if let session = activeSession {
@@ -155,14 +233,21 @@ final class SessionManager {
        resumeCard = nil
        showingResumeCard = false
        proactiveCard = nil
        approvedActionLabel = nil
        latestVlmSummary = nil
        latestInferredTask = nil
        isExecuting = false
        executorOutput = nil
        proactiveCardTimer?.cancel()
        proactiveCardTimer = nil
        screenshotHistory = []
        frameBuffer = []
        persistedSessionId = nil
        // Keep the capture loop running for app-switch heuristics
        if TokenStore.shared.token != nil {
            startCapture()
            startAppObserver()
        }
    }
    func fetchResumeCard() async {
@@ -183,7 +268,6 @@ final class SessionManager {
            if let idx = activeSteps.firstIndex(where: { $0.id == updated.id }) {
                activeSteps[idx] = updated
            }
            // Advance to next pending
            if let next = activeSteps.firstIndex(where: { $0.status == "pending" }) {
                currentStepIndex = next
            }
@@ -192,12 +276,19 @@ final class SessionManager {
        }
    }
    // MARK: - Retry (HUD Retry button)
    func retryMonitoring() {
        monitoringError = nil
        frameBuffer = []
        stopCapture()
        startCapture()
        if appSwitchObserver == nil { startAppObserver() }
    }
    // MARK: - Proactive Card Lifecycle
-    /// Show a proactive card and start the 15-second auto-dismiss timer.
+    private func showProactiveCard(_ card: ProactiveCard) {
    /// - Parameter vlmCard: Pass true when the card came from VLM so argus gets a stdin response on dismiss.
    private func showProactiveCard(_ card: ProactiveCard, vlmCard: Bool = false) {
        proactiveCardNeedsArgusResponse = vlmCard
        proactiveCardTimer?.cancel()
        withAnimation { proactiveCard = card }
@@ -208,31 +299,44 @@ final class SessionManager {
        }
    }
    /// Dismiss the current card (user tapped "Not now" or 15s elapsed).
    func dismissProactiveCard() {
        proactiveCardTimer?.cancel()
        proactiveCardTimer = nil
        withAnimation { proactiveCard = nil }
        if proactiveCardNeedsArgusResponse { sendArgusResponse(0) }
        proactiveCardNeedsArgusResponse = false
    }
    /// Approve action at the given index (0-based). Argus stdin uses 1-based (1 = action 0).
    func approveProactiveCard(actionIndex: Int) {
        proactiveCardTimer?.cancel()
        proactiveCardTimer = nil
        let card = proactiveCard
        withAnimation { proactiveCard = nil }
-        if proactiveCardNeedsArgusResponse {
+        guard case .vlmFriction(_, _, let actions) = card?.source,
-            sendArgusResponse(actionIndex + 1)
+              actionIndex < actions.count else { return }
        let action = actions[actionIndex]
        isExecuting = true
        Task {
            do {
                let screenshot = await captureScreen()
                let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
                guard !geminiKey.isEmpty else {
                    isExecuting = false
                    executorOutput = (title: action.label, content: action.details ?? "Action approved.")
                    return
                }
                let client = GeminiVLMClient(apiKey: geminiKey)
                let result = try await client.executeAction(
                    label: action.label,
                    actionType: action.actionType,
                    details: action.details ?? "",
                    screenshot: screenshot
                )
                isExecuting = false
                executorOutput = (title: action.label, content: result)
            } catch {
                isExecuting = false
                executorOutput = (title: action.label, content: action.details ?? "Couldn't complete automatically.")
            }
        proactiveCardNeedsArgusResponse = false
        }
    private func sendArgusResponse(_ choice: Int) {
        guard let pipe = argusStdinPipe,
              let data = "\(choice)\n".data(using: .utf8) else { return }
        try? pipe.fileHandleForWriting.write(contentsOf: data)
    }
    // MARK: - App Switch Observer
@@ -269,7 +373,7 @@ final class SessionManager {
        guard name != lastApp.name else { return }
-        // Log previous app's dwell time to backend (fire-and-forget)
+        // Log previous app dwell time to backend
        let duration = max(1, Int(now.timeIntervalSince(lastAppEnteredAt)))
        let prev = lastApp
        if let session = activeSession, !prev.name.isEmpty {
@@ -289,159 +393,36 @@ final class SessionManager {
        appSwitches.append((name: name, bundleId: bundleId, time: now))
        if appSwitches.count > 30 { appSwitches.removeFirst() }
        // Only trigger card during active session and when none is already showing
        guard isSessionActive, proactiveCard == nil else { return }
        if let loop = detectRepetitiveLoop() {
-            showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count)), vlmCard: false)
+            showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count)))
        }
    }
    // Detects a back-and-forth pattern between exactly 2 apps within a 5-minute window.
    // Requires 3 full cycles (6 consecutive alternating switches) to avoid false positives.
    private func detectRepetitiveLoop() -> (apps: [String], count: Int)? {
        let cutoff = Date().addingTimeInterval(-300)
        let recent = appSwitches.filter { $0.time > cutoff }.map(\.name)
        guard recent.count >= 6 else { return nil }
        let last6 = Array(recent.suffix(6))
        guard Set(last6).count == 2 else { return nil }
        // Strictly alternating — no two consecutive identical app names
        for i in 1..<last6.count {
            if last6[i] == last6[i - 1] { return nil }
        }
        return (apps: Array(Set(last6)).sorted(), count: 3)
    }
-    // MARK: - Argus Subprocess (device-side VLM)
+    // MARK: - Screenshot Capture Loop
    /// Launch the argus Python daemon as a subprocess.
    /// Argus captures screenshots itself, runs them through a local VLM (Ollama/Gemini),
    /// posts results to the backend, and emits RESULT:{json} lines to stdout for Swift to consume.
    /// Falls back to the internal `startCapture()` loop if the process cannot be launched.
    private func startArgus(session: FocusSession, task: AppTask?) {
        guard FileManager.default.fileExists(atPath: argusPythonPath),
              FileManager.default.fileExists(atPath: argusRepoPath) else {
            startCapture()
            return
        }
        // Encode steps as JSON for --steps-json arg
        var stepsJSONString = "[]"
        if !activeSteps.isEmpty {
            let stepsArray: [[String: Any]] = activeSteps.map { step in
                var s: [String: Any] = [
                    "id": step.id,
                    "sort_order": step.sortOrder,
                    "title": step.title,
                    "status": step.status
                ]
                if let note = step.checkpointNote { s["checkpoint_note"] = note }
                return s
            }
            if let data = try? JSONSerialization.data(withJSONObject: stepsArray),
               let str = String(data: data, encoding: .utf8) {
                stepsJSONString = str
            }
        }
        let jwt = TokenStore.shared.token ?? ""
        let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
        var arguments = [
            "-m", "argus",
            "--session-id", session.id,
            "--task-title", task?.title ?? "(no task)",
            "--task-goal", task?.description ?? "",
            "--steps-json", stepsJSONString,
            "--window-title", NSWorkspace.shared.frontmostApplication?.localizedName ?? "",
            "--vlm", "gemini",
            "--jwt", jwt,
            "--backend-url", "https://wahwa.com/api/v1",
            "--swift-ipc",
            "--execute"   // enables agentic executor; Swift sends 0/1/2 via stdin
        ]
        if !geminiKey.isEmpty {
            arguments += ["--gemini-key", geminiKey]
        }
        let process = Process()
        process.executableURL = URL(fileURLWithPath: argusPythonPath)
        process.currentDirectoryURL = URL(fileURLWithPath: argusRepoPath)
        process.arguments = arguments
        // Pipe stdout for RESULT:/STATUS:/EXEC_OUTPUT: lines
        // stderr is NOT captured — leaving it unset lets argus log to the system console
        // without risk of the pipe buffer filling and blocking the process.
        let stdoutPipe = Pipe()
        let stdinPipe = Pipe()
        process.standardOutput = stdoutPipe
        process.standardInput = stdinPipe
        do {
            try process.run()
        } catch {
            startCapture()
            return
        }
        argusProcess = process
        argusStdinPipe = stdinPipe
        isCapturing = true
        // Read RESULT:/STATUS:/EXEC_OUTPUT: lines from argus stdout in a background task
        let fileHandle = stdoutPipe.fileHandleForReading
        argusReadTask = Task { [weak self] in
            do {
                for try await line in fileHandle.bytes.lines {
                    guard let self, !Task.isCancelled else { break }
                    if line.hasPrefix("RESULT:") {
                        let jsonStr = String(line.dropFirst("RESULT:".count))
                        if let data = jsonStr.data(using: .utf8),
                           let result = try? JSONDecoder().decode(DistractionAnalysisResponse.self, from: data) {
                            await MainActor.run { self.applyDistractionResult(result) }
                        }
                    } else if line.hasPrefix("STATUS:exec_done:") {
                        await MainActor.run { self.isExecuting = false }
                    } else if line.hasPrefix("EXEC_OUTPUT:") {
                        let jsonStr = String(line.dropFirst("EXEC_OUTPUT:".count))
                        if let data = jsonStr.data(using: .utf8),
                           let obj = try? JSONSerialization.jsonObject(with: data) as? [String: String],
                           let title = obj["title"], let content = obj["content"] {
                            await MainActor.run { self.executorOutput = (title: title, content: content) }
                        }
                    }
                }
            } catch {
                // Pipe closed — argus process ended
            }
        }
    }
    private func stopArgus() {
        argusReadTask?.cancel()
        argusReadTask = nil
        argusStdinPipe = nil
        if let proc = argusProcess {
            proc.terminate()
            argusProcess = nil
            isCapturing = false
        }
    }
    // MARK: - Screenshot Capture Loop (fallback when argus is unavailable)
    private func startCapture() {
        guard !isCapturing else { return }
        isCapturing = true
        captureTask = Task { [weak self] in
            guard let self else { return }
-            // Capture immediately on session start, then repeat on interval
+            // Capture immediately, then repeat on interval
            await self.captureAndAnalyze()
-            while !Task.isCancelled && self.isSessionActive {
+            while !Task.isCancelled {
                try? await Task.sleep(for: .seconds(self.captureInterval))
-                guard !Task.isCancelled && self.isSessionActive else { break }
+                guard !Task.isCancelled else { break }
                await self.captureAndAnalyze()
            }
        }
@@ -453,56 +434,77 @@ final class SessionManager {
        isCapturing = false
    }
    /// Capture one frame, buffer it, and call VLM every `framesPerVLMCall` frames.
    private func captureAndAnalyze() async {
        guard let session = activeSession else { return }
        guard let imageData = await captureScreen() else { return }
-        let windowTitle = NSWorkspace.shared.frontmostApplication?.localizedName ?? "Unknown"
+        frameBuffer.append(imageData)
-        var context = buildTaskContext()
+        // Keep buffer bounded — rolling window of most recent frames
        if frameBuffer.count > framesPerVLMCall { frameBuffer.removeFirst() }
-        // Inject rolling history so the VLM has temporal context across captures.
+        // Only call VLM once we have a full batch for temporal diff analysis
-        // Only summaries (text) are sent — not the raw images — to keep token cost low.
+        guard frameBuffer.count >= framesPerVLMCall else { return }
-        if !screenshotHistory.isEmpty {
+
-            let iso = ISO8601DateFormatter()
+        let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
-            context["screenshot_history"] = screenshotHistory.map { entry in
+        guard !geminiKey.isEmpty else {
-                ["summary": entry.summary, "timestamp": iso.string(from: entry.timestamp)]
+            print("[VLM] No Gemini API key set — skipping analysis")
-            }
+            return
        }
        let client = GeminiVLMClient(apiKey: geminiKey)
        let windowTitle = NSWorkspace.shared.frontmostApplication?.localizedName ?? ""
        let recentSummaries = screenshotHistory.map(\.summary)
        let frames = frameBuffer  // snapshot before async gap
        do {
-            let result = try await APIClient.shared.analyzeScreenshot(
+            print("[VLM] Calling Gemini with \(frames.count) frames…")
-                imageData: imageData,
+            let result = try await client.analyze(
                frames: frames,
                taskTitle: activeTask?.title ?? "",
                taskGoal: activeTask?.description ?? "",
                steps: activeSteps,
                windowTitle: windowTitle,
-                sessionId: session.id,
+                recentSummaries: recentSummaries
                taskContext: context
            )
            print("[VLM] Result: on_task=\(result.onTask), friction=\(result.friction?.type ?? "none"), summary=\(result.vlmSummary ?? "")")
-            // Append this result's summary to the rolling buffer (max 4 entries)
+            // Append to rolling summary history
-            if let summary = result.vlmSummary {
+            if let summary = result.vlmSummary, !summary.isEmpty {
-                screenshotHistory.append(ScreenshotHistoryEntry(summary: summary, timestamp: Date()))
+                screenshotHistory.append(HistoryEntry(summary: summary, timestamp: Date()))
                if screenshotHistory.count > 4 { screenshotHistory.removeFirst() }
            }
            // Clear frame buffer — next batch starts fresh
            frameBuffer.removeAll()
            monitoringError = nil
            applyDistractionResult(result)
            // Post result to backend (fire-and-forget)
            if let session = activeSession {
                Task {
                    try? await APIClient.shared.postAnalysisResult(result, sessionId: session.id)
                }
            }
        } catch {
-            // Silent fail — don't interrupt the user
+            print("[VLM] Analysis error: \(error)")
            // Don't surface transient errors — the next attempt will retry automatically
        }
    }
    // MARK: - Screen Capture
    private func captureScreen() async -> Data? {
        guard CGPreflightScreenCaptureAccess() else { return nil }
        do {
            let content = try await SCShareableContent.current
            guard let display = content.displays.first else { return nil }
            let config = SCStreamConfiguration()
            config.width = 1280
            config.height = 720
            let filter = SCContentFilter(display: display, excludingWindows: [])
            let image = try await SCScreenshotManager.captureImage(
-                contentFilter: filter,
+                contentFilter: filter, configuration: config)
                configuration: config
            )
            return cgImageToJPEG(image)
        } catch {
            return nil
@@ -518,29 +520,13 @@ final class SessionManager {
        return jpeg
    }
-    private func buildTaskContext() -> [String: Any] {
+    // MARK: - Apply VLM Result
        var ctx: [String: Any] = [:]
        guard let task = activeTask else { return ctx }
        ctx["task_title"] = task.title
        ctx["task_goal"] = task.description ?? task.title
        ctx["steps"] = activeSteps.map { step -> [String: Any] in
            var s: [String: Any] = [
                "id": step.id,
                "sort_order": step.sortOrder,
                "title": step.title,
                "status": step.status
            ]
            if let note = step.checkpointNote { s["checkpoint_note"] = note }
            return s
        }
        return ctx
    }
    private func applyDistractionResult(_ result: DistractionAnalysisResponse) {
        // 0. Store latest summary for the floating HUD
        if let summary = result.vlmSummary { latestVlmSummary = summary }
        if let task = result.inferredTask, !task.isEmpty { latestInferredTask = task }
-        // 1. Apply step side-effects (always)
+        // Apply step side-effects
        for completedId in result.stepsCompleted {
            if let idx = activeSteps.firstIndex(where: { $0.id == completedId }) {
                activeSteps[idx].status = "done"
@@ -556,22 +542,25 @@ final class SessionManager {
            currentStepIndex = idx
        }
-        // 2. Notification priority (design spec §1.5):
+        // Notification priority: friction card (formal or has actions) → nudge
-        //    Proactive friction help → Context resume → Gentle nudge
+        if let friction = result.friction {
-        //    NEVER nudge when the system could help instead.
+            let shouldShow = friction.isActionable || !friction.proposedActions.isEmpty
-        if let friction = result.friction, friction.isActionable {
+            if shouldShow {
                if friction.isResumption {
                // Task resumption detected — auto-surface resume card without button press
                    Task { await fetchResumeCard() }
                } else if proactiveCard == nil {
                    showProactiveCard(ProactiveCard(source: .vlmFriction(
                        frictionType: friction.type,
                        description: friction.description,
                        actions: friction.proposedActions
-                )), vlmCard: true)
+                    )))
                }
            } else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge {
                distractionCount += 1
                lastNudge = nudge
                sendNudgeNotification(nudge)
            }
        } else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge {
            // Only nudge if VLM found no actionable friction
            distractionCount += 1
            lastNudge = nudge
            sendNudgeNotification(nudge)
--- a/README.md
+++ b/README.md
@@ -0,0 +1,104 @@
 # LockInBro
 ADHD-friendly macOS focus assistant. Monitors your screen with a local VLM agent (Argus), detects friction and distractions, and nudges you back on track.
 ## Requirements
 - macOS 14+
 - Xcode 16+
 - Python 3.11+ (system or Homebrew)
 ## Setup
 ### 1. Python environment
 Create the venv once from the project root:
 ```bash
 cd ~/yhack/LockInBro
 python3 -m venv .venv
 .venv/bin/pip install -r argus/requirements.txt
 ```
 ### 2. API keys
 Create a `.env` file in the project root (next to `argus/`):
 ```bash
 cp argus/.env.example .env   # if example exists, otherwise create manually
 ```
 `.env` contents:
 ```
 GEMINI_API_KEY=your_gemini_api_key_here
 BACKEND_BASE_URL=https://wahwa.com/api/v1
 ```
 The Gemini API key is also set at runtime from the app's Settings screen — the `.env` is only needed if running Argus directly from the command line.
 ### 3. Xcode permissions
 In Xcode → Signing & Capabilities, ensure the app has:
 - **Screen Recording** — required for screenshot capture
 - **App Sandbox** disabled (or `com.apple.security.screen-recording` entitlement added)
 ### 4. Run
 Open `LockInBro.xcodeproj` in Xcode and press **Run** (⌘R).
 On first launch:
 - Grant **Screen Recording** permission when prompted
 - Log in or register via the app
 - Enter your Gemini API key in Settings
 ---
 ## Argus (VLM agent)
 The `argus/` directory contains the Python screen-analysis agent. It runs as a subprocess of the Swift app — you do not need to launch it manually.
 ### Running Argus directly (for debugging)
 ```bash
 cd ~/yhack/LockInBro
 .venv/bin/python3 -m argus \
  --vlm gemini \
  --gemini-key YOUR_KEY \
  --dry-run \
  --task-title "debug run"
 ```
 ### Recreating the venv
 ```bash
 rm -rf .venv
 python3 -m venv .venv
 .venv/bin/pip install -r argus/requirements.txt
 ```
 ---
 ## Project structure
 ```
 LockInBro/
 ├── .venv/                  # Python venv (gitignored)
 ├── .env                    # API keys (gitignored)
 ├── argus/                  # VLM agent (Python)
 │   ├── requirements.txt
 │   ├── capture.py          # Reads screenshots from /tmp/lockinbro_capture.jpg
 │   ├── loop.py             # Main analysis loop
 │   ├── vlm.py              # Gemini / Ollama client
 │   └── ...
 ├── LockInBro/              # Swift app source
 │   ├── SessionManager.swift
 │   ├── APIClient.swift
 │   └── ...
 └── LockInBro.xcodeproj
 ```
 ## Backend
 Deployed at `https://wahwa.com/api/v1` (FastAPI on DigitalOcean).
 Source: `~/yhack/lockinbro-api`
--- a/argus/init.py
+++ b/argus/init.py
--- a/argus/main.py
+++ b/argus/main.py
@@ -0,0 +1,113 @@
 """CLI entry point: python -m argus [options]"""
 from __future__ import annotations
 import argparse
 import asyncio
 import json
 import logging
 import sys
 from argus.config import CAPTURE_INTERVAL_S
 from argus.loop import run_loop
 from argus.vlm import StepInfo, TaskContext
 def _parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        prog="argus",
        description="Argus VLM — proactive focus assistant screen analyzer",
    )
    p.add_argument("--session-id", default="00000000-0000-0000-0000-000000000000")
    p.add_argument("--task-title", default="(no task)")
    p.add_argument("--task-goal", default="")
    p.add_argument(
        "--steps-json",
        default=None,
        help='JSON array of steps: [{"id":"...", "sort_order":1, "title":"...", "status":"pending"}]',
    )
    p.add_argument("--window-title", default="")
    p.add_argument("--vlm", choices=["ollama", "gemini"], default=None, help="VLM backend (default: ollama)")
    p.add_argument("--gemini-key", default=None, help="Override GEMINI_API_KEY env var")
    p.add_argument("--jwt", default=None, help="Override BACKEND_JWT env var")
    p.add_argument("--backend-url", default=None, help="Override BACKEND_BASE_URL env var")
    p.add_argument("--dry-run", action="store_true", help="Print JSON instead of POSTing")
    p.add_argument("--execute", action="store_true", help="Enable notification + executor flow")
    p.add_argument("--mock-sessions", default=None, help='JSON array of mock sessions: [{"task_title":"...", "status":"interrupted", "last_app":"VS Code", "last_file":"solution.cpp", "checkpoint_note":"stuck on impl"}]')
    p.add_argument("--iterations", type=int, default=None, help="Stop after N iterations")
    p.add_argument("-v", "--verbose", action="store_true")
    return p.parse_args()
 def main() -> None:
    args = _parse_args()
    logging.basicConfig(
        level=logging.DEBUG if args.verbose else logging.INFO,
        format="%(asctime)s %(levelname)-5s %(name)s  %(message)s",
        datefmt="%H:%M:%S",
    )
    # httpx logs every request/response at INFO — suppress to WARNING to avoid noisy 404s
    logging.getLogger("httpx").setLevel(logging.WARNING)
    logging.getLogger("httpcore").setLevel(logging.WARNING)
    steps: list[StepInfo] = []
    if args.steps_json:
        for s in json.loads(args.steps_json):
            steps.append(
                StepInfo(
                    id=s["id"],
                    sort_order=s["sort_order"],
                    title=s["title"],
                    status=s.get("status", "pending"),
                    checkpoint_note=s.get("checkpoint_note"),
                )
            )
    ctx = TaskContext(
        task_title=args.task_title,
        task_goal=args.task_goal,
        steps=steps,
        window_title=args.window_title,
        session_id=args.session_id,
    )
    # Parse mock sessions if provided
    mock_sessions = None
    if args.mock_sessions:
        from argus.session import SessionInfo
        mock_sessions = []
        for i, s in enumerate(json.loads(args.mock_sessions)):
            mock_sessions.append(
                SessionInfo(
                    session_id=s.get("session_id", f"mock-{i}"),
                    task_id=s.get("task_id"),
                    task_title=s.get("task_title", ""),
                    task_goal=s.get("task_goal", ""),
                    status=s.get("status", "interrupted"),
                    last_app=s.get("last_app", ""),
                    last_file=s.get("last_file", ""),
                    checkpoint_note=s.get("checkpoint_note", ""),
                    started_at=s.get("started_at", ""),
                    ended_at=s.get("ended_at"),
                    minutes_ago=s.get("minutes_ago", 30),
                )
            )
    asyncio.run(
        run_loop(
            ctx,
            api_key=args.gemini_key,
            vlm_backend=args.vlm,
            jwt=args.jwt,
            base_url=args.backend_url,
            dry_run=args.dry_run,
            max_iterations=args.iterations,
            auto_execute=args.execute,
            mock_sessions=mock_sessions,
        )
    )
 if __name__ == "__main__":
    main()
--- a/argus/backend.py
+++ b/argus/backend.py
@@ -0,0 +1,50 @@
 """Backend API client — sends VLM analysis results to the LockInBro server.
 POST /distractions/analyze-result with the enriched JSON payload.
 """
 from __future__ import annotations
 import logging
 import httpx
 from argus.config import BACKEND_BASE_URL, BACKEND_JWT
 from argus.vlm import VLMResult
 log = logging.getLogger(__name__)
 async def send_analysis(
    result: VLMResult,
    session_id: str,
    *,
    jwt: str | None = None,
    base_url: str | None = None,
 ) -> dict:
    """Post VLM analysis result to the backend.
    Returns the backend response body on success, or an error dict.
    """
    url = f"{base_url or BACKEND_BASE_URL}/distractions/analyze-result"
    token = jwt or BACKEND_JWT
    payload = result.to_backend_payload(session_id)
    headers = {}
    if token:
        headers["Authorization"] = f"Bearer {token}"
    log.info("Sending analysis to %s  on_task=%s", url, result.on_task)
    log.debug("Payload: %s", payload)
    try:
        async with httpx.AsyncClient(timeout=10.0) as client:
            resp = await client.post(url, json=payload, headers=headers)
            resp.raise_for_status()
            return resp.json()
    except httpx.HTTPStatusError as exc:
        log.error("Backend returned %s: %s", exc.response.status_code, exc.response.text)
        return {"error": str(exc), "status_code": exc.response.status_code}
    except httpx.RequestError as exc:
        log.error("Backend unreachable: %s", exc)
        return {"error": str(exc)}
--- a/argus/buffer.py
+++ b/argus/buffer.py
@@ -0,0 +1,123 @@
 """Rolling history buffer for VLM screenshot analysis.
 Two tiers:
  - Image buffer: deque(maxlen=4) of recent screenshots sent as images
  - Text history: deque(maxlen=12) of older VLM summaries + previous outputs
    for extended context (what happened 30-60s ago) and self-refinement
 """
 from __future__ import annotations
 import time
 from collections import deque
 from dataclasses import dataclass, field
@dataclass
 class BufferEntry:
    jpeg: bytes
    vlm_summary: str
    timestamp: float = field(default_factory=time.time)
@dataclass
 class TextEntry:
    vlm_summary: str
    timestamp: float
 class HistoryBuffer:
    def __init__(self, image_maxlen: int = 4, text_maxlen: int = 12):
        self._images: deque[BufferEntry] = deque(maxlen=image_maxlen)
        self._text_history: deque[TextEntry] = deque(maxlen=text_maxlen)
        self._last_output: dict | None = None
        self._last_execution: str | None = None
    def push(self, jpeg: bytes, vlm_summary: str) -> None:
        now = time.time()
        # When an image entry gets evicted from the image buffer,
        # it's already captured in text_history, so nothing extra needed.
        self._images.append(BufferEntry(jpeg=jpeg, vlm_summary=vlm_summary, timestamp=now))
        self._text_history.append(TextEntry(vlm_summary=vlm_summary, timestamp=now))
    def set_last_output(self, output: dict) -> None:
        """Store the previous VLM JSON output for self-refinement."""
        self._last_output = output
    def set_last_execution(self, summary: str | None) -> None:
        """Store the result of the last executor action."""
        self._last_execution = summary
    def get_last_execution(self) -> str | None:
        return self._last_execution
    def clear_last_execution(self) -> None:
        self._last_execution = None
    def get_entries(self) -> list[BufferEntry]:
        """Return image entries oldest-first."""
        return list(self._images)
    def format_for_prompt(self) -> str:
        """Format the full timeline: text history + image labels."""
        now = time.time()
        lines: list[str] = []
        # Text-only history (entries older than what's in the image buffer)
        image_timestamps = {round(e.timestamp, 2) for e in self._images}
        older = [
            e for e in self._text_history
            if round(e.timestamp, 2) not in image_timestamps
        ]
        older = [e for e in older if e.vlm_summary]  # skip empty summaries
        if older:
            lines.append("Older context (text only, no images):")
            for entry in older:
                ago = int(now - entry.timestamp)
                lines.append(f"  - [{ago}s ago] {entry.vlm_summary}")
            lines.append("")
        # Image timeline
        n = len(self._images)
        if n > 0:
            lines.append(f"Recent screenshots ({n} prior + 1 current = {n + 1} images):")
            for i, entry in enumerate(self._images):
                ago = int(now - entry.timestamp)
                lines.append(f"  - Screenshot {i + 1}/{n + 1}: [{ago}s ago]")
            lines.append(f"  - Screenshot {n + 1}/{n + 1}: [now] (current)")
        else:
            lines.append("Screenshots:")
            lines.append("  - Screenshot 1/1: [now] (current, first capture)")
        return "\n".join(lines)
    def format_last_output(self) -> str:
        """Format previous VLM output for self-refinement context."""
        if not self._last_output:
            return ""
        import json
        # Only include the key fields, not the full blob
        prev = self._last_output
        parts = [
            f"  on_task: {prev.get('on_task')}",
            f"  app: {prev.get('app_name')}",
            f"  friction: {prev.get('friction', {}).get('type')}",
            f"  summary: {prev.get('vlm_summary')}",
        ]
        note = prev.get("checkpoint_note_update")
        if note:
            parts.append(f"  checkpoint: {note}")
        desc = prev.get("friction", {}).get("description")
        if desc:
            parts.append(f"  friction_desc: {desc}")
        return "\n".join(parts)
    def __len__(self) -> int:
        return len(self._images)
    def clear(self) -> None:
        self._images.clear()
        self._text_history.clear()
        self._last_output = None
        self._last_execution = None
--- a/argus/capture.py
+++ b/argus/capture.py
@@ -0,0 +1,80 @@
 """Screenshot capture for macOS.
 Reads a JPEG file written by the Swift host app (LockInBro.app), which holds
 the Screen Recording TCC permission. The Swift app writes atomically to
 /tmp/lockinbro_capture.jpg every ~5s using ScreenCaptureKit.
 capture_screenshot() blocks up to 15 seconds waiting for a fresh file,
 then raises RuntimeError if nothing arrives — so loop.py can handle it
 gracefully without falling back to the screencapture CLI (which would fail
 with a permissions error when called from a Python subprocess).
 """
 from __future__ import annotations
 import io
 import os
 import time
 from PIL import Image
 from argus.config import SCREENSHOT_JPEG_QUALITY, SCREENSHOT_MAX_WIDTH
 _SWIFT_FRAME_PATH = "/tmp/lockinbro_capture.jpg"
 _SWIFT_FRAME_MAX_AGE_S = 10.0   # treat as stale if older than this
 _WAIT_TIMEOUT_S = 15.0          # how long to wait for Swift to write the first frame
 _WAIT_POLL_S = 0.5              # polling interval while waiting
 def _to_jpeg(img: Image.Image) -> bytes:
    """Convert a PIL Image to JPEG bytes, handling RGBA → RGB."""
    if img.width > SCREENSHOT_MAX_WIDTH:
        ratio = SCREENSHOT_MAX_WIDTH / img.width
        img = img.resize(
            (SCREENSHOT_MAX_WIDTH, int(img.height * ratio)),
            Image.LANCZOS,
        )
    if img.mode == "RGBA":
        img = img.convert("RGB")
    buf = io.BytesIO()
    img.save(buf, format="JPEG", quality=SCREENSHOT_JPEG_QUALITY)
    return buf.getvalue()
 def _read_swift_frame() -> bytes | None:
    """Return JPEG bytes from the Swift-provided file if it exists and is fresh."""
    try:
        age = time.time() - os.path.getmtime(_SWIFT_FRAME_PATH)
        if age >= _SWIFT_FRAME_MAX_AGE_S:
            return None
        with open(_SWIFT_FRAME_PATH, "rb") as f:
            data = f.read()
        img = Image.open(io.BytesIO(data))
        img.load()
        return _to_jpeg(img)
    except Exception:
        return None
 def capture_screenshot() -> bytes:
    """Return JPEG bytes for the current screen.
    Blocks up to _WAIT_TIMEOUT_S seconds waiting for the Swift app to write
    a fresh frame. Raises RuntimeError if no frame arrives in time.
    """
    deadline = time.time() + _WAIT_TIMEOUT_S
    while time.time() < deadline:
        frame = _read_swift_frame()
        if frame is not None:
            return frame
        time.sleep(_WAIT_POLL_S)
    raise RuntimeError(
        f"No screenshot received from LockInBro.app within {_WAIT_TIMEOUT_S}s. "
        "Make sure the app is running and has Screen Recording permission."
    )
 def load_image_file(path: str) -> bytes:
    """Load an image file and return JPEG bytes (for testing)."""
    return _to_jpeg(Image.open(path))
--- a/argus/config.py
+++ b/argus/config.py
@@ -0,0 +1,28 @@
 import os
 from pathlib import Path
 from dotenv import load_dotenv
 load_dotenv(Path(__file__).resolve().parent.parent / ".env", override=True)
 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
 GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.5-pro")
 GEMINI_URL = (
    f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL}:generateContent"
 )
 # Ollama (local VLM — default)
 OLLAMA_BASE_URL = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434")
 OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "qwen3.5:9b")
 # VLM backend: "ollama" (default) or "gemini"
 VLM_BACKEND = os.environ.get("VLM_BACKEND", "ollama")
 BACKEND_BASE_URL = os.environ.get("BACKEND_BASE_URL", "https://wahwa.com/api/v1")
 BACKEND_JWT = os.environ.get("BACKEND_JWT", "")
 CAPTURE_INTERVAL_S = 2.5  # screenshot every 2.5s
 BUFFER_MAX_LEN = 4        # 4 frames accumulated
 VLM_CALL_EVERY_N = 4      # call VLM every 4 captures (= every 10s)
 SCREENSHOT_JPEG_QUALITY = 50
 SCREENSHOT_MAX_WIDTH = 1280
--- a/argus/executor.py
+++ b/argus/executor.py
@@ -0,0 +1,384 @@
 """Agentic executor — uses Gemini 2.5 Pro with tool use to complete actions.
 When the user approves a proposed action, the executor gets:
  - The recent screenshots (so it can read source content)
  - Tool access: read_file, write_file, run_command
  - A loop: Gemini proposes tool calls → we execute → feed results back → repeat
 This is a full agent loop, not a single-shot LLM call.
 Swift portability notes:
  - The agent loop is the same HTTP pattern (Gemini function calling API)
  - Tool implementations map to FileManager, NSTask, NSPasteboard
  - The loop structure is identical in Swift async/await
 """
 from __future__ import annotations
 import asyncio
 import base64
 import logging
 import os
 import subprocess
 import httpx
 from argus.buffer import HistoryBuffer
 from argus.config import GEMINI_API_KEY
 log = logging.getLogger(__name__)
 EXECUTOR_MODEL = os.environ.get("EXECUTOR_MODEL", "gemini-2.5-pro")
 EXECUTOR_URL = (
    f"https://generativelanguage.googleapis.com/v1beta/models/{EXECUTOR_MODEL}:generateContent"
 )
 MAX_AGENT_STEPS = 10
 # ── Tool definitions (Gemini function calling format) ────────────────────
 TOOLS = [
    {
        "functionDeclarations": [
            {
                "name": "read_file",
                "description": "Read the contents of a file. Use this to inspect source files, code, configs, etc.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "path": {
                            "type": "string",
                            "description": "Absolute or relative file path to read",
                        }
                    },
                    "required": ["path"],
                },
            },
            {
                "name": "output",
                "description": "Display content to the user as a sticky note. Use for: extracted text from PDFs/images, form fill suggestions, content the user needs to paste into binary formats (docx, ppt, websites). The user will copy/paste from this.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "title": {
                            "type": "string",
                            "description": "Short title (e.g. 'Extracted receipt', 'Form values')",
                        },
                        "content": {
                            "type": "string",
                            "description": "The full content to display",
                        },
                    },
                    "required": ["title", "content"],
                },
            },
            {
                "name": "write_file",
                "description": "Write content to an EXISTING plain text file (code, markdown, config, txt). Only use when: (1) you have confirmed the file path via read_file or mdfind, AND (2) the file is a plain text format you can write correctly. NEVER use for binary formats (docx, ppt, pdf, images).",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "path": {
                            "type": "string",
                            "description": "Absolute file path (must have been confirmed to exist first)",
                        },
                        "content": {
                            "type": "string",
                            "description": "Complete file content to write",
                        },
                    },
                    "required": ["path", "content"],
                },
            },
            {
                "name": "run_command",
                "description": "Run a shell command and return its output. Use for compiling, testing, listing files, etc.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "command": {
                            "type": "string",
                            "description": "The shell command to execute",
                        }
                    },
                    "required": ["command"],
                },
            },
            {
                "name": "done",
                "description": "Signal that the task is complete. Call this AFTER using output() to present results. Summary should describe what was shown to the user, not file operations.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "summary": {
                            "type": "string",
                            "description": "Brief summary of what was done",
                        }
                    },
                    "required": ["summary"],
                },
            },
        ]
    }
 ]
 # ── Tool implementations ─────────────────────────────────────────────────
 def _exec_read_file(path: str) -> str:
    resolved = path if os.path.isabs(path) else os.path.join(os.getcwd(), path)
    try:
        with open(resolved, "r") as f:
            content = f.read()
        log.info("  read_file: %s (%d bytes)", resolved, len(content))
        return content
    except OSError as e:
        return f"Error reading {resolved}: {e}"
 def _exec_write_file(path: str, content: str) -> str:
    resolved = path if os.path.isabs(path) else os.path.join(os.getcwd(), path)
    # Safety: only write to existing text files
    if not os.path.exists(resolved):
        return f"Error: {resolved} does not exist. Use output() instead for new content."
    try:
        with open(resolved, "r"):
            pass  # confirm it's readable as text
    except (OSError, UnicodeDecodeError):
        return f"Error: {resolved} is not a readable text file. Use output() instead."
    try:
        with open(resolved, "w") as f:
            f.write(content)
        log.info("  write_file: %s (%d bytes)", resolved, len(content))
        return f"Successfully wrote {len(content)} bytes to {resolved}"
    except OSError as e:
        return f"Error writing {resolved}: {e}"
 def _exec_output(title: str, content: str) -> str:
    """Display content to the user via terminal.
    Swift portability: becomes a sticky note / floating card UI.
    """
    print()
    print(f"┌── 📋 {title} " + "─" * max(0, 50 - len(title)) + "┐")
    for line in content.split("\n"):
        print(f"│ {line}")
    print(f"└" + "─" * 58 + "┘")
    print()
    log.info("  output: %s (%d chars)", title, len(content))
    return f"Displayed '{title}' to user ({len(content)} chars)"
 def _exec_run_command(command: str) -> str:
    log.info("  run_command: %s", command[:80])
    try:
        result = subprocess.run(
            command, shell=True, capture_output=True, text=True, timeout=30
        )
        output = result.stdout
        if result.stderr:
            output += "\nSTDERR:\n" + result.stderr
        if result.returncode != 0:
            output += f"\n(exit code {result.returncode})"
        return output[:4000]  # cap output length
    except subprocess.TimeoutExpired:
        return "Error: command timed out after 30s"
 def _execute_tool(name: str, args: dict) -> str:
    if name == "read_file":
        return _exec_read_file(args["path"])
    elif name == "output":
        return _exec_output(args["title"], args["content"])
    elif name == "write_file":
        return _exec_write_file(args["path"], args["content"])
    elif name == "run_command":
        return _exec_run_command(args["command"])
    elif name == "done":
        return args.get("summary", "Done.")
    else:
        return f"Unknown tool: {name}"
 # ── Agent loop ───────────────────────────────────────────────────────────
 async def execute(
    vlm_payload: dict,
    action_index: int = 0,
    *,
    history: HistoryBuffer | None = None,
    current_screenshot: bytes | None = None,
    api_key: str | None = None,
 ) -> str | None:
    """Run the agentic executor loop.
    The agent can read files, write files, and run commands to complete
    the user's approved action. It loops until it calls done() or hits
    the step limit.
    Returns a summary of what was done, or None on failure.
    """
    friction = vlm_payload.get("friction", {})
    actions = friction.get("proposed_actions", [])
    if action_index >= len(actions):
        log.warning("Action index %d out of range", action_index)
        return None
    chosen = actions[action_index]
    key = api_key or GEMINI_API_KEY
    if not key:
        log.warning("No API key for executor")
        return None
    log.info("Agent executing: %s", chosen.get("label", "?")[:80])
    # Build initial message with screenshots + task context
    initial_parts = _build_initial_parts(vlm_payload, chosen, history, current_screenshot)
    # Conversation history for the agent loop
    contents = [{"role": "user", "parts": initial_parts}]
    for step in range(MAX_AGENT_STEPS):
        log.debug("Agent step %d/%d", step + 1, MAX_AGENT_STEPS)
        payload = {
            "contents": contents,
            "tools": TOOLS,
            "generationConfig": {"temperature": 0.2, "maxOutputTokens": 8192},
        }
        try:
            async with httpx.AsyncClient(timeout=120.0) as client:
                for attempt in range(3):
                    resp = await client.post(f"{EXECUTOR_URL}?key={key}", json=payload)
                    if resp.status_code == 429:
                        wait = 2 ** attempt
                        log.warning("Executor 429, retrying in %ds...", wait)
                        await asyncio.sleep(wait)
                        continue
                    resp.raise_for_status()
                    break
                else:
                    resp.raise_for_status()
        except Exception:
            log.exception("Agent API call failed at step %d", step + 1)
            return None
        body = resp.json()
        candidate = body["candidates"][0]
        response_parts = candidate["content"]["parts"]
        # Add assistant response to conversation
        contents.append({"role": "model", "parts": response_parts})
        # Check for function calls
        function_calls = [p for p in response_parts if "functionCall" in p]
        if not function_calls:
            # No tool calls — agent returned text, we're done
            text = "".join(p.get("text", "") for p in response_parts)
            log.info("Agent finished with text response (step %d)", step + 1)
            return text.strip() if text.strip() else "Done."
        # Execute each tool call
        tool_results = []
        done_summary = None
        for fc_part in function_calls:
            fc = fc_part["functionCall"]
            name = fc["name"]
            args = fc.get("args", {})
            print(f"  🔧 {name}({_summarize_args(args)})")
            result = _execute_tool(name, args)
            tool_results.append({
                "functionResponse": {
                    "name": name,
                    "response": {"result": result},
                }
            })
            if name == "done":
                done_summary = result
        # Add tool results to conversation
        contents.append({"role": "user", "parts": tool_results})
        if done_summary:
            log.info("Agent called done() at step %d: %s", step + 1, done_summary[:80])
            return done_summary
    log.warning("Agent hit step limit (%d)", MAX_AGENT_STEPS)
    return "Agent reached maximum steps without completing."
 def _build_initial_parts(
    vlm_payload: dict,
    action: dict,
    history: HistoryBuffer | None,
    current_screenshot: bytes | None,
 ) -> list[dict]:
    """Build the initial message parts: screenshots + task prompt."""
    parts: list[dict] = []
    # Include screenshots so agent can read source content
    if history:
        entries = history.get_entries()
        for i, entry in enumerate(entries):
            b64 = base64.b64encode(entry.jpeg).decode()
            parts.append({"text": f"[Screenshot {i + 1}/{len(entries)}]"})
            parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
    if current_screenshot:
        b64 = base64.b64encode(current_screenshot).decode()
        parts.append({"text": "[Current screenshot]"})
        parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
    friction = vlm_payload.get("friction", {})
    prompt = f"""\
 The user approved this action. Complete it using the tools available to you.
 ACTION: {action.get('label', '')}
 DETAILS: {action.get('details', '')}
 Context:
  User's task: {vlm_payload.get('inferred_task', '')}
  Problem: {friction.get('description', '')}
  Current state: {vlm_payload.get('checkpoint_note_update', '')}
  Application: {vlm_payload.get('app_name', '')}
  Source: {friction.get('source_context', '')}
  Target: {friction.get('target_context', '')}
 INSTRUCTIONS:
 1. For BINARY files (PDFs, images, etc.): use your VISION. Read content directly
   from the screenshots — this is your most reliable source for non-text files.
 2. For TEXT files (code, markdown, configs, txt): use read_file to get exact content.
 3. If you need a file but only know the filename (not the path), FIND IT FIRST:
   - run_command("mdfind -name 'filename'") — fast macOS Spotlight search
   - run_command("lsof -c AppName | grep filename") — find what file an app has open
   Do NOT guess paths. Search first.
 4. Choose the right output method:
   - Binary format targets (docx, ppt, website forms, PDFs): use output() — user will copy/paste.
   - Existing plain text files (code, markdown, config): use write_file() — modify directly.
   - write_file() only works on files that ALREADY EXIST. Confirm the path with read_file first.
 5. Use run_command to compile, test, or search for files. Never to write files.
 6. Do NOT hallucinate content. If you can't read something, say so.
 7. Call done() with a summary when the action is complete.
 Working directory: {os.getcwd()}"""
    parts.append({"text": prompt})
    return parts
 def _summarize_args(args: dict) -> str:
    """Short summary of tool args for terminal display."""
    parts = []
    for k, v in args.items():
        sv = str(v)
        if len(sv) > 50:
            sv = sv[:47] + "..."
        parts.append(f"{k}={sv}")
    return ", ".join(parts)
--- a/argus/loop.py
+++ b/argus/loop.py
@@ -0,0 +1,415 @@
 """Main orchestrator loop — capture → analyze → notify → execute.
 Three concurrent tasks:
  1. VLM loop: capture screenshots, analyze, push results
  2. Notification handler: show cards when friction detected, wait for response
  3. Input reader: read user's terminal input (1=accept, 0=dismiss)
 Swift portability notes:
  - The VLM loop becomes a Timer or DispatchSourceTimer
  - The notification handler becomes SwiftUI state updates
  - The input reader becomes button tap handlers
  - The three tasks map to Swift's structured concurrency (async let / TaskGroup)
 """
 from __future__ import annotations
 import asyncio
 import json
 import logging
 import sys
 import time
 from argus.backend import send_analysis
 from argus.buffer import HistoryBuffer
 from argus.capture import capture_screenshot
 from argus.config import BUFFER_MAX_LEN, CAPTURE_INTERVAL_S, VLM_CALL_EVERY_N
 from argus.executor import execute
 from argus.notification import NotificationManager
 from argus.session import SessionManager
 from argus.vlm import TaskContext, VLMResult, analyze_screenshot
 log = logging.getLogger(__name__)
 # Track session actions already shown to avoid repeating
 _handled_session_actions: set[str] = set()
 def _session_action_key(sa) -> str:
    return f"{sa.type}:{sa.session_id}"
 def _show_session_card(title: str, body: str, options: list[str]) -> None:
    """Display a session-related card (not friction). Swift: native notification."""
    print()
    print("┌" + "─" * 58 + "┐")
    print(f"│ {title:<56} │")
    print("│" + " " * 58 + "│")
    for line in body.split("\n"):
        print(f"│ {line:<56} │")
    print("│" + " " * 58 + "│")
    for i, opt in enumerate(options):
        print(f"│  [{i + 1}] {opt:<53} │")
    print(f"│  [0] Not now{' ' * 44} │")
    print("└" + "─" * 58 + "┘")
    print()
 async def _wait_for_input() -> int:
    """Wait for a single integer input from stdin. Returns the number or 0."""
    loop = asyncio.get_event_loop()
    line = await loop.run_in_executor(None, sys.stdin.readline)
    try:
        return int(line.strip())
    except (ValueError, EOFError):
        return 0
 async def _handle_session_action(sa, result, sessions: SessionManager, notifier: NotificationManager) -> None:
    """Handle VLM session_action output — show card and act on approval.
    Swift portability: becomes SwiftUI state changes + button handlers.
    """
    # Skip if we already showed this exact action
    key = _session_action_key(sa)
    if key in _handled_session_actions:
        return
    _handled_session_actions.add(key)
    if sa.type == "resume":
        session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
        checkpoint = session.checkpoint_note if session else ""
        task_title = session.task_title if session else sa.reason
        _show_session_card(
            f"📂 Resume: {task_title}",
            f"You left off: {checkpoint}" if checkpoint else sa.reason,
            ["Resume session"],
        )
        choice = await _wait_for_input()
        if choice == 1:
            if not sessions._mock:
                resume_card = await sessions.get_resume_card(sa.session_id)
                if resume_card:
                    rc = resume_card.get("resume_card", {})
                    print(f"  💡 {rc.get('welcome_back', '')}")
                    print(f"     {rc.get('you_were_doing', '')}")
                    print(f"     {rc.get('next_step', '')}")
                    print(f"     {rc.get('motivation', '')}")
            else:
                print(f"  ✓ Resumed \"{task_title}\"")
                if checkpoint:
                    print(f"    Last checkpoint: {checkpoint}")
            if session:
                session.status = "active"
                sessions._active_session = session
        else:
            print("  dismissed.")
    elif sa.type == "start_new":
        task = result.inferred_task
        _show_session_card(
            "🆕 New focus session",
            f"You're working on: {task}",
            ["Start focus session"],
        )
        choice = await _wait_for_input()
        if choice == 1:
            if not sessions._mock:
                resp = await sessions.start_session(task)
                if resp:
                    print(f"  ✓ Session started: {resp.get('id', '?')}")
            else:
                print(f"  ✓ (mock) Session started for \"{task}\"")
        else:
            print("  dismissed.")
    elif sa.type == "complete":
        session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
        task_title = session.task_title if session else "session"
        _show_session_card(
            f"✅ Complete: {task_title}",
            sa.reason,
            ["Complete session"],
        )
        choice = await _wait_for_input()
        if choice == 1 and sa.session_id:
            if not sessions._mock:
                ok = await sessions.end_session(sa.session_id)
                if ok:
                    print(f"  ✓ Session completed")
            else:
                print(f"  ✓ (mock) Session \"{task_title}\" completed")
            sessions._sessions = [s for s in sessions._sessions if s.session_id != sa.session_id]
            if sessions._active_session and sessions._active_session.session_id == sa.session_id:
                sessions._active_session = None
        else:
            print("  dismissed.")
    elif sa.type == "switch":
        session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
        task_title = session.task_title if session else "another task"
        _show_session_card(
            f"🔄 Switch to: {task_title}",
            sa.reason,
            [f"Switch to \"{task_title}\""],
        )
        choice = await _wait_for_input()
        if choice == 1:
            print(f"  ✓ Switched to \"{task_title}\"")
        else:
            print("  dismissed.")
 def _is_valid_session_id(session_id: str | None, sessions: SessionManager) -> bool:
    """Check if a session_id from VLM output actually exists in our sessions."""
    if not session_id:
        return False
    return any(s.session_id == session_id for s in sessions.sessions)
 async def run_loop(
    ctx: TaskContext,
    *,
    api_key: str | None = None,
    vlm_backend: str | None = None,
    jwt: str | None = None,
    base_url: str | None = None,
    dry_run: bool = False,
    max_iterations: int | None = None,
    on_result: None | (callable) = None,
    auto_execute: bool = False,
    mock_sessions: list | None = None,
 ) -> None:
    """Run the Argus VLM loop with notification and execution support.
    Args:
        ctx: Task/session context.
        api_key: Gemini API key override.
        vlm_backend: "ollama" or "gemini".
        jwt: Backend JWT override.
        base_url: Backend base URL override.
        dry_run: If True, skip sending to backend.
        max_iterations: Stop after N iterations (None = forever).
        on_result: Optional callback(VLMResult) per analysis.
        auto_execute: If True, enable notification + executor flow.
    """
    history = HistoryBuffer(image_maxlen=BUFFER_MAX_LEN)
    notifier = NotificationManager()
    sessions = SessionManager(jwt=jwt, base_url=base_url)
    iteration = 0
    log.info(
        "Argus loop starting — interval=%ds, session=%s, task=%s, executor=%s",
        CAPTURE_INTERVAL_S,
        ctx.session_id,
        ctx.task_title,
        "on" if auto_execute else "off",
    )
    # Load sessions — mock or from backend
    use_mock = mock_sessions is not None
    if use_mock:
        sessions._sessions = mock_sessions
        sessions._active_session = next((s for s in mock_sessions if s.status == "active"), None)
        sessions._mock = True  # prevent refresh from overwriting
        log.info("Loaded %d mock sessions", len(mock_sessions))
    else:
        sessions._mock = False
        try:
            await sessions.fetch_open_sessions()
        except Exception:
            log.exception("Startup: failed to fetch sessions — continuing without session context")
    # Log attachment status so it's visible in the console
    if sessions.active:
        s = sessions.active
        log.info(
            "ATTACHED to session %s — task=%r  last_app=%s  checkpoint=%r",
            s.session_id[:8], s.task_title, s.last_app or "(unknown)", s.checkpoint_note or "(none)",
        )
    else:
        log.info("No active session found — running in monitoring-only mode (dry-run=%s)", dry_run)
    # Pending frames collected between VLM calls
    pending_frames: list[bytes] = []
    capture_count = 0
    try:
        while max_iterations is None or iteration < max_iterations:
            t0 = time.monotonic()
            try:
                # 0. Refresh sessions periodically
                await sessions.maybe_refresh()
                # 1. Capture screenshot every interval
                screenshot = capture_screenshot()
                pending_frames.append(screenshot)
                capture_count += 1
                log.debug("Captured frame %d (%d pending)", capture_count, len(pending_frames))
                # 2. Only call VLM every N captures
                if len(pending_frames) < VLM_CALL_EVERY_N:
                    elapsed = time.monotonic() - t0
                    sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed)
                    if sleep_for > 0:
                        await asyncio.sleep(sleep_for)
                    continue
                # ── VLM call with all pending frames ──
                iteration += 1
                # Push all pending frames into buffer (buffer keeps last BUFFER_MAX_LEN)
                for frame in pending_frames:
                    history.push(frame, "")  # summaries filled after VLM call
                pending_frames.clear()
                t_vlm = time.monotonic()
                result: VLMResult = await analyze_screenshot(
                    screenshot, ctx, history,
                    api_key=api_key,
                    backend=vlm_backend,
                    session_context=sessions.format_for_prompt(),
                )
                t_vlm_done = time.monotonic()
                payload = result.to_backend_payload(ctx.session_id)
                log.info(
                    "[%d] vlm=%.2fs  frames=%d  friction=%s  summary=%s",
                    iteration,
                    t_vlm_done - t_vlm,
                    VLM_CALL_EVERY_N,
                    result.friction.type,
                    result.vlm_summary,
                )
                # Update last entry's summary now that we have it
                history.set_last_output(payload)
                # 4. Print / send to backend
                if dry_run:
                    print(json.dumps(payload, indent=2))
                else:
                    resp = await send_analysis(
                        result, ctx.session_id, jwt=jwt, base_url=base_url
                    )
                    log.debug("Backend response: %s", resp)
                if auto_execute:
                    sa = result.session_action
                    # Validate session_action — check that VLM output relates to the session.
                    # Match against filename, file stem (no extension), or task title.
                    if sa.type in ("resume", "switch") and sa.session_id:
                        session = next(
                            (s for s in sessions.sessions if s.session_id == sa.session_id), None
                        )
                        if session:
                            context = (result.vlm_summary + " " + result.inferred_task + " " + sa.reason).lower()
                            matches = []
                            if session.last_file:
                                matches.append(session.last_file.lower())
                                # Also check stem without extension (e.g. "receipt" from "receipt.pdf")
                                stem = session.last_file.rsplit(".", 1)[0].lower()
                                if stem:
                                    matches.append(stem)
                            if session.task_title:
                                # Check key words from task title
                                for word in session.task_title.lower().split():
                                    if len(word) > 3:  # skip short words
                                        matches.append(word)
                            if matches and not any(m in context for m in matches):
                                log.debug(
                                    "Suppressing session_action=%s — none of %s found in context",
                                    sa.type, matches,
                                )
                                sa.type = "none"
                    if not _is_valid_session_id(sa.session_id, sessions) and sa.type in ("resume", "switch"):
                        sa.type = "none"
                    # 5. Session actions take priority — but only if not already handled
                    session_handled = False
                    if sa.type != "none":
                        key = _session_action_key(sa)
                        if key not in _handled_session_actions:
                            await _handle_session_action(sa, result, sessions, notifier)
                            session_handled = True
                    # 6. Friction notification + executor
                    if not session_handled and notifier.should_notify(payload):
                        card = notifier.create_card(payload)
                        notifier.show_card_terminal(card)
                        choice = await _wait_for_input()
                        action_idx = choice - 1
                        if choice > 0:
                            print(f"\n⚡ Executing action {action_idx + 1}...")
                            summary = await execute(
                                payload, action_idx,
                                history=history,
                                current_screenshot=screenshot,
                                api_key=api_key,
                            )
                            # Emit a parseable JSON block so Swift can surface the result
                            # in the HUD and clear the executing spinner.
                            print(json.dumps({"exec_summary": summary or ""}, indent=2), flush=True)
                            if summary:
                                history.set_last_execution(summary)
                        else:
                            print("dismissed.")
                    # 7. Nudge — VLM decided a nudge is appropriate (only if nothing else fired)
                    elif not session_handled and result.gentle_nudge:
                        print(f"\n💛 {result.gentle_nudge}")
                    # 8. Fallback: suggest new session if VLM didn't
                    elif not session_handled and (
                        sa.type == "none"
                        and result.on_task
                        and not sessions.active
                        and sessions.should_suggest_new_session(result.inferred_task)
                    ):
                        task = result.inferred_task
                        print(f"\n🆕 You've been working on \"{task}\" — start a focus session?")
                        print(f"   [1] Start  [0] Not now")
                        card = notifier.create_card({
                            "friction": {
                                "type": "none",
                                "confidence": 1.0,
                                "description": f"New task detected: {task}",
                                "proposed_actions": [{"label": "Start focus session", "action_type": "other", "details": task}],
                            }
                        })
                        notifier.show_card_terminal(card)
                        accepted, _ = await notifier.wait_for_response(timeout=60.0)
                        if accepted:
                            resp = await sessions.start_session(task)
                            if resp:
                                print(f"  ✓ Session started: {resp.get('id', '?')}")
                # Clear execution context after 3 iterations
                if history.get_last_execution() and iteration % 3 == 0:
                    history.clear_last_execution()
                # 9. Callback
                if on_result:
                    on_result(result)
            except Exception:
                log.exception("Error in Argus loop iteration %d", iteration)
            # Sleep for remainder of capture interval
            elapsed = time.monotonic() - t0
            sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed)
            if sleep_for > 0:
                await asyncio.sleep(sleep_for)
    finally:
        pass
--- a/argus/notification.py
+++ b/argus/notification.py
@@ -0,0 +1,172 @@
 """Notification manager — decides when to show action cards to the user.
 Only pushes a new notification when the proposed action meaningfully changes.
 In production (Swift), this becomes a native macOS notification / floating card.
 Here it's a terminal prompt for testing.
 Swift portability notes:
  - NotificationManager becomes a class with @Published properties
  - show_card() triggers a SwiftUI overlay or UNUserNotificationCenter
  - user_response() is wired to button taps instead of stdin
 """
 from __future__ import annotations
 import asyncio
 import logging
 from dataclasses import dataclass, field
 log = logging.getLogger(__name__)
@dataclass
 class ActionCard:
    """A proposed action shown to the user."""
    friction_type: str
    description: str
    actions: list[dict]
    source: str
    target: str
    vlm_payload: dict  # full VLM result for the executor
 class NotificationManager:
    """Deduplicates notifications and manages the pending action card.
    Only shows a new card when the friction type or proposed action labels
    change from the previous notification.
    """
    def __init__(self):
        self._last_fingerprint: str = ""
        self._pending_card: ActionCard | None = None
        self._response_event: asyncio.Event = asyncio.Event()
        self._user_accepted: bool = False
    def _fingerprint(self, vlm_payload: dict) -> str:
        """Generate a dedup key from friction type + action labels."""
        friction = vlm_payload.get("friction", {})
        ftype = friction.get("type", "none")
        labels = tuple(
            a.get("label", "") for a in friction.get("proposed_actions", [])
        )
        return f"{ftype}:{labels}"
    # Action types that the executor can actually perform
    ACTIONABLE_TYPES = {"auto_extract", "brain_dump", "auto_fill", "summarize"}
    def should_notify(self, vlm_payload: dict) -> bool:
        """Check if this VLM result warrants a friction card (executor action).
        Only fires when proposed_actions contain something the executor can
        act on. Generic suggestions (action_type: "other") are nudges, not
        executor actions.
        """
        friction = vlm_payload.get("friction", {})
        # No friction or no proposed actions → no notification
        if friction.get("type") == "none":
            return False
        actions = friction.get("proposed_actions", [])
        if not actions:
            return False
        # Only notify if at least one action is executor-actionable
        has_actionable = any(
            a.get("action_type") in self.ACTIONABLE_TYPES for a in actions
        )
        if not has_actionable:
            return False
        # Same as last notification → skip
        fp = self._fingerprint(vlm_payload)
        if fp == self._last_fingerprint:
            return False
        return True
    def create_card(self, vlm_payload: dict) -> ActionCard:
        """Create an action card from VLM output and mark it as pending."""
        friction = vlm_payload.get("friction", {})
        card = ActionCard(
            friction_type=friction.get("type", ""),
            description=friction.get("description", ""),
            actions=friction.get("proposed_actions", []),
            source=friction.get("source_context", ""),
            target=friction.get("target_context", ""),
            vlm_payload=vlm_payload,
        )
        self._pending_card = card
        self._last_fingerprint = self._fingerprint(vlm_payload)
        self._response_event.clear()
        self._user_accepted = False
        return card
    def show_card_terminal(self, card: ActionCard) -> None:
        """Print the action card to terminal. (Swift: show native UI.)"""
        print()
        print("┌" + "─" * 58 + "┐")
        print(f"│ {'🔧 Friction detected':^58} │")
        print("│" + " " * 58 + "│")
        desc_lines = _wrap(card.description, 56)
        for line in desc_lines:
            print(f"│ {line:<56} │")
        print("│" + " " * 58 + "│")
        for i, action in enumerate(card.actions):
            label = action.get("label", "?")
            print(f"│  [{i + 1}] {label:<53} │")
        print(f"│  [0] Not now{' ' * 44} │")
        print("└" + "─" * 58 + "┘")
        print()
    async def wait_for_response(self, timeout: float = 30.0) -> tuple[bool, int]:
        """Wait for user response. Returns (accepted, action_index).
        Swift portability: this becomes a button tap callback.
        """
        try:
            await asyncio.wait_for(self._response_event.wait(), timeout=timeout)
            return self._user_accepted, self._selected_index
        except asyncio.TimeoutError:
            log.debug("Notification timed out, dismissing")
            self._pending_card = None
            return False, -1
    def submit_response(self, choice: int) -> None:
        """Submit user's choice. Called from input reader task.
        Swift portability: called from button tap handler.
        """
        if choice > 0 and self._pending_card:
            self._user_accepted = True
            self._selected_index = choice - 1
        else:
            self._user_accepted = False
            self._selected_index = -1
        self._response_event.set()
    @property
    def pending(self) -> ActionCard | None:
        return self._pending_card
    def dismiss(self) -> None:
        """Dismiss current card without action."""
        self._pending_card = None
        self._last_fingerprint = ""
 def _wrap(text: str, width: int) -> list[str]:
    """Simple word wrap."""
    words = text.split()
    lines: list[str] = []
    current = ""
    for word in words:
        if len(current) + len(word) + 1 <= width:
            current = f"{current} {word}" if current else word
        else:
            if current:
                lines.append(current)
            current = word
    if current:
        lines.append(current)
    return lines or [""]
--- a/argus/requirements.txt
+++ b/argus/requirements.txt
@@ -0,0 +1,3 @@
 httpx>=0.27
 Pillow>=10.0
 python-dotenv>=1.0
--- a/argus/session.py
+++ b/argus/session.py
@@ -0,0 +1,293 @@
 """Session manager — fetches and caches session state from the backend.
 Provides session context to the VLM prompt and handles session lifecycle
 actions (start, resume, switch, complete).
 Swift portability notes:
  - SessionManager becomes an ObservableObject with @Published properties
  - Backend calls use URLSession instead of httpx
  - match_session() logic is identical
 """
 from __future__ import annotations
 import logging
 import time
 from dataclasses import dataclass, field
 import httpx
 from argus.config import BACKEND_BASE_URL, BACKEND_JWT
 log = logging.getLogger(__name__)
@dataclass
 class SessionInfo:
    session_id: str
    task_id: str | None
    task_title: str
    task_goal: str
    status: str  # active | interrupted
    last_app: str
    last_file: str
    checkpoint_note: str
    started_at: str
    ended_at: str | None
    minutes_ago: int | None = None
 class SessionManager:
    """Caches open sessions and provides matching + lifecycle operations."""
    def __init__(self, jwt: str | None = None, base_url: str | None = None):
        self._jwt = jwt or BACKEND_JWT
        self._base_url = base_url or BACKEND_BASE_URL
        self._sessions: list[SessionInfo] = []
        self._active_session: SessionInfo | None = None
        self._last_fetch: float = 0
        self._fetch_interval = 30.0  # re-fetch every 30s
        self._mock: bool = False
        # Track inferred_task stability for new session suggestion
        self._inferred_task_history: list[str] = []
        self._stable_threshold = 3  # consecutive matching inferred_tasks
    @property
    def active(self) -> SessionInfo | None:
        return self._active_session
    @property
    def sessions(self) -> list[SessionInfo]:
        return self._sessions
    def has_sessions(self) -> bool:
        return len(self._sessions) > 0
    # ── Backend communication ────────────────────────────────────────
    async def fetch_open_sessions(self) -> list[SessionInfo]:
        """Fetch the active session from backend (GET /sessions/active → single object or 404).
        Called on startup and periodically.
        """
        url = f"{self._base_url}/sessions/active"
        headers = {}
        if self._jwt:
            headers["Authorization"] = f"Bearer {self._jwt}"
        try:
            async with httpx.AsyncClient(timeout=10.0) as client:
                resp = await client.get(url, headers=headers)
            if resp.status_code == 404:
                # No active session — clear cache
                self._sessions = []
                self._active_session = None
                self._last_fetch = time.monotonic()
                log.info("No active session on backend")
                return []
            resp.raise_for_status()
            data = resp.json()
            session = self._parse_session(data)
            self._sessions = [session]
            self._active_session = session
            self._last_fetch = time.monotonic()
            log.info("Fetched active session: %s (%s)", session.session_id, session.task_title)
            return self._sessions
        except httpx.HTTPStatusError as e:
            log.warning("Failed to fetch sessions: %s", e.response.status_code)
            return self._sessions
        except httpx.RequestError as e:
            log.debug("Backend unreachable for sessions: %s", e)
            return self._sessions
    async def maybe_refresh(self) -> None:
        """Re-fetch if stale. Skips if using mock data."""
        if self._mock:
            return
        if time.monotonic() - self._last_fetch > self._fetch_interval:
            await self.fetch_open_sessions()
    async def start_session(self, task_title: str, task_goal: str = "") -> dict | None:
        """Start a new focus session. Returns session data or None."""
        url = f"{self._base_url}/sessions/start"
        headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {}
        # If we have a task_title but no task_id, the backend should
        # create an ad-hoc session (or we create the task first).
        payload = {"platform": "mac"}
        try:
            async with httpx.AsyncClient(timeout=10.0) as client:
                resp = await client.post(url, json=payload, headers=headers)
                resp.raise_for_status()
            result = resp.json()
            await self.fetch_open_sessions()  # refresh
            log.info("Started session: %s", result.get("id"))
            return result
        except Exception:
            log.exception("Failed to start session")
            return None
    async def end_session(self, session_id: str, status: str = "completed") -> bool:
        """End a session. Returns True on success."""
        url = f"{self._base_url}/sessions/{session_id}/end"
        headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {}
        try:
            async with httpx.AsyncClient(timeout=10.0) as client:
                resp = await client.post(url, json={"status": status}, headers=headers)
                resp.raise_for_status()
            await self.fetch_open_sessions()  # refresh
            log.info("Ended session %s (%s)", session_id, status)
            return True
        except Exception:
            log.exception("Failed to end session %s", session_id)
            return False
    async def get_resume_card(self, session_id: str) -> dict | None:
        """Fetch AI-generated resume card for a session."""
        url = f"{self._base_url}/sessions/{session_id}/resume"
        headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {}
        try:
            async with httpx.AsyncClient(timeout=15.0) as client:
                resp = await client.get(url, headers=headers)
                resp.raise_for_status()
            return resp.json()
        except Exception:
            log.exception("Failed to get resume card for %s", session_id)
            return None
    # ── Session matching ─────────────────────────────────────────────
    def match_session(self, inferred_task: str, app_name: str) -> SessionInfo | None:
        """Find an open session that matches the current screen state.
        Returns the best match or None.
        """
        if not inferred_task or not self._sessions:
            return None
        inferred_lower = inferred_task.lower()
        best_match: SessionInfo | None = None
        best_score = 0
        for session in self._sessions:
            score = 0
            title_lower = session.task_title.lower()
            # Check for keyword overlap between inferred_task and session task_title
            inferred_words = set(inferred_lower.split())
            title_words = set(title_lower.split())
            overlap = inferred_words & title_words
            # Filter out common words
            overlap -= {"the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with"}
            score += len(overlap) * 2
            # App match
            if app_name and session.last_app and app_name.lower() in session.last_app.lower():
                score += 3
            # File match — check if session's last_file appears in inferred_task
            if session.last_file and session.last_file.lower() in inferred_lower:
                score += 5
            if score > best_score:
                best_score = score
                best_match = session
        # Require minimum score to avoid false matches
        if best_score >= 4:
            return best_match
        return None
    def should_suggest_new_session(self, inferred_task: str) -> bool:
        """Check if we should suggest starting a new session.
        Returns True if inferred_task has been stable for N iterations
        and doesn't match any existing session.
        """
        if not inferred_task:
            self._inferred_task_history.clear()
            return False
        self._inferred_task_history.append(inferred_task)
        # Keep only recent history
        if len(self._inferred_task_history) > self._stable_threshold + 2:
            self._inferred_task_history = self._inferred_task_history[-self._stable_threshold - 2:]
        if len(self._inferred_task_history) < self._stable_threshold:
            return False
        # Check if the last N inferred tasks are "similar" (share key words)
        recent = self._inferred_task_history[-self._stable_threshold:]
        first_words = set(recent[0].lower().split()) - {"the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with"}
        all_similar = all(
            len(first_words & set(t.lower().split())) >= len(first_words) * 0.5
            for t in recent[1:]
        )
        if not all_similar:
            return False
        # Make sure it doesn't match an existing session
        matched = self.match_session(inferred_task, "")
        return matched is None
    # ── Prompt formatting ────────────────────────────────────────────
    def format_for_prompt(self) -> str:
        """Format open sessions for injection into the VLM prompt.
        Includes session_id so VLM can reference the exact ID in session_action.
        """
        if not self._sessions:
            return "(no open sessions)"
        lines: list[str] = []
        for s in self._sessions:
            status_tag = f"[{s.status}]"
            ago = f" (paused {s.minutes_ago}m ago)" if s.minutes_ago else ""
            line = f"  session_id=\"{s.session_id}\" {status_tag} \"{s.task_title}\" — last in {s.last_app}"
            if s.last_file:
                line += f"/{s.last_file}"
            if s.checkpoint_note:
                line += f", \"{s.checkpoint_note}\""
            line += ago
            lines.append(line)
        return "\n".join(lines)
    # ── Internal ─────────────────────────────────────────────────────
    def _parse_session(self, data: dict) -> SessionInfo:
        checkpoint = data.get("checkpoint", {}) or {}
        ended = data.get("ended_at")
        minutes_ago = None
        if ended:
            import datetime
            try:
                ended_dt = datetime.datetime.fromisoformat(ended.replace("Z", "+00:00"))
                now = datetime.datetime.now(datetime.timezone.utc)
                minutes_ago = int((now - ended_dt).total_seconds() / 60)
            except (ValueError, TypeError):
                pass
        # SessionOut has no nested "task" object — task title is stored in checkpoint["goal"]
        # by POST /sessions/start when a task_id is provided.
        task_title = checkpoint.get("goal", "")
        checkpoint_note = checkpoint.get("last_action_summary", checkpoint.get("last_vlm_summary", ""))
        return SessionInfo(
            session_id=str(data.get("id", "")),
            task_id=data.get("task_id"),
            task_title=task_title,
            task_goal=task_title,
            status=data.get("status", ""),
            last_app=checkpoint.get("active_app", ""),
            last_file=checkpoint.get("active_file", ""),
            checkpoint_note=checkpoint_note,
            started_at=data.get("started_at", ""),
            ended_at=ended,
            minutes_ago=minutes_ago,
        )
--- a/argus/vlm.py
+++ b/argus/vlm.py
@@ -0,0 +1,442 @@
 """VLM client — supports Ollama (local, default) and Gemini (cloud fallback).
 Sends the current screenshot plus text summaries of recent analyses.
 Parses the structured JSON response.
 """
 from __future__ import annotations
 import asyncio
 import base64
 import json
 import logging
 import re
 from dataclasses import dataclass, field
 import httpx
 from argus.buffer import HistoryBuffer
 from argus.config import (
    GEMINI_API_KEY,
    GEMINI_URL,
    OLLAMA_BASE_URL,
    OLLAMA_MODEL,
    VLM_BACKEND,
 )
 log = logging.getLogger(__name__)
 # ── Task context passed in from the session ──────────────────────────────
@dataclass
 class StepInfo:
    id: str
    sort_order: int
    title: str
    status: str  # pending | in_progress | done | skipped
    checkpoint_note: str | None = None
@dataclass
 class TaskContext:
    """Context about the user's current task/session, fed into the VLM prompt."""
    task_title: str = ""
    task_goal: str = ""
    steps: list[StepInfo] = field(default_factory=list)
    window_title: str = ""
    session_id: str = ""
 # ── VLM response schema ─────────────────────────────────────────────────
@dataclass
 class FrictionInfo:
    type: str = "none"
    confidence: float = 0.0
    description: str | None = None
    proposed_actions: list[dict] = field(default_factory=list)
    source_context: str | None = None
    target_context: str | None = None
@dataclass
 class SessionAction:
    type: str = "none"  # resume | switch | complete | start_new | none
    session_id: str | None = None
    reason: str = ""
@dataclass
 class VLMResult:
    on_task: bool = True
    current_step_id: str | None = None
    inferred_task: str = ""
    checkpoint_note_update: str | None = None
    steps_completed: list[str] = field(default_factory=list)
    friction: FrictionInfo = field(default_factory=FrictionInfo)
    session_action: SessionAction = field(default_factory=SessionAction)
    intent: str | None = None
    distraction_type: str | None = None
    app_name: str = ""
    confidence: float = 0.0
    gentle_nudge: str | None = None
    vlm_summary: str = ""
    def to_backend_payload(self, session_id: str) -> dict:
        """Serialize to the JSON shape expected by POST /distractions/analyze-result."""
        return {
            "session_id": session_id,
            "on_task": self.on_task,
            "current_step_id": self.current_step_id,
            "inferred_task": self.inferred_task,
            "checkpoint_note_update": self.checkpoint_note_update,
            "steps_completed": self.steps_completed,
            "friction": {
                "type": self.friction.type,
                "confidence": self.friction.confidence,
                "description": self.friction.description,
                "proposed_actions": self.friction.proposed_actions,
                "source_context": self.friction.source_context,
                "target_context": self.friction.target_context,
            },
            "session_action": {
                "type": self.session_action.type,
                "session_id": self.session_action.session_id,
                "reason": self.session_action.reason,
            },
            "intent": self.intent,
            "distraction_type": self.distraction_type,
            "app_name": self.app_name,
            "confidence": self.confidence,
            "gentle_nudge": self.gentle_nudge,
            "vlm_summary": self.vlm_summary,
        }
 # ── Prompt construction ──────────────────────────────────────────────────
 def _format_steps(steps: list[StepInfo]) -> str:
    lines: list[str] = []
    for s in steps:
        marker = {"pending": "○", "in_progress": "►", "done": "✓", "skipped": "–"}.get(
            s.status, "?"
        )
        line = f"    {marker} [{s.status}] (id={s.id}) {s.sort_order}. {s.title}"
        if s.checkpoint_note:
            line += f"  — checkpoint: {s.checkpoint_note}"
        lines.append(line)
    return "\n".join(lines) if lines else "    (no steps)"
 def build_system_prompt(ctx: TaskContext, history: HistoryBuffer, session_context: str = "") -> str:
    history_text = history.format_for_prompt()
    steps_text = _format_steps(ctx.steps)
    prev_output = history.format_last_output()
    prev_section = ""
    if prev_output:
        prev_section = f"""
 Your previous analysis (refine or correct this based on new evidence):
 {prev_output}
 If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it with new observations.
 """
    execution = history.get_last_execution()
    exec_section = ""
    if execution:
        exec_section = f"""
 IMPORTANT — An AI agent just completed an action for the user:
  {execution}
 This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT.
 """
    return f"""\
 You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots.
 ## How to read the screenshots
 You receive screenshots in chronological order (oldest first, newest last).
 You receive ~5 frames spanning ~20 seconds (one frame every 4 seconds). This means:
  - 2 unchanged frames = 8+ seconds idle. That's significant.
  - 3+ unchanged frames = 12-20 seconds idle. The user is stuck or distracted.
  - If ALL frames are identical, the user has been idle for 20 seconds — definitely flag it.
  - If the user wrote code and then 2+ frames show no changes, they are STUCK NOW.
 Do NOT wait for many frames to flag problems. React fast.
 Your PRIMARY signal is the DIFFERENCES between consecutive frames.
 Where the screen CHANGED = where the user's ATTENTION is.
 Where the screen is STATIC = background noise. Ignore it unless the user interacts with it.
 Diff signals and what they mean:
  - New text appearing / cursor advancing → user is actively typing (THIS is their task)
  - Window or tab switch → context change, could be reference lookup or distraction
  - Same content, no pixel changes → stalled, idle, or reading
  - Repeated switching between same 2-3 apps → repetitive loop (manual data transfer)
  - Scroll position change → reading or browsing
  - Error message that APPEARED between frames → user just triggered it, relevant
  - Error message that was ALREADY THERE in all frames → stale, ignore it
 ## Task inference
 Infer the user's current task from what they are ACTIVELY DOING across the frames.
 Do NOT assume static content (old terminal output, background panels, stale errors)
 is the task. The region of the screen where pixels are changing IS the task.
 CRITICAL — looking at something ≠ working on something:
  - User switches to Preview/browser/another app and just LOOKS → this is NOT a new task.
    It could be a distraction, a quick reference, or idle browsing.
  - User switches to another app AND starts TYPING/EDITING → this might be a new task.
  - If the user has an active session and switches away WITHOUT typing in the new app,
    they are DISTRACTED from their session, not starting a new task.
  - Only infer a new task when there is clear evidence of productive work (typing, editing,
    cursor movement between frames) in the new context.
  - A single app switch is NEVER enough to infer a new task. Wait for active work.
 If an explicit task is provided below, use it. Otherwise, infer from the screenshots.
  Task: {ctx.task_title}
  Goal: {ctx.task_goal}
  Steps:
 {steps_text}
  Window title reported by OS: {ctx.window_title}
 ## Open sessions from backend (use EXACT session_id values below)
 {session_context if session_context else "(no open sessions — suggest start_new if user is working on something)"}
 Session matching rules — be STRICT:
  - A session matches ONLY if the user is actively editing the session's last_file.
    Being in the same app (e.g. VS Code) is NOT enough. The user must be typing/editing
    in the specific file listed in the session (e.g. solution.cpp).
  - Chatting in a sidebar, reading logs, or browsing in the same app ≠ working on the session.
  - If the user is in VS Code but editing a DIFFERENT file or chatting in a panel,
    they are NOT on-task for the session. That's a distraction.
 If the session's file IS being actively edited, output session_action: resume with the EXACT session_id.
 If the user moved to a different open session's file, output session_action: switch with the EXACT session_id.
 IMPORTANT: If you propose a friction action (e.g. auto_extract) that relates to an existing
 session's task, ALSO output session_action: resume with that session's ID. The friction action
 and session resume should go together — don't propose work related to a session without
 linking it to the session.
 If the user finished and closed the session's file, output session_action: complete with the EXACT session_id.
 If no sessions exist but the user is actively working, output session_action: start_new (session_id: null).
 NEVER invent session IDs. Use only the IDs listed above or null.
 {prev_section}{exec_section}
 Screenshot timeline:
 {history_text}
 ## What to analyze
 1. INFERRED TASK: What is the user actually working on right now? Base this on where
   the screen is changing, not on static content.
 2. CHECKPOINT: What specific progress did the user make across these frames?
   Describe what changed (e.g., "typed 3 new lines of C++ code", "scrolled to next section").
 3. FRICTION DETECTION: Is the user stuck in any of these patterns?
   - REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually)
   - STALLED: No meaningful pixel changes across 2+ frames, OR user wrote code then
     deleted/undid it (write-then-delete = struggle, NOT "refining")
   - TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand)
   - CONTEXT_OVERHEAD: Many windows open, visibly searching across them
   - TASK_RESUMPTION: User just returned to a task from earlier (check text history)
   IMPORTANT signals to catch IMMEDIATELY (do NOT wait many frames):
   - User wrote code/text then deleted it → STUCK, not refining. Flag stalled.
   - User idle for 2+ frames after deletion → definitely stuck. Flag stalled.
   - User switching between source doc and target file repeatedly → TEDIOUS_MANUAL.
     This is NOT "fluent workflow." If the user is copying data from one place to
     another by switching windows, flag it on the SECOND switch. Don't wait.
   - Code is incomplete/wrong and user stopped typing → need help.
 4. INTENT: If viewing informational content, is the user skimming, engaged, or unclear?
 5. PROPOSED ACTION: If friction detected, suggest what the AI could DO.
   Be SPECIFIC: "Extract full text from writeup.pdf into transcript.md" not "Summarize text".
   The label should be a concrete verb phrase the user can approve with one tap.
   CRITICAL: The "details" field is the executor agent's instruction manual. Write it as
   a natural language spec — the executor has vision too, so tell it where to look and
   what to do, not the raw data:
   Bad:  "Extract data from the document"
   Bad:  "Help with the code"
   Good: "User is writing a report in report.md and has a PDF open with source data. They are manually copying table values from the PDF into markdown. Extract the table from the PDF (visible in screenshots), format as a markdown table matching the style in report.md, and append to the file."
   Good: "User is implementing quicksort in solution.cpp but has been idle after writing the function signature. They appear stuck on the partition logic. Provide a working C++ quicksort implementation that fits their existing code structure."
 Respond ONLY with JSON:
 {{
  "on_task": true,
  "current_step_id": "step UUID or null",
  "inferred_task": "what the user is actually working on, based on screen diffs",
  "checkpoint_note_update": "what changed across these frames specifically",
  "steps_completed": ["UUIDs"],
  "friction": {{
    "type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none",
    "confidence": 0.0-1.0,
    "description": "what the user is struggling with, based on diff evidence",
    "proposed_actions": [
      {{"label": "specific verb phrase: what to do, from where, to where", "action_type": "auto_extract | brain_dump | auto_fill | summarize | other", "details": "Natural language spec for executor. Include: (1) what the user wants done, (2) where to look in the screenshots, (3) EXACT format to use — quote what the user already wrote so the executor matches it (e.g. if user wrote '3 tacos with steak' in plain text, say 'format as plain text lines like the user already started, NOT JSON'), (4) target file. The executor has vision to read screenshots — tell it WHERE to look, not the raw data."}}
    ],
    "source_context": "just the filename if visible (e.g. writeup.pdf), or app name if no file",
    "target_context": "just the filename if visible (e.g. transcript.md), or app name if no file"
  }},
  "session_action": {{
    "type": "resume | switch | complete | start_new | none",
    "session_id": "uuid of matching session, or null for start_new/none",
    "reason": "why this session action is suggested"
  }},
  "intent": "skimming | engaged | unclear | null",
  "distraction_type": "app_switch | browsing | idle | null",
  "app_name": "primary visible application",
  "confidence": 0.0-1.0,
  "gentle_nudge": "nudge text if distracted, null otherwise",
  "vlm_summary": "1-sentence description of what CHANGED across the frames (not what's static)"
 }}"""
 # ── Gemini API call ──────────────────────────────────────────────────────
 def _extract_json(text: str) -> dict:
    """Extract JSON from Gemini response, handling markdown code fences."""
    text = text.strip()
    # Strip ```json ... ``` wrappers
    m = re.search(r"```(?:json)?\s*([\s\S]*?)```", text)
    if m:
        text = m.group(1).strip()
    return json.loads(text)
 def _parse_vlm_response(raw: dict) -> VLMResult:
    """Parse the raw JSON dict into a VLMResult dataclass."""
    friction_raw = raw.get("friction", {})
    friction = FrictionInfo(
        type=friction_raw.get("type", "none"),
        confidence=friction_raw.get("confidence", 0.0),
        description=friction_raw.get("description"),
        proposed_actions=friction_raw.get("proposed_actions", []),
        source_context=friction_raw.get("source_context"),
        target_context=friction_raw.get("target_context"),
    )
    sa_raw = raw.get("session_action", {})
    session_action = SessionAction(
        type=sa_raw.get("type", "none"),
        session_id=sa_raw.get("session_id"),
        reason=sa_raw.get("reason", ""),
    )
    return VLMResult(
        on_task=raw.get("on_task", True),
        current_step_id=raw.get("current_step_id"),
        inferred_task=raw.get("inferred_task", ""),
        checkpoint_note_update=raw.get("checkpoint_note_update"),
        steps_completed=raw.get("steps_completed", []),
        friction=friction,
        session_action=session_action,
        intent=raw.get("intent"),
        distraction_type=raw.get("distraction_type"),
        app_name=raw.get("app_name", ""),
        confidence=raw.get("confidence", 0.0),
        gentle_nudge=raw.get("gentle_nudge"),
        vlm_summary=raw.get("vlm_summary", ""),
    )
 async def _call_ollama(system_prompt: str, b64_images: list[str]) -> str:
    """Call Ollama local VLM with multiple images and return raw text."""
    payload = {
        "model": OLLAMA_MODEL,
        "messages": [
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": "/no_think\nAnalyze this screenshot sequence now.",
                "images": b64_images,
            },
        ],
        "stream": False,
        "keep_alive": -1,
        "options": {"temperature": 0.2},
    }
    async with httpx.AsyncClient(timeout=300.0) as client:
        resp = await client.post(f"{OLLAMA_BASE_URL}/api/chat", json=payload)
        resp.raise_for_status()
    body = resp.json()
    return body["message"]["content"]
 async def _call_gemini(system_prompt: str, b64_images: list[str], api_key: str) -> str:
    """Call Gemini Vision API with multiple images and return raw text."""
    # Build parts: interleave images with labels, newest last
    parts: list[dict] = []
    total = len(b64_images)
    for i, b64 in enumerate(b64_images):
        parts.append({"text": f"[Screenshot {i + 1}/{total}]"})
        parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
    parts.append({"text": "Analyze this screenshot sequence now."})
    payload = {
        "systemInstruction": {"parts": [{"text": system_prompt}]},
        "contents": [{"parts": parts}],
        "generationConfig": {
            "temperature": 0.2,
            "maxOutputTokens": 4096,
        },
    }
    async with httpx.AsyncClient(timeout=60.0) as client:
        for attempt in range(3):
            resp = await client.post(f"{GEMINI_URL}?key={api_key}", json=payload)
            if resp.status_code == 429:
                wait = 2 ** attempt
                log.warning("Gemini 429 rate limited, retrying in %ds...", wait)
                await asyncio.sleep(wait)
                continue
            resp.raise_for_status()
            break
        else:
            resp.raise_for_status()
    body = resp.json()
    parts = body["candidates"][0]["content"]["parts"]
    text = ""
    for part in parts:
        if "text" in part:
            text = part["text"]
    return text
 async def analyze_screenshot(
    screenshot_jpeg: bytes,
    ctx: TaskContext,
    history: HistoryBuffer,
    *,
    api_key: str | None = None,
    backend: str | None = None,
    session_context: str = "",
 ) -> VLMResult:
    """Analyze a screenshot sequence via Ollama or Gemini.
    Sends all buffered screenshots + the current one as images (oldest first).
    """
    which = backend or VLM_BACKEND
    system_prompt = build_system_prompt(ctx, history, session_context=session_context)
    # Build image list: buffered (oldest first) + current
    b64_images: list[str] = []
    for entry in history.get_entries():
        b64_images.append(base64.b64encode(entry.jpeg).decode())
    b64_images.append(base64.b64encode(screenshot_jpeg).decode())
    log.debug("Sending %d screenshots to %s", len(b64_images), which)
    if which == "ollama":
        text = await _call_ollama(system_prompt, b64_images)
    elif which == "gemini":
        key = api_key or GEMINI_API_KEY
        if not key:
            raise RuntimeError("GEMINI_API_KEY not set")
        text = await _call_gemini(system_prompt, b64_images, key)
    else:
        raise ValueError(f"Unknown VLM backend: {which}")
    log.debug("VLM raw response: %s", text)
    parsed = _extract_json(text)
    return _parse_vlm_response(parsed)