diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7ab48c8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +# Python venv — run `python3 -m venv .venv && pip install -r argus/requirements.txt` to recreate +.venv/ + +# Python bytecode +argus/__pycache__/ +argus/**/__pycache__/ +*.pyc +*.pyo + +# macOS +.DS_Store diff --git a/FloatingHUDView.swift b/FloatingHUDView.swift index 2ecc4a3..1b55421 100644 --- a/FloatingHUDView.swift +++ b/FloatingHUDView.swift @@ -18,6 +18,7 @@ struct FloatingHUDView: View { .animation(.spring(duration: 0.3), value: session.proactiveCard?.id) .animation(.spring(duration: 0.3), value: session.isExecuting) .animation(.spring(duration: 0.3), value: session.executorOutput?.title) + .animation(.spring(duration: 0.3), value: session.monitoringError) } // MARK: - Header @@ -58,6 +59,12 @@ struct FloatingHUDView: View { @ViewBuilder private var content: some View { + // Error / warning banner — shown above all other content when monitoring has a problem + if let error = session.monitoringError { + MonitoringErrorBanner(message: error) + .transition(.move(edge: .top).combined(with: .opacity)) + } + // Executor output sticky card (highest priority — persists until dismissed) if let output = session.executorOutput { ExecutorOutputCard(title: output.title, content: output.content) { @@ -83,13 +90,23 @@ struct FloatingHUDView: View { .transition(.move(edge: .top).combined(with: .opacity)) } // Latest VLM summary (idle state) - else { - Text(session.latestVlmSummary ?? "Monitoring your screen…") - .font(.caption) - .foregroundStyle(.secondary) - .fixedSize(horizontal: false, vertical: true) - .padding(14) - .transition(.opacity) + else if session.monitoringError == nil { + VStack(alignment: .leading, spacing: 4) { + if let task = session.latestInferredTask, !task.isEmpty { + Text(task) + .font(.caption.bold()) + .foregroundStyle(.primary) + .fixedSize(horizontal: false, vertical: true) + .lineLimit(2) + } + Text(session.latestVlmSummary ?? "Monitoring your screen…") + .font(.caption) + .foregroundStyle(.secondary) + .fixedSize(horizontal: false, vertical: true) + .lineLimit(3) + } + .padding(14) + .transition(.opacity) } } } @@ -128,58 +145,147 @@ private struct HUDCardView: View { .buttonStyle(.plain) } - // Action buttons — only for VLM friction with proposed actions - if case .vlmFriction(_, _, let actions) = card.source, !actions.isEmpty { - VStack(alignment: .leading, spacing: 6) { - ForEach(Array(actions.prefix(2).enumerated()), id: \.offset) { index, action in - Button { - session.approveProactiveCard(actionIndex: index) - } label: { - VStack(alignment: .leading, spacing: 2) { - Text(action.label) - .font(.caption.bold()) - .lineLimit(2) - .multilineTextAlignment(.leading) - if let details = action.details, !details.isEmpty { - Text(details) - .font(.caption2) - .foregroundStyle(.purple.opacity(0.7)) - .lineLimit(2) - .multilineTextAlignment(.leading) - } - } - .frame(maxWidth: .infinity, alignment: .leading) - .padding(.horizontal, 10) - .padding(.vertical, 6) - .background(Color.purple.opacity(0.12)) - .clipShape(.rect(cornerRadius: 8)) - } - .buttonStyle(.plain) - .foregroundStyle(.purple) - } - - Button("Not now — I'm good") { session.dismissProactiveCard() } - .font(.caption) - .foregroundStyle(.secondary) - .buttonStyle(.plain) - .padding(.top, 2) - } - } + // Action buttons + actionButtons } .padding(14) .background(Color.purple.opacity(0.07)) } + @ViewBuilder + private var actionButtons: some View { + switch card.source { + case .vlmFriction(_, _, let actions) where !actions.isEmpty: + VStack(alignment: .leading, spacing: 6) { + ForEach(Array(actions.prefix(2).enumerated()), id: \.offset) { index, action in + Button { + session.approveProactiveCard(actionIndex: index) + } label: { + VStack(alignment: .leading, spacing: 2) { + Text(action.label) + .font(.caption.bold()) + .lineLimit(2) + .multilineTextAlignment(.leading) + if let details = action.details, !details.isEmpty { + Text(details) + .font(.caption2) + .foregroundStyle(.purple.opacity(0.7)) + .lineLimit(2) + .multilineTextAlignment(.leading) + } + } + .frame(maxWidth: .infinity, alignment: .leading) + .padding(.horizontal, 10) + .padding(.vertical, 6) + .background(Color.purple.opacity(0.12)) + .clipShape(.rect(cornerRadius: 8)) + } + .buttonStyle(.plain) + .foregroundStyle(.purple) + } + notNowButton + } + + case .sessionAction(let type, _, _, _, _): + VStack(alignment: .leading, spacing: 6) { + Button { + session.approveProactiveCard(actionIndex: 0) + } label: { + Text(sessionActionButtonLabel(type)) + .font(.caption.bold()) + .frame(maxWidth: .infinity, alignment: .leading) + .padding(.horizontal, 10) + .padding(.vertical, 6) + .background(Color.purple.opacity(0.12)) + .clipShape(.rect(cornerRadius: 8)) + } + .buttonStyle(.plain) + .foregroundStyle(.purple) + notNowButton + } + + default: + EmptyView() + } + } + + private var notNowButton: some View { + Button("Not now — I'm good") { session.dismissProactiveCard() } + .font(.caption) + .foregroundStyle(.secondary) + .buttonStyle(.plain) + .padding(.top, 2) + } + + private func sessionActionButtonLabel(_ type: String) -> String { + switch type { + case "resume": return "Resume session" + case "switch": return "Switch to this task" + case "complete": return "Mark complete" + case "start_new": return "Start focus session" + default: return "OK" + } + } + private var bodyText: String { switch card.source { case .vlmFriction(_, let description, _): return description ?? "I noticed something that might be slowing you down." case .appSwitchLoop(let apps, let count): return "You've switched between \(apps.joined(separator: " ↔ ")) \(count)× — are you stuck?" + case .sessionAction(_, _, let checkpoint, let reason, _): + if !checkpoint.isEmpty { return "Left off: \(checkpoint)" } + return reason.isEmpty ? "Argus noticed a session change." : reason } } } +// MARK: - Monitoring Error Banner + +private struct MonitoringErrorBanner: View { + let message: String + @Environment(SessionManager.self) private var session + + private var isRestarting: Bool { message.contains("restarting") } + + var body: some View { + VStack(alignment: .leading, spacing: 8) { + HStack(spacing: 8) { + Image(systemName: isRestarting ? "arrow.clockwise" : "exclamationmark.triangle.fill") + .font(.caption) + .foregroundStyle(isRestarting ? .orange : .red) + .symbolEffect(.pulse, isActive: isRestarting) + + Text(message) + .font(.caption) + .foregroundStyle(isRestarting ? .orange : .red) + .fixedSize(horizontal: false, vertical: true) + + Spacer(minLength: 0) + } + + if !isRestarting { + Button("Retry") { session.retryMonitoring() } + .font(.caption.bold()) + .foregroundStyle(.white) + .padding(.horizontal, 12) + .padding(.vertical, 5) + .background(Color.red.opacity(0.8)) + .clipShape(.rect(cornerRadius: 6)) + .buttonStyle(.plain) + } + } + .padding(12) + .background(isRestarting ? Color.orange.opacity(0.08) : Color.red.opacity(0.08)) + .overlay( + Rectangle() + .frame(width: 3) + .foregroundStyle(isRestarting ? Color.orange : Color.red), + alignment: .leading + ) + } +} + // MARK: - Executor Output Sticky Card private struct ExecutorOutputCard: View { diff --git a/GeminiVLMClient.swift b/GeminiVLMClient.swift new file mode 100644 index 0000000..d32536a --- /dev/null +++ b/GeminiVLMClient.swift @@ -0,0 +1,278 @@ +// GeminiVLMClient.swift — Native Swift Gemini Vision API client +// Ports the Python argus VLM analysis (vlm.py) directly into Swift. +// No subprocess required: screenshots go straight from ScreenCaptureKit → Gemini → UI. + +import Foundation + +struct GeminiVLMClient { + + private static let apiBase = "https://generativelanguage.googleapis.com/v1beta/models" + private static let model = "gemini-3.1-pro-preview" + + let apiKey: String + + // MARK: - Public + + /// Analyze a sequence of JPEG frames and return a structured distraction analysis. + /// - Parameters: + /// - frames: JPEG screenshot frames, oldest first, newest last. + /// - taskTitle: Current task title (empty if no session). + /// - taskGoal: Task description / goal. + /// - steps: Active step list for the current task. + /// - windowTitle: Frontmost app name from NSWorkspace. + /// - recentSummaries: Rolling summaries from previous analyses (temporal context). + func analyze( + frames: [Data], + taskTitle: String, + taskGoal: String, + steps: [Step], + windowTitle: String, + recentSummaries: [String] + ) async throws -> DistractionAnalysisResponse { + let prompt = buildPrompt( + taskTitle: taskTitle, + taskGoal: taskGoal, + steps: steps, + windowTitle: windowTitle, + recentSummaries: recentSummaries + ) + let raw = try await callGemini(prompt: prompt, frames: frames) + return try parseResponse(raw) + } + + // MARK: - Prompt Builder (ported from vlm.py build_system_prompt) + + private func buildPrompt( + taskTitle: String, + taskGoal: String, + steps: [Step], + windowTitle: String, + recentSummaries: [String] + ) -> String { + let stepsText: String + if steps.isEmpty { + stepsText = " (no steps defined)" + } else { + stepsText = steps.map { s in + let marker: String + switch s.status { + case "pending": marker = "○" + case "in_progress": marker = "►" + case "done": marker = "✓" + default: marker = "?" + } + var line = " \(marker) [\(s.status)] (id=\(s.id)) \(s.sortOrder). \(s.title)" + if let note = s.checkpointNote { line += " — checkpoint: \(note)" } + return line + }.joined(separator: "\n") + } + + let historyText: String + if recentSummaries.isEmpty { + historyText = " (no previous frames)" + } else { + historyText = recentSummaries.enumerated() + .map { i, s in " [frame \(i + 1)] \(s)" } + .joined(separator: "\n") + } + + return """ + You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots. + + ## How to read the screenshots + + You receive screenshots in chronological order (oldest first, newest last). + Each frame is ~5 seconds apart. This means: + - 2 unchanged frames = ~10 seconds idle — significant. + - 3 unchanged frames = ~15 seconds idle — user is stuck or distracted. + - If ALL frames are identical, the user has been idle for 15+ seconds — flag it. + + Your PRIMARY signal is the DIFFERENCES between consecutive frames. + Where the screen CHANGED = where attention is. Static areas = ignore. + + Diff signals and what they mean: + - New text appearing / cursor advancing → user is actively typing (this IS their task) + - Window or tab switch → context change, could be reference or distraction + - Same content, no pixel changes → stalled, idle, or reading + - Repeated switching between same 2-3 apps → repetitive loop (manual data transfer) + - Error message that APPEARED between frames → user just triggered it, relevant + - Error message already in ALL frames → stale, ignore + + CRITICAL — looking at something ≠ working on something: + - User switches to browser/another app and just LOOKS → distraction or quick reference. + - User switches and starts TYPING/EDITING → might be a new task. + - If the user has an active session and switches away WITHOUT typing in the new app, + they are DISTRACTED from their session, not starting a new task. + - A single app switch is NEVER enough to infer a new task. Wait for active work. + + ## Current task context + + Task: \(taskTitle.isEmpty ? "(no active task)" : taskTitle) + Goal: \(taskGoal.isEmpty ? taskTitle : taskGoal) + Steps: + \(stepsText) + Window title (OS): \(windowTitle.isEmpty ? "(unknown)" : windowTitle) + + ## Recent screen history (for temporal context) + \(historyText) + + ## What to output + + Analyze the screenshots and return JSON with EXACTLY this structure (no extra fields, no markdown): + { + "on_task": true, + "current_step_id": "step UUID or null", + "inferred_task": "what the user is actually working on based on screen diffs", + "checkpoint_note_update": "what specifically changed across these frames", + "steps_completed": [], + "friction": { + "type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none", + "confidence": 0.0, + "description": "what the user is struggling with", + "proposed_actions": [ + { + "label": "specific verb phrase: what to do", + "action_type": "auto_extract | brain_dump | other", + "details": "natural language spec for what action to take" + } + ], + "source_context": "filename or app name, or null", + "target_context": "filename or app name, or null" + }, + "session_action": { + "type": "none", + "session_id": null, + "reason": "" + }, + "intent": "skimming | engaged | unclear | null", + "distraction_type": "app_switch | browsing | idle | null", + "app_name": "primary visible application", + "confidence": 0.8, + "gentle_nudge": "short nudge message if distracted but no friction action applies, otherwise null", + "vlm_summary": "1-sentence description of what CHANGED across the frames (not what is static)" + } + + FRICTION DETECTION rules: + - REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually) + - STALLED: No meaningful pixel changes across 2+ frames; or user wrote then deleted + - TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand) + - CONTEXT_OVERHEAD: Many windows open, visibly searching across them + - TASK_RESUMPTION: User just returned to a task they were working on earlier + + If friction confidence < 0.5, set type to "none". + Only set gentle_nudge when user is off-task AND no actionable friction applies. + """ + } + + // MARK: - Action Executor + + /// Execute a user-approved proactive action and return a plain-text result. + func executeAction( + label: String, + actionType: String, + details: String, + screenshot: Data? + ) async throws -> String { + let taskInstruction: String + switch actionType { + case "auto_extract": + taskInstruction = "Extract the relevant data from the screenshot and present it concisely as plain text." + case "brain_dump": + taskInstruction = "Format this as a short brain-dump note the user should add to their task list." + default: + taskInstruction = "Provide 2–3 concrete next steps the user can take right now." + } + let prompt = """ + You are a productivity assistant. The user approved this action: "\(label)" + Details: \(details.isEmpty ? "(none)" : details) + \(taskInstruction) + Be specific and brief (3–5 sentences max). No markdown, no preamble, plain text only. + """ + let frames: [Data] = screenshot.map { [$0] } ?? [] + return try await callGemini(prompt: prompt, frames: frames) + } + + // MARK: - Gemini REST API Call + + private func callGemini(prompt: String, frames: [Data]) async throws -> String { + let urlStr = "\(Self.apiBase)/\(Self.model):generateContent?key=\(apiKey)" + guard let url = URL(string: urlStr) else { throw URLError(.badURL) } + + // Build content parts: label + image for each frame, then instruction + var parts: [[String: Any]] = [] + let total = frames.count + for (i, frame) in frames.enumerated() { + parts.append(["text": "[Screenshot \(i + 1)/\(total) — \((total - i) * 5)s ago]"]) + parts.append([ + "inlineData": [ + "mimeType": "image/jpeg", + "data": frame.base64EncodedString() + ] + ]) + } + parts.append(["text": "Analyze this screenshot sequence now. Reply with ONLY valid JSON — no markdown, no code fences."]) + + let body: [String: Any] = [ + "systemInstruction": ["parts": [["text": prompt]]], + "contents": [["parts": parts]], + "generationConfig": [ + "temperature": 0.2, + "maxOutputTokens": 1024 + ] + ] + + var request = URLRequest(url: url) + request.httpMethod = "POST" + request.setValue("application/json", forHTTPHeaderField: "Content-Type") + request.httpBody = try JSONSerialization.data(withJSONObject: body) + request.timeoutInterval = 60 + + let (data, response) = try await URLSession.shared.data(for: request) + + if let http = response as? HTTPURLResponse, http.statusCode != 200 { + let msg = String(data: data, encoding: .utf8) ?? "HTTP \(http.statusCode)" + print("[GeminiVLM] API error \(http.statusCode): \(msg)") + throw URLError(.badServerResponse) + } + + guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any], + let candidates = json["candidates"] as? [[String: Any]], + let first = candidates.first, + let content = first["content"] as? [String: Any], + let contentParts = content["parts"] as? [[String: Any]], + let text = contentParts.first?["text"] as? String + else { + let raw = String(data: data, encoding: .utf8) ?? "" + print("[GeminiVLM] Unexpected response shape: \(raw.prefix(300))") + throw URLError(.cannotParseResponse) + } + + print("[GeminiVLM] Response (\(text.count) chars): \(text.prefix(200))") + return text + } + + // MARK: - Response Parsing + + private func parseResponse(_ text: String) throws -> DistractionAnalysisResponse { + var cleaned = text.trimmingCharacters(in: .whitespacesAndNewlines) + // Strip ```json ... ``` or ``` ... ``` fences + if cleaned.hasPrefix("```") { + let lines = cleaned.components(separatedBy: "\n") + cleaned = lines.dropFirst().joined(separator: "\n") + if let backtickRange = cleaned.range(of: "```") { + cleaned = String(cleaned[..? + private func req( _ path: String, method: String = "GET", body: Data? = nil, contentType: String = "application/json", auth: Bool = true, - timeout: TimeInterval = 30 + timeout: TimeInterval = 30, + isRetry: Bool = false ) async throws -> Data { guard let url = URL(string: base + path) else { throw NetworkError.unknown(URLError(.badURL)) @@ -75,6 +97,17 @@ final class APIClient { throw NetworkError.unknown(URLError(.badServerResponse)) } guard http.statusCode < 400 else { + if http.statusCode == 401 && auth && !isRetry { + // Try to silently refresh the access token, then retry once + let refreshed = await refreshAccessToken() + if refreshed { + return try await req(path, method: method, body: body, + contentType: contentType, auth: auth, + timeout: timeout, isRetry: true) + } + // Refresh also failed — force logout + await MainActor.run { AuthManager.shared.handleSessionExpired() } + } let msg = (try? JSONDecoder().decode(APIErrorResponse.self, from: data))?.detail ?? String(data: data, encoding: .utf8) ?? "Unknown error" @@ -83,6 +116,32 @@ final class APIClient { return data } + /// Refreshes the access token. Concurrent callers share one in-flight request. + private func refreshAccessToken() async -> Bool { + if let existing = activeRefreshTask { return await existing.value } + let task = Task { + defer { self.activeRefreshTask = nil } + guard let refresh = TokenStore.shared.refreshToken else { return false } + do { + let body = try JSONSerialization.data(withJSONObject: ["refresh_token": refresh]) + guard let url = URL(string: base + "/auth/refresh") else { return false } + var req = URLRequest(url: url) + req.httpMethod = "POST" + req.setValue("application/json", forHTTPHeaderField: "Content-Type") + req.httpBody = body + req.timeoutInterval = 30 + let (data, res) = try await urlSession.data(for: req) + guard let http = res as? HTTPURLResponse, http.statusCode == 200 else { return false } + let auth = try self.decode(AuthResponse.self, from: data) + TokenStore.shared.token = auth.accessToken + TokenStore.shared.refreshToken = auth.refreshToken + return true + } catch { return false } + } + activeRefreshTask = task + return await task.value + } + private func decode(_ type: T.Type, from data: Data) throws -> T { let decoder = JSONDecoder() decoder.dateDecodingStrategy = .iso8601 @@ -206,6 +265,16 @@ final class APIClient { // MARK: - Sessions + /// Returns the currently active session, or nil if none (404). + func getActiveSession() async throws -> FocusSession? { + do { + let data = try await req("/sessions/active") + return try decode(FocusSession.self, from: data) + } catch NetworkError.httpError(404, _) { + return nil + } + } + func startSession(taskId: String?) async throws -> FocusSession { var dict: [String: Any] = ["platform": "mac"] if let tid = taskId { dict["task_id"] = tid } @@ -268,9 +337,35 @@ final class APIClient { _ = try await req("/distractions/app-activity", method: "POST", body: body) } - // MARK: - Distraction / Screenshot Analysis - // Note: spec primary endpoint is POST /distractions/analyze-result (device-side VLM, JSON only). - // Backend currently implements analyze-screenshot (legacy fallback) — using that until analyze-result is deployed. + // MARK: - Distraction / VLM Analysis + + /// Post a VLM analysis result (from GeminiVLMClient) to the backend. + /// This updates the session checkpoint so the backend has the latest on_task / friction data. + func postAnalysisResult(_ result: DistractionAnalysisResponse, sessionId: String) async throws { + var payload: [String: Any] = [ + "session_id": sessionId, + "on_task": result.onTask, + "confidence": result.confidence, + "vlm_summary": result.vlmSummary ?? "", + "steps_completed": result.stepsCompleted, + ] + if let stepId = result.currentStepId { payload["current_step_id"] = stepId } + if let note = result.checkpointNoteUpdate { payload["checkpoint_note_update"] = note } + if let app = result.appName { payload["app_name"] = app } + if let nudge = result.gentleNudge { payload["gentle_nudge"] = nudge } + if let friction = result.friction { + payload["friction"] = [ + "type": friction.type, + "confidence": friction.confidence, + "description": friction.description as Any, + "proposed_actions": friction.proposedActions.map { + ["label": $0.label, "action_type": $0.actionType, "details": $0.details as Any] + }, + ] + } + let body = try JSONSerialization.data(withJSONObject: payload) + _ = try await req("/distractions/analyze-result", method: "POST", body: body) + } func analyzeScreenshot( imageData: Data, diff --git a/LockInBro/AuthManager.swift b/LockInBro/AuthManager.swift index 57221a4..1692410 100644 --- a/LockInBro/AuthManager.swift +++ b/LockInBro/AuthManager.swift @@ -22,6 +22,7 @@ final class AuthManager { do { let response = try await APIClient.shared.login(email: email, password: password) TokenStore.shared.token = response.accessToken + TokenStore.shared.refreshToken = response.refreshToken currentUser = response.user isLoggedIn = true } catch { @@ -40,6 +41,7 @@ final class AuthManager { displayName: displayName ) TokenStore.shared.token = response.accessToken + TokenStore.shared.refreshToken = response.refreshToken currentUser = response.user isLoggedIn = true } catch { @@ -58,6 +60,7 @@ final class AuthManager { fullName: fullName ) TokenStore.shared.token = response.accessToken + TokenStore.shared.refreshToken = response.refreshToken currentUser = response.user isLoggedIn = true } catch { @@ -67,8 +70,20 @@ final class AuthManager { } func logout() { - TokenStore.shared.token = nil + SessionManager.shared.stopMonitoring() + TokenStore.shared.clear() currentUser = nil isLoggedIn = false + errorMessage = nil + } + + /// Called by APIClient when the server returns 401 and the refresh token is also dead. + func handleSessionExpired() { + guard isLoggedIn else { return } + SessionManager.shared.stopMonitoring() + TokenStore.shared.clear() + currentUser = nil + isLoggedIn = false + errorMessage = "Your session expired — please log in again." } } diff --git a/LockInBro/FocusSessionView.swift b/LockInBro/FocusSessionView.swift index 3343a7e..20639cd 100644 --- a/LockInBro/FocusSessionView.swift +++ b/LockInBro/FocusSessionView.swift @@ -401,6 +401,8 @@ private struct ProactiveCardView: View { return description ?? "I noticed something that might be slowing you down." case .appSwitchLoop(let apps, let count): return "You've switched between \(apps.joined(separator: " ↔ ")) \(count)× in a row — are you stuck?" + case .sessionAction(_, _, let checkpoint, let reason, _): + return checkpoint.isEmpty ? reason : "Left off: \(checkpoint)" } } } diff --git a/LockInBro/LockInBroApp.swift b/LockInBro/LockInBroApp.swift index 32326b0..feeb7ce 100644 --- a/LockInBro/LockInBroApp.swift +++ b/LockInBro/LockInBroApp.swift @@ -2,8 +2,24 @@ import SwiftUI +// MARK: - AppDelegate (subprocess cleanup on quit) + +final class AppDelegate: NSObject, NSApplicationDelegate { + /// Called for normal quits (Cmd+Q), window close, and SIGTERM. + /// Ensures the argus subprocess is killed before the process exits. + func applicationWillTerminate(_ notification: Notification) { + // applicationWillTerminate runs on the main thread, so we can safely + // call @MainActor methods synchronously via assumeIsolated. + MainActor.assumeIsolated { + SessionManager.shared.stopMonitoring() + } + } +} + @main struct LockInBroApp: App { + @NSApplicationDelegateAdaptor(AppDelegate.self) var appDelegate + @State private var auth = AuthManager.shared @State private var session = SessionManager.shared @@ -13,9 +29,11 @@ struct LockInBroApp: App { ContentView() .environment(auth) .environment(session) - .onChange(of: session.isSessionActive) { _, isActive in - if isActive { + .onChange(of: auth.isLoggedIn, initial: true) { _, loggedIn in + if loggedIn { + // Show HUD and start always-on monitoring as soon as user logs in FloatingPanelController.shared.show(session: session) + Task { await session.startMonitoring() } } else { FloatingPanelController.shared.close() } diff --git a/LockInBro/Models.swift b/LockInBro/Models.swift index 71297d6..4902847 100644 --- a/LockInBro/Models.swift +++ b/LockInBro/Models.swift @@ -109,6 +109,32 @@ struct Step: Identifiable, Codable, Hashable { // MARK: - Focus Session +/// Subset of the JSONB checkpoint dict stored on the backend session record. +/// Populated by argus when it POSTs to /distractions/analyze-result. +struct SessionCheckpoint: Codable { + /// Written by POST /distractions/analyze-result (argus live mode). + let lastVlmSummary: String? + /// Written by POST /distractions/analyze-screenshot (Swift fallback). + let lastScreenshotAnalysis: String? + /// Concise summary of the last completed action. + let lastActionSummary: String? + /// Frontmost application name at last checkpoint. + let activeApp: String? + /// Running count of distractions logged during this session. + let distractionCount: Int? + + /// Returns whichever VLM summary field is populated, preferring the most recent. + var vlmSummary: String? { lastVlmSummary ?? lastScreenshotAnalysis } + + enum CodingKeys: String, CodingKey { + case lastVlmSummary = "last_vlm_summary" + case lastScreenshotAnalysis = "last_screenshot_analysis" + case lastActionSummary = "last_action_summary" + case activeApp = "active_app" + case distractionCount = "distraction_count" + } +} + struct FocusSession: Identifiable, Codable { let id: String let userId: String @@ -117,9 +143,11 @@ struct FocusSession: Identifiable, Codable { let startedAt: String var endedAt: String? var status: String + /// Live checkpoint data written by argus (nil when no checkpoint exists yet). + var checkpoint: SessionCheckpoint? enum CodingKeys: String, CodingKey { - case id, platform, status + case id, platform, status, checkpoint case userId = "user_id" case taskId = "task_id" case startedAt = "started_at" @@ -144,8 +172,10 @@ struct BrainDumpResponse: Codable { struct ParsedTask: Codable, Identifiable { // local UUID for list identity before saving var localId: String = UUID().uuidString - var id: String { localId } + var id: String { taskId ?? localId } + /// Set by backend when the brain-dump endpoint creates the task automatically. + let taskId: String? let title: String let description: String? let priority: Int @@ -154,6 +184,7 @@ struct ParsedTask: Codable, Identifiable { let tags: [String] enum CodingKeys: String, CodingKey { + case taskId = "task_id" case title, description, priority, deadline, tags case estimatedMinutes = "estimated_minutes" } @@ -208,14 +239,29 @@ struct FrictionInfo: Codable { var isResumption: Bool { type == "task_resumption" } } +/// Session lifecycle action suggested by the VLM (new argus feature). +struct SessionAction: Codable { + /// resume | switch | complete | start_new | none + let type: String + let sessionId: String? + let reason: String? + + enum CodingKeys: String, CodingKey { + case type, reason + case sessionId = "session_id" + } +} + struct DistractionAnalysisResponse: Codable { let onTask: Bool let currentStepId: String? + let inferredTask: String? let checkpointNoteUpdate: String? let stepsCompleted: [String] // Upgraded Argus prompt fields (nil when backend uses legacy prompt) let friction: FrictionInfo? - let intent: String? // skimming | engaged | unclear | null + let sessionAction: SessionAction? // new argus: session lifecycle suggestions + let intent: String? // skimming | engaged | unclear | null let distractionType: String? let appName: String? let confidence: Double @@ -225,9 +271,11 @@ struct DistractionAnalysisResponse: Codable { enum CodingKeys: String, CodingKey { case onTask = "on_task" case currentStepId = "current_step_id" + case inferredTask = "inferred_task" case checkpointNoteUpdate = "checkpoint_note_update" case stepsCompleted = "steps_completed" case friction, intent + case sessionAction = "session_action" case distractionType = "distraction_type" case appName = "app_name" case confidence @@ -298,6 +346,8 @@ struct ProactiveCard: Identifiable { case vlmFriction(frictionType: String, description: String?, actions: [ProposedAction]) /// Heuristic app-switch loop detected by NSWorkspace observer (fallback when VLM hasn't returned friction yet). case appSwitchLoop(apps: [String], switchCount: Int) + /// VLM suggests a session lifecycle action (new argus: resume, switch, complete, start_new). + case sessionAction(type: String, taskTitle: String, checkpoint: String, reason: String, sessionId: String?) } let id = UUID() @@ -316,6 +366,14 @@ struct ProactiveCard: Identifiable { } case .appSwitchLoop: return "Repetitive Pattern Detected" + case .sessionAction(let type, let taskTitle, _, _, _): + switch type { + case "resume": return "Resume: \(taskTitle)" + case "switch": return "Switch to: \(taskTitle)" + case "complete": return "Done with \(taskTitle)?" + case "start_new": return "Start a Focus Session?" + default: return "Session Suggestion" + } } } @@ -332,6 +390,14 @@ struct ProactiveCard: Identifiable { } case .appSwitchLoop: return "arrow.triangle.2.circlepath" + case .sessionAction(let type, _, _, _, _): + switch type { + case "resume": return "arrow.counterclockwise.circle" + case "switch": return "arrow.left.arrow.right" + case "complete": return "checkmark.circle" + case "start_new": return "plus.circle" + default: return "circle" + } } } } diff --git a/LockInBro/SessionManager.swift b/LockInBro/SessionManager.swift index 4fb02d9..dfa4945 100644 --- a/LockInBro/SessionManager.swift +++ b/LockInBro/SessionManager.swift @@ -1,4 +1,6 @@ -// SessionManager.swift — Focus session state, screenshot engine, distraction detection +// SessionManager.swift — Focus session state, native VLM screen analysis +// Screenshot capture → Gemini Vision API → apply results to UI + post to backend. +// No Python subprocess. No external process management. import AppKit import SwiftUI @@ -25,30 +27,27 @@ final class SessionManager { var errorMessage: String? var isLoading: Bool = false - // Proactive agent + // VLM / proactive agent var proactiveCard: ProactiveCard? - /// Set when the user approves a proposed action — shown as a confirmation toast - var approvedActionLabel: String? - /// Latest one-sentence summary from the VLM, shown in the floating HUD var latestVlmSummary: String? - /// True while the argus executor is running an approved action + var latestInferredTask: String? var isExecuting: Bool = false - /// Result produced by the executor's output() tool — shown as a sticky card in the HUD var executorOutput: (title: String, content: String)? + var monitoringError: String? // Screenshot engine var isCapturing: Bool = false - private var captureTask: Task? + @ObservationIgnored private var captureTask: Task? private let captureInterval: TimeInterval = 5.0 - // Rolling screenshot history buffer (max 4 entries, ~20-second window) - // Provides temporal context to the VLM so it can detect patterns across captures. - private struct ScreenshotHistoryEntry { - let summary: String // vlm_summary text from the previous analysis - let timestamp: Date - } - @ObservationIgnored private var screenshotHistory: [ScreenshotHistoryEntry] = [] + // Frame buffer — accumulate N frames before calling VLM for temporal diff context + @ObservationIgnored private var frameBuffer: [Data] = [] + private let framesPerVLMCall = 3 + + // Rolling summary history fed as context into subsequent VLM calls + private struct HistoryEntry { let summary: String; let timestamp: Date } + @ObservationIgnored private var screenshotHistory: [HistoryEntry] = [] // App switch tracking @ObservationIgnored private var appSwitches: [(name: String, bundleId: String, time: Date)] = [] @@ -56,15 +55,8 @@ final class SessionManager { @ObservationIgnored private var lastApp: (name: String, bundleId: String) = ("", "") @ObservationIgnored private var lastAppEnteredAt: Date = Date() - // Argus subprocess (device-side VLM) - @ObservationIgnored private var argusProcess: Process? - @ObservationIgnored private var argusReadTask: Task? - @ObservationIgnored private var argusStdinPipe: Pipe? - /// Whether the current proactive card came from VLM (needs argus stdin response) vs local heuristic - @ObservationIgnored private var proactiveCardNeedsArgusResponse = false + // Proactive card auto-dismiss timer @ObservationIgnored private var proactiveCardTimer: Task? - private let argusPythonPath = "/Users/joyzhuo/miniconda3/envs/gmr/bin/python3" - private let argusRepoPath = "/Users/joyzhuo/yhack/lockinbro-argus" private init() {} @@ -83,9 +75,96 @@ final class SessionManager { return Date().timeIntervalSince(start) } + // MARK: - Monitoring Lifecycle + + /// Immediately shuts down all monitoring without making any API calls. + func stopMonitoring() { + stopCapture() + stopAppObserver() + proactiveCardTimer?.cancel() + proactiveCardTimer = nil + activeSession = nil + activeTask = nil + activeSteps = [] + isSessionActive = false + sessionStartDate = nil + lastNudge = nil + resumeCard = nil + showingResumeCard = false + proactiveCard = nil + latestVlmSummary = nil + latestInferredTask = nil + isExecuting = false + executorOutput = nil + monitoringError = nil + screenshotHistory = [] + frameBuffer = [] + persistedSessionId = nil + } + + /// Called once after login. Auto-resumes any existing active session and starts the capture loop. + func startMonitoring() async { + guard TokenStore.shared.token != nil else { return } + guard !isCapturing else { return } + + monitoringError = nil + await requestNotificationPermission() + + // Silent preflight — never shows UI; only request permission if not yet granted. + if !CGPreflightScreenCaptureAccess() { + CGRequestScreenCaptureAccess() + monitoringError = "Screen Recording permission required — enable in System Settings → Privacy & Security → Screen Recording, then tap Retry" + return + } + + do { + if let existing = try await APIClient.shared.getActiveSession() { + await autoResumeSession(existing) + } else { + startCapture() + startAppObserver() + } + } catch { + startCapture() + startAppObserver() + } + } + + /// Silently resume an active session found on the backend (no loading UI shown). + private func autoResumeSession(_ session: FocusSession) async { + activeSession = session + persistedSessionId = session.id + isSessionActive = true + sessionStartDate = Date() + distractionCount = 0 + lastNudge = nil + screenshotHistory = [] + frameBuffer = [] + + if let taskId = session.taskId { + do { + let tasks = try await APIClient.shared.getTasks() + activeTask = tasks.first(where: { $0.id == taskId }) + if let task = activeTask { + let steps = try await APIClient.shared.getSteps(taskId: task.id) + activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder } + currentStepIndex = activeSteps.firstIndex(where: { $0.isActive }) + ?? activeSteps.firstIndex(where: { $0.status == "pending" }) + ?? 0 + } + } catch {} + } + + let shortId = String(session.id.prefix(8)) + let taskLabel = activeTask?.title ?? "(no task)" + latestVlmSummary = "Resumed session \(shortId) · \(taskLabel)" + + startCapture() + startAppObserver() + } + // MARK: - Session Lifecycle - // Persisted so we can end a stale session after an app restart private var persistedSessionId: String? { get { UserDefaults.standard.string(forKey: "lockInBro.lastSessionId") } set { @@ -98,18 +177,16 @@ final class SessionManager { isLoading = true errorMessage = nil do { - let session: FocusSession - do { - session = try await APIClient.shared.startSession(taskId: task?.id) - } catch NetworkError.httpError(409, _) { - // End whichever session is active — prefer the locally known one, - // fall back to the last persisted ID (survives app restarts) - let staleId = activeSession?.id ?? persistedSessionId - if let id = staleId { - _ = try? await APIClient.shared.endSession(sessionId: id, status: "completed") - } - session = try await APIClient.shared.startSession(taskId: task?.id) + // End any existing session first + var staleId: String? = activeSession?.id ?? persistedSessionId + if staleId == nil { + staleId = (try? await APIClient.shared.getActiveSession())?.id } + if let id = staleId { + _ = try? await APIClient.shared.endSession(sessionId: id, status: "completed") + } + + let session = try await APIClient.shared.startSession(taskId: task?.id) activeSession = session persistedSessionId = session.id activeTask = task @@ -119,20 +196,22 @@ final class SessionManager { sessionStartDate = Date() distractionCount = 0 lastNudge = nil + screenshotHistory = [] + frameBuffer = [] if let task { let steps = try await APIClient.shared.getSteps(taskId: task.id) activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder } - // Pick first in-progress or first pending step currentStepIndex = activeSteps.firstIndex(where: { $0.isActive }) ?? activeSteps.firstIndex(where: { $0.status == "pending" }) ?? 0 } - screenshotHistory = [] await requestNotificationPermission() - startArgus(session: session, task: task) - startAppObserver() + // Restart capture loop (in case it wasn't running or was in monitoring-only mode) + stopCapture() + startCapture() + if appSwitchObserver == nil { startAppObserver() } } catch { errorMessage = error.localizedDescription } @@ -140,7 +219,6 @@ final class SessionManager { } func endSession(status: String = "completed") async { - stopArgus() stopCapture() stopAppObserver() if let session = activeSession { @@ -155,14 +233,21 @@ final class SessionManager { resumeCard = nil showingResumeCard = false proactiveCard = nil - approvedActionLabel = nil latestVlmSummary = nil + latestInferredTask = nil isExecuting = false executorOutput = nil proactiveCardTimer?.cancel() proactiveCardTimer = nil screenshotHistory = [] + frameBuffer = [] persistedSessionId = nil + + // Keep the capture loop running for app-switch heuristics + if TokenStore.shared.token != nil { + startCapture() + startAppObserver() + } } func fetchResumeCard() async { @@ -183,7 +268,6 @@ final class SessionManager { if let idx = activeSteps.firstIndex(where: { $0.id == updated.id }) { activeSteps[idx] = updated } - // Advance to next pending if let next = activeSteps.firstIndex(where: { $0.status == "pending" }) { currentStepIndex = next } @@ -192,12 +276,19 @@ final class SessionManager { } } + // MARK: - Retry (HUD Retry button) + + func retryMonitoring() { + monitoringError = nil + frameBuffer = [] + stopCapture() + startCapture() + if appSwitchObserver == nil { startAppObserver() } + } + // MARK: - Proactive Card Lifecycle - /// Show a proactive card and start the 15-second auto-dismiss timer. - /// - Parameter vlmCard: Pass true when the card came from VLM so argus gets a stdin response on dismiss. - private func showProactiveCard(_ card: ProactiveCard, vlmCard: Bool = false) { - proactiveCardNeedsArgusResponse = vlmCard + private func showProactiveCard(_ card: ProactiveCard) { proactiveCardTimer?.cancel() withAnimation { proactiveCard = card } @@ -208,31 +299,44 @@ final class SessionManager { } } - /// Dismiss the current card (user tapped "Not now" or 15s elapsed). func dismissProactiveCard() { proactiveCardTimer?.cancel() proactiveCardTimer = nil withAnimation { proactiveCard = nil } - if proactiveCardNeedsArgusResponse { sendArgusResponse(0) } - proactiveCardNeedsArgusResponse = false } - /// Approve action at the given index (0-based). Argus stdin uses 1-based (1 = action 0). func approveProactiveCard(actionIndex: Int) { proactiveCardTimer?.cancel() proactiveCardTimer = nil + let card = proactiveCard withAnimation { proactiveCard = nil } - if proactiveCardNeedsArgusResponse { - sendArgusResponse(actionIndex + 1) - isExecuting = true + guard case .vlmFriction(_, _, let actions) = card?.source, + actionIndex < actions.count else { return } + let action = actions[actionIndex] + isExecuting = true + Task { + do { + let screenshot = await captureScreen() + let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? "" + guard !geminiKey.isEmpty else { + isExecuting = false + executorOutput = (title: action.label, content: action.details ?? "Action approved.") + return + } + let client = GeminiVLMClient(apiKey: geminiKey) + let result = try await client.executeAction( + label: action.label, + actionType: action.actionType, + details: action.details ?? "", + screenshot: screenshot + ) + isExecuting = false + executorOutput = (title: action.label, content: result) + } catch { + isExecuting = false + executorOutput = (title: action.label, content: action.details ?? "Couldn't complete automatically.") + } } - proactiveCardNeedsArgusResponse = false - } - - private func sendArgusResponse(_ choice: Int) { - guard let pipe = argusStdinPipe, - let data = "\(choice)\n".data(using: .utf8) else { return } - try? pipe.fileHandleForWriting.write(contentsOf: data) } // MARK: - App Switch Observer @@ -269,7 +373,7 @@ final class SessionManager { guard name != lastApp.name else { return } - // Log previous app's dwell time to backend (fire-and-forget) + // Log previous app dwell time to backend let duration = max(1, Int(now.timeIntervalSince(lastAppEnteredAt))) let prev = lastApp if let session = activeSession, !prev.name.isEmpty { @@ -289,159 +393,36 @@ final class SessionManager { appSwitches.append((name: name, bundleId: bundleId, time: now)) if appSwitches.count > 30 { appSwitches.removeFirst() } - // Only trigger card during active session and when none is already showing guard isSessionActive, proactiveCard == nil else { return } if let loop = detectRepetitiveLoop() { - showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count)), vlmCard: false) + showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count))) } } - // Detects a back-and-forth pattern between exactly 2 apps within a 5-minute window. - // Requires 3 full cycles (6 consecutive alternating switches) to avoid false positives. private func detectRepetitiveLoop() -> (apps: [String], count: Int)? { let cutoff = Date().addingTimeInterval(-300) let recent = appSwitches.filter { $0.time > cutoff }.map(\.name) guard recent.count >= 6 else { return nil } - let last6 = Array(recent.suffix(6)) guard Set(last6).count == 2 else { return nil } - - // Strictly alternating — no two consecutive identical app names for i in 1.. framesPerVLMCall { frameBuffer.removeFirst() } - // Inject rolling history so the VLM has temporal context across captures. - // Only summaries (text) are sent — not the raw images — to keep token cost low. - if !screenshotHistory.isEmpty { - let iso = ISO8601DateFormatter() - context["screenshot_history"] = screenshotHistory.map { entry in - ["summary": entry.summary, "timestamp": iso.string(from: entry.timestamp)] - } + // Only call VLM once we have a full batch for temporal diff analysis + guard frameBuffer.count >= framesPerVLMCall else { return } + + let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? "" + guard !geminiKey.isEmpty else { + print("[VLM] No Gemini API key set — skipping analysis") + return } - do { - let result = try await APIClient.shared.analyzeScreenshot( - imageData: imageData, - windowTitle: windowTitle, - sessionId: session.id, - taskContext: context - ) + let client = GeminiVLMClient(apiKey: geminiKey) + let windowTitle = NSWorkspace.shared.frontmostApplication?.localizedName ?? "" + let recentSummaries = screenshotHistory.map(\.summary) + let frames = frameBuffer // snapshot before async gap - // Append this result's summary to the rolling buffer (max 4 entries) - if let summary = result.vlmSummary { - screenshotHistory.append(ScreenshotHistoryEntry(summary: summary, timestamp: Date())) + do { + print("[VLM] Calling Gemini with \(frames.count) frames…") + let result = try await client.analyze( + frames: frames, + taskTitle: activeTask?.title ?? "", + taskGoal: activeTask?.description ?? "", + steps: activeSteps, + windowTitle: windowTitle, + recentSummaries: recentSummaries + ) + print("[VLM] Result: on_task=\(result.onTask), friction=\(result.friction?.type ?? "none"), summary=\(result.vlmSummary ?? "")") + + // Append to rolling summary history + if let summary = result.vlmSummary, !summary.isEmpty { + screenshotHistory.append(HistoryEntry(summary: summary, timestamp: Date())) if screenshotHistory.count > 4 { screenshotHistory.removeFirst() } } + // Clear frame buffer — next batch starts fresh + frameBuffer.removeAll() + + monitoringError = nil applyDistractionResult(result) + + // Post result to backend (fire-and-forget) + if let session = activeSession { + Task { + try? await APIClient.shared.postAnalysisResult(result, sessionId: session.id) + } + } } catch { - // Silent fail — don't interrupt the user + print("[VLM] Analysis error: \(error)") + // Don't surface transient errors — the next attempt will retry automatically } } + // MARK: - Screen Capture + private func captureScreen() async -> Data? { + guard CGPreflightScreenCaptureAccess() else { return nil } do { let content = try await SCShareableContent.current guard let display = content.displays.first else { return nil } - let config = SCStreamConfiguration() config.width = 1280 config.height = 720 - let filter = SCContentFilter(display: display, excludingWindows: []) let image = try await SCScreenshotManager.captureImage( - contentFilter: filter, - configuration: config - ) + contentFilter: filter, configuration: config) return cgImageToJPEG(image) } catch { return nil @@ -518,29 +520,13 @@ final class SessionManager { return jpeg } - private func buildTaskContext() -> [String: Any] { - var ctx: [String: Any] = [:] - guard let task = activeTask else { return ctx } - ctx["task_title"] = task.title - ctx["task_goal"] = task.description ?? task.title - ctx["steps"] = activeSteps.map { step -> [String: Any] in - var s: [String: Any] = [ - "id": step.id, - "sort_order": step.sortOrder, - "title": step.title, - "status": step.status - ] - if let note = step.checkpointNote { s["checkpoint_note"] = note } - return s - } - return ctx - } + // MARK: - Apply VLM Result private func applyDistractionResult(_ result: DistractionAnalysisResponse) { - // 0. Store latest summary for the floating HUD if let summary = result.vlmSummary { latestVlmSummary = summary } + if let task = result.inferredTask, !task.isEmpty { latestInferredTask = task } - // 1. Apply step side-effects (always) + // Apply step side-effects for completedId in result.stepsCompleted { if let idx = activeSteps.firstIndex(where: { $0.id == completedId }) { activeSteps[idx].status = "done" @@ -556,22 +542,25 @@ final class SessionManager { currentStepIndex = idx } - // 2. Notification priority (design spec §1.5): - // Proactive friction help → Context resume → Gentle nudge - // NEVER nudge when the system could help instead. - if let friction = result.friction, friction.isActionable { - if friction.isResumption { - // Task resumption detected — auto-surface resume card without button press - Task { await fetchResumeCard() } - } else if proactiveCard == nil { - showProactiveCard(ProactiveCard(source: .vlmFriction( - frictionType: friction.type, - description: friction.description, - actions: friction.proposedActions - )), vlmCard: true) + // Notification priority: friction card (formal or has actions) → nudge + if let friction = result.friction { + let shouldShow = friction.isActionable || !friction.proposedActions.isEmpty + if shouldShow { + if friction.isResumption { + Task { await fetchResumeCard() } + } else if proactiveCard == nil { + showProactiveCard(ProactiveCard(source: .vlmFriction( + frictionType: friction.type, + description: friction.description, + actions: friction.proposedActions + ))) + } + } else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge { + distractionCount += 1 + lastNudge = nudge + sendNudgeNotification(nudge) } } else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge { - // Only nudge if VLM found no actionable friction distractionCount += 1 lastNudge = nudge sendNudgeNotification(nudge) diff --git a/README.md b/README.md new file mode 100644 index 0000000..8b95fca --- /dev/null +++ b/README.md @@ -0,0 +1,104 @@ +# LockInBro + +ADHD-friendly macOS focus assistant. Monitors your screen with a local VLM agent (Argus), detects friction and distractions, and nudges you back on track. + +## Requirements + +- macOS 14+ +- Xcode 16+ +- Python 3.11+ (system or Homebrew) + +## Setup + +### 1. Python environment + +Create the venv once from the project root: + +```bash +cd ~/yhack/LockInBro +python3 -m venv .venv +.venv/bin/pip install -r argus/requirements.txt +``` + +### 2. API keys + +Create a `.env` file in the project root (next to `argus/`): + +```bash +cp argus/.env.example .env # if example exists, otherwise create manually +``` + +`.env` contents: + +``` +GEMINI_API_KEY=your_gemini_api_key_here +BACKEND_BASE_URL=https://wahwa.com/api/v1 +``` + +The Gemini API key is also set at runtime from the app's Settings screen — the `.env` is only needed if running Argus directly from the command line. + +### 3. Xcode permissions + +In Xcode → Signing & Capabilities, ensure the app has: +- **Screen Recording** — required for screenshot capture +- **App Sandbox** disabled (or `com.apple.security.screen-recording` entitlement added) + +### 4. Run + +Open `LockInBro.xcodeproj` in Xcode and press **Run** (⌘R). + +On first launch: +- Grant **Screen Recording** permission when prompted +- Log in or register via the app +- Enter your Gemini API key in Settings + +--- + +## Argus (VLM agent) + +The `argus/` directory contains the Python screen-analysis agent. It runs as a subprocess of the Swift app — you do not need to launch it manually. + +### Running Argus directly (for debugging) + +```bash +cd ~/yhack/LockInBro +.venv/bin/python3 -m argus \ + --vlm gemini \ + --gemini-key YOUR_KEY \ + --dry-run \ + --task-title "debug run" +``` + +### Recreating the venv + +```bash +rm -rf .venv +python3 -m venv .venv +.venv/bin/pip install -r argus/requirements.txt +``` + +--- + +## Project structure + +``` +LockInBro/ +├── .venv/ # Python venv (gitignored) +├── .env # API keys (gitignored) +├── argus/ # VLM agent (Python) +│ ├── requirements.txt +│ ├── capture.py # Reads screenshots from /tmp/lockinbro_capture.jpg +│ ├── loop.py # Main analysis loop +│ ├── vlm.py # Gemini / Ollama client +│ └── ... +├── LockInBro/ # Swift app source +│ ├── SessionManager.swift +│ ├── APIClient.swift +│ └── ... +└── LockInBro.xcodeproj +``` + +## Backend + +Deployed at `https://wahwa.com/api/v1` (FastAPI on DigitalOcean). +Source: `~/yhack/lockinbro-api` diff --git a/argus/__init__.py b/argus/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/argus/__main__.py b/argus/__main__.py new file mode 100644 index 0000000..a7e8d9d --- /dev/null +++ b/argus/__main__.py @@ -0,0 +1,113 @@ +"""CLI entry point: python -m argus [options]""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import logging +import sys + +from argus.config import CAPTURE_INTERVAL_S +from argus.loop import run_loop +from argus.vlm import StepInfo, TaskContext + + +def _parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="argus", + description="Argus VLM — proactive focus assistant screen analyzer", + ) + p.add_argument("--session-id", default="00000000-0000-0000-0000-000000000000") + p.add_argument("--task-title", default="(no task)") + p.add_argument("--task-goal", default="") + p.add_argument( + "--steps-json", + default=None, + help='JSON array of steps: [{"id":"...", "sort_order":1, "title":"...", "status":"pending"}]', + ) + p.add_argument("--window-title", default="") + p.add_argument("--vlm", choices=["ollama", "gemini"], default=None, help="VLM backend (default: ollama)") + p.add_argument("--gemini-key", default=None, help="Override GEMINI_API_KEY env var") + p.add_argument("--jwt", default=None, help="Override BACKEND_JWT env var") + p.add_argument("--backend-url", default=None, help="Override BACKEND_BASE_URL env var") + p.add_argument("--dry-run", action="store_true", help="Print JSON instead of POSTing") + p.add_argument("--execute", action="store_true", help="Enable notification + executor flow") + p.add_argument("--mock-sessions", default=None, help='JSON array of mock sessions: [{"task_title":"...", "status":"interrupted", "last_app":"VS Code", "last_file":"solution.cpp", "checkpoint_note":"stuck on impl"}]') + p.add_argument("--iterations", type=int, default=None, help="Stop after N iterations") + p.add_argument("-v", "--verbose", action="store_true") + return p.parse_args() + + +def main() -> None: + args = _parse_args() + + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(asctime)s %(levelname)-5s %(name)s %(message)s", + datefmt="%H:%M:%S", + ) + # httpx logs every request/response at INFO — suppress to WARNING to avoid noisy 404s + logging.getLogger("httpx").setLevel(logging.WARNING) + logging.getLogger("httpcore").setLevel(logging.WARNING) + + steps: list[StepInfo] = [] + if args.steps_json: + for s in json.loads(args.steps_json): + steps.append( + StepInfo( + id=s["id"], + sort_order=s["sort_order"], + title=s["title"], + status=s.get("status", "pending"), + checkpoint_note=s.get("checkpoint_note"), + ) + ) + + ctx = TaskContext( + task_title=args.task_title, + task_goal=args.task_goal, + steps=steps, + window_title=args.window_title, + session_id=args.session_id, + ) + + # Parse mock sessions if provided + mock_sessions = None + if args.mock_sessions: + from argus.session import SessionInfo + mock_sessions = [] + for i, s in enumerate(json.loads(args.mock_sessions)): + mock_sessions.append( + SessionInfo( + session_id=s.get("session_id", f"mock-{i}"), + task_id=s.get("task_id"), + task_title=s.get("task_title", ""), + task_goal=s.get("task_goal", ""), + status=s.get("status", "interrupted"), + last_app=s.get("last_app", ""), + last_file=s.get("last_file", ""), + checkpoint_note=s.get("checkpoint_note", ""), + started_at=s.get("started_at", ""), + ended_at=s.get("ended_at"), + minutes_ago=s.get("minutes_ago", 30), + ) + ) + + asyncio.run( + run_loop( + ctx, + api_key=args.gemini_key, + vlm_backend=args.vlm, + jwt=args.jwt, + base_url=args.backend_url, + dry_run=args.dry_run, + max_iterations=args.iterations, + auto_execute=args.execute, + mock_sessions=mock_sessions, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/argus/backend.py b/argus/backend.py new file mode 100644 index 0000000..b0cb84d --- /dev/null +++ b/argus/backend.py @@ -0,0 +1,50 @@ +"""Backend API client — sends VLM analysis results to the LockInBro server. + +POST /distractions/analyze-result with the enriched JSON payload. +""" + +from __future__ import annotations + +import logging + +import httpx + +from argus.config import BACKEND_BASE_URL, BACKEND_JWT +from argus.vlm import VLMResult + +log = logging.getLogger(__name__) + + +async def send_analysis( + result: VLMResult, + session_id: str, + *, + jwt: str | None = None, + base_url: str | None = None, +) -> dict: + """Post VLM analysis result to the backend. + + Returns the backend response body on success, or an error dict. + """ + url = f"{base_url or BACKEND_BASE_URL}/distractions/analyze-result" + token = jwt or BACKEND_JWT + payload = result.to_backend_payload(session_id) + + headers = {} + if token: + headers["Authorization"] = f"Bearer {token}" + + log.info("Sending analysis to %s on_task=%s", url, result.on_task) + log.debug("Payload: %s", payload) + + try: + async with httpx.AsyncClient(timeout=10.0) as client: + resp = await client.post(url, json=payload, headers=headers) + resp.raise_for_status() + return resp.json() + except httpx.HTTPStatusError as exc: + log.error("Backend returned %s: %s", exc.response.status_code, exc.response.text) + return {"error": str(exc), "status_code": exc.response.status_code} + except httpx.RequestError as exc: + log.error("Backend unreachable: %s", exc) + return {"error": str(exc)} diff --git a/argus/buffer.py b/argus/buffer.py new file mode 100644 index 0000000..19f4df2 --- /dev/null +++ b/argus/buffer.py @@ -0,0 +1,123 @@ +"""Rolling history buffer for VLM screenshot analysis. + +Two tiers: + - Image buffer: deque(maxlen=4) of recent screenshots sent as images + - Text history: deque(maxlen=12) of older VLM summaries + previous outputs + for extended context (what happened 30-60s ago) and self-refinement +""" + +from __future__ import annotations + +import time +from collections import deque +from dataclasses import dataclass, field + + +@dataclass +class BufferEntry: + jpeg: bytes + vlm_summary: str + timestamp: float = field(default_factory=time.time) + + +@dataclass +class TextEntry: + vlm_summary: str + timestamp: float + + +class HistoryBuffer: + def __init__(self, image_maxlen: int = 4, text_maxlen: int = 12): + self._images: deque[BufferEntry] = deque(maxlen=image_maxlen) + self._text_history: deque[TextEntry] = deque(maxlen=text_maxlen) + self._last_output: dict | None = None + self._last_execution: str | None = None + + def push(self, jpeg: bytes, vlm_summary: str) -> None: + now = time.time() + # When an image entry gets evicted from the image buffer, + # it's already captured in text_history, so nothing extra needed. + self._images.append(BufferEntry(jpeg=jpeg, vlm_summary=vlm_summary, timestamp=now)) + self._text_history.append(TextEntry(vlm_summary=vlm_summary, timestamp=now)) + + def set_last_output(self, output: dict) -> None: + """Store the previous VLM JSON output for self-refinement.""" + self._last_output = output + + def set_last_execution(self, summary: str | None) -> None: + """Store the result of the last executor action.""" + self._last_execution = summary + + def get_last_execution(self) -> str | None: + return self._last_execution + + def clear_last_execution(self) -> None: + self._last_execution = None + + def get_entries(self) -> list[BufferEntry]: + """Return image entries oldest-first.""" + return list(self._images) + + def format_for_prompt(self) -> str: + """Format the full timeline: text history + image labels.""" + now = time.time() + lines: list[str] = [] + + # Text-only history (entries older than what's in the image buffer) + image_timestamps = {round(e.timestamp, 2) for e in self._images} + older = [ + e for e in self._text_history + if round(e.timestamp, 2) not in image_timestamps + ] + older = [e for e in older if e.vlm_summary] # skip empty summaries + if older: + lines.append("Older context (text only, no images):") + for entry in older: + ago = int(now - entry.timestamp) + lines.append(f" - [{ago}s ago] {entry.vlm_summary}") + lines.append("") + + # Image timeline + n = len(self._images) + if n > 0: + lines.append(f"Recent screenshots ({n} prior + 1 current = {n + 1} images):") + for i, entry in enumerate(self._images): + ago = int(now - entry.timestamp) + lines.append(f" - Screenshot {i + 1}/{n + 1}: [{ago}s ago]") + lines.append(f" - Screenshot {n + 1}/{n + 1}: [now] (current)") + else: + lines.append("Screenshots:") + lines.append(" - Screenshot 1/1: [now] (current, first capture)") + + return "\n".join(lines) + + def format_last_output(self) -> str: + """Format previous VLM output for self-refinement context.""" + if not self._last_output: + return "" + + import json + # Only include the key fields, not the full blob + prev = self._last_output + parts = [ + f" on_task: {prev.get('on_task')}", + f" app: {prev.get('app_name')}", + f" friction: {prev.get('friction', {}).get('type')}", + f" summary: {prev.get('vlm_summary')}", + ] + note = prev.get("checkpoint_note_update") + if note: + parts.append(f" checkpoint: {note}") + desc = prev.get("friction", {}).get("description") + if desc: + parts.append(f" friction_desc: {desc}") + return "\n".join(parts) + + def __len__(self) -> int: + return len(self._images) + + def clear(self) -> None: + self._images.clear() + self._text_history.clear() + self._last_output = None + self._last_execution = None diff --git a/argus/capture.py b/argus/capture.py new file mode 100644 index 0000000..ff6a9ad --- /dev/null +++ b/argus/capture.py @@ -0,0 +1,80 @@ +"""Screenshot capture for macOS. + +Reads a JPEG file written by the Swift host app (LockInBro.app), which holds +the Screen Recording TCC permission. The Swift app writes atomically to +/tmp/lockinbro_capture.jpg every ~5s using ScreenCaptureKit. + +capture_screenshot() blocks up to 15 seconds waiting for a fresh file, +then raises RuntimeError if nothing arrives — so loop.py can handle it +gracefully without falling back to the screencapture CLI (which would fail +with a permissions error when called from a Python subprocess). +""" + +from __future__ import annotations + +import io +import os +import time + +from PIL import Image + +from argus.config import SCREENSHOT_JPEG_QUALITY, SCREENSHOT_MAX_WIDTH + +_SWIFT_FRAME_PATH = "/tmp/lockinbro_capture.jpg" +_SWIFT_FRAME_MAX_AGE_S = 10.0 # treat as stale if older than this +_WAIT_TIMEOUT_S = 15.0 # how long to wait for Swift to write the first frame +_WAIT_POLL_S = 0.5 # polling interval while waiting + + +def _to_jpeg(img: Image.Image) -> bytes: + """Convert a PIL Image to JPEG bytes, handling RGBA → RGB.""" + if img.width > SCREENSHOT_MAX_WIDTH: + ratio = SCREENSHOT_MAX_WIDTH / img.width + img = img.resize( + (SCREENSHOT_MAX_WIDTH, int(img.height * ratio)), + Image.LANCZOS, + ) + if img.mode == "RGBA": + img = img.convert("RGB") + buf = io.BytesIO() + img.save(buf, format="JPEG", quality=SCREENSHOT_JPEG_QUALITY) + return buf.getvalue() + + +def _read_swift_frame() -> bytes | None: + """Return JPEG bytes from the Swift-provided file if it exists and is fresh.""" + try: + age = time.time() - os.path.getmtime(_SWIFT_FRAME_PATH) + if age >= _SWIFT_FRAME_MAX_AGE_S: + return None + with open(_SWIFT_FRAME_PATH, "rb") as f: + data = f.read() + img = Image.open(io.BytesIO(data)) + img.load() + return _to_jpeg(img) + except Exception: + return None + + +def capture_screenshot() -> bytes: + """Return JPEG bytes for the current screen. + + Blocks up to _WAIT_TIMEOUT_S seconds waiting for the Swift app to write + a fresh frame. Raises RuntimeError if no frame arrives in time. + """ + deadline = time.time() + _WAIT_TIMEOUT_S + while time.time() < deadline: + frame = _read_swift_frame() + if frame is not None: + return frame + time.sleep(_WAIT_POLL_S) + + raise RuntimeError( + f"No screenshot received from LockInBro.app within {_WAIT_TIMEOUT_S}s. " + "Make sure the app is running and has Screen Recording permission." + ) + + +def load_image_file(path: str) -> bytes: + """Load an image file and return JPEG bytes (for testing).""" + return _to_jpeg(Image.open(path)) diff --git a/argus/config.py b/argus/config.py new file mode 100644 index 0000000..0a7ac8a --- /dev/null +++ b/argus/config.py @@ -0,0 +1,28 @@ +import os +from pathlib import Path + +from dotenv import load_dotenv + +load_dotenv(Path(__file__).resolve().parent.parent / ".env", override=True) + +GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "") +GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.5-pro") +GEMINI_URL = ( + f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL}:generateContent" +) + +# Ollama (local VLM — default) +OLLAMA_BASE_URL = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434") +OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "qwen3.5:9b") + +# VLM backend: "ollama" (default) or "gemini" +VLM_BACKEND = os.environ.get("VLM_BACKEND", "ollama") + +BACKEND_BASE_URL = os.environ.get("BACKEND_BASE_URL", "https://wahwa.com/api/v1") +BACKEND_JWT = os.environ.get("BACKEND_JWT", "") + +CAPTURE_INTERVAL_S = 2.5 # screenshot every 2.5s +BUFFER_MAX_LEN = 4 # 4 frames accumulated +VLM_CALL_EVERY_N = 4 # call VLM every 4 captures (= every 10s) +SCREENSHOT_JPEG_QUALITY = 50 +SCREENSHOT_MAX_WIDTH = 1280 diff --git a/argus/executor.py b/argus/executor.py new file mode 100644 index 0000000..9491fcb --- /dev/null +++ b/argus/executor.py @@ -0,0 +1,384 @@ +"""Agentic executor — uses Gemini 2.5 Pro with tool use to complete actions. + +When the user approves a proposed action, the executor gets: + - The recent screenshots (so it can read source content) + - Tool access: read_file, write_file, run_command + - A loop: Gemini proposes tool calls → we execute → feed results back → repeat + +This is a full agent loop, not a single-shot LLM call. + +Swift portability notes: + - The agent loop is the same HTTP pattern (Gemini function calling API) + - Tool implementations map to FileManager, NSTask, NSPasteboard + - The loop structure is identical in Swift async/await +""" + +from __future__ import annotations + +import asyncio +import base64 +import logging +import os +import subprocess + +import httpx + +from argus.buffer import HistoryBuffer +from argus.config import GEMINI_API_KEY + +log = logging.getLogger(__name__) + +EXECUTOR_MODEL = os.environ.get("EXECUTOR_MODEL", "gemini-2.5-pro") +EXECUTOR_URL = ( + f"https://generativelanguage.googleapis.com/v1beta/models/{EXECUTOR_MODEL}:generateContent" +) + +MAX_AGENT_STEPS = 10 + +# ── Tool definitions (Gemini function calling format) ──────────────────── + +TOOLS = [ + { + "functionDeclarations": [ + { + "name": "read_file", + "description": "Read the contents of a file. Use this to inspect source files, code, configs, etc.", + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Absolute or relative file path to read", + } + }, + "required": ["path"], + }, + }, + { + "name": "output", + "description": "Display content to the user as a sticky note. Use for: extracted text from PDFs/images, form fill suggestions, content the user needs to paste into binary formats (docx, ppt, websites). The user will copy/paste from this.", + "parameters": { + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "Short title (e.g. 'Extracted receipt', 'Form values')", + }, + "content": { + "type": "string", + "description": "The full content to display", + }, + }, + "required": ["title", "content"], + }, + }, + { + "name": "write_file", + "description": "Write content to an EXISTING plain text file (code, markdown, config, txt). Only use when: (1) you have confirmed the file path via read_file or mdfind, AND (2) the file is a plain text format you can write correctly. NEVER use for binary formats (docx, ppt, pdf, images).", + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Absolute file path (must have been confirmed to exist first)", + }, + "content": { + "type": "string", + "description": "Complete file content to write", + }, + }, + "required": ["path", "content"], + }, + }, + { + "name": "run_command", + "description": "Run a shell command and return its output. Use for compiling, testing, listing files, etc.", + "parameters": { + "type": "object", + "properties": { + "command": { + "type": "string", + "description": "The shell command to execute", + } + }, + "required": ["command"], + }, + }, + { + "name": "done", + "description": "Signal that the task is complete. Call this AFTER using output() to present results. Summary should describe what was shown to the user, not file operations.", + "parameters": { + "type": "object", + "properties": { + "summary": { + "type": "string", + "description": "Brief summary of what was done", + } + }, + "required": ["summary"], + }, + }, + ] + } +] + +# ── Tool implementations ───────────────────────────────────────────────── + + +def _exec_read_file(path: str) -> str: + resolved = path if os.path.isabs(path) else os.path.join(os.getcwd(), path) + try: + with open(resolved, "r") as f: + content = f.read() + log.info(" read_file: %s (%d bytes)", resolved, len(content)) + return content + except OSError as e: + return f"Error reading {resolved}: {e}" + + +def _exec_write_file(path: str, content: str) -> str: + resolved = path if os.path.isabs(path) else os.path.join(os.getcwd(), path) + # Safety: only write to existing text files + if not os.path.exists(resolved): + return f"Error: {resolved} does not exist. Use output() instead for new content." + try: + with open(resolved, "r"): + pass # confirm it's readable as text + except (OSError, UnicodeDecodeError): + return f"Error: {resolved} is not a readable text file. Use output() instead." + try: + with open(resolved, "w") as f: + f.write(content) + log.info(" write_file: %s (%d bytes)", resolved, len(content)) + return f"Successfully wrote {len(content)} bytes to {resolved}" + except OSError as e: + return f"Error writing {resolved}: {e}" + + +def _exec_output(title: str, content: str) -> str: + """Display content to the user via terminal. + Swift portability: becomes a sticky note / floating card UI. + """ + print() + print(f"┌── 📋 {title} " + "─" * max(0, 50 - len(title)) + "┐") + for line in content.split("\n"): + print(f"│ {line}") + print(f"└" + "─" * 58 + "┘") + print() + log.info(" output: %s (%d chars)", title, len(content)) + return f"Displayed '{title}' to user ({len(content)} chars)" + + +def _exec_run_command(command: str) -> str: + log.info(" run_command: %s", command[:80]) + try: + result = subprocess.run( + command, shell=True, capture_output=True, text=True, timeout=30 + ) + output = result.stdout + if result.stderr: + output += "\nSTDERR:\n" + result.stderr + if result.returncode != 0: + output += f"\n(exit code {result.returncode})" + return output[:4000] # cap output length + except subprocess.TimeoutExpired: + return "Error: command timed out after 30s" + + +def _execute_tool(name: str, args: dict) -> str: + if name == "read_file": + return _exec_read_file(args["path"]) + elif name == "output": + return _exec_output(args["title"], args["content"]) + elif name == "write_file": + return _exec_write_file(args["path"], args["content"]) + elif name == "run_command": + return _exec_run_command(args["command"]) + elif name == "done": + return args.get("summary", "Done.") + else: + return f"Unknown tool: {name}" + + +# ── Agent loop ─────────────────────────────────────────────────────────── + + +async def execute( + vlm_payload: dict, + action_index: int = 0, + *, + history: HistoryBuffer | None = None, + current_screenshot: bytes | None = None, + api_key: str | None = None, +) -> str | None: + """Run the agentic executor loop. + + The agent can read files, write files, and run commands to complete + the user's approved action. It loops until it calls done() or hits + the step limit. + + Returns a summary of what was done, or None on failure. + """ + friction = vlm_payload.get("friction", {}) + actions = friction.get("proposed_actions", []) + if action_index >= len(actions): + log.warning("Action index %d out of range", action_index) + return None + + chosen = actions[action_index] + key = api_key or GEMINI_API_KEY + if not key: + log.warning("No API key for executor") + return None + + log.info("Agent executing: %s", chosen.get("label", "?")[:80]) + + # Build initial message with screenshots + task context + initial_parts = _build_initial_parts(vlm_payload, chosen, history, current_screenshot) + + # Conversation history for the agent loop + contents = [{"role": "user", "parts": initial_parts}] + + for step in range(MAX_AGENT_STEPS): + log.debug("Agent step %d/%d", step + 1, MAX_AGENT_STEPS) + + payload = { + "contents": contents, + "tools": TOOLS, + "generationConfig": {"temperature": 0.2, "maxOutputTokens": 8192}, + } + + try: + async with httpx.AsyncClient(timeout=120.0) as client: + for attempt in range(3): + resp = await client.post(f"{EXECUTOR_URL}?key={key}", json=payload) + if resp.status_code == 429: + wait = 2 ** attempt + log.warning("Executor 429, retrying in %ds...", wait) + await asyncio.sleep(wait) + continue + resp.raise_for_status() + break + else: + resp.raise_for_status() + except Exception: + log.exception("Agent API call failed at step %d", step + 1) + return None + + body = resp.json() + candidate = body["candidates"][0] + response_parts = candidate["content"]["parts"] + + # Add assistant response to conversation + contents.append({"role": "model", "parts": response_parts}) + + # Check for function calls + function_calls = [p for p in response_parts if "functionCall" in p] + + if not function_calls: + # No tool calls — agent returned text, we're done + text = "".join(p.get("text", "") for p in response_parts) + log.info("Agent finished with text response (step %d)", step + 1) + return text.strip() if text.strip() else "Done." + + # Execute each tool call + tool_results = [] + done_summary = None + + for fc_part in function_calls: + fc = fc_part["functionCall"] + name = fc["name"] + args = fc.get("args", {}) + + print(f" 🔧 {name}({_summarize_args(args)})") + result = _execute_tool(name, args) + + tool_results.append({ + "functionResponse": { + "name": name, + "response": {"result": result}, + } + }) + + if name == "done": + done_summary = result + + # Add tool results to conversation + contents.append({"role": "user", "parts": tool_results}) + + if done_summary: + log.info("Agent called done() at step %d: %s", step + 1, done_summary[:80]) + return done_summary + + log.warning("Agent hit step limit (%d)", MAX_AGENT_STEPS) + return "Agent reached maximum steps without completing." + + +def _build_initial_parts( + vlm_payload: dict, + action: dict, + history: HistoryBuffer | None, + current_screenshot: bytes | None, +) -> list[dict]: + """Build the initial message parts: screenshots + task prompt.""" + parts: list[dict] = [] + + # Include screenshots so agent can read source content + if history: + entries = history.get_entries() + for i, entry in enumerate(entries): + b64 = base64.b64encode(entry.jpeg).decode() + parts.append({"text": f"[Screenshot {i + 1}/{len(entries)}]"}) + parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}}) + + if current_screenshot: + b64 = base64.b64encode(current_screenshot).decode() + parts.append({"text": "[Current screenshot]"}) + parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}}) + + friction = vlm_payload.get("friction", {}) + prompt = f"""\ +The user approved this action. Complete it using the tools available to you. + +ACTION: {action.get('label', '')} +DETAILS: {action.get('details', '')} + +Context: + User's task: {vlm_payload.get('inferred_task', '')} + Problem: {friction.get('description', '')} + Current state: {vlm_payload.get('checkpoint_note_update', '')} + Application: {vlm_payload.get('app_name', '')} + Source: {friction.get('source_context', '')} + Target: {friction.get('target_context', '')} + +INSTRUCTIONS: +1. For BINARY files (PDFs, images, etc.): use your VISION. Read content directly + from the screenshots — this is your most reliable source for non-text files. +2. For TEXT files (code, markdown, configs, txt): use read_file to get exact content. +3. If you need a file but only know the filename (not the path), FIND IT FIRST: + - run_command("mdfind -name 'filename'") — fast macOS Spotlight search + - run_command("lsof -c AppName | grep filename") — find what file an app has open + Do NOT guess paths. Search first. +4. Choose the right output method: + - Binary format targets (docx, ppt, website forms, PDFs): use output() — user will copy/paste. + - Existing plain text files (code, markdown, config): use write_file() — modify directly. + - write_file() only works on files that ALREADY EXIST. Confirm the path with read_file first. +5. Use run_command to compile, test, or search for files. Never to write files. +6. Do NOT hallucinate content. If you can't read something, say so. +7. Call done() with a summary when the action is complete. + +Working directory: {os.getcwd()}""" + + parts.append({"text": prompt}) + return parts + + +def _summarize_args(args: dict) -> str: + """Short summary of tool args for terminal display.""" + parts = [] + for k, v in args.items(): + sv = str(v) + if len(sv) > 50: + sv = sv[:47] + "..." + parts.append(f"{k}={sv}") + return ", ".join(parts) diff --git a/argus/loop.py b/argus/loop.py new file mode 100644 index 0000000..7288dd6 --- /dev/null +++ b/argus/loop.py @@ -0,0 +1,415 @@ +"""Main orchestrator loop — capture → analyze → notify → execute. + +Three concurrent tasks: + 1. VLM loop: capture screenshots, analyze, push results + 2. Notification handler: show cards when friction detected, wait for response + 3. Input reader: read user's terminal input (1=accept, 0=dismiss) + +Swift portability notes: + - The VLM loop becomes a Timer or DispatchSourceTimer + - The notification handler becomes SwiftUI state updates + - The input reader becomes button tap handlers + - The three tasks map to Swift's structured concurrency (async let / TaskGroup) +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import sys +import time + +from argus.backend import send_analysis +from argus.buffer import HistoryBuffer +from argus.capture import capture_screenshot +from argus.config import BUFFER_MAX_LEN, CAPTURE_INTERVAL_S, VLM_CALL_EVERY_N +from argus.executor import execute +from argus.notification import NotificationManager +from argus.session import SessionManager +from argus.vlm import TaskContext, VLMResult, analyze_screenshot + +log = logging.getLogger(__name__) + + +# Track session actions already shown to avoid repeating +_handled_session_actions: set[str] = set() + + +def _session_action_key(sa) -> str: + return f"{sa.type}:{sa.session_id}" + + +def _show_session_card(title: str, body: str, options: list[str]) -> None: + """Display a session-related card (not friction). Swift: native notification.""" + print() + print("┌" + "─" * 58 + "┐") + print(f"│ {title:<56} │") + print("│" + " " * 58 + "│") + for line in body.split("\n"): + print(f"│ {line:<56} │") + print("│" + " " * 58 + "│") + for i, opt in enumerate(options): + print(f"│ [{i + 1}] {opt:<53} │") + print(f"│ [0] Not now{' ' * 44} │") + print("└" + "─" * 58 + "┘") + print() + + +async def _wait_for_input() -> int: + """Wait for a single integer input from stdin. Returns the number or 0.""" + loop = asyncio.get_event_loop() + line = await loop.run_in_executor(None, sys.stdin.readline) + try: + return int(line.strip()) + except (ValueError, EOFError): + return 0 + + +async def _handle_session_action(sa, result, sessions: SessionManager, notifier: NotificationManager) -> None: + """Handle VLM session_action output — show card and act on approval. + + Swift portability: becomes SwiftUI state changes + button handlers. + """ + # Skip if we already showed this exact action + key = _session_action_key(sa) + if key in _handled_session_actions: + return + _handled_session_actions.add(key) + + if sa.type == "resume": + session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None) + checkpoint = session.checkpoint_note if session else "" + task_title = session.task_title if session else sa.reason + + _show_session_card( + f"📂 Resume: {task_title}", + f"You left off: {checkpoint}" if checkpoint else sa.reason, + ["Resume session"], + ) + choice = await _wait_for_input() + if choice == 1: + if not sessions._mock: + resume_card = await sessions.get_resume_card(sa.session_id) + if resume_card: + rc = resume_card.get("resume_card", {}) + print(f" 💡 {rc.get('welcome_back', '')}") + print(f" {rc.get('you_were_doing', '')}") + print(f" {rc.get('next_step', '')}") + print(f" {rc.get('motivation', '')}") + else: + print(f" ✓ Resumed \"{task_title}\"") + if checkpoint: + print(f" Last checkpoint: {checkpoint}") + if session: + session.status = "active" + sessions._active_session = session + else: + print(" dismissed.") + + elif sa.type == "start_new": + task = result.inferred_task + _show_session_card( + "🆕 New focus session", + f"You're working on: {task}", + ["Start focus session"], + ) + choice = await _wait_for_input() + if choice == 1: + if not sessions._mock: + resp = await sessions.start_session(task) + if resp: + print(f" ✓ Session started: {resp.get('id', '?')}") + else: + print(f" ✓ (mock) Session started for \"{task}\"") + else: + print(" dismissed.") + + elif sa.type == "complete": + session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None) + task_title = session.task_title if session else "session" + + _show_session_card( + f"✅ Complete: {task_title}", + sa.reason, + ["Complete session"], + ) + choice = await _wait_for_input() + if choice == 1 and sa.session_id: + if not sessions._mock: + ok = await sessions.end_session(sa.session_id) + if ok: + print(f" ✓ Session completed") + else: + print(f" ✓ (mock) Session \"{task_title}\" completed") + sessions._sessions = [s for s in sessions._sessions if s.session_id != sa.session_id] + if sessions._active_session and sessions._active_session.session_id == sa.session_id: + sessions._active_session = None + else: + print(" dismissed.") + + elif sa.type == "switch": + session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None) + task_title = session.task_title if session else "another task" + _show_session_card( + f"🔄 Switch to: {task_title}", + sa.reason, + [f"Switch to \"{task_title}\""], + ) + choice = await _wait_for_input() + if choice == 1: + print(f" ✓ Switched to \"{task_title}\"") + else: + print(" dismissed.") + + +def _is_valid_session_id(session_id: str | None, sessions: SessionManager) -> bool: + """Check if a session_id from VLM output actually exists in our sessions.""" + if not session_id: + return False + return any(s.session_id == session_id for s in sessions.sessions) + + + + +async def run_loop( + ctx: TaskContext, + *, + api_key: str | None = None, + vlm_backend: str | None = None, + jwt: str | None = None, + base_url: str | None = None, + dry_run: bool = False, + max_iterations: int | None = None, + on_result: None | (callable) = None, + auto_execute: bool = False, + mock_sessions: list | None = None, +) -> None: + """Run the Argus VLM loop with notification and execution support. + + Args: + ctx: Task/session context. + api_key: Gemini API key override. + vlm_backend: "ollama" or "gemini". + jwt: Backend JWT override. + base_url: Backend base URL override. + dry_run: If True, skip sending to backend. + max_iterations: Stop after N iterations (None = forever). + on_result: Optional callback(VLMResult) per analysis. + auto_execute: If True, enable notification + executor flow. + """ + history = HistoryBuffer(image_maxlen=BUFFER_MAX_LEN) + notifier = NotificationManager() + sessions = SessionManager(jwt=jwt, base_url=base_url) + iteration = 0 + + log.info( + "Argus loop starting — interval=%ds, session=%s, task=%s, executor=%s", + CAPTURE_INTERVAL_S, + ctx.session_id, + ctx.task_title, + "on" if auto_execute else "off", + ) + + # Load sessions — mock or from backend + use_mock = mock_sessions is not None + if use_mock: + sessions._sessions = mock_sessions + sessions._active_session = next((s for s in mock_sessions if s.status == "active"), None) + sessions._mock = True # prevent refresh from overwriting + log.info("Loaded %d mock sessions", len(mock_sessions)) + else: + sessions._mock = False + try: + await sessions.fetch_open_sessions() + except Exception: + log.exception("Startup: failed to fetch sessions — continuing without session context") + + # Log attachment status so it's visible in the console + if sessions.active: + s = sessions.active + log.info( + "ATTACHED to session %s — task=%r last_app=%s checkpoint=%r", + s.session_id[:8], s.task_title, s.last_app or "(unknown)", s.checkpoint_note or "(none)", + ) + else: + log.info("No active session found — running in monitoring-only mode (dry-run=%s)", dry_run) + + + # Pending frames collected between VLM calls + pending_frames: list[bytes] = [] + capture_count = 0 + + try: + while max_iterations is None or iteration < max_iterations: + t0 = time.monotonic() + + try: + # 0. Refresh sessions periodically + await sessions.maybe_refresh() + + # 1. Capture screenshot every interval + screenshot = capture_screenshot() + pending_frames.append(screenshot) + capture_count += 1 + log.debug("Captured frame %d (%d pending)", capture_count, len(pending_frames)) + + # 2. Only call VLM every N captures + if len(pending_frames) < VLM_CALL_EVERY_N: + elapsed = time.monotonic() - t0 + sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed) + if sleep_for > 0: + await asyncio.sleep(sleep_for) + continue + + # ── VLM call with all pending frames ── + iteration += 1 + + # Push all pending frames into buffer (buffer keeps last BUFFER_MAX_LEN) + for frame in pending_frames: + history.push(frame, "") # summaries filled after VLM call + pending_frames.clear() + + t_vlm = time.monotonic() + result: VLMResult = await analyze_screenshot( + screenshot, ctx, history, + api_key=api_key, + backend=vlm_backend, + session_context=sessions.format_for_prompt(), + ) + t_vlm_done = time.monotonic() + payload = result.to_backend_payload(ctx.session_id) + + log.info( + "[%d] vlm=%.2fs frames=%d friction=%s summary=%s", + iteration, + t_vlm_done - t_vlm, + VLM_CALL_EVERY_N, + result.friction.type, + result.vlm_summary, + ) + + # Update last entry's summary now that we have it + history.set_last_output(payload) + + # 4. Print / send to backend + if dry_run: + print(json.dumps(payload, indent=2)) + else: + resp = await send_analysis( + result, ctx.session_id, jwt=jwt, base_url=base_url + ) + log.debug("Backend response: %s", resp) + + if auto_execute: + sa = result.session_action + + # Validate session_action — check that VLM output relates to the session. + # Match against filename, file stem (no extension), or task title. + if sa.type in ("resume", "switch") and sa.session_id: + session = next( + (s for s in sessions.sessions if s.session_id == sa.session_id), None + ) + if session: + context = (result.vlm_summary + " " + result.inferred_task + " " + sa.reason).lower() + matches = [] + if session.last_file: + matches.append(session.last_file.lower()) + # Also check stem without extension (e.g. "receipt" from "receipt.pdf") + stem = session.last_file.rsplit(".", 1)[0].lower() + if stem: + matches.append(stem) + if session.task_title: + # Check key words from task title + for word in session.task_title.lower().split(): + if len(word) > 3: # skip short words + matches.append(word) + if matches and not any(m in context for m in matches): + log.debug( + "Suppressing session_action=%s — none of %s found in context", + sa.type, matches, + ) + sa.type = "none" + + if not _is_valid_session_id(sa.session_id, sessions) and sa.type in ("resume", "switch"): + sa.type = "none" + + # 5. Session actions take priority — but only if not already handled + session_handled = False + if sa.type != "none": + key = _session_action_key(sa) + if key not in _handled_session_actions: + await _handle_session_action(sa, result, sessions, notifier) + session_handled = True + + # 6. Friction notification + executor + if not session_handled and notifier.should_notify(payload): + card = notifier.create_card(payload) + notifier.show_card_terminal(card) + + choice = await _wait_for_input() + action_idx = choice - 1 + + if choice > 0: + print(f"\n⚡ Executing action {action_idx + 1}...") + summary = await execute( + payload, action_idx, + history=history, + current_screenshot=screenshot, + api_key=api_key, + ) + # Emit a parseable JSON block so Swift can surface the result + # in the HUD and clear the executing spinner. + print(json.dumps({"exec_summary": summary or ""}, indent=2), flush=True) + if summary: + history.set_last_execution(summary) + else: + print("dismissed.") + + # 7. Nudge — VLM decided a nudge is appropriate (only if nothing else fired) + elif not session_handled and result.gentle_nudge: + print(f"\n💛 {result.gentle_nudge}") + + # 8. Fallback: suggest new session if VLM didn't + elif not session_handled and ( + sa.type == "none" + and result.on_task + and not sessions.active + and sessions.should_suggest_new_session(result.inferred_task) + ): + task = result.inferred_task + print(f"\n🆕 You've been working on \"{task}\" — start a focus session?") + print(f" [1] Start [0] Not now") + card = notifier.create_card({ + "friction": { + "type": "none", + "confidence": 1.0, + "description": f"New task detected: {task}", + "proposed_actions": [{"label": "Start focus session", "action_type": "other", "details": task}], + } + }) + notifier.show_card_terminal(card) + accepted, _ = await notifier.wait_for_response(timeout=60.0) + if accepted: + resp = await sessions.start_session(task) + if resp: + print(f" ✓ Session started: {resp.get('id', '?')}") + + # Clear execution context after 3 iterations + if history.get_last_execution() and iteration % 3 == 0: + history.clear_last_execution() + + # 9. Callback + if on_result: + on_result(result) + + except Exception: + log.exception("Error in Argus loop iteration %d", iteration) + + # Sleep for remainder of capture interval + elapsed = time.monotonic() - t0 + sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed) + if sleep_for > 0: + await asyncio.sleep(sleep_for) + finally: + pass diff --git a/argus/notification.py b/argus/notification.py new file mode 100644 index 0000000..7ad6272 --- /dev/null +++ b/argus/notification.py @@ -0,0 +1,172 @@ +"""Notification manager — decides when to show action cards to the user. + +Only pushes a new notification when the proposed action meaningfully changes. +In production (Swift), this becomes a native macOS notification / floating card. +Here it's a terminal prompt for testing. + +Swift portability notes: + - NotificationManager becomes a class with @Published properties + - show_card() triggers a SwiftUI overlay or UNUserNotificationCenter + - user_response() is wired to button taps instead of stdin +""" + +from __future__ import annotations + +import asyncio +import logging +from dataclasses import dataclass, field + +log = logging.getLogger(__name__) + + +@dataclass +class ActionCard: + """A proposed action shown to the user.""" + friction_type: str + description: str + actions: list[dict] + source: str + target: str + vlm_payload: dict # full VLM result for the executor + + +class NotificationManager: + """Deduplicates notifications and manages the pending action card. + + Only shows a new card when the friction type or proposed action labels + change from the previous notification. + """ + + def __init__(self): + self._last_fingerprint: str = "" + self._pending_card: ActionCard | None = None + self._response_event: asyncio.Event = asyncio.Event() + self._user_accepted: bool = False + + def _fingerprint(self, vlm_payload: dict) -> str: + """Generate a dedup key from friction type + action labels.""" + friction = vlm_payload.get("friction", {}) + ftype = friction.get("type", "none") + labels = tuple( + a.get("label", "") for a in friction.get("proposed_actions", []) + ) + return f"{ftype}:{labels}" + + # Action types that the executor can actually perform + ACTIONABLE_TYPES = {"auto_extract", "brain_dump", "auto_fill", "summarize"} + + def should_notify(self, vlm_payload: dict) -> bool: + """Check if this VLM result warrants a friction card (executor action). + + Only fires when proposed_actions contain something the executor can + act on. Generic suggestions (action_type: "other") are nudges, not + executor actions. + """ + friction = vlm_payload.get("friction", {}) + + # No friction or no proposed actions → no notification + if friction.get("type") == "none": + return False + actions = friction.get("proposed_actions", []) + if not actions: + return False + + # Only notify if at least one action is executor-actionable + has_actionable = any( + a.get("action_type") in self.ACTIONABLE_TYPES for a in actions + ) + if not has_actionable: + return False + + # Same as last notification → skip + fp = self._fingerprint(vlm_payload) + if fp == self._last_fingerprint: + return False + + return True + + def create_card(self, vlm_payload: dict) -> ActionCard: + """Create an action card from VLM output and mark it as pending.""" + friction = vlm_payload.get("friction", {}) + card = ActionCard( + friction_type=friction.get("type", ""), + description=friction.get("description", ""), + actions=friction.get("proposed_actions", []), + source=friction.get("source_context", ""), + target=friction.get("target_context", ""), + vlm_payload=vlm_payload, + ) + self._pending_card = card + self._last_fingerprint = self._fingerprint(vlm_payload) + self._response_event.clear() + self._user_accepted = False + return card + + def show_card_terminal(self, card: ActionCard) -> None: + """Print the action card to terminal. (Swift: show native UI.)""" + print() + print("┌" + "─" * 58 + "┐") + print(f"│ {'🔧 Friction detected':^58} │") + print("│" + " " * 58 + "│") + desc_lines = _wrap(card.description, 56) + for line in desc_lines: + print(f"│ {line:<56} │") + print("│" + " " * 58 + "│") + for i, action in enumerate(card.actions): + label = action.get("label", "?") + print(f"│ [{i + 1}] {label:<53} │") + print(f"│ [0] Not now{' ' * 44} │") + print("└" + "─" * 58 + "┘") + print() + + async def wait_for_response(self, timeout: float = 30.0) -> tuple[bool, int]: + """Wait for user response. Returns (accepted, action_index). + + Swift portability: this becomes a button tap callback. + """ + try: + await asyncio.wait_for(self._response_event.wait(), timeout=timeout) + return self._user_accepted, self._selected_index + except asyncio.TimeoutError: + log.debug("Notification timed out, dismissing") + self._pending_card = None + return False, -1 + + def submit_response(self, choice: int) -> None: + """Submit user's choice. Called from input reader task. + + Swift portability: called from button tap handler. + """ + if choice > 0 and self._pending_card: + self._user_accepted = True + self._selected_index = choice - 1 + else: + self._user_accepted = False + self._selected_index = -1 + self._response_event.set() + + @property + def pending(self) -> ActionCard | None: + return self._pending_card + + def dismiss(self) -> None: + """Dismiss current card without action.""" + self._pending_card = None + self._last_fingerprint = "" + + +def _wrap(text: str, width: int) -> list[str]: + """Simple word wrap.""" + words = text.split() + lines: list[str] = [] + current = "" + for word in words: + if len(current) + len(word) + 1 <= width: + current = f"{current} {word}" if current else word + else: + if current: + lines.append(current) + current = word + if current: + lines.append(current) + return lines or [""] diff --git a/argus/requirements.txt b/argus/requirements.txt new file mode 100644 index 0000000..4c563f9 --- /dev/null +++ b/argus/requirements.txt @@ -0,0 +1,3 @@ +httpx>=0.27 +Pillow>=10.0 +python-dotenv>=1.0 diff --git a/argus/session.py b/argus/session.py new file mode 100644 index 0000000..d847203 --- /dev/null +++ b/argus/session.py @@ -0,0 +1,293 @@ +"""Session manager — fetches and caches session state from the backend. + +Provides session context to the VLM prompt and handles session lifecycle +actions (start, resume, switch, complete). + +Swift portability notes: + - SessionManager becomes an ObservableObject with @Published properties + - Backend calls use URLSession instead of httpx + - match_session() logic is identical +""" + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass, field + +import httpx + +from argus.config import BACKEND_BASE_URL, BACKEND_JWT + +log = logging.getLogger(__name__) + + +@dataclass +class SessionInfo: + session_id: str + task_id: str | None + task_title: str + task_goal: str + status: str # active | interrupted + last_app: str + last_file: str + checkpoint_note: str + started_at: str + ended_at: str | None + minutes_ago: int | None = None + + +class SessionManager: + """Caches open sessions and provides matching + lifecycle operations.""" + + def __init__(self, jwt: str | None = None, base_url: str | None = None): + self._jwt = jwt or BACKEND_JWT + self._base_url = base_url or BACKEND_BASE_URL + self._sessions: list[SessionInfo] = [] + self._active_session: SessionInfo | None = None + self._last_fetch: float = 0 + self._fetch_interval = 30.0 # re-fetch every 30s + self._mock: bool = False + + # Track inferred_task stability for new session suggestion + self._inferred_task_history: list[str] = [] + self._stable_threshold = 3 # consecutive matching inferred_tasks + + @property + def active(self) -> SessionInfo | None: + return self._active_session + + @property + def sessions(self) -> list[SessionInfo]: + return self._sessions + + def has_sessions(self) -> bool: + return len(self._sessions) > 0 + + # ── Backend communication ──────────────────────────────────────── + + async def fetch_open_sessions(self) -> list[SessionInfo]: + """Fetch the active session from backend (GET /sessions/active → single object or 404). + Called on startup and periodically. + """ + url = f"{self._base_url}/sessions/active" + headers = {} + if self._jwt: + headers["Authorization"] = f"Bearer {self._jwt}" + + try: + async with httpx.AsyncClient(timeout=10.0) as client: + resp = await client.get(url, headers=headers) + + if resp.status_code == 404: + # No active session — clear cache + self._sessions = [] + self._active_session = None + self._last_fetch = time.monotonic() + log.info("No active session on backend") + return [] + + resp.raise_for_status() + data = resp.json() + session = self._parse_session(data) + self._sessions = [session] + self._active_session = session + self._last_fetch = time.monotonic() + log.info("Fetched active session: %s (%s)", session.session_id, session.task_title) + return self._sessions + + except httpx.HTTPStatusError as e: + log.warning("Failed to fetch sessions: %s", e.response.status_code) + return self._sessions + except httpx.RequestError as e: + log.debug("Backend unreachable for sessions: %s", e) + return self._sessions + + async def maybe_refresh(self) -> None: + """Re-fetch if stale. Skips if using mock data.""" + if self._mock: + return + if time.monotonic() - self._last_fetch > self._fetch_interval: + await self.fetch_open_sessions() + + async def start_session(self, task_title: str, task_goal: str = "") -> dict | None: + """Start a new focus session. Returns session data or None.""" + url = f"{self._base_url}/sessions/start" + headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {} + + # If we have a task_title but no task_id, the backend should + # create an ad-hoc session (or we create the task first). + payload = {"platform": "mac"} + + try: + async with httpx.AsyncClient(timeout=10.0) as client: + resp = await client.post(url, json=payload, headers=headers) + resp.raise_for_status() + result = resp.json() + await self.fetch_open_sessions() # refresh + log.info("Started session: %s", result.get("id")) + return result + except Exception: + log.exception("Failed to start session") + return None + + async def end_session(self, session_id: str, status: str = "completed") -> bool: + """End a session. Returns True on success.""" + url = f"{self._base_url}/sessions/{session_id}/end" + headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {} + + try: + async with httpx.AsyncClient(timeout=10.0) as client: + resp = await client.post(url, json={"status": status}, headers=headers) + resp.raise_for_status() + await self.fetch_open_sessions() # refresh + log.info("Ended session %s (%s)", session_id, status) + return True + except Exception: + log.exception("Failed to end session %s", session_id) + return False + + async def get_resume_card(self, session_id: str) -> dict | None: + """Fetch AI-generated resume card for a session.""" + url = f"{self._base_url}/sessions/{session_id}/resume" + headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {} + + try: + async with httpx.AsyncClient(timeout=15.0) as client: + resp = await client.get(url, headers=headers) + resp.raise_for_status() + return resp.json() + except Exception: + log.exception("Failed to get resume card for %s", session_id) + return None + + # ── Session matching ───────────────────────────────────────────── + + def match_session(self, inferred_task: str, app_name: str) -> SessionInfo | None: + """Find an open session that matches the current screen state. + Returns the best match or None. + """ + if not inferred_task or not self._sessions: + return None + + inferred_lower = inferred_task.lower() + + best_match: SessionInfo | None = None + best_score = 0 + + for session in self._sessions: + score = 0 + title_lower = session.task_title.lower() + + # Check for keyword overlap between inferred_task and session task_title + inferred_words = set(inferred_lower.split()) + title_words = set(title_lower.split()) + overlap = inferred_words & title_words + # Filter out common words + overlap -= {"the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with"} + score += len(overlap) * 2 + + # App match + if app_name and session.last_app and app_name.lower() in session.last_app.lower(): + score += 3 + + # File match — check if session's last_file appears in inferred_task + if session.last_file and session.last_file.lower() in inferred_lower: + score += 5 + + if score > best_score: + best_score = score + best_match = session + + # Require minimum score to avoid false matches + if best_score >= 4: + return best_match + return None + + def should_suggest_new_session(self, inferred_task: str) -> bool: + """Check if we should suggest starting a new session. + Returns True if inferred_task has been stable for N iterations + and doesn't match any existing session. + """ + if not inferred_task: + self._inferred_task_history.clear() + return False + + self._inferred_task_history.append(inferred_task) + # Keep only recent history + if len(self._inferred_task_history) > self._stable_threshold + 2: + self._inferred_task_history = self._inferred_task_history[-self._stable_threshold - 2:] + + if len(self._inferred_task_history) < self._stable_threshold: + return False + + # Check if the last N inferred tasks are "similar" (share key words) + recent = self._inferred_task_history[-self._stable_threshold:] + first_words = set(recent[0].lower().split()) - {"the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with"} + all_similar = all( + len(first_words & set(t.lower().split())) >= len(first_words) * 0.5 + for t in recent[1:] + ) + + if not all_similar: + return False + + # Make sure it doesn't match an existing session + matched = self.match_session(inferred_task, "") + return matched is None + + # ── Prompt formatting ──────────────────────────────────────────── + + def format_for_prompt(self) -> str: + """Format open sessions for injection into the VLM prompt. + Includes session_id so VLM can reference the exact ID in session_action. + """ + if not self._sessions: + return "(no open sessions)" + + lines: list[str] = [] + for s in self._sessions: + status_tag = f"[{s.status}]" + ago = f" (paused {s.minutes_ago}m ago)" if s.minutes_ago else "" + line = f" session_id=\"{s.session_id}\" {status_tag} \"{s.task_title}\" — last in {s.last_app}" + if s.last_file: + line += f"/{s.last_file}" + if s.checkpoint_note: + line += f", \"{s.checkpoint_note}\"" + line += ago + lines.append(line) + return "\n".join(lines) + + # ── Internal ───────────────────────────────────────────────────── + + def _parse_session(self, data: dict) -> SessionInfo: + checkpoint = data.get("checkpoint", {}) or {} + ended = data.get("ended_at") + minutes_ago = None + if ended: + import datetime + try: + ended_dt = datetime.datetime.fromisoformat(ended.replace("Z", "+00:00")) + now = datetime.datetime.now(datetime.timezone.utc) + minutes_ago = int((now - ended_dt).total_seconds() / 60) + except (ValueError, TypeError): + pass + + # SessionOut has no nested "task" object — task title is stored in checkpoint["goal"] + # by POST /sessions/start when a task_id is provided. + task_title = checkpoint.get("goal", "") + checkpoint_note = checkpoint.get("last_action_summary", checkpoint.get("last_vlm_summary", "")) + + return SessionInfo( + session_id=str(data.get("id", "")), + task_id=data.get("task_id"), + task_title=task_title, + task_goal=task_title, + status=data.get("status", ""), + last_app=checkpoint.get("active_app", ""), + last_file=checkpoint.get("active_file", ""), + checkpoint_note=checkpoint_note, + started_at=data.get("started_at", ""), + ended_at=ended, + minutes_ago=minutes_ago, + ) diff --git a/argus/vlm.py b/argus/vlm.py new file mode 100644 index 0000000..63e0f01 --- /dev/null +++ b/argus/vlm.py @@ -0,0 +1,442 @@ +"""VLM client — supports Ollama (local, default) and Gemini (cloud fallback). + +Sends the current screenshot plus text summaries of recent analyses. +Parses the structured JSON response. +""" + +from __future__ import annotations + +import asyncio +import base64 +import json +import logging +import re +from dataclasses import dataclass, field + +import httpx + +from argus.buffer import HistoryBuffer +from argus.config import ( + GEMINI_API_KEY, + GEMINI_URL, + OLLAMA_BASE_URL, + OLLAMA_MODEL, + VLM_BACKEND, +) + +log = logging.getLogger(__name__) + +# ── Task context passed in from the session ────────────────────────────── + + +@dataclass +class StepInfo: + id: str + sort_order: int + title: str + status: str # pending | in_progress | done | skipped + checkpoint_note: str | None = None + + +@dataclass +class TaskContext: + """Context about the user's current task/session, fed into the VLM prompt.""" + + task_title: str = "" + task_goal: str = "" + steps: list[StepInfo] = field(default_factory=list) + window_title: str = "" + session_id: str = "" + + +# ── VLM response schema ───────────────────────────────────────────────── + + +@dataclass +class FrictionInfo: + type: str = "none" + confidence: float = 0.0 + description: str | None = None + proposed_actions: list[dict] = field(default_factory=list) + source_context: str | None = None + target_context: str | None = None + + +@dataclass +class SessionAction: + type: str = "none" # resume | switch | complete | start_new | none + session_id: str | None = None + reason: str = "" + + +@dataclass +class VLMResult: + on_task: bool = True + current_step_id: str | None = None + inferred_task: str = "" + checkpoint_note_update: str | None = None + steps_completed: list[str] = field(default_factory=list) + friction: FrictionInfo = field(default_factory=FrictionInfo) + session_action: SessionAction = field(default_factory=SessionAction) + intent: str | None = None + distraction_type: str | None = None + app_name: str = "" + confidence: float = 0.0 + gentle_nudge: str | None = None + vlm_summary: str = "" + + def to_backend_payload(self, session_id: str) -> dict: + """Serialize to the JSON shape expected by POST /distractions/analyze-result.""" + return { + "session_id": session_id, + "on_task": self.on_task, + "current_step_id": self.current_step_id, + "inferred_task": self.inferred_task, + "checkpoint_note_update": self.checkpoint_note_update, + "steps_completed": self.steps_completed, + "friction": { + "type": self.friction.type, + "confidence": self.friction.confidence, + "description": self.friction.description, + "proposed_actions": self.friction.proposed_actions, + "source_context": self.friction.source_context, + "target_context": self.friction.target_context, + }, + "session_action": { + "type": self.session_action.type, + "session_id": self.session_action.session_id, + "reason": self.session_action.reason, + }, + "intent": self.intent, + "distraction_type": self.distraction_type, + "app_name": self.app_name, + "confidence": self.confidence, + "gentle_nudge": self.gentle_nudge, + "vlm_summary": self.vlm_summary, + } + + +# ── Prompt construction ────────────────────────────────────────────────── + + +def _format_steps(steps: list[StepInfo]) -> str: + lines: list[str] = [] + for s in steps: + marker = {"pending": "○", "in_progress": "►", "done": "✓", "skipped": "–"}.get( + s.status, "?" + ) + line = f" {marker} [{s.status}] (id={s.id}) {s.sort_order}. {s.title}" + if s.checkpoint_note: + line += f" — checkpoint: {s.checkpoint_note}" + lines.append(line) + return "\n".join(lines) if lines else " (no steps)" + + +def build_system_prompt(ctx: TaskContext, history: HistoryBuffer, session_context: str = "") -> str: + history_text = history.format_for_prompt() + steps_text = _format_steps(ctx.steps) + prev_output = history.format_last_output() + + prev_section = "" + if prev_output: + prev_section = f""" +Your previous analysis (refine or correct this based on new evidence): +{prev_output} +If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it with new observations. +""" + + execution = history.get_last_execution() + exec_section = "" + if execution: + exec_section = f""" +IMPORTANT — An AI agent just completed an action for the user: + {execution} +This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT. +""" + + return f"""\ +You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots. + +## How to read the screenshots + +You receive screenshots in chronological order (oldest first, newest last). +You receive ~5 frames spanning ~20 seconds (one frame every 4 seconds). This means: + - 2 unchanged frames = 8+ seconds idle. That's significant. + - 3+ unchanged frames = 12-20 seconds idle. The user is stuck or distracted. + - If ALL frames are identical, the user has been idle for 20 seconds — definitely flag it. + - If the user wrote code and then 2+ frames show no changes, they are STUCK NOW. +Do NOT wait for many frames to flag problems. React fast. + +Your PRIMARY signal is the DIFFERENCES between consecutive frames. +Where the screen CHANGED = where the user's ATTENTION is. +Where the screen is STATIC = background noise. Ignore it unless the user interacts with it. + +Diff signals and what they mean: + - New text appearing / cursor advancing → user is actively typing (THIS is their task) + - Window or tab switch → context change, could be reference lookup or distraction + - Same content, no pixel changes → stalled, idle, or reading + - Repeated switching between same 2-3 apps → repetitive loop (manual data transfer) + - Scroll position change → reading or browsing + - Error message that APPEARED between frames → user just triggered it, relevant + - Error message that was ALREADY THERE in all frames → stale, ignore it + +## Task inference + +Infer the user's current task from what they are ACTIVELY DOING across the frames. +Do NOT assume static content (old terminal output, background panels, stale errors) +is the task. The region of the screen where pixels are changing IS the task. + +CRITICAL — looking at something ≠ working on something: + - User switches to Preview/browser/another app and just LOOKS → this is NOT a new task. + It could be a distraction, a quick reference, or idle browsing. + - User switches to another app AND starts TYPING/EDITING → this might be a new task. + - If the user has an active session and switches away WITHOUT typing in the new app, + they are DISTRACTED from their session, not starting a new task. + - Only infer a new task when there is clear evidence of productive work (typing, editing, + cursor movement between frames) in the new context. + - A single app switch is NEVER enough to infer a new task. Wait for active work. + +If an explicit task is provided below, use it. Otherwise, infer from the screenshots. + Task: {ctx.task_title} + Goal: {ctx.task_goal} + Steps: +{steps_text} + Window title reported by OS: {ctx.window_title} + +## Open sessions from backend (use EXACT session_id values below) +{session_context if session_context else "(no open sessions — suggest start_new if user is working on something)"} +Session matching rules — be STRICT: + - A session matches ONLY if the user is actively editing the session's last_file. + Being in the same app (e.g. VS Code) is NOT enough. The user must be typing/editing + in the specific file listed in the session (e.g. solution.cpp). + - Chatting in a sidebar, reading logs, or browsing in the same app ≠ working on the session. + - If the user is in VS Code but editing a DIFFERENT file or chatting in a panel, + they are NOT on-task for the session. That's a distraction. +If the session's file IS being actively edited, output session_action: resume with the EXACT session_id. +If the user moved to a different open session's file, output session_action: switch with the EXACT session_id. +IMPORTANT: If you propose a friction action (e.g. auto_extract) that relates to an existing +session's task, ALSO output session_action: resume with that session's ID. The friction action +and session resume should go together — don't propose work related to a session without +linking it to the session. +If the user finished and closed the session's file, output session_action: complete with the EXACT session_id. +If no sessions exist but the user is actively working, output session_action: start_new (session_id: null). +NEVER invent session IDs. Use only the IDs listed above or null. +{prev_section}{exec_section} +Screenshot timeline: +{history_text} + +## What to analyze + +1. INFERRED TASK: What is the user actually working on right now? Base this on where + the screen is changing, not on static content. +2. CHECKPOINT: What specific progress did the user make across these frames? + Describe what changed (e.g., "typed 3 new lines of C++ code", "scrolled to next section"). +3. FRICTION DETECTION: Is the user stuck in any of these patterns? + - REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually) + - STALLED: No meaningful pixel changes across 2+ frames, OR user wrote code then + deleted/undid it (write-then-delete = struggle, NOT "refining") + - TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand) + - CONTEXT_OVERHEAD: Many windows open, visibly searching across them + - TASK_RESUMPTION: User just returned to a task from earlier (check text history) + + IMPORTANT signals to catch IMMEDIATELY (do NOT wait many frames): + - User wrote code/text then deleted it → STUCK, not refining. Flag stalled. + - User idle for 2+ frames after deletion → definitely stuck. Flag stalled. + - User switching between source doc and target file repeatedly → TEDIOUS_MANUAL. + This is NOT "fluent workflow." If the user is copying data from one place to + another by switching windows, flag it on the SECOND switch. Don't wait. + - Code is incomplete/wrong and user stopped typing → need help. +4. INTENT: If viewing informational content, is the user skimming, engaged, or unclear? +5. PROPOSED ACTION: If friction detected, suggest what the AI could DO. + Be SPECIFIC: "Extract full text from writeup.pdf into transcript.md" not "Summarize text". + The label should be a concrete verb phrase the user can approve with one tap. + + CRITICAL: The "details" field is the executor agent's instruction manual. Write it as + a natural language spec — the executor has vision too, so tell it where to look and + what to do, not the raw data: + Bad: "Extract data from the document" + Bad: "Help with the code" + Good: "User is writing a report in report.md and has a PDF open with source data. They are manually copying table values from the PDF into markdown. Extract the table from the PDF (visible in screenshots), format as a markdown table matching the style in report.md, and append to the file." + Good: "User is implementing quicksort in solution.cpp but has been idle after writing the function signature. They appear stuck on the partition logic. Provide a working C++ quicksort implementation that fits their existing code structure." + +Respond ONLY with JSON: +{{ + "on_task": true, + "current_step_id": "step UUID or null", + "inferred_task": "what the user is actually working on, based on screen diffs", + "checkpoint_note_update": "what changed across these frames specifically", + "steps_completed": ["UUIDs"], + "friction": {{ + "type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none", + "confidence": 0.0-1.0, + "description": "what the user is struggling with, based on diff evidence", + "proposed_actions": [ + {{"label": "specific verb phrase: what to do, from where, to where", "action_type": "auto_extract | brain_dump | auto_fill | summarize | other", "details": "Natural language spec for executor. Include: (1) what the user wants done, (2) where to look in the screenshots, (3) EXACT format to use — quote what the user already wrote so the executor matches it (e.g. if user wrote '3 tacos with steak' in plain text, say 'format as plain text lines like the user already started, NOT JSON'), (4) target file. The executor has vision to read screenshots — tell it WHERE to look, not the raw data."}} + ], + "source_context": "just the filename if visible (e.g. writeup.pdf), or app name if no file", + "target_context": "just the filename if visible (e.g. transcript.md), or app name if no file" + }}, + "session_action": {{ + "type": "resume | switch | complete | start_new | none", + "session_id": "uuid of matching session, or null for start_new/none", + "reason": "why this session action is suggested" + }}, + "intent": "skimming | engaged | unclear | null", + "distraction_type": "app_switch | browsing | idle | null", + "app_name": "primary visible application", + "confidence": 0.0-1.0, + "gentle_nudge": "nudge text if distracted, null otherwise", + "vlm_summary": "1-sentence description of what CHANGED across the frames (not what's static)" +}}""" + + +# ── Gemini API call ────────────────────────────────────────────────────── + + +def _extract_json(text: str) -> dict: + """Extract JSON from Gemini response, handling markdown code fences.""" + text = text.strip() + # Strip ```json ... ``` wrappers + m = re.search(r"```(?:json)?\s*([\s\S]*?)```", text) + if m: + text = m.group(1).strip() + return json.loads(text) + + +def _parse_vlm_response(raw: dict) -> VLMResult: + """Parse the raw JSON dict into a VLMResult dataclass.""" + friction_raw = raw.get("friction", {}) + friction = FrictionInfo( + type=friction_raw.get("type", "none"), + confidence=friction_raw.get("confidence", 0.0), + description=friction_raw.get("description"), + proposed_actions=friction_raw.get("proposed_actions", []), + source_context=friction_raw.get("source_context"), + target_context=friction_raw.get("target_context"), + ) + sa_raw = raw.get("session_action", {}) + session_action = SessionAction( + type=sa_raw.get("type", "none"), + session_id=sa_raw.get("session_id"), + reason=sa_raw.get("reason", ""), + ) + return VLMResult( + on_task=raw.get("on_task", True), + current_step_id=raw.get("current_step_id"), + inferred_task=raw.get("inferred_task", ""), + checkpoint_note_update=raw.get("checkpoint_note_update"), + steps_completed=raw.get("steps_completed", []), + friction=friction, + session_action=session_action, + intent=raw.get("intent"), + distraction_type=raw.get("distraction_type"), + app_name=raw.get("app_name", ""), + confidence=raw.get("confidence", 0.0), + gentle_nudge=raw.get("gentle_nudge"), + vlm_summary=raw.get("vlm_summary", ""), + ) + + +async def _call_ollama(system_prompt: str, b64_images: list[str]) -> str: + """Call Ollama local VLM with multiple images and return raw text.""" + payload = { + "model": OLLAMA_MODEL, + "messages": [ + {"role": "system", "content": system_prompt}, + { + "role": "user", + "content": "/no_think\nAnalyze this screenshot sequence now.", + "images": b64_images, + }, + ], + "stream": False, + "keep_alive": -1, + "options": {"temperature": 0.2}, + } + + async with httpx.AsyncClient(timeout=300.0) as client: + resp = await client.post(f"{OLLAMA_BASE_URL}/api/chat", json=payload) + resp.raise_for_status() + + body = resp.json() + return body["message"]["content"] + + +async def _call_gemini(system_prompt: str, b64_images: list[str], api_key: str) -> str: + """Call Gemini Vision API with multiple images and return raw text.""" + # Build parts: interleave images with labels, newest last + parts: list[dict] = [] + total = len(b64_images) + for i, b64 in enumerate(b64_images): + parts.append({"text": f"[Screenshot {i + 1}/{total}]"}) + parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}}) + parts.append({"text": "Analyze this screenshot sequence now."}) + + payload = { + "systemInstruction": {"parts": [{"text": system_prompt}]}, + "contents": [{"parts": parts}], + "generationConfig": { + "temperature": 0.2, + "maxOutputTokens": 4096, + }, + } + + async with httpx.AsyncClient(timeout=60.0) as client: + for attempt in range(3): + resp = await client.post(f"{GEMINI_URL}?key={api_key}", json=payload) + if resp.status_code == 429: + wait = 2 ** attempt + log.warning("Gemini 429 rate limited, retrying in %ds...", wait) + await asyncio.sleep(wait) + continue + resp.raise_for_status() + break + else: + resp.raise_for_status() + + body = resp.json() + parts = body["candidates"][0]["content"]["parts"] + text = "" + for part in parts: + if "text" in part: + text = part["text"] + return text + + +async def analyze_screenshot( + screenshot_jpeg: bytes, + ctx: TaskContext, + history: HistoryBuffer, + *, + api_key: str | None = None, + backend: str | None = None, + session_context: str = "", +) -> VLMResult: + """Analyze a screenshot sequence via Ollama or Gemini. + + Sends all buffered screenshots + the current one as images (oldest first). + """ + which = backend or VLM_BACKEND + system_prompt = build_system_prompt(ctx, history, session_context=session_context) + + # Build image list: buffered (oldest first) + current + b64_images: list[str] = [] + for entry in history.get_entries(): + b64_images.append(base64.b64encode(entry.jpeg).decode()) + b64_images.append(base64.b64encode(screenshot_jpeg).decode()) + + log.debug("Sending %d screenshots to %s", len(b64_images), which) + + if which == "ollama": + text = await _call_ollama(system_prompt, b64_images) + elif which == "gemini": + key = api_key or GEMINI_API_KEY + if not key: + raise RuntimeError("GEMINI_API_KEY not set") + text = await _call_gemini(system_prompt, b64_images, key) + else: + raise ValueError(f"Unknown VLM backend: {which}") + + log.debug("VLM raw response: %s", text) + parsed = _extract_json(text) + return _parse_vlm_response(parsed)