From 483e3c1d00723fe30aa46638930421eb31e8bd73 Mon Sep 17 00:00:00 2001 From: pulipakaa24 Date: Wed, 1 Apr 2026 16:10:30 -0500 Subject: [PATCH] new macOS version --- FloatingHUDView.swift | 284 +++- FloatingPanel.swift | 19 +- GeminiVLMClient.swift | 727 ++++++++-- HistoryBuffer.swift | 189 +++ LockInBro.xcodeproj/project.pbxproj | 18 +- .../xcschemes/xcschememanagement.plist | 14 + .../xcschemes/xcschememanagement.plist | 14 + LockInBro/APIClient.swift | 48 +- LockInBro/FocusSessionView.swift | 4 +- LockInBro/LockInBro.entitlements | 4 - LockInBro/MenuBarView.swift | 30 + LockInBro/Models.swift | 77 +- LockInBro/SessionManager.swift | 1193 +++++++++++++++-- 13 files changed, 2302 insertions(+), 319 deletions(-) create mode 100644 HistoryBuffer.swift create mode 100644 LockInBro.xcodeproj/xcuserdata/adipu.xcuserdatad/xcschemes/xcschememanagement.plist create mode 100644 LockInBro.xcodeproj/xcuserdata/zehuaw.xcuserdatad/xcschemes/xcschememanagement.plist diff --git a/FloatingHUDView.swift b/FloatingHUDView.swift index 1b55421..38a7860 100644 --- a/FloatingHUDView.swift +++ b/FloatingHUDView.swift @@ -1,5 +1,7 @@ // FloatingHUDView.swift — Content for the always-on-top focus HUD panel +// All notifications (friction, nudges, resume) render here — not in system notifications. +import AppKit import SwiftUI struct FloatingHUDView: View { @@ -19,23 +21,24 @@ struct FloatingHUDView: View { .animation(.spring(duration: 0.3), value: session.isExecuting) .animation(.spring(duration: 0.3), value: session.executorOutput?.title) .animation(.spring(duration: 0.3), value: session.monitoringError) + .animation(.spring(duration: 0.3), value: session.nudgeMessage) + .animation(.spring(duration: 0.3), value: session.showingResumeCard) } // MARK: - Header private var header: some View { HStack(spacing: 8) { - Image(systemName: "eye.fill") - .foregroundStyle(.blue) + Image(systemName: session.isSessionActive ? "eye.fill" : "eye") + .foregroundStyle(session.isSessionActive ? .blue : .secondary) .font(.caption) - Text(session.activeTask?.title ?? "Focus Session") + Text(session.activeTask?.title ?? (session.isSessionActive ? "Focus Session" : "Argus Monitoring")) .font(.caption.bold()) .lineLimit(1) Spacer() - // Pulse dot — green when capturing, orange when executing if session.isExecuting { Image(systemName: "bolt.fill") .font(.caption2) @@ -55,24 +58,24 @@ struct FloatingHUDView: View { .padding(.vertical, 10) } - // MARK: - Content + // MARK: - Content (priority order) @ViewBuilder private var content: some View { - // Error / warning banner — shown above all other content when monitoring has a problem + // 1. Error / warning banner if let error = session.monitoringError { MonitoringErrorBanner(message: error) .transition(.move(edge: .top).combined(with: .opacity)) } - // Executor output sticky card (highest priority — persists until dismissed) + // 2. Executor output sticky card (highest priority — persists until dismissed) if let output = session.executorOutput { ExecutorOutputCard(title: output.title, content: output.content) { session.executorOutput = nil } .transition(.move(edge: .top).combined(with: .opacity)) } - // Executing spinner + // 3. Executing spinner else if session.isExecuting { HStack(spacing: 10) { ProgressView() @@ -84,34 +87,196 @@ struct FloatingHUDView: View { .padding(14) .transition(.opacity) } - // Proactive friction card + // 4. Resume card (shown in HUD, not as system overlay) + else if session.showingResumeCard, let card = session.resumeCard { + ResumeCardView(card: card) + .transition(.move(edge: .top).combined(with: .opacity)) + } + // 5. Proactive friction / session action card else if let card = session.proactiveCard { HUDCardView(card: card) .transition(.move(edge: .top).combined(with: .opacity)) } - // Latest VLM summary (idle state) + // 6. Nudge card (amber, shown in HUD instead of system notification) + else if let nudge = session.nudgeMessage { + NudgeCardView(message: nudge) + .transition(.move(edge: .top).combined(with: .opacity)) + } + // 7. Idle state — latest VLM summary else if session.monitoringError == nil { - VStack(alignment: .leading, spacing: 4) { - if let task = session.latestInferredTask, !task.isEmpty { + IdleSummaryView() + .transition(.opacity) + } + } +} + +// MARK: - Nudge Card (amber — replaces UNUserNotificationCenter) + +private struct NudgeCardView: View { + let message: String + @Environment(SessionManager.self) private var session + + var body: some View { + VStack(alignment: .leading, spacing: 8) { + HStack(alignment: .top, spacing: 8) { + Image(systemName: "lightbulb.fill") + .foregroundStyle(.orange) + .font(.caption) + + Text(message) + .font(.caption) + .foregroundStyle(.primary) + .fixedSize(horizontal: false, vertical: true) + .lineLimit(4) + + Spacer(minLength: 0) + + Button { session.dismissNudge() } label: { + Image(systemName: "xmark") + .font(.caption2.bold()) + .foregroundStyle(.secondary) + } + .buttonStyle(.plain) + } + } + .padding(12) + .background(Color.orange.opacity(0.08)) + .overlay( + Rectangle() + .frame(width: 3) + .foregroundStyle(Color.orange), + alignment: .leading + ) + } +} + +// MARK: - Resume Card (warm welcome-back in HUD) + +private struct ResumeCardView: View { + let card: ResumeCard + @Environment(SessionManager.self) private var session + + var body: some View { + VStack(alignment: .leading, spacing: 8) { + HStack(spacing: 8) { + Image(systemName: "arrow.counterclockwise.circle.fill") + .foregroundStyle(.blue) + .font(.caption) + Text(card.welcomeBack) + .font(.caption.bold()) + .foregroundStyle(.blue) + Spacer() + Button { session.showingResumeCard = false } label: { + Image(systemName: "xmark") + .font(.caption2.bold()) + .foregroundStyle(.secondary) + } + .buttonStyle(.plain) + } + + Text(card.youWereDoing) + .font(.caption) + .foregroundStyle(.primary) + .fixedSize(horizontal: false, vertical: true) + + Text(card.nextStep) + .font(.caption) + .foregroundStyle(.secondary) + .fixedSize(horizontal: false, vertical: true) + + Text(card.motivation) + .font(.caption.italic()) + .foregroundStyle(.blue.opacity(0.8)) + .fixedSize(horizontal: false, vertical: true) + + Button("Got it — let's go") { + session.showingResumeCard = false + } + .font(.caption.bold()) + .foregroundStyle(.white) + .padding(.horizontal, 12) + .padding(.vertical, 5) + .background(Color.blue) + .clipShape(.rect(cornerRadius: 6)) + .buttonStyle(.plain) + } + .padding(14) + .background(Color.blue.opacity(0.07)) + } +} + +// MARK: - Idle Summary View + +private struct IdleSummaryView: View { + @Environment(SessionManager.self) private var session + + var body: some View { + VStack(alignment: .leading, spacing: 8) { + // Step progress — only when session is active with steps + if session.isSessionActive && session.totalSteps > 0 { + HStack(spacing: 6) { + Image(systemName: "checklist") + .font(.caption2) + .foregroundStyle(.blue) + Text("Step \(min(session.completedCount + 1, session.totalSteps))/\(session.totalSteps): \(session.currentStep?.title ?? "In progress")") + .font(.caption) + .foregroundStyle(.blue) + .lineLimit(1) + } + Divider() + } + + // Inferred task + if let task = session.latestInferredTask, !task.isEmpty { + VStack(alignment: .leading, spacing: 2) { + Text("DOING NOW") + .font(.system(size: 9, weight: .semibold)) + .foregroundStyle(.secondary) + .tracking(0.5) Text(task) .font(.caption.bold()) .foregroundStyle(.primary) .fixedSize(horizontal: false, vertical: true) .lineLimit(2) } + } + + // App badge + VLM summary + HStack(alignment: .top, spacing: 6) { + if let app = session.latestAppName, !app.isEmpty { + Text(app) + .font(.system(size: 9, weight: .medium)) + .foregroundStyle(.purple) + .padding(.horizontal, 5) + .padding(.vertical, 2) + .background(Color.purple.opacity(0.1)) + .clipShape(.capsule) + .lineLimit(1) + } Text(session.latestVlmSummary ?? "Monitoring your screen…") .font(.caption) .foregroundStyle(.secondary) .fixedSize(horizontal: false, vertical: true) .lineLimit(3) } - .padding(14) - .transition(.opacity) + + // Distraction count badge + if session.isSessionActive && session.distractionCount > 0 { + HStack(spacing: 4) { + Image(systemName: "exclamationmark.triangle") + .font(.system(size: 9)) + .foregroundStyle(.orange) + Text("\(session.distractionCount) distraction\(session.distractionCount == 1 ? "" : "s") this session") + .font(.system(size: 9)) + .foregroundStyle(.orange) + } + } } + .padding(14) } } -// MARK: - HUD Card (friction + proposed actions) +// MARK: - HUD Card (friction + proposed actions / session actions) private struct HUDCardView: View { let card: ProactiveCard @@ -145,7 +310,6 @@ private struct HUDCardView: View { .buttonStyle(.plain) } - // Action buttons actionButtons } .padding(14) @@ -161,24 +325,15 @@ private struct HUDCardView: View { Button { session.approveProactiveCard(actionIndex: index) } label: { - VStack(alignment: .leading, spacing: 2) { - Text(action.label) - .font(.caption.bold()) - .lineLimit(2) - .multilineTextAlignment(.leading) - if let details = action.details, !details.isEmpty { - Text(details) - .font(.caption2) - .foregroundStyle(.purple.opacity(0.7)) - .lineLimit(2) - .multilineTextAlignment(.leading) - } - } - .frame(maxWidth: .infinity, alignment: .leading) - .padding(.horizontal, 10) - .padding(.vertical, 6) - .background(Color.purple.opacity(0.12)) - .clipShape(.rect(cornerRadius: 8)) + Text(action.label) + .font(.caption.bold()) + .lineLimit(2) + .multilineTextAlignment(.leading) + .frame(maxWidth: .infinity, alignment: .leading) + .padding(.horizontal, 10) + .padding(.vertical, 6) + .background(Color.purple.opacity(0.12)) + .clipShape(.rect(cornerRadius: 8)) } .buttonStyle(.plain) .foregroundStyle(.purple) @@ -186,7 +341,7 @@ private struct HUDCardView: View { notNowButton } - case .sessionAction(let type, _, _, _, _): + case .sessionAction(let type, _, _, _, _, _): VStack(alignment: .leading, spacing: 6) { Button { session.approveProactiveCard(actionIndex: 0) @@ -201,6 +356,26 @@ private struct HUDCardView: View { } .buttonStyle(.plain) .foregroundStyle(.purple) + + notNowButton + } + + case .appSwitchLoop: + VStack(alignment: .leading, spacing: 6) { + Button { + session.approveProactiveCard(actionIndex: 0) + } label: { + Text("Help me with this") + .font(.caption.bold()) + .frame(maxWidth: .infinity, alignment: .leading) + .padding(.horizontal, 10) + .padding(.vertical, 6) + .background(Color.purple.opacity(0.12)) + .clipShape(.rect(cornerRadius: 8)) + } + .buttonStyle(.plain) + .foregroundStyle(.purple) + notNowButton } @@ -222,7 +397,7 @@ private struct HUDCardView: View { case "resume": return "Resume session" case "switch": return "Switch to this task" case "complete": return "Mark complete" - case "start_new": return "Start focus session" + case "start_new": return "Create task + start focus session" default: return "OK" } } @@ -233,7 +408,7 @@ private struct HUDCardView: View { return description ?? "I noticed something that might be slowing you down." case .appSwitchLoop(let apps, let count): return "You've switched between \(apps.joined(separator: " ↔ ")) \(count)× — are you stuck?" - case .sessionAction(_, _, let checkpoint, let reason, _): + case .sessionAction(_, _, let checkpoint, let reason, _, _): if !checkpoint.isEmpty { return "Left off: \(checkpoint)" } return reason.isEmpty ? "Argus noticed a session change." : reason } @@ -293,6 +468,13 @@ private struct ExecutorOutputCard: View { let content: String let onDismiss: () -> Void + @State private var copied = false + + private var maxScrollHeight: CGFloat { + let screenHeight = NSScreen.main?.visibleFrame.height ?? 800 + return max(120, screenHeight - 157) + } + var body: some View { VStack(alignment: .leading, spacing: 8) { HStack(spacing: 6) { @@ -316,16 +498,34 @@ private struct ExecutorOutputCard: View { Text(content) .font(.caption) .foregroundStyle(.primary) + .textSelection(.enabled) .fixedSize(horizontal: false, vertical: true) .frame(maxWidth: .infinity, alignment: .leading) } - .frame(maxHeight: 120) + .frame(maxHeight: maxScrollHeight) - Button("Dismiss") { onDismiss() } - .font(.caption) - .foregroundStyle(.secondary) + HStack { + Button("Dismiss") { onDismiss() } + .font(.caption) + .foregroundStyle(.secondary) + .buttonStyle(.plain) + Spacer() + Button { + NSPasteboard.general.clearContents() + NSPasteboard.general.setString(content, forType: .string) + copied = true + Task { + try? await Task.sleep(for: .seconds(2)) + copied = false + } + } label: { + Label(copied ? "Copied!" : "Copy", systemImage: copied ? "checkmark" : "doc.on.doc") + .font(.caption.bold()) + .foregroundStyle(copied ? AnyShapeStyle(.secondary) : AnyShapeStyle(Color.green)) + } .buttonStyle(.plain) - .frame(maxWidth: .infinity, alignment: .trailing) + .animation(.easeInOut(duration: 0.15), value: copied) + } } .padding(14) .background(Color.green.opacity(0.07)) diff --git a/FloatingPanel.swift b/FloatingPanel.swift index 2fe6818..dd6fe24 100644 --- a/FloatingPanel.swift +++ b/FloatingPanel.swift @@ -32,6 +32,11 @@ final class FloatingPanel: NSPanel { // Don't activate the app when clicked (user keeps focus on their work) becomesKeyOnlyIfNeeded = true } + + // Allow the panel to become key so buttons inside it can receive clicks. + // Combined with .nonactivatingPanel, this lets buttons work without + // stealing focus from the user's active app. + override var canBecomeKey: Bool { true } } // MARK: - Controller @@ -49,14 +54,20 @@ final class FloatingPanelController { let p = FloatingPanel() let hud = FloatingHUDView() .environment(session) - p.contentView = NSHostingView(rootView: hud) - // Position: top-right of the main screen, just below the menu bar + // NSHostingController gives proper preferredContentSize tracking so the + // panel auto-resizes as SwiftUI content grows or shrinks. + let controller = NSHostingController(rootView: hud) + p.contentViewController = controller + + // Position: top-right of the main screen, just below the menu bar. + // Anchor the top edge so the panel grows downward as content expands. if let screen = NSScreen.main { let margin: CGFloat = 16 let x = screen.visibleFrame.maxX - 320 - margin - let y = screen.visibleFrame.maxY - 160 - margin - p.setFrameOrigin(NSPoint(x: x, y: y)) + // Place top edge just below the menu bar + let topY = screen.visibleFrame.maxY - margin + p.setFrameTopLeftPoint(NSPoint(x: x, y: topY)) } else { p.center() } diff --git a/GeminiVLMClient.swift b/GeminiVLMClient.swift index d32536a..99cb7ac 100644 --- a/GeminiVLMClient.swift +++ b/GeminiVLMClient.swift @@ -1,5 +1,5 @@ -// GeminiVLMClient.swift — Native Swift Gemini Vision API client -// Ports the Python argus VLM analysis (vlm.py) directly into Swift. +// GeminiVLMClient.swift — Native Swift Gemini Vision API client + Agentic Executor +// Ports the Python argus VLM analysis (vlm.py) and executor (executor.py) into Swift. // No subprocess required: screenshots go straight from ScreenCaptureKit → Gemini → UI. import Foundation @@ -7,36 +7,81 @@ import Foundation struct GeminiVLMClient { private static let apiBase = "https://generativelanguage.googleapis.com/v1beta/models" - private static let model = "gemini-3.1-pro-preview" + private static let analysisModel = "gemini-3-flash-preview" + private static let executorModel = "gemini-3-flash-preview" let apiKey: String - // MARK: - Public + // MARK: - Files API Upload + + /// Upload a single JPEG frame to the Gemini Files API. + /// Returns the file URI which can be reused in subsequent VLM requests, + /// avoiding redundant base64 re-encoding of frames already seen by the model. + func uploadFrame(_ data: Data) async throws -> String { + let urlStr = "https://generativelanguage.googleapis.com/upload/v1beta/files?uploadType=multipart&key=\(apiKey)" + guard let url = URL(string: urlStr) else { throw URLError(.badURL) } + + let boundary = "frameboundary-\(UUID().uuidString.prefix(16))" + var body = Data() + let meta = "{\"file\":{\"display_name\":\"frame\"}}" + body.append("--\(boundary)\r\nContent-Type: application/json; charset=UTF-8\r\n\r\n\(meta)\r\n".data(using: .utf8)!) + body.append("--\(boundary)\r\nContent-Type: image/jpeg\r\n\r\n".data(using: .utf8)!) + body.append(data) + body.append("\r\n--\(boundary)--\r\n".data(using: .utf8)!) + + var request = URLRequest(url: url) + request.httpMethod = "POST" + request.setValue("multipart/related; boundary=\(boundary)", forHTTPHeaderField: "Content-Type") + request.httpBody = body + request.timeoutInterval = 30 + + let (responseData, response) = try await URLSession.shared.data(for: request) + if let http = response as? HTTPURLResponse, http.statusCode != 200 { + let msg = String(data: responseData, encoding: .utf8) ?? "HTTP \(http.statusCode)" + print("[GeminiFiles] Upload failed \(http.statusCode): \(msg.prefix(200))") + throw URLError(.badServerResponse) + } + guard let json = try JSONSerialization.jsonObject(with: responseData) as? [String: Any], + let file = json["file"] as? [String: Any], + let uri = file["uri"] as? String + else { + let raw = String(data: responseData, encoding: .utf8) ?? "" + print("[GeminiFiles] Unexpected upload response: \(raw.prefix(200))") + throw URLError(.cannotParseResponse) + } + print("[GeminiFiles] Uploaded \(data.count / 1024)KB → \(uri.suffix(20))") + return uri + } + + // MARK: - VLM Analysis /// Analyze a sequence of JPEG frames and return a structured distraction analysis. - /// - Parameters: - /// - frames: JPEG screenshot frames, oldest first, newest last. - /// - taskTitle: Current task title (empty if no session). - /// - taskGoal: Task description / goal. - /// - steps: Active step list for the current task. - /// - windowTitle: Frontmost app name from NSWorkspace. - /// - recentSummaries: Rolling summaries from previous analyses (temporal context). + /// Pass `fileUris` (parallel to `frames`) to use Gemini Files API URIs for frames that + /// were already uploaded — avoids re-sending base64 for the 3 frames carried over from + /// the previous rolling-window call. Nil entries fall back to inline base64. func analyze( frames: [Data], + fileUris: [String?] = [], taskTitle: String, taskGoal: String, steps: [Step], windowTitle: String, - recentSummaries: [String] + historyContext: String, + sessionContext: String, + lastOutputContext: String, + executionContext: String ) async throws -> DistractionAnalysisResponse { let prompt = buildPrompt( taskTitle: taskTitle, taskGoal: taskGoal, steps: steps, windowTitle: windowTitle, - recentSummaries: recentSummaries + historyContext: historyContext, + sessionContext: sessionContext, + lastOutputContext: lastOutputContext, + executionContext: executionContext ) - let raw = try await callGemini(prompt: prompt, frames: frames) + let raw = try await callGemini(prompt: prompt, frames: frames, fileUris: fileUris, maxOutputTokens: 1024) return try parseResponse(raw) } @@ -47,11 +92,14 @@ struct GeminiVLMClient { taskGoal: String, steps: [Step], windowTitle: String, - recentSummaries: [String] + historyContext: String, + sessionContext: String, + lastOutputContext: String, + executionContext: String ) -> String { let stepsText: String if steps.isEmpty { - stepsText = " (no steps defined)" + stepsText = " (no steps)" } else { stepsText = steps.map { s in let marker: String @@ -67,14 +115,12 @@ struct GeminiVLMClient { }.joined(separator: "\n") } - let historyText: String - if recentSummaries.isEmpty { - historyText = " (no previous frames)" - } else { - historyText = recentSummaries.enumerated() - .map { i, s in " [frame \(i + 1)] \(s)" } - .joined(separator: "\n") - } + let sessionSection = sessionContext.isEmpty + ? "(no open sessions — suggest start_new if user is actively working on something)" + : sessionContext + + let prevSection = lastOutputContext.isEmpty ? "" : "\n\(lastOutputContext)" + let execSection = executionContext.isEmpty ? "" : "\n\(executionContext)" return """ You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots. @@ -82,142 +128,460 @@ struct GeminiVLMClient { ## How to read the screenshots You receive screenshots in chronological order (oldest first, newest last). - Each frame is ~5 seconds apart. This means: - - 2 unchanged frames = ~10 seconds idle — significant. - - 3 unchanged frames = ~15 seconds idle — user is stuck or distracted. - - If ALL frames are identical, the user has been idle for 15+ seconds — flag it. + You receive ~4 frames spanning ~20 seconds (one frame every 5 seconds). This means: + - 2 unchanged frames = 10+ seconds idle. That's significant. + - 3+ unchanged frames = 15-20 seconds idle. The user is stuck or distracted. + - If ALL frames are identical, the user has been idle for 20 seconds — definitely flag it. + - If the user wrote code/text and then 2+ frames show no changes, they are STUCK NOW. + Do NOT wait for many frames to flag problems. React fast. Your PRIMARY signal is the DIFFERENCES between consecutive frames. - Where the screen CHANGED = where attention is. Static areas = ignore. + Where the screen CHANGED = where the user's ATTENTION is. + Where the screen is STATIC = background noise. Ignore it. Diff signals and what they mean: - - New text appearing / cursor advancing → user is actively typing (this IS their task) + - New text appearing / cursor advancing → user is actively typing (THIS is their task) - Window or tab switch → context change, could be reference or distraction - Same content, no pixel changes → stalled, idle, or reading - Repeated switching between same 2-3 apps → repetitive loop (manual data transfer) + - Scroll position change → reading or browsing - Error message that APPEARED between frames → user just triggered it, relevant - - Error message already in ALL frames → stale, ignore + - Error message that was ALREADY THERE in all frames → stale, ignore it CRITICAL — looking at something ≠ working on something: - User switches to browser/another app and just LOOKS → distraction or quick reference. - User switches and starts TYPING/EDITING → might be a new task. - If the user has an active session and switches away WITHOUT typing in the new app, they are DISTRACTED from their session, not starting a new task. + - Only infer a new task when there is clear evidence of productive work (typing, editing, + cursor movement between frames) in the new context. - A single app switch is NEVER enough to infer a new task. Wait for active work. - ## Current task context + ## Current state: \(taskTitle.isEmpty ? "MONITORING MODE (no active focus session)" : "FOCUS SESSION on \"\(taskTitle)\"") - Task: \(taskTitle.isEmpty ? "(no active task)" : taskTitle) - Goal: \(taskGoal.isEmpty ? taskTitle : taskGoal) - Steps: - \(stepsText) + \(taskTitle.isEmpty ? "" : "Task: \(taskTitle)\nGoal: \(taskGoal.isEmpty ? taskTitle : taskGoal)\nSteps:\n\(stepsText)") Window title (OS): \(windowTitle.isEmpty ? "(unknown)" : windowTitle) - ## Recent screen history (for temporal context) - \(historyText) + \(taskTitle.isEmpty ? """ + You are in MONITORING MODE — no focus session is active. + Rules for monitoring mode: + - NEVER send notification type "nudge". Nudges are only for active focus sessions. + - Instead, suggest session_action: start_new or resume if the user is actively working. + - If the user is browsing, idle, or doing casual stuff, set notification type "none". + - Do NOT nag the user about incomplete tasks. Only suggest sessions when you see ACTIVE WORK. + """ : """ + IMPORTANT — Do NOT force-fit everything to the current task: + - The current task is what the user WAS working on. They may have MOVED ON. + - If the screen shows UNRELATED work (different app, different topic, different file), + the user is NOT on this task. Set on_task: false. + - If the user has been doing unrelated work for multiple frames, suggest + session_action: complete (they're done) or session_action: start_new (new work). + - Do NOT interpret browsing YouTube, checking email, or working on a different project + as "related to" the current task just because a session is active. + - Your job is to OBSERVE what the user is doing, not to anchor to the current task. + """) - ## What to output + ## Open sessions and tasks from backend (use EXACT IDs below) + \(sessionSection) - Analyze the screenshots and return JSON with EXACTLY this structure (no extra fields, no markdown): + Session & task matching rules: + - A session matches ONLY if the user is actively EDITING the session's last_file. + Being in the same app (e.g. VS Code) is NOT enough — must be typing/editing the specific file. + - If the session's file IS being actively edited → session_action: resume with EXACT session_id. + - If the user moved to a different open session's file → session_action: switch with EXACT session_id. + - If the session's task appears DONE → session_action: complete with EXACT session_id. + Completion = the task's GOAL is visibly achieved on screen, NOT "all steps checked off." + Steps are AI-generated approximations. A commit, successful build, or "fixed" message + means the task is done regardless of how many steps are still marked pending. + - If the user is working on something matching an UNSTARTED TASK (listed above with task_id), + output session_action: start_new with task_id set to that task's ID. This starts a session + linked to the existing task instead of creating a new one. + - If the user is working on something that matches NO existing session or task, + output session_action: start_new with session_id: null AND task_id: null. + - NEVER invent IDs. Use only the IDs listed above or null. + \(prevSection)\(execSection) + ## Recent screen history (temporal context) + \(historyContext) + + ## What to analyze + + 1. INFERRED TASK: What is the user working on right now? Base this on where pixels changed. + 2. CHECKPOINT: What specific progress did the user make across these frames? + 3. STEP COMPLETION — be AGGRESSIVE about marking steps done: + - Steps are AI-generated APPROXIMATIONS, not a rigid checklist. + - The user might solve the entire task in fewer steps than listed. + - If the screen shows the task's GOAL is achieved (e.g., code compiles, commit succeeded, + file is saved, output looks correct), mark ALL remaining steps as done via steps_completed. + - Look for completion signals: "committed", "fixed", "done", "success", green checkmarks, + successful build output, "pushed", merged PR, closed issue. + - A single action (like an AI agent fixing a bug) can complete multiple steps at once. + - When in doubt about whether a step is done, CHECK THE SCREEN — if the end result is + visible and correct, the intermediate steps don't matter. + 4. TASK/SESSION COMPLETION — detect when the WHOLE task is done: + - If you can see the task's goal is achieved on screen, output session_action: complete. + - Do NOT wait for all steps to be individually checked off. Steps are suggestions. + - Completion signals: successful commit/push, "fixed", moving on to unrelated work, + closing the relevant files, terminal showing success. + - If an AI agent (like Claude Code) just solved the problem and committed, the task is DONE. + 5. FRICTION DETECTION: Is the user stuck in any of these patterns? + - REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually) + - STALLED: No meaningful pixel changes across 2+ frames, OR user wrote then deleted/undid + (write-then-delete = struggle, NOT "refining") + - TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand) + - CONTEXT_OVERHEAD: Many windows open, visibly searching across them + - TASK_RESUMPTION: User just returned to a task from earlier + IMPORTANT signals to catch IMMEDIATELY: + - User wrote code/text then deleted it → STUCK. Flag stalled. + - User switching between source doc and target file repeatedly → TEDIOUS_MANUAL. + Flag it on the SECOND switch. Don't wait. + 6. NOTIFICATION: Decide what to show the user: + - "none" — user is productively working + - "nudge" — user is idle/distracted, set message to a short reminder + - "friction" — user is stuck and an AI agent can take a concrete action + ONLY use "friction" when proposed_actions has a specific, executable task with a target + 7. PROPOSED ACTION (only when notification.type = "friction"): + The "details" field is the executor agent's full instruction: + Bad: "Extract data from the document" + Good: "User is copying table values from a PDF into markdown. Extract the table from the PDF + (visible in screenshots), format as a markdown table matching the style already in the + file, and append to report.md. The user has been writing plain text tables — match that style." + + Respond ONLY with JSON (no markdown fences): { "on_task": true, "current_step_id": "step UUID or null", - "inferred_task": "what the user is actually working on based on screen diffs", - "checkpoint_note_update": "what specifically changed across these frames", + "inferred_task": "what the user is actually working on, based on screen diffs", + "checkpoint_note_update": "what changed across these frames specifically", "steps_completed": [], "friction": { "type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none", "confidence": 0.0, - "description": "what the user is struggling with", + "description": "what the user is struggling with, based on diff evidence", "proposed_actions": [ { - "label": "specific verb phrase: what to do", - "action_type": "auto_extract | brain_dump | other", - "details": "natural language spec for what action to take" + "label": "specific verb phrase the user can approve with one tap", + "details": "Natural language spec: (1) what to do, (2) where to look in screenshots, (3) EXACT format matching what the user already wrote, (4) target file. Concrete enough for an agent to execute without asking questions." } ], - "source_context": "filename or app name, or null", - "target_context": "filename or app name, or null" + "source_context": "filename if visible, or app name", + "target_context": "filename if visible, or app name" }, "session_action": { - "type": "none", - "session_id": null, - "reason": "" + "type": "resume | switch | complete | start_new | none", + "session_id": "uuid of matching session, or null for start_new/none", + "task_id": "uuid of matching unstarted task (for start_new only), or null", + "reason": "why this session action is suggested" + }, + "notification": { + "type": "none | nudge | friction", + "message": "nudge text if type=nudge, null otherwise" }, "intent": "skimming | engaged | unclear | null", "distraction_type": "app_switch | browsing | idle | null", "app_name": "primary visible application", "confidence": 0.8, - "gentle_nudge": "short nudge message if distracted but no friction action applies, otherwise null", "vlm_summary": "1-sentence description of what CHANGED across the frames (not what is static)" } - - FRICTION DETECTION rules: - - REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually) - - STALLED: No meaningful pixel changes across 2+ frames; or user wrote then deleted - - TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand) - - CONTEXT_OVERHEAD: Many windows open, visibly searching across them - - TASK_RESUMPTION: User just returned to a task they were working on earlier - - If friction confidence < 0.5, set type to "none". - Only set gentle_nudge when user is off-task AND no actionable friction applies. """ } - // MARK: - Action Executor + // MARK: - Agentic Executor (ported from executor.py) - /// Execute a user-approved proactive action and return a plain-text result. + /// Execute a user-approved proactive action using a multi-step agent loop + /// with Gemini function calling. Returns the final output/summary. func executeAction( label: String, - actionType: String, details: String, - screenshot: Data? + frames: [Data], + onToolCall: (@Sendable (String, String) -> Void)? = nil ) async throws -> String { - let taskInstruction: String - switch actionType { - case "auto_extract": - taskInstruction = "Extract the relevant data from the screenshot and present it concisely as plain text." - case "brain_dump": - taskInstruction = "Format this as a short brain-dump note the user should add to their task list." - default: - taskInstruction = "Provide 2–3 concrete next steps the user can take right now." - } - let prompt = """ - You are a productivity assistant. The user approved this action: "\(label)" - Details: \(details.isEmpty ? "(none)" : details) - \(taskInstruction) - Be specific and brief (3–5 sentences max). No markdown, no preamble, plain text only. + let systemPrompt = """ + You are a productivity assistant executing a task the user approved. + Action: "\(label)" + Spec: \(details.isEmpty ? "(none provided)" : details) + + INSTRUCTIONS: + 1. For BINARY files (PDFs, images, etc.): use your VISION. Read content directly + from the screenshots — this is your most reliable source for non-text files. + 2. For TEXT files (code, markdown, configs, txt): use read_file to get exact content. + 3. If you need a file but only know the filename (not the path), FIND IT FIRST: + - run_command("mdfind -name 'filename'") — fast macOS Spotlight search + - run_command("lsof -c AppName | grep filename") — find what file an app has open + Do NOT guess paths. Search first. + 4. Choose the right output method: + - write_file(): For existing text files where the modification is clear and the + file location is known — code files (cpp, py, js, etc.), markdown, configs. + Read the file first, then write the updated version. + NEVER create new files. NEVER write to files you haven't read first. + - output(): For everything else — extracted data from PDFs/images, content for + binary targets (docx, ppt, forms, websites), or when you're unsure where to + put the result. User will review and copy/paste. + 5. Use run_command to compile, test, or search for files. Never to write files. + 6. Do NOT hallucinate content. If you can't read something, say so. + 7. Call done() with a summary when the action is complete. """ - let frames: [Data] = screenshot.map { [$0] } ?? [] - return try await callGemini(prompt: prompt, frames: frames) - } - // MARK: - Gemini REST API Call - - private func callGemini(prompt: String, frames: [Data]) async throws -> String { - let urlStr = "\(Self.apiBase)/\(Self.model):generateContent?key=\(apiKey)" - guard let url = URL(string: urlStr) else { throw URLError(.badURL) } - - // Build content parts: label + image for each frame, then instruction - var parts: [[String: Any]] = [] - let total = frames.count + // Build initial user message with screenshots + var userParts: [[String: Any]] = [] for (i, frame) in frames.enumerated() { - parts.append(["text": "[Screenshot \(i + 1)/\(total) — \((total - i) * 5)s ago]"]) - parts.append([ + userParts.append(["text": "[Screenshot \(i + 1)/\(frames.count)]"]) + userParts.append([ "inlineData": [ "mimeType": "image/jpeg", "data": frame.base64EncodedString() ] ]) } - parts.append(["text": "Analyze this screenshot sequence now. Reply with ONLY valid JSON — no markdown, no code fences."]) + userParts.append(["text": "Execute the action now. Use the tools available to you."]) + + var messages: [[String: Any]] = [ + ["role": "user", "parts": userParts] + ] + + let maxSteps = 10 + var filesRead: Set = [] + var outputResult: String? + var doneSummary: String? + + for step in 0.. String { + let expandedPath = NSString(string: path).expandingTildeInPath + guard FileManager.default.fileExists(atPath: expandedPath) else { + return "ERROR: File not found: \(path)" + } + guard FileManager.default.isReadableFile(atPath: expandedPath) else { + return "ERROR: Cannot read file: \(path)" + } + do { + let content = try String(contentsOfFile: expandedPath, encoding: .utf8) + // Truncate very large files + if content.count > 50_000 { + return String(content.prefix(50_000)) + "\n\n[TRUNCATED — file is \(content.count) characters]" + } + return content + } catch { + return "ERROR: \(error.localizedDescription)" + } + } + + nonisolated private func executeWriteFile(path: String, content: String) -> String { + let expandedPath = NSString(string: path).expandingTildeInPath + guard FileManager.default.fileExists(atPath: expandedPath) else { + return "ERROR: File does not exist: \(path). Cannot create new files." + } + do { + try content.write(toFile: expandedPath, atomically: true, encoding: .utf8) + return "OK — wrote \(content.count) characters to \(path)" + } catch { + return "ERROR: \(error.localizedDescription)" + } + } + + nonisolated private func executeRunCommand(command: String) async -> String { + // Safety: block obviously destructive commands + let dangerous = ["rm -rf /", "rm -rf ~", "mkfs", "dd if=", "> /dev/"] + for d in dangerous where command.contains(d) { + return "ERROR: Blocked dangerous command." + } + + return await withCheckedContinuation { continuation in + let process = Process() + process.executableURL = URL(fileURLWithPath: "/bin/zsh") + process.arguments = ["-c", command] + + let stdout = Pipe() + let stderr = Pipe() + process.standardOutput = stdout + process.standardError = stderr + + var hasResumed = false + + // Timeout after 30 seconds + let timeoutWork = DispatchWorkItem { + guard !hasResumed else { return } + hasResumed = true + process.terminate() + continuation.resume(returning: "ERROR: Command timed out after 30s.") + } + DispatchQueue.global().asyncAfter(deadline: .now() + 30, execute: timeoutWork) + + process.terminationHandler = { _ in + timeoutWork.cancel() + guard !hasResumed else { return } + hasResumed = true + + let outData = stdout.fileHandleForReading.readDataToEndOfFile() + let errData = stderr.fileHandleForReading.readDataToEndOfFile() + let out = String(data: outData, encoding: .utf8) ?? "" + let err = String(data: errData, encoding: .utf8) ?? "" + + var result = "" + if !out.isEmpty { result += out } + if !err.isEmpty { result += (result.isEmpty ? "" : "\n") + "STDERR: " + err } + if result.isEmpty { result = "(no output)" } + + if result.count > 10_000 { + result = String(result.prefix(10_000)) + "\n\n[TRUNCATED]" + } + + if process.terminationStatus != 0 { + result += "\n(exit code: \(process.terminationStatus))" + } + + continuation.resume(returning: result) + } + + do { + try process.run() + } catch { + timeoutWork.cancel() + guard !hasResumed else { return } + hasResumed = true + continuation.resume(returning: "ERROR: \(error.localizedDescription)") + } + } + } + + // MARK: - Gemini API: Analysis (no tools) + + private func callGemini( + prompt: String, + frames: [Data], + fileUris: [String?] = [], + finalInstruction: String = "Analyze this screenshot sequence now. Reply with ONLY valid JSON — no markdown, no code fences.", + maxOutputTokens: Int = 1024 + ) async throws -> String { + let urlStr = "\(Self.apiBase)/\(Self.analysisModel):generateContent?key=\(apiKey)" + guard let url = URL(string: urlStr) else { throw URLError(.badURL) } + + var parts: [[String: Any]] = [] + let total = frames.count + var inlineCount = 0 + var uriCount = 0 + for (i, frame) in frames.enumerated() { + let age = (total - i) * 5 // approximate seconds ago + parts.append(["text": "[Screenshot \(i + 1)/\(total) — \(age)s ago]"]) + let uri = i < fileUris.count ? fileUris[i] : nil + if let uri { + // Use Files API URI — no re-upload of this frame's bytes + parts.append(["fileData": ["mimeType": "image/jpeg", "fileUri": uri]]) + uriCount += 1 + } else { + // Fallback to inline base64 (newest frame, or upload not yet complete) + parts.append(["inlineData": ["mimeType": "image/jpeg", "data": frame.base64EncodedString()]]) + inlineCount += 1 + } + } + print("[GeminiVLM] Sending \(uriCount) URI frames + \(inlineCount) inline frames") + parts.append(["text": finalInstruction]) let body: [String: Any] = [ "systemInstruction": ["parts": [["text": prompt]]], "contents": [["parts": parts]], "generationConfig": [ "temperature": 0.2, - "maxOutputTokens": 1024 + "maxOutputTokens": maxOutputTokens ] ] @@ -247,15 +611,134 @@ struct GeminiVLMClient { throw URLError(.cannotParseResponse) } + if let reason = first["finishReason"] as? String, reason != "STOP" { + print("[GeminiVLM] finishReason=\(reason) — response may be truncated") + } + print("[GeminiVLM] Response (\(text.count) chars): \(text.prefix(200))") return text } + // MARK: - Gemini API: Executor (with function calling) + + /// Gemini function calling tool declarations for the agentic executor. + private var executorTools: [[String: Any]] { + [[ + "functionDeclarations": [ + [ + "name": "read_file", + "description": "Read a plain text file. Returns the file contents as a string.", + "parameters": [ + "type": "object", + "properties": [ + "path": ["type": "string", "description": "Absolute file path to read"] + ], + "required": ["path"] + ] + ], + [ + "name": "write_file", + "description": "Write content to an existing plain text file. You MUST call read_file on this path first. Cannot create new files.", + "parameters": [ + "type": "object", + "properties": [ + "path": ["type": "string", "description": "Absolute file path (must already exist)"], + "content": ["type": "string", "description": "Full file content to write"] + ], + "required": ["path", "content"] + ] + ], + [ + "name": "run_command", + "description": "Execute a shell command and return stdout/stderr. Use for compilation, testing, file discovery (mdfind, lsof). Do not use to write files.", + "parameters": [ + "type": "object", + "properties": [ + "command": ["type": "string", "description": "Shell command to execute"] + ], + "required": ["command"] + ] + ], + [ + "name": "output", + "description": "Display content to the user in a sticky note card. Use for extracted data from PDFs/images, content for binary targets, or when unsure where to put results.", + "parameters": [ + "type": "object", + "properties": [ + "title": ["type": "string", "description": "Card title"], + "content": ["type": "string", "description": "Content to display"] + ], + "required": ["title", "content"] + ] + ], + [ + "name": "done", + "description": "Signal that the action is complete. Always call this when finished.", + "parameters": [ + "type": "object", + "properties": [ + "summary": ["type": "string", "description": "Brief summary of what was done"] + ], + "required": ["summary"] + ] + ] + ] + ]] + } + + /// Call Gemini with function calling enabled. Returns raw response Data. + private func callGeminiWithTools( + systemPrompt: String, + messages: [[String: Any]], + maxOutputTokens: Int = 4096 + ) async throws -> Data { + let urlStr = "\(Self.apiBase)/\(Self.executorModel):generateContent?key=\(apiKey)" + guard let url = URL(string: urlStr) else { throw URLError(.badURL) } + + let body: [String: Any] = [ + "systemInstruction": ["parts": [["text": systemPrompt]]], + "tools": executorTools, + "contents": messages, + "generationConfig": [ + "temperature": 0.2, + "maxOutputTokens": maxOutputTokens + ] + ] + + var request = URLRequest(url: url) + request.httpMethod = "POST" + request.setValue("application/json", forHTTPHeaderField: "Content-Type") + request.httpBody = try JSONSerialization.data(withJSONObject: body) + request.timeoutInterval = 120 + + let (data, response) = try await URLSession.shared.data(for: request) + + if let http = response as? HTTPURLResponse, http.statusCode == 429 { + // Rate limited — wait and retry once + print("[Executor] Rate limited (429) — retrying in 5s") + try await Task.sleep(for: .seconds(5)) + let (retryData, retryResponse) = try await URLSession.shared.data(for: request) + if let retryHttp = retryResponse as? HTTPURLResponse, retryHttp.statusCode != 200 { + let msg = String(data: retryData, encoding: .utf8) ?? "HTTP \(retryHttp.statusCode)" + print("[Executor] Retry failed: \(msg)") + throw URLError(.badServerResponse) + } + return retryData + } + + if let http = response as? HTTPURLResponse, http.statusCode != 200 { + let msg = String(data: data, encoding: .utf8) ?? "HTTP \(http.statusCode)" + print("[Executor] API error \(http.statusCode): \(msg)") + throw URLError(.badServerResponse) + } + + return data + } + // MARK: - Response Parsing private func parseResponse(_ text: String) throws -> DistractionAnalysisResponse { var cleaned = text.trimmingCharacters(in: .whitespacesAndNewlines) - // Strip ```json ... ``` or ``` ... ``` fences if cleaned.hasPrefix("```") { let lines = cleaned.components(separatedBy: "\n") cleaned = lines.dropFirst().joined(separator: "\n") @@ -264,15 +747,57 @@ struct GeminiVLMClient { } cleaned = cleaned.trimmingCharacters(in: .whitespacesAndNewlines) } - // Find JSON object boundaries robustly - guard let start = cleaned.firstIndex(of: "{"), - let end = cleaned.lastIndex(of: "}") else { + + guard let start = cleaned.firstIndex(of: "{") else { throw URLError(.cannotParseResponse) } + + guard let end = cleaned.lastIndex(of: "}") else { + print("[GeminiVLM] Truncated JSON — attempting partial field extraction") + return partialFallback(from: String(cleaned[start...])) + } + let jsonStr = String(cleaned[start...end]) guard let jsonData = jsonStr.data(using: .utf8) else { throw URLError(.cannotParseResponse) } - return try JSONDecoder().decode(DistractionAnalysisResponse.self, from: jsonData) + + do { + return try JSONDecoder().decode(DistractionAnalysisResponse.self, from: jsonData) + } catch { + print("[GeminiVLM] Decode error: \(error) — attempting partial field extraction") + return partialFallback(from: jsonStr) + } + } + + private func partialFallback(from jsonText: String) -> DistractionAnalysisResponse { + let onTask = !jsonText.contains("\"on_task\": false") && !jsonText.contains("\"on_task\":false") + let inferredTask = regexExtract(#""inferred_task"\s*:\s*"((?:[^"\\]|\\.)*)""#, from: jsonText) + let vlmSummary = regexExtract(#""vlm_summary"\s*:\s*"((?:[^"\\]|\\.)*)""#, from: jsonText) + let appName = regexExtract(#""app_name"\s*:\s*"((?:[^"\\]|\\.)*)""#, from: jsonText) + print("[GeminiVLM] Partial recovery — on_task=\(onTask) task=\(inferredTask ?? "nil")") + return DistractionAnalysisResponse( + onTask: onTask, + currentStepId: nil, + inferredTask: inferredTask, + checkpointNoteUpdate: nil, + stepsCompleted: [], + friction: nil, + sessionAction: nil, + notification: nil, + intent: nil, + distractionType: nil, + appName: appName, + confidence: 0.0, + vlmSummary: vlmSummary + ) + } + + private func regexExtract(_ pattern: String, from text: String) -> String? { + guard let regex = try? NSRegularExpression(pattern: pattern), + let match = regex.firstMatch(in: text, range: NSRange(text.startIndex..., in: text)), + let range = Range(match.range(at: 1), in: text) + else { return nil } + return String(text[range]) } } diff --git a/HistoryBuffer.swift b/HistoryBuffer.swift new file mode 100644 index 0000000..9aec0a5 --- /dev/null +++ b/HistoryBuffer.swift @@ -0,0 +1,189 @@ +// HistoryBuffer.swift — Two-tier rolling history for VLM temporal context +// Ports Python argus buffer.py: image tier (recent frames) + text tier (older summaries). +// The VLM sees recent images directly AND gets text context for events 30-60s ago. + +import Foundation + +/// A single buffered screenshot frame with its VLM summary. +struct BufferEntry: Sendable { + let imageData: Data // JPEG bytes + var summary: String // VLM-generated summary (populated after analysis) + let timestamp: Date + var fileUri: String? // Gemini Files API URI (set async after upload; nil = use inline) +} + +/// A text-only summary from an older analysis (images already evicted). +struct TextEntry: Sendable { + let summary: String + let timestamp: Date +} + +/// Two-tier rolling buffer that provides temporal context to the VLM. +/// +/// - **Image tier:** Last N frames (JPEG + summary + timestamp). Sent as images. +/// - **Text tier:** Older summaries that rolled off the image buffer. Sent as text. +/// - **Last output:** Previous VLM JSON result for self-refinement. +/// - **Last execution:** Executor action summary to prevent re-flagging. +/// +/// Only accessed from `SessionManager` on the main actor — no concurrent access. +@MainActor +final class HistoryBuffer { + + private let imageMaxLen: Int + private let textMaxLen: Int + + /// Recent frames — sent as images to the VLM. + private(set) var images: [BufferEntry] = [] + + /// Older summaries — sent as text context. + private(set) var textHistory: [TextEntry] = [] + + /// Full VLM JSON output from last analysis (for self-refinement). + private(set) var lastOutput: String = "" + + /// Summary of last executor action (prevents re-flagging same friction). + private(set) var lastExecution: String = "" + + /// Counter for how many VLM calls since execution was set (clear after 3). + private var executionAge: Int = 0 + + init(imageMaxLen: Int = 4, textMaxLen: Int = 12) { + self.imageMaxLen = imageMaxLen + self.textMaxLen = textMaxLen + } + + // MARK: - Push / Update + + /// Add a new frame to the image buffer. If the buffer is full, the oldest + /// frame's summary is promoted to the text tier before eviction. + func push(imageData: Data, summary: String = "") { + let entry = BufferEntry(imageData: imageData, summary: summary, timestamp: Date()) + + if images.count >= imageMaxLen { + // Promote oldest image's summary to text tier (if non-empty) + let evicted = images.removeFirst() + if !evicted.summary.isEmpty { + textHistory.append(TextEntry(summary: evicted.summary, timestamp: evicted.timestamp)) + if textHistory.count > textMaxLen { + textHistory.removeFirst() + } + } + } + images.append(entry) + } + + /// Update the summary on the most recent image entry (called after VLM returns). + func updateLastSummary(_ summary: String) { + guard !images.isEmpty else { return } + images[images.count - 1].summary = summary + } + + /// Store the Gemini Files API URI for the frame with the given timestamp. + /// Called asynchronously after upload completes — safe because pushes happen at 5s intervals. + func updateFileUri(_ uri: String, forTimestamp ts: Date) { + guard let idx = images.firstIndex(where: { abs($0.timestamp.timeIntervalSince(ts)) < 1.0 }) else { return } + images[idx].fileUri = uri + } + + /// Store the full VLM JSON output for self-refinement on the next call. + func setLastOutput(_ json: String) { + lastOutput = json + } + + /// Store executor action summary. Cleared automatically after 3 VLM iterations. + func setLastExecution(_ summary: String) { + lastExecution = summary + executionAge = 0 + } + + /// Tick execution age — call after each VLM analysis. Clears after 3. + func tickExecutionAge() { + if !lastExecution.isEmpty { + executionAge += 1 + if executionAge >= 3 { + lastExecution = "" + executionAge = 0 + } + } + } + + /// Get all buffered JPEG frames (for sending to VLM as images). + var frameData: [Data] { + images.map(\.imageData) + } + + /// File URIs parallel to frameData — nil means fall back to inline base64 for that frame. + var fileUris: [String?] { + images.map(\.fileUri) + } + + /// Get recent summaries as strings (for recentSummaries parameter). + var recentSummaries: [String] { + images.compactMap { $0.summary.isEmpty ? nil : $0.summary } + } + + /// Clear all state (e.g., on session end). + func clear() { + images.removeAll() + textHistory.removeAll() + lastOutput = "" + lastExecution = "" + executionAge = 0 + } + + // MARK: - Prompt Formatting + + /// Build the temporal context section for the VLM prompt. + /// Returns a formatted string with older text context + image labels. + func formatForPrompt() -> String { + var lines: [String] = [] + + // Older text-only context (no images — just summaries) + if !textHistory.isEmpty { + lines.append("Older context (text only, no images):") + for entry in textHistory { + let age = Int(Date().timeIntervalSince(entry.timestamp)) + lines.append(" - [\(age)s ago] \(entry.summary)") + } + lines.append("") + } + + // Recent image labels (these accompany the actual images sent to the VLM) + if !images.isEmpty { + let total = images.count + lines.append("Recent screenshots (\(total) frames, newest last):") + for (i, entry) in images.enumerated() { + let age = Int(Date().timeIntervalSince(entry.timestamp)) + let isCurrent = (i == images.count - 1) + let label = " - Screenshot \(i + 1)/\(total): [\(isCurrent ? "now" : "\(age)s ago")]" + if !entry.summary.isEmpty { + lines.append("\(label) \(entry.summary)") + } else { + lines.append(label) + } + } + } + + return lines.isEmpty ? "(no previous context)" : lines.joined(separator: "\n") + } + + /// Format the last VLM output for self-refinement injection into the prompt. + func formatLastOutput() -> String { + guard !lastOutput.isEmpty else { return "" } + return """ + Your previous analysis (refine or correct this based on new evidence): + \(lastOutput) + If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it. + """ + } + + /// Format execution context for injection into the prompt. + func formatLastExecution() -> String { + guard !lastExecution.isEmpty else { return "" } + return """ + IMPORTANT — An AI agent just completed an action for the user: + \(lastExecution) + This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT. + """ + } +} diff --git a/LockInBro.xcodeproj/project.pbxproj b/LockInBro.xcodeproj/project.pbxproj index 36f188a..d126db4 100644 --- a/LockInBro.xcodeproj/project.pbxproj +++ b/LockInBro.xcodeproj/project.pbxproj @@ -8,6 +8,7 @@ /* Begin PBXBuildFile section */ FF341F642F7932FA00B5716A /* GeminiVLMClient.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF341F632F7932FA00B5716A /* GeminiVLMClient.swift */; }; + FF341F662F793A0000B5716A /* HistoryBuffer.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF341F652F793A0000B5716A /* HistoryBuffer.swift */; }; FF935B1E2F78A83100ED3330 /* SpeakerKit in Frameworks */ = {isa = PBXBuildFile; productRef = FF935B1D2F78A83100ED3330 /* SpeakerKit */; }; FF935B202F78A83100ED3330 /* TTSKit in Frameworks */ = {isa = PBXBuildFile; productRef = FF935B1F2F78A83100ED3330 /* TTSKit */; }; FF935B222F78A83100ED3330 /* WhisperKit in Frameworks */ = {isa = PBXBuildFile; productRef = FF935B212F78A83100ED3330 /* WhisperKit */; }; @@ -18,6 +19,7 @@ /* Begin PBXFileReference section */ FF3296C22F785B3300C734EB /* LockInBro.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LockInBro.app; sourceTree = BUILT_PRODUCTS_DIR; }; FF341F632F7932FA00B5716A /* GeminiVLMClient.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = GeminiVLMClient.swift; sourceTree = ""; }; + FF341F652F793A0000B5716A /* HistoryBuffer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = HistoryBuffer.swift; sourceTree = ""; }; FF935B232F78D0AA00ED3330 /* FloatingPanel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FloatingPanel.swift; sourceTree = ""; }; FF935B252F78D0BF00ED3330 /* FloatingHUDView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FloatingHUDView.swift; sourceTree = ""; }; /* End PBXFileReference section */ @@ -52,6 +54,7 @@ FF935B232F78D0AA00ED3330 /* FloatingPanel.swift */, FF935B252F78D0BF00ED3330 /* FloatingHUDView.swift */, FF341F632F7932FA00B5716A /* GeminiVLMClient.swift */, + FF341F652F793A0000B5716A /* HistoryBuffer.swift */, ); sourceTree = ""; }; @@ -146,6 +149,7 @@ FF935B262F78D0BF00ED3330 /* FloatingHUDView.swift in Sources */, FF341F642F7932FA00B5716A /* GeminiVLMClient.swift in Sources */, FF935B242F78D0AA00ED3330 /* FloatingPanel.swift in Sources */, + FF341F662F793A0000B5716A /* HistoryBuffer.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -277,12 +281,10 @@ ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_ENTITLEMENTS = LockInBro/LockInBro.entitlements; CODE_SIGN_IDENTITY = "Apple Development"; - "CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development"; - CODE_SIGN_STYLE = Manual; + CODE_SIGN_STYLE = Automatic; COMBINE_HIDPI_IMAGES = YES; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = ""; - "DEVELOPMENT_TEAM[sdk=macosx*]" = YK2DB9NT3S; + DEVELOPMENT_TEAM = YK2DB9NT3S; ENABLE_APP_SANDBOX = NO; ENABLE_PREVIEWS = YES; ENABLE_USER_SELECTED_FILES = readonly; @@ -299,7 +301,6 @@ PRODUCT_BUNDLE_IDENTIFIER = com.adipu.LockInBro; PRODUCT_NAME = "$(TARGET_NAME)"; PROVISIONING_PROFILE_SPECIFIER = ""; - "PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "Joy Zhuo"; REGISTER_APP_GROUPS = YES; STRING_CATALOG_GENERATE_SYMBOLS = YES; SWIFT_APPROACHABLE_CONCURRENCY = YES; @@ -317,12 +318,10 @@ ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_ENTITLEMENTS = LockInBro/LockInBro.entitlements; CODE_SIGN_IDENTITY = "Apple Development"; - "CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development"; - CODE_SIGN_STYLE = Manual; + CODE_SIGN_STYLE = Automatic; COMBINE_HIDPI_IMAGES = YES; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = ""; - "DEVELOPMENT_TEAM[sdk=macosx*]" = YK2DB9NT3S; + DEVELOPMENT_TEAM = YK2DB9NT3S; ENABLE_APP_SANDBOX = NO; ENABLE_PREVIEWS = YES; ENABLE_USER_SELECTED_FILES = readonly; @@ -339,7 +338,6 @@ PRODUCT_BUNDLE_IDENTIFIER = com.adipu.LockInBro; PRODUCT_NAME = "$(TARGET_NAME)"; PROVISIONING_PROFILE_SPECIFIER = ""; - "PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "Joy Zhuo"; REGISTER_APP_GROUPS = YES; STRING_CATALOG_GENERATE_SYMBOLS = YES; SWIFT_APPROACHABLE_CONCURRENCY = YES; diff --git a/LockInBro.xcodeproj/xcuserdata/adipu.xcuserdatad/xcschemes/xcschememanagement.plist b/LockInBro.xcodeproj/xcuserdata/adipu.xcuserdatad/xcschemes/xcschememanagement.plist new file mode 100644 index 0000000..1e305ad --- /dev/null +++ b/LockInBro.xcodeproj/xcuserdata/adipu.xcuserdatad/xcschemes/xcschememanagement.plist @@ -0,0 +1,14 @@ + + + + + SchemeUserState + + LockInBro.xcscheme_^#shared#^_ + + orderHint + 0 + + + + diff --git a/LockInBro.xcodeproj/xcuserdata/zehuaw.xcuserdatad/xcschemes/xcschememanagement.plist b/LockInBro.xcodeproj/xcuserdata/zehuaw.xcuserdatad/xcschemes/xcschememanagement.plist new file mode 100644 index 0000000..1e305ad --- /dev/null +++ b/LockInBro.xcodeproj/xcuserdata/zehuaw.xcuserdatad/xcschemes/xcschememanagement.plist @@ -0,0 +1,14 @@ + + + + + SchemeUserState + + LockInBro.xcscheme_^#shared#^_ + + orderHint + 0 + + + + diff --git a/LockInBro/APIClient.swift b/LockInBro/APIClient.swift index 754e193..06df5bc 100644 --- a/LockInBro/APIClient.swift +++ b/LockInBro/APIClient.swift @@ -265,6 +265,26 @@ final class APIClient { // MARK: - Sessions + /// Returns all active + interrupted sessions (for VLM session context). + func getOpenSessions() async throws -> [OpenSession] { + do { + let data = try await req("/sessions/open") + return try decode([OpenSession].self, from: data) + } catch NetworkError.httpError(404, _) { + return [] + } + } + + /// Create a task detected by the VLM from screen analysis. + func createVLMTask(title: String) async throws -> AppTask { + let body = try JSONSerialization.data(withJSONObject: [ + "title": title, + "source": "vlm_detected" + ]) + let data = try await req("/tasks", method: "POST", body: body) + return try decode(AppTask.self, from: data) + } + /// Returns the currently active session, or nil if none (404). func getActiveSession() async throws -> FocusSession? { do { @@ -318,6 +338,28 @@ final class APIClient { _ = try await req("/sessions/\(sessionId)/checkpoint", method: "POST", body: body) } + // MARK: - Nudge (cross-device) + + /// Send a focus-session nudge via the backend push pipeline to all signed-in devices. + func sendNudge( + sessionId: String, + title: String, + body: String, + nudgeNumber: Int, + lastStep: String?, + nextStep: String? + ) async throws { + var dict: [String: Any] = [ + "title": title, + "body": body, + "nudge_number": nudgeNumber, + ] + if let ls = lastStep { dict["last_step"] = ls } + if let ns = nextStep { dict["next_step"] = ns } + let bodyData = try JSONSerialization.data(withJSONObject: dict) + _ = try await req("/sessions/\(sessionId)/nudge", method: "POST", body: bodyData) + } + // MARK: - App Activity func appActivity( @@ -352,14 +394,16 @@ final class APIClient { if let stepId = result.currentStepId { payload["current_step_id"] = stepId } if let note = result.checkpointNoteUpdate { payload["checkpoint_note_update"] = note } if let app = result.appName { payload["app_name"] = app } - if let nudge = result.gentleNudge { payload["gentle_nudge"] = nudge } + if let notif = result.notification { + payload["notification"] = ["type": notif.type, "message": notif.message as Any] + } if let friction = result.friction { payload["friction"] = [ "type": friction.type, "confidence": friction.confidence, "description": friction.description as Any, "proposed_actions": friction.proposedActions.map { - ["label": $0.label, "action_type": $0.actionType, "details": $0.details as Any] + ["label": $0.label, "details": $0.details as Any] }, ] } diff --git a/LockInBro/FocusSessionView.swift b/LockInBro/FocusSessionView.swift index 20639cd..fcad53d 100644 --- a/LockInBro/FocusSessionView.swift +++ b/LockInBro/FocusSessionView.swift @@ -83,7 +83,7 @@ struct FocusSessionView: View { } // Latest nudge - if let nudge = session.lastNudge { + if let nudge = session.nudgeMessage { NudgeCard(message: nudge) } @@ -401,7 +401,7 @@ private struct ProactiveCardView: View { return description ?? "I noticed something that might be slowing you down." case .appSwitchLoop(let apps, let count): return "You've switched between \(apps.joined(separator: " ↔ ")) \(count)× in a row — are you stuck?" - case .sessionAction(_, _, let checkpoint, let reason, _): + case .sessionAction(_, _, let checkpoint, let reason, _, _): return checkpoint.isEmpty ? reason : "Left off: \(checkpoint)" } } diff --git a/LockInBro/LockInBro.entitlements b/LockInBro/LockInBro.entitlements index 5f10c1e..e89b7f3 100644 --- a/LockInBro/LockInBro.entitlements +++ b/LockInBro/LockInBro.entitlements @@ -2,10 +2,6 @@ - com.apple.developer.applesignin - - Default - com.apple.security.app-sandbox diff --git a/LockInBro/MenuBarView.swift b/LockInBro/MenuBarView.swift index bdc1854..4527798 100644 --- a/LockInBro/MenuBarView.swift +++ b/LockInBro/MenuBarView.swift @@ -18,6 +18,11 @@ struct MenuBarView: View { Divider() + // Settings + settingsSection + + Divider() + // Bottom HStack { Text(auth.currentUser?.displayName ?? auth.currentUser?.email ?? "LockInBro") @@ -120,6 +125,31 @@ struct MenuBarView: View { } .padding(.vertical, 4) } + private var settingsSection: some View { + HStack(spacing: 8) { + Image(systemName: "bell.badge") + .foregroundStyle(.secondary) + .frame(width: 16) + Text("Nudge after") + .font(.caption) + .foregroundStyle(.secondary) + Spacer() + Picker("", selection: Binding( + get: { Int(session.distractionThresholdSeconds) }, + set: { session.distractionThresholdSeconds = TimeInterval($0) } + )) { + Text("1 min").tag(60) + Text("2 min").tag(120) + Text("3 min").tag(180) + Text("5 min").tag(300) + } + .pickerStyle(.menu) + .frame(width: 80) + .font(.caption) + } + .padding(.horizontal, 12) + .padding(.vertical, 6) + } } // MARK: - Menu Bar Button diff --git a/LockInBro/Models.swift b/LockInBro/Models.swift index 4902847..55d7f24 100644 --- a/LockInBro/Models.swift +++ b/LockInBro/Models.swift @@ -208,14 +208,8 @@ struct StepPlanResponse: Codable { /// A single action the proactive agent can take on the user's behalf. struct ProposedAction: Codable { - let label: String // e.g. "Extract all 14 events" - let actionType: String // e.g. "auto_extract", "brain_dump" - let details: String? - - enum CodingKeys: String, CodingKey { - case label, details - case actionType = "action_type" - } + let label: String // e.g. "Extract all 14 events into transcript.md" + let details: String? // Executor instruction spec (not shown as UI text) } /// Friction pattern detected by the upgraded Argus VLM prompt. @@ -244,14 +238,25 @@ struct SessionAction: Codable { /// resume | switch | complete | start_new | none let type: String let sessionId: String? + /// When start_new matches an existing task, the VLM sets this to the task's ID. + let taskId: String? let reason: String? enum CodingKeys: String, CodingKey { case type, reason case sessionId = "session_id" + case taskId = "task_id" } } +/// VLM-decided notification intent — replaces the old gentle_nudge field. +struct VLMNotification: Codable { + /// "none" | "nudge" | "friction" + let type: String + /// Populated when type == "nudge"; nil otherwise. + let message: String? +} + struct DistractionAnalysisResponse: Codable { let onTask: Bool let currentStepId: String? @@ -260,12 +265,13 @@ struct DistractionAnalysisResponse: Codable { let stepsCompleted: [String] // Upgraded Argus prompt fields (nil when backend uses legacy prompt) let friction: FrictionInfo? - let sessionAction: SessionAction? // new argus: session lifecycle suggestions + let sessionAction: SessionAction? + /// VLM explicitly decides what to show: none | nudge | friction + let notification: VLMNotification? let intent: String? // skimming | engaged | unclear | null let distractionType: String? let appName: String? let confidence: Double - let gentleNudge: String? let vlmSummary: String? enum CodingKeys: String, CodingKey { @@ -274,12 +280,11 @@ struct DistractionAnalysisResponse: Codable { case inferredTask = "inferred_task" case checkpointNoteUpdate = "checkpoint_note_update" case stepsCompleted = "steps_completed" - case friction, intent + case friction, notification, intent case sessionAction = "session_action" case distractionType = "distraction_type" case appName = "app_name" case confidence - case gentleNudge = "gentle_nudge" case vlmSummary = "vlm_summary" } } @@ -347,11 +352,14 @@ struct ProactiveCard: Identifiable { /// Heuristic app-switch loop detected by NSWorkspace observer (fallback when VLM hasn't returned friction yet). case appSwitchLoop(apps: [String], switchCount: Int) /// VLM suggests a session lifecycle action (new argus: resume, switch, complete, start_new). - case sessionAction(type: String, taskTitle: String, checkpoint: String, reason: String, sessionId: String?) + /// taskId: if start_new matches an existing unstarted task, this is set so we link instead of creating. + case sessionAction(type: String, taskTitle: String, checkpoint: String, reason: String, sessionId: String?, taskId: String?) } let id = UUID() let source: Source + /// For start_new: an existing task from the database that matches the inferred work. + var matchedTask: AppTask? /// Human-readable title for the card header. var title: String { @@ -366,7 +374,7 @@ struct ProactiveCard: Identifiable { } case .appSwitchLoop: return "Repetitive Pattern Detected" - case .sessionAction(let type, let taskTitle, _, _, _): + case .sessionAction(let type, let taskTitle, _, _, _, _): switch type { case "resume": return "Resume: \(taskTitle)" case "switch": return "Switch to: \(taskTitle)" @@ -390,7 +398,7 @@ struct ProactiveCard: Identifiable { } case .appSwitchLoop: return "arrow.triangle.2.circlepath" - case .sessionAction(let type, _, _, _, _): + case .sessionAction(let type, _, _, _, _, _): switch type { case "resume": return "arrow.counterclockwise.circle" case "switch": return "arrow.left.arrow.right" @@ -402,6 +410,45 @@ struct ProactiveCard: Identifiable { } } +// MARK: - Open Sessions (GET /sessions/open — for VLM session context) + +struct OpenSessionTask: Codable { + let title: String + let goal: String? +} + +struct OpenSessionCheckpoint: Codable { + let activeApp: String? + let activeFile: String? + let currentStepId: String? + let lastActionSummary: String? + + enum CodingKeys: String, CodingKey { + case activeApp = "active_app" + case activeFile = "active_file" + case currentStepId = "current_step_id" + case lastActionSummary = "last_action_summary" + } +} + +struct OpenSession: Identifiable, Codable { + let id: String + let taskId: String? + let task: OpenSessionTask? + let status: String // active | interrupted + let platform: String + let startedAt: String + let endedAt: String? + let checkpoint: OpenSessionCheckpoint? + + enum CodingKeys: String, CodingKey { + case id, task, status, platform, checkpoint + case taskId = "task_id" + case startedAt = "started_at" + case endedAt = "ended_at" + } +} + // MARK: - API Error struct APIErrorResponse: Codable { diff --git a/LockInBro/SessionManager.swift b/LockInBro/SessionManager.swift index dfa4945..7ee570b 100644 --- a/LockInBro/SessionManager.swift +++ b/LockInBro/SessionManager.swift @@ -4,9 +4,14 @@ import AppKit import SwiftUI -import UserNotifications import ScreenCaptureKit +/// Background monitoring vs focus session state. +enum MonitoringState { + case monitoring // VLM running, no focus session — Argus background mode + case focusSession // Linked to a specific task, tracking steps, distraction timer active +} + @Observable @MainActor final class SessionManager { @@ -14,40 +19,46 @@ final class SessionManager { // MARK: - State + var monitoringState: MonitoringState = .monitoring + var activeSession: FocusSession? var activeTask: AppTask? var activeSteps: [Step] = [] var currentStepIndex: Int = 0 - var isSessionActive: Bool = false + /// Computed for backward compatibility with UI bindings. + var isSessionActive: Bool { monitoringState == .focusSession } var sessionStartDate: Date? var distractionCount: Int = 0 - var lastNudge: String? - var resumeCard: ResumeCard? - var showingResumeCard: Bool = false var errorMessage: String? var isLoading: Bool = false + // Resume card (shown in HUD, not system notification) + var resumeCard: ResumeCard? + var showingResumeCard: Bool = false + // VLM / proactive agent var proactiveCard: ProactiveCard? var latestVlmSummary: String? var latestInferredTask: String? + var latestAppName: String? var isExecuting: Bool = false var executorOutput: (title: String, content: String)? var monitoringError: String? + // Nudge — shown in HUD as amber card, NOT system notification + var nudgeMessage: String? + @ObservationIgnored private var nudgeTimer: Task? + // Screenshot engine var isCapturing: Bool = false @ObservationIgnored private var captureTask: Task? private let captureInterval: TimeInterval = 5.0 - // Frame buffer — accumulate N frames before calling VLM for temporal diff context - @ObservationIgnored private var frameBuffer: [Data] = [] - private let framesPerVLMCall = 3 - - // Rolling summary history fed as context into subsequent VLM calls - private struct HistoryEntry { let summary: String; let timestamp: Date } - @ObservationIgnored private var screenshotHistory: [HistoryEntry] = [] + // Two-tier history buffer (replaces flat frameBuffer + screenshotHistory) + @ObservationIgnored private var historyBuffer = HistoryBuffer(imageMaxLen: 4, textMaxLen: 12) + @ObservationIgnored private var savedFramesForExecutor: [Data] = [] + private let framesPerVLMCall = 4 // App switch tracking @ObservationIgnored private var appSwitches: [(name: String, bundleId: String, time: Date)] = [] @@ -58,6 +69,47 @@ final class SessionManager { // Proactive card auto-dismiss timer @ObservationIgnored private var proactiveCardTimer: Task? + // Open sessions cache (for VLM session context injection) + @ObservationIgnored private var openSessions: [OpenSession] = [] + @ObservationIgnored private var lastOpenSessionsFetch: Date? + /// Adaptive: 10s during focus session, 30s during monitoring + private var sessionsFetchInterval: TimeInterval { monitoringState == .focusSession ? 10.0 : 30.0 } + + // Cross-device sync: track last known active session to detect remote changes + @ObservationIgnored private var lastKnownActiveSessionId: String? + + // Incomplete tasks cache (tasks without active sessions — for VLM task matching) + @ObservationIgnored private var incompleteTasks: [AppTask] = [] + + // Task caching (original's sophisticated matching) + @ObservationIgnored private var cachedTasks: [AppTask] = [] + @ObservationIgnored private var lastTasksFetch: Date? + + // Inferred task stability tracking (for VLM-initiated task creation) + @ObservationIgnored private var inferredTaskHistory: [String] = [] + private let stableTaskThreshold = 3 + + // Notification dedup — fingerprint-based, prevents spam + @ObservationIgnored private var lastFrictionFingerprint: String = "" + + // Session action dedup — tracks session IDs we've already shown cards for + @ObservationIgnored private var handledSessionActions: Set = [] + + // Distraction timer (active only during focus session) + @ObservationIgnored private var continuousOffTaskStart: Date? + @ObservationIgnored private var consecutiveNudgeCount: Int = 0 + @ObservationIgnored private var lastNudgeSentAt: Date? + private let maxConsecutiveNudges = 5 + + /// Configurable distraction threshold (default 2 minutes). + var distractionThresholdSeconds: TimeInterval { + get { + let stored = UserDefaults.standard.double(forKey: "lockInBro.distractionThreshold") + return stored > 0 ? stored : 120 + } + set { UserDefaults.standard.set(newValue, forKey: "lockInBro.distractionThreshold") } + } + private init() {} // MARK: - Computed @@ -77,74 +129,102 @@ final class SessionManager { // MARK: - Monitoring Lifecycle - /// Immediately shuts down all monitoring without making any API calls. func stopMonitoring() { stopCapture() stopAppObserver() proactiveCardTimer?.cancel() proactiveCardTimer = nil + nudgeTimer?.cancel() + nudgeTimer = nil activeSession = nil activeTask = nil activeSteps = [] - isSessionActive = false + monitoringState = .monitoring sessionStartDate = nil - lastNudge = nil resumeCard = nil showingResumeCard = false proactiveCard = nil + nudgeMessage = nil latestVlmSummary = nil latestInferredTask = nil + latestAppName = nil isExecuting = false executorOutput = nil monitoringError = nil - screenshotHistory = [] - frameBuffer = [] + historyBuffer.clear() + + savedFramesForExecutor = [] persistedSessionId = nil + openSessions = [] + incompleteTasks = [] + cachedTasks = [] + lastTasksFetch = nil + lastOpenSessionsFetch = nil + inferredTaskHistory = [] + lastFrictionFingerprint = "" + handledSessionActions = [] + distractionCount = 0 + continuousOffTaskStart = nil + consecutiveNudgeCount = 0 + lastNudgeSentAt = nil + lastKnownActiveSessionId = nil } - /// Called once after login. Auto-resumes any existing active session and starts the capture loop. + /// Called once after login. Starts VLM monitoring only — no focus session. + /// The VLM observes the screen and suggests resume/switch/start_new when appropriate. func startMonitoring() async { guard TokenStore.shared.token != nil else { return } guard !isCapturing else { return } monitoringError = nil - await requestNotificationPermission() - // Silent preflight — never shows UI; only request permission if not yet granted. if !CGPreflightScreenCaptureAccess() { CGRequestScreenCaptureAccess() monitoringError = "Screen Recording permission required — enable in System Settings → Privacy & Security → Screen Recording, then tap Retry" return } - do { - if let existing = try await APIClient.shared.getActiveSession() { - await autoResumeSession(existing) - } else { - startCapture() - startAppObserver() - } - } catch { - startCapture() - startAppObserver() + // End any stale active session FIRST — app always starts in monitoring mode. + // The interrupted session will appear in open sessions for the VLM to suggest resuming. + if let stale = try? await APIClient.shared.getActiveSession() { + print("[Startup] Found stale session \(stale.id.prefix(8)) — interrupting. VLM will decide what to do.") + _ = try? await APIClient.shared.endSession(sessionId: stale.id, status: "interrupted") } + + // Start VLM capture + app observer — monitoring mode only, no session + startCapture() + startAppObserver() + + // Fetch tasks and open sessions so VLM has context for matching + await fetchTasksIfNeeded() + await fetchOpenSessions() + await fetchIncompleteTasks() } - /// Silently resume an active session found on the backend (no loading UI shown). private func autoResumeSession(_ session: FocusSession) async { activeSession = session persistedSessionId = session.id - isSessionActive = true + monitoringState = .focusSession sessionStartDate = Date() distractionCount = 0 - lastNudge = nil - screenshotHistory = [] - frameBuffer = [] + nudgeMessage = nil + historyBuffer.clear() + + continuousOffTaskStart = nil + consecutiveNudgeCount = 0 + lastNudgeSentAt = nil if let taskId = session.taskId { do { - let tasks = try await APIClient.shared.getTasks() - activeTask = tasks.first(where: { $0.id == taskId }) + if cachedTasks.isEmpty { await fetchTasksIfNeeded() } + activeTask = cachedTasks.first(where: { $0.id == taskId }) + + if activeTask == nil { + cachedTasks = try await APIClient.shared.getTasks() + lastTasksFetch = Date() + activeTask = cachedTasks.first(where: { $0.id == taskId }) + } + if let task = activeTask { let steps = try await APIClient.shared.getSteps(taskId: task.id) activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder } @@ -155,12 +235,318 @@ final class SessionManager { } catch {} } - let shortId = String(session.id.prefix(8)) - let taskLabel = activeTask?.title ?? "(no task)" - latestVlmSummary = "Resumed session \(shortId) · \(taskLabel)" + // Build an informative resume summary for the HUD + if let task = activeTask { + let completed = activeSteps.filter(\.isDone).count + let total = activeSteps.count + var summary = "Resumed: \(task.title)" + if total > 0 { + summary += " (\(completed)/\(total) steps done)" + if let step = currentStep?.title { + summary += " — next: \(step)" + } + } + latestVlmSummary = summary + latestInferredTask = task.title + } else { + let checkpoint = session.checkpoint?.vlmSummary ?? session.checkpoint?.lastActionSummary + if let info = checkpoint, !info.isEmpty { + latestVlmSummary = "Resumed session — \(info)" + } else { + latestVlmSummary = "Resumed session — monitoring your screen…" + } + } - startCapture() - startAppObserver() + if !isCapturing { startCapture() } + if appSwitchObserver == nil { startAppObserver() } + } + + // MARK: - Open Sessions + Task List + Context + + private func fetchOpenSessions() async { + guard TokenStore.shared.token != nil else { return } + do { + openSessions = try await APIClient.shared.getOpenSessions() + lastOpenSessionsFetch = Date() + detectCrossDeviceChanges() + } catch {} + } + + /// Fetch incomplete tasks that don't already have open sessions. + /// These are tasks the user created (brain dump, manual, etc.) but hasn't started working on. + private func fetchIncompleteTasks() async { + guard TokenStore.shared.token != nil else { return } + do { + let allTasks = try await APIClient.shared.getTasks() + let sessionTaskIds = Set(openSessions.compactMap(\.taskId)) + incompleteTasks = allTasks.filter { task in + !task.isDone && !sessionTaskIds.contains(task.id) + } + } catch {} + } + + /// Fetch tasks from the backend (cached for 30s). + private func fetchTasksIfNeeded() async { + if let last = lastTasksFetch, Date().timeIntervalSince(last) < 30 { return } + do { + cachedTasks = try await APIClient.shared.getTasks() + lastTasksFetch = Date() + } catch {} + } + + private func maybeRefreshSessions() async { + guard let lastFetch = lastOpenSessionsFetch else { + await fetchOpenSessions() + await fetchIncompleteTasks() + return + } + if Date().timeIntervalSince(lastFetch) > sessionsFetchInterval { + await fetchOpenSessions() + await fetchIncompleteTasks() + } + } + + // MARK: - Cross-Device Session Sync + + /// Detect when another device ends a focus session we're tracking locally. + /// Does NOT auto-resume — the VLM decides whether to suggest resume/switch/start_new. + private func detectCrossDeviceChanges() { + let serverActiveSession = openSessions.first(where: { $0.status == "active" }) + + // Our local focus session was ended by another device → drop to monitoring + if monitoringState == .focusSession, + let localId = activeSession?.id, + serverActiveSession?.id != localId { + print("[CrossDevice] Session \(localId.prefix(8)) ended by another device") + handleRemoteSessionEnd() + return + } + + // Track what's active on server (for context only — VLM decides what to do) + lastKnownActiveSessionId = serverActiveSession?.id + } + + /// Handle a session that was ended by another device. + private func handleRemoteSessionEnd() { + activeSession = nil + activeTask = nil + activeSteps = [] + monitoringState = .monitoring + sessionStartDate = nil + distractionCount = 0 + nudgeMessage = nil + resumeCard = nil + showingResumeCard = false + proactiveCard = nil + proactiveCardTimer?.cancel() + proactiveCardTimer = nil + nudgeTimer?.cancel() + nudgeTimer = nil + continuousOffTaskStart = nil + consecutiveNudgeCount = 0 + lastNudgeSentAt = nil + persistedSessionId = nil + inferredTaskHistory = [] + latestVlmSummary = "Focus session ended on another device" + // VLM capture continues in monitoring mode + } + + private func formatSessionContext() -> String { + var sections: [String] = [] + + // Open sessions (active + interrupted) + if !openSessions.isEmpty { + let formatter = ISO8601DateFormatter() + let lines: [String] = openSessions.map { s in + let statusTag = "[\(s.status)]" + let taskTitle = s.task?.title ?? "(no task)" + var line = " session_id=\"\(s.id)\" \(statusTag) \"\(taskTitle)\"" + + let app = s.checkpoint?.activeApp ?? "" + if !app.isEmpty { line += " — last in \(app)" } + + let file = s.checkpoint?.activeFile ?? "" + if !file.isEmpty { line += "/\(file)" } + + let note = s.checkpoint?.lastActionSummary ?? "" + if !note.isEmpty { line += ", \"\(note)\"" } + + if let endedAt = s.endedAt, + let date = formatter.date(from: endedAt) { + let minutes = Int(Date().timeIntervalSince(date) / 60) + if minutes > 0 { line += " (paused \(minutes)m ago)" } + } + return line + } + sections.append("Open sessions:\n" + lines.joined(separator: "\n")) + } + + // Incomplete tasks without sessions (from brain dump, manual creation, etc.) + if !incompleteTasks.isEmpty { + let lines: [String] = incompleteTasks.prefix(10).map { t in + var line = " task_id=\"\(t.id)\" [\(t.status)] \"\(t.title)\"" + if let desc = t.description, !desc.isEmpty { + line += " — \(desc.prefix(80))" + } + return line + } + sections.append("Unstarted tasks (no session yet — use task_id in session_action if user is working on one):\n" + lines.joined(separator: "\n")) + } + + if sections.isEmpty { + return "(no open sessions or tasks — suggest start_new if user is actively working on something)" + } + return sections.joined(separator: "\n\n") + } + + // MARK: - Notification Deduplication + + private func frictionFingerprint(_ friction: FrictionInfo) -> String { + let labels = friction.proposedActions.map(\.label).sorted().joined(separator: "|") + return "\(friction.type):\(labels)" + } + + private func shouldNotify(friction: FrictionInfo) -> Bool { + guard friction.isActionable else { return false } + let fingerprint = frictionFingerprint(friction) + if fingerprint == lastFrictionFingerprint { return false } + lastFrictionFingerprint = fingerprint + return true + } + + // MARK: - Task Matching + + /// Find the best matching existing task by comparing inferred task + VLM summary against all non-done tasks. + /// Uses keyword overlap scoring against task title, description, and tags. + func findMatchingTask(for inferredTask: String, vlmSummary: String = "", appName: String = "") -> AppTask? { + let stopWords: Set = ["the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with", "my", + "this", "that", "user", "working", "screen", "currently", "appears", "be", "has", + "are", "was", "been", "being", "it", "its", "at", "by", "from", "not", "but"] + + let combined = "\(inferredTask) \(vlmSummary) \(appName)".lowercased() + let searchWords = Set(combined.split(separator: " ").map(String.init)).subtracting(stopWords) + guard !searchWords.isEmpty else { return nil } + + var bestMatch: AppTask? + var bestScore = 0 + + for task in cachedTasks where task.status != "done" { + var score = 0 + + let titleWords = Set(task.title.lowercased().split(separator: " ").map(String.init)).subtracting(stopWords) + score += searchWords.intersection(titleWords).count * 3 + + if let desc = task.description?.lowercased() { + let descWords = Set(desc.split(separator: " ").map(String.init)).subtracting(stopWords) + score += searchWords.intersection(descWords).count + } + + for tag in task.tags { + if combined.contains(tag.lowercased()) { + score += 2 + } + } + + if combined.contains(task.title.lowercased()) { + score += 10 + } + + if score > bestScore { + bestScore = score + bestMatch = task + } + } + + return bestScore >= 4 ? bestMatch : nil + } + + /// Try to match an inferred task string to an existing incomplete task by keyword overlap. + /// Simpler version for VLM prompt injection. + private func matchInferredTaskToExisting(_ inferredTask: String) -> AppTask? { + let stopWords: Set = ["the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with"] + let inferredWords = Set(inferredTask.lowercased().split(separator: " ").map(String.init)) + .subtracting(stopWords) + guard !inferredWords.isEmpty else { return nil } + + var bestMatch: (task: AppTask, score: Int)? + for task in incompleteTasks { + let taskWords = Set(task.title.lowercased().split(separator: " ").map(String.init)) + .subtracting(stopWords) + let overlap = inferredWords.intersection(taskWords).count + if overlap >= 2, overlap > (bestMatch?.score ?? 0) { + bestMatch = (task, overlap) + } + } + return bestMatch?.task + } + + // MARK: - Inferred Task Stability + + private func shouldSuggestNewSession(_ inferredTask: String) -> Bool { + guard !inferredTask.isEmpty, !isSessionActive else { + inferredTaskHistory = [] + return false + } + + inferredTaskHistory.append(inferredTask) + if inferredTaskHistory.count > stableTaskThreshold + 2 { + inferredTaskHistory = Array(inferredTaskHistory.suffix(stableTaskThreshold + 2)) + } + guard inferredTaskHistory.count >= stableTaskThreshold else { return false } + + let recent = Array(inferredTaskHistory.suffix(stableTaskThreshold)) + let stopWords: Set = ["the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with"] + let firstWords = Set(recent[0].lowercased().split(separator: " ").map(String.init)) + .subtracting(stopWords) + guard !firstWords.isEmpty else { return false } + + let allSimilar = recent.dropFirst().allSatisfy { task in + let words = Set(task.lowercased().split(separator: " ").map(String.init)) + let overlap = firstWords.intersection(words) + return Double(overlap.count) >= Double(firstWords.count) * 0.5 + } + return allSimilar + } + + // MARK: - VLM-Initiated Task Creation + + func createVLMTaskAndSession(inferredTask: String) async { + isLoading = true + errorMessage = nil + do { + if let stale = activeSession { + _ = try? await APIClient.shared.endSession(sessionId: stale.id, status: "completed") + } + + let task = try await APIClient.shared.createVLMTask(title: inferredTask) + let plan = try? await APIClient.shared.planTask(taskId: task.id) + let steps = (plan?.steps ?? []).sorted { $0.sortOrder < $1.sortOrder } + + let session = try await APIClient.shared.startSession(taskId: task.id) + activeSession = session + persistedSessionId = session.id + activeTask = task + activeSteps = steps + currentStepIndex = 0 + monitoringState = .focusSession + sessionStartDate = Date() + distractionCount = 0 + inferredTaskHistory = [] + historyBuffer.clear() + + continuousOffTaskStart = nil + consecutiveNudgeCount = 0 + lastNudgeSentAt = nil + + await fetchOpenSessions() + + stopCapture() + startCapture() + if appSwitchObserver == nil { startAppObserver() } + } catch { + errorMessage = error.localizedDescription + } + isLoading = false } // MARK: - Session Lifecycle @@ -177,7 +563,6 @@ final class SessionManager { isLoading = true errorMessage = nil do { - // End any existing session first var staleId: String? = activeSession?.id ?? persistedSessionId if staleId == nil { staleId = (try? await APIClient.shared.getActiveSession())?.id @@ -192,12 +577,15 @@ final class SessionManager { activeTask = task activeSteps = [] currentStepIndex = 0 - isSessionActive = true + monitoringState = .focusSession sessionStartDate = Date() distractionCount = 0 - lastNudge = nil - screenshotHistory = [] - frameBuffer = [] + nudgeMessage = nil + historyBuffer.clear() + + continuousOffTaskStart = nil + consecutiveNudgeCount = 0 + lastNudgeSentAt = nil if let task { let steps = try await APIClient.shared.getSteps(taskId: task.id) @@ -207,8 +595,6 @@ final class SessionManager { ?? 0 } - await requestNotificationPermission() - // Restart capture loop (in case it wasn't running or was in monitoring-only mode) stopCapture() startCapture() if appSwitchObserver == nil { startAppObserver() } @@ -218,32 +604,120 @@ final class SessionManager { isLoading = false } + /// Start a session attached to an existing task from the database. + func startSessionWithExistingTask(_ task: AppTask) async { + isLoading = true + errorMessage = nil + do { + if let stale = activeSession { + _ = try? await APIClient.shared.endSession(sessionId: stale.id, status: "completed") + } + + let session = try await APIClient.shared.startSession(taskId: task.id) + activeSession = session + persistedSessionId = session.id + activeTask = task + monitoringState = .focusSession + sessionStartDate = Date() + distractionCount = 0 + inferredTaskHistory = [] + historyBuffer.clear() + + continuousOffTaskStart = nil + consecutiveNudgeCount = 0 + lastNudgeSentAt = nil + + let steps = try await APIClient.shared.getSteps(taskId: task.id) + activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder } + currentStepIndex = activeSteps.firstIndex(where: { $0.isActive }) + ?? activeSteps.firstIndex(where: { $0.status == "pending" }) + ?? 0 + + await fetchOpenSessions() + + stopCapture() + startCapture() + if appSwitchObserver == nil { startAppObserver() } + } catch { + errorMessage = error.localizedDescription + } + isLoading = false + } + + // MARK: - Context Checkpointing + + /// Save a context checkpoint before ending/interrupting a session. + /// Captures the latest VLM state so the resume card has rich context later. + private func saveCheckpoint(for sessionId: String) async { + let stepId = currentStep?.id + let summary = latestVlmSummary + let inferred = latestInferredTask + let app = latestAppName ?? NSWorkspace.shared.frontmostApplication?.localizedName + + var actionSummary = "" + if let inferred, !inferred.isEmpty { actionSummary = inferred } + if let summary, !summary.isEmpty { + actionSummary += actionSummary.isEmpty ? summary : " — \(summary)" + } + + let nextUp: String? = { + guard let step = currentStep else { return nil } + if step.isDone { + return activeSteps.first(where: { $0.status == "pending" })?.title + } + return step.title + }() + + try? await APIClient.shared.checkpointSession( + sessionId: sessionId, + currentStepId: stepId, + lastActionSummary: actionSummary.isEmpty ? nil : String(actionSummary.prefix(500)), + nextUp: nextUp, + goal: activeTask?.description ?? activeTask?.title, + activeApp: app, + lastScreenshotAnalysis: summary, + distractionCount: distractionCount + ) + } + func endSession(status: String = "completed") async { stopCapture() stopAppObserver() if let session = activeSession { + await saveCheckpoint(for: session.id) _ = try? await APIClient.shared.endSession(sessionId: session.id, status: status) } activeSession = nil activeTask = nil activeSteps = [] - isSessionActive = false + monitoringState = .monitoring sessionStartDate = nil - lastNudge = nil resumeCard = nil showingResumeCard = false proactiveCard = nil + nudgeMessage = nil latestVlmSummary = nil latestInferredTask = nil + latestAppName = nil isExecuting = false executorOutput = nil proactiveCardTimer?.cancel() proactiveCardTimer = nil - screenshotHistory = [] - frameBuffer = [] - persistedSessionId = nil + nudgeTimer?.cancel() + nudgeTimer = nil + historyBuffer.clear() - // Keep the capture loop running for app-switch heuristics + savedFramesForExecutor = [] + persistedSessionId = nil + inferredTaskHistory = [] + lastFrictionFingerprint = "" + handledSessionActions = [] + continuousOffTaskStart = nil + consecutiveNudgeCount = 0 + lastNudgeSentAt = nil + lastKnownActiveSessionId = nil + + // Keep capture loop running for always-on monitoring if TokenStore.shared.token != nil { startCapture() startAppObserver() @@ -280,62 +754,219 @@ final class SessionManager { func retryMonitoring() { monitoringError = nil - frameBuffer = [] + historyBuffer.clear() + stopCapture() startCapture() if appSwitchObserver == nil { startAppObserver() } } + // MARK: - Nudge Lifecycle (shown in HUD) + + private func showNudge(_ message: String) { + nudgeTimer?.cancel() + distractionCount += 1 + withAnimation { nudgeMessage = message } + + nudgeTimer = Task { [weak self] in + try? await Task.sleep(for: .seconds(12)) + guard !Task.isCancelled, let self else { return } + await MainActor.run { withAnimation { self.nudgeMessage = nil } } + } + } + + func dismissNudge() { + nudgeTimer?.cancel() + nudgeTimer = nil + withAnimation { nudgeMessage = nil } + } + // MARK: - Proactive Card Lifecycle - private func showProactiveCard(_ card: ProactiveCard) { + private func markSessionActionHandled(_ card: ProactiveCard?) { + if case .sessionAction(_, _, _, _, let sessionId, _) = card?.source, let sessionId { + handledSessionActions.insert(sessionId) + } + } + + private func showProactiveCard(_ card: ProactiveCard, autoDismiss: Bool = true) { proactiveCardTimer?.cancel() withAnimation { proactiveCard = card } - proactiveCardTimer = Task { [weak self] in - try? await Task.sleep(for: .seconds(15)) - guard !Task.isCancelled, let self else { return } - await MainActor.run { self.dismissProactiveCard() } + // Session action cards persist — they require user input. + // Only friction/nudge cards auto-dismiss. + if autoDismiss { + proactiveCardTimer = Task { [weak self] in + try? await Task.sleep(for: .seconds(30)) + guard !Task.isCancelled, let self else { return } + await MainActor.run { self.dismissProactiveCard() } + } } } func dismissProactiveCard() { + markSessionActionHandled(proactiveCard) proactiveCardTimer?.cancel() proactiveCardTimer = nil withAnimation { proactiveCard = nil } } func approveProactiveCard(actionIndex: Int) { + markSessionActionHandled(proactiveCard) proactiveCardTimer?.cancel() proactiveCardTimer = nil let card = proactiveCard withAnimation { proactiveCard = nil } - guard case .vlmFriction(_, _, let actions) = card?.source, - actionIndex < actions.count else { return } - let action = actions[actionIndex] - isExecuting = true - Task { - do { - let screenshot = await captureScreen() - let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? "" - guard !geminiKey.isEmpty else { + + switch card?.source { + case .vlmFriction(_, _, let actions): + guard actionIndex < actions.count else { return } + let action = actions[actionIndex] + isExecuting = true + Task { + do { + let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? "" + guard !geminiKey.isEmpty else { + isExecuting = false + executorOutput = (title: action.label, content: action.details ?? "Action approved.") + return + } + let client = GeminiVLMClient(apiKey: geminiKey) + // Send all buffered frames to the executor for vision context + var frames = historyBuffer.frameData + if let fresh = await captureScreen() { frames.append(fresh) } + savedFramesForExecutor = frames + let result = try await client.executeAction( + label: action.label, + details: action.details ?? "", + frames: frames, + onToolCall: { name, args in + print("[Executor] \(name): \(args)") + } + ) isExecuting = false - executorOutput = (title: action.label, content: action.details ?? "Action approved.") - return + executorOutput = (title: action.label, content: result) + historyBuffer.setLastExecution("Completed: \(action.label). \(result.prefix(200))") + } catch { + isExecuting = false + executorOutput = (title: action.label, content: action.details ?? "Couldn't complete automatically.") } - let client = GeminiVLMClient(apiKey: geminiKey) - let result = try await client.executeAction( - label: action.label, - actionType: action.actionType, - details: action.details ?? "", - screenshot: screenshot - ) - isExecuting = false - executorOutput = (title: action.label, content: result) - } catch { - isExecuting = false - executorOutput = (title: action.label, content: action.details ?? "Couldn't complete automatically.") } + + case .appSwitchLoop(let apps, _): + isExecuting = true + Task { + do { + let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? "" + guard !geminiKey.isEmpty else { + isExecuting = false + executorOutput = (title: "App Switch Help", content: "No Gemini API key set.") + return + } + let client = GeminiVLMClient(apiKey: geminiKey) + let frames = historyBuffer.frameData + let label = "Help with \(apps.joined(separator: " ↔ ")) switching" + let details = """ + The user is repeatedly switching between \(apps.joined(separator: " and ")). \ + Look at the screenshots to understand what they're trying to do across these apps. \ + If they're copying data between them, extract and format the data. \ + If they're looking something up, find and summarize the answer. \ + If they're comparing content, create a consolidated view. \ + Use output() to show the result. + """ + let result = try await client.executeAction( + label: label, + details: details, + frames: frames, + onToolCall: { name, args in + print("[Executor] \(name): \(args)") + } + ) + isExecuting = false + executorOutput = (title: label, content: result) + historyBuffer.setLastExecution("Completed: \(label). \(result.prefix(200))") + } catch { + isExecuting = false + executorOutput = (title: "App Switch Help", content: "Couldn't analyze — \(error.localizedDescription)") + } + } + + case .sessionAction(let type, let taskTitle, _, _, let sessionId, let taskId): + let matchedTask = card?.matchedTask + switch type { + case "start_new": + if actionIndex == 0 && matchedTask != nil { + Task { await startSessionWithExistingTask(matchedTask!) } + } else if actionIndex == 0 && matchedTask == nil { + if let taskId, let existingTask = incompleteTasks.first(where: { $0.id == taskId }) { + Task { await startSession(task: existingTask) } + } else { + Task { await startSession(task: nil) } + } + } else if actionIndex == 1 { + Task { await createVLMTaskAndSession(inferredTask: taskTitle) } + } else if actionIndex == 2 { + Task { await startSession(task: nil) } + } + + case "resume": + guard let sessionId else { break } + Task { + if let openSession = openSessions.first(where: { $0.id == sessionId }) { + let focusSession = FocusSession( + id: openSession.id, + userId: "", + taskId: openSession.taskId, + platform: openSession.platform, + startedAt: openSession.startedAt, + endedAt: nil, + status: "active", + checkpoint: nil + ) + await autoResumeSession(focusSession) + await fetchResumeCard() + } + } + + case "switch": + guard let sessionId else { break } + Task { + if let current = activeSession { + await saveCheckpoint(for: current.id) + _ = try? await APIClient.shared.endSession(sessionId: current.id, status: "interrupted") + } + if let openSession = openSessions.first(where: { $0.id == sessionId }) { + let focusSession = FocusSession( + id: openSession.id, + userId: "", + taskId: openSession.taskId, + platform: openSession.platform, + startedAt: openSession.startedAt, + endedAt: nil, + status: "active", + checkpoint: nil + ) + await autoResumeSession(focusSession) + } + } + + case "complete": + guard let sessionId else { break } + Task { + await saveCheckpoint(for: sessionId) + _ = try? await APIClient.shared.endSession(sessionId: sessionId, status: "completed") + if activeSession?.id == sessionId { + await endSession(status: "completed") + } + await fetchOpenSessions() + } + + default: + break + } + + default: + break } } @@ -373,7 +1004,6 @@ final class SessionManager { guard name != lastApp.name else { return } - // Log previous app dwell time to backend let duration = max(1, Int(now.timeIntervalSince(lastAppEnteredAt))) let prev = lastApp if let session = activeSession, !prev.name.isEmpty { @@ -418,7 +1048,6 @@ final class SessionManager { isCapturing = true captureTask = Task { [weak self] in guard let self else { return } - // Capture immediately, then repeat on interval await self.captureAndAnalyze() while !Task.isCancelled { try? await Task.sleep(for: .seconds(self.captureInterval)) @@ -436,77 +1065,243 @@ final class SessionManager { /// Capture one frame, buffer it, and call VLM every `framesPerVLMCall` frames. private func captureAndAnalyze() async { - guard let imageData = await captureScreen() else { return } + let t0 = Date() + let ts = { "[t+\(String(format: "%.2f", Date().timeIntervalSince(t0)))s]" } - frameBuffer.append(imageData) - // Keep buffer bounded — rolling window of most recent frames - if frameBuffer.count > framesPerVLMCall { frameBuffer.removeFirst() } + print("\n[TIMING] ── captureAndAnalyze start at \(t0.formatted(.dateTime.hour().minute().second()))") - // Only call VLM once we have a full batch for temporal diff analysis - guard frameBuffer.count >= framesPerVLMCall else { return } + // 1. Capture screenshot + let tCapture = Date() + guard let imageData = await captureScreen() else { + print("[TIMING] captureScreen failed (no permission?)") + return + } + print("[TIMING] \(ts()) captureScreen done — \(imageData.count / 1024)KB (\(String(format: "%.2f", Date().timeIntervalSince(tCapture)))s)") + + // 2. Push into the rolling buffer (oldest auto-evicted when full) + historyBuffer.push(imageData: imageData) + let bufferSize = historyBuffer.images.count + let frameTimestamp = historyBuffer.images.last!.timestamp + print("[TIMING] \(ts()) buffer=\(bufferSize)/\(framesPerVLMCall) frames") let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? "" + + // 2a. Upload this frame to Files API in the background so its URI is ready next cycle. + // The 3 older frames in the rolling buffer were uploaded in previous cycles and already + // have URIs — only the newest frame (just pushed) needs uploading now. + if !geminiKey.isEmpty { + let capturedData = imageData + let capturedTs = frameTimestamp + Task { @MainActor [weak self] in + guard let self else { return } + let client = GeminiVLMClient(apiKey: geminiKey) + if let uri = try? await client.uploadFrame(capturedData) { + self.historyBuffer.updateFileUri(uri, forTimestamp: capturedTs) + } + } + } + + // Warm-up: wait until buffer is full (~20s at 5s/frame) before first inference + guard bufferSize >= framesPerVLMCall else { + print("[TIMING] \(ts()) warming up buffer — skip VLM this cycle") + return + } + guard !geminiKey.isEmpty else { print("[VLM] No Gemini API key set — skipping analysis") return } + // 3. Snapshot the current rolling window for this VLM call + let frames = historyBuffer.frameData + let fileUris = historyBuffer.fileUris + + // 4. Refresh open sessions + build prompt context + await maybeRefreshSessions() + + let tPrompt = Date() let client = GeminiVLMClient(apiKey: geminiKey) let windowTitle = NSWorkspace.shared.frontmostApplication?.localizedName ?? "" - let recentSummaries = screenshotHistory.map(\.summary) - let frames = frameBuffer // snapshot before async gap + let historyCtx = historyBuffer.formatForPrompt() + let sessionCtx = formatSessionContext() + let lastOutputCtx = historyBuffer.formatLastOutput() + let execCtx = historyBuffer.formatLastExecution() + let totalBytes = frames.reduce(0) { $0 + $1.count } + print("[TIMING] \(ts()) prompt context ready — \(frames.count) frames \(totalBytes / 1024)KB sessions=\(openSessions.count) (\(String(format: "%.3f", Date().timeIntervalSince(tPrompt)))s)") + // 5. Gemini API call + let tVlm = Date() + print("[TIMING] \(ts()) → sending to Gemini…") do { - print("[VLM] Calling Gemini with \(frames.count) frames…") let result = try await client.analyze( frames: frames, + fileUris: fileUris, taskTitle: activeTask?.title ?? "", taskGoal: activeTask?.description ?? "", steps: activeSteps, windowTitle: windowTitle, - recentSummaries: recentSummaries + historyContext: historyCtx, + sessionContext: sessionCtx, + lastOutputContext: lastOutputCtx, + executionContext: execCtx ) - print("[VLM] Result: on_task=\(result.onTask), friction=\(result.friction?.type ?? "none"), summary=\(result.vlmSummary ?? "")") + print("[TIMING] \(ts()) ← Gemini response received (\(String(format: "%.2f", Date().timeIntervalSince(tVlm)))s)") + print("[VLM] on_task=\(result.onTask) notification=\(result.notification?.type ?? "none") friction=\(result.friction?.type ?? "none") | task: \(result.inferredTask ?? "") | \(result.vlmSummary ?? "")") - // Append to rolling summary history + // 6. Update history buffer + let tApply = Date() if let summary = result.vlmSummary, !summary.isEmpty { - screenshotHistory.append(HistoryEntry(summary: summary, timestamp: Date())) - if screenshotHistory.count > 4 { screenshotHistory.removeFirst() } + historyBuffer.updateLastSummary(summary) } - // Clear frame buffer — next batch starts fresh - frameBuffer.removeAll() + // Store full VLM result for self-refinement (encode key fields as JSON) + let selfRefinementJSON = buildSelfRefinementJSON(result) + historyBuffer.setLastOutput(selfRefinementJSON) + + // Tick execution age (clears after 3 iterations) + historyBuffer.tickExecutionAge() + + // Save frames for executor context + savedFramesForExecutor = frames monitoringError = nil applyDistractionResult(result) - // Post result to backend (fire-and-forget) - if let session = activeSession { - Task { - try? await APIClient.shared.postAnalysisResult(result, sessionId: session.id) + // Auto-link: if session is active but has NO task, try to match against database tasks + if isSessionActive, + activeTask == nil, + proactiveCard == nil, + let inferredTask = result.inferredTask, !inferredTask.isEmpty { + await fetchTasksIfNeeded() + if let matched = findMatchingTask( + for: inferredTask, + vlmSummary: result.vlmSummary ?? "", + appName: result.appName ?? "" + ) { + var card = ProactiveCard(source: .sessionAction( + type: "start_new", + taskTitle: matched.title, + checkpoint: "", + reason: "Want to start a focus session for \"\(matched.title)\"?", + sessionId: nil, + taskId: nil + )) + card.matchedTask = matched + showProactiveCard(card, autoDismiss: false) } } + + // Fallback: if VLM didn't suggest a session but task is stable, suggest start_new + if result.sessionAction?.type == "none" || result.sessionAction == nil, + !isSessionActive, + let inferredTask = result.inferredTask, !inferredTask.isEmpty, + shouldSuggestNewSession(inferredTask), + proactiveCard == nil { + await fetchTasksIfNeeded() + let inferredWork = inferredTask + let matched = findMatchingTask( + for: inferredWork, + vlmSummary: result.vlmSummary ?? "", + appName: result.appName ?? "" + ) + let matchedTaskFromIncomplete = matchInferredTaskToExisting(inferredWork) + let reason: String + if let matched { + reason = "Want to start a focus session for \"\(matched.title)\"?" + } else { + reason = "You've been working on this — want to start a focus session?" + } + var card = ProactiveCard(source: .sessionAction( + type: "start_new", + taskTitle: matched?.title ?? inferredWork, + checkpoint: "", + reason: reason, + sessionId: nil, + taskId: matchedTaskFromIncomplete?.id + )) + card.matchedTask = matched + showProactiveCard(card, autoDismiss: false) + } + print("[TIMING] \(ts()) applyDistractionResult done (\(String(format: "%.3f", Date().timeIntervalSince(tApply)))s)") + + // 7. Backend post + live checkpoint (fire-and-forget) + if let session = activeSession { + let stepId = currentStep?.id + let app = latestAppName + let summary = latestVlmSummary + let dCount = distractionCount + print("[TIMING] \(ts()) posting result + checkpoint to backend…") + Task { + let tBackend = Date() + try? await APIClient.shared.postAnalysisResult(result, sessionId: session.id) + // Keep checkpoint fresh so resume card always has latest context + try? await APIClient.shared.checkpointSession( + sessionId: session.id, + currentStepId: stepId, + activeApp: app, + lastScreenshotAnalysis: summary, + distractionCount: dCount + ) + print("[TIMING] backend POST + checkpoint done (\(String(format: "%.2f", Date().timeIntervalSince(tBackend)))s)") + } + } + + print("[TIMING] ── total cycle: \(String(format: "%.2f", Date().timeIntervalSince(t0)))s") } catch { - print("[VLM] Analysis error: \(error)") - // Don't surface transient errors — the next attempt will retry automatically + print("[TIMING] \(ts()) ✗ Gemini error after \(String(format: "%.2f", Date().timeIntervalSince(tVlm)))s: \(error)") } } + /// Build a concise JSON string of the VLM result for self-refinement context. + private func buildSelfRefinementJSON(_ result: DistractionAnalysisResponse) -> String { + var dict: [String: Any] = [ + "on_task": result.onTask, + "confidence": result.confidence + ] + if let task = result.inferredTask { dict["inferred_task"] = task } + if let summary = result.vlmSummary { dict["vlm_summary"] = summary } + if let friction = result.friction { + dict["friction_type"] = friction.type + if let desc = friction.description { dict["friction_description"] = desc } + } + if let sa = result.sessionAction { + dict["session_action"] = sa.type + } + if let app = result.appName { dict["app_name"] = app } + + guard let data = try? JSONSerialization.data(withJSONObject: dict, options: [.sortedKeys]), + let str = String(data: data, encoding: .utf8) else { + return result.vlmSummary ?? "" + } + return str + } + // MARK: - Screen Capture private func captureScreen() async -> Data? { guard CGPreflightScreenCaptureAccess() else { return nil } do { + let t0 = Date() let content = try await SCShareableContent.current + print("[TIMING] captureScreen: SCShareableContent \(String(format: "%.3f", Date().timeIntervalSince(t0)))s") + guard let display = content.displays.first else { return nil } let config = SCStreamConfiguration() config.width = 1280 config.height = 720 let filter = SCContentFilter(display: display, excludingWindows: []) + + let t1 = Date() let image = try await SCScreenshotManager.captureImage( contentFilter: filter, configuration: config) - return cgImageToJPEG(image) + print("[TIMING] captureScreen: captureImage \(String(format: "%.3f", Date().timeIntervalSince(t1)))s") + + let t2 = Date() + let jpeg = cgImageToJPEG(image) + print("[TIMING] captureScreen: JPEG encode \(String(format: "%.3f", Date().timeIntervalSince(t2)))s → \((jpeg?.count ?? 0) / 1024)KB") + return jpeg } catch { + print("[TIMING] captureScreen: error \(error)") return nil } } @@ -525,6 +1320,7 @@ final class SessionManager { private func applyDistractionResult(_ result: DistractionAnalysisResponse) { if let summary = result.vlmSummary { latestVlmSummary = summary } if let task = result.inferredTask, !task.isEmpty { latestInferredTask = task } + if let app = result.appName, !app.isEmpty { latestAppName = app } // Apply step side-effects for completedId in result.stepsCompleted { @@ -542,48 +1338,167 @@ final class SessionManager { currentStepIndex = idx } - // Notification priority: friction card (formal or has actions) → nudge - if let friction = result.friction { - let shouldShow = friction.isActionable || !friction.proposedActions.isEmpty - if shouldShow { + // Trust VLM's explicit notification decision + switch result.notification?.type { + case "friction": + if let friction = result.friction { if friction.isResumption { - Task { await fetchResumeCard() } - } else if proactiveCard == nil { + // Task resumption → show resume card (skip if already handled) + if let sa = result.sessionAction, sa.type == "resume", let sessionId = sa.sessionId, + !handledSessionActions.contains(sessionId) { + showProactiveCard(ProactiveCard(source: .sessionAction( + type: "resume", + taskTitle: result.inferredTask ?? "Previous task", + checkpoint: friction.description ?? "", + reason: sa.reason ?? "You appear to be returning to this task.", + sessionId: sessionId, + taskId: nil + ))) + } else { + Task { await fetchResumeCard() } + } + } else if shouldNotify(friction: friction), proactiveCard == nil { showProactiveCard(ProactiveCard(source: .vlmFriction( frictionType: friction.type, description: friction.description, actions: friction.proposedActions ))) } - } else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge { - distractionCount += 1 - lastNudge = nudge - sendNudgeNotification(nudge) } - } else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge { - distractionCount += 1 - lastNudge = nudge - sendNudgeNotification(nudge) + case "nudge": + // Only show nudges during active focus sessions — not in monitoring mode. + // In monitoring mode the VLM should suggest start_new/resume, not nag. + if isSessionActive, let nudge = result.notification?.message, !nudge.isEmpty { + showNudge(nudge) + } + default: + break + } + + // Handle session lifecycle suggestions + if let sa = result.sessionAction, sa.type != "none" { + switch sa.type { + case "start_new" where !isSessionActive && proactiveCard == nil: + let inferredWork = result.inferredTask ?? "your current work" + let matchedTaskId = sa.taskId ?? matchInferredTaskToExisting(inferredWork)?.id + let matchedTitle = (matchedTaskId != nil ? incompleteTasks.first(where: { $0.id == matchedTaskId })?.title : nil) ?? inferredWork + // Also try the sophisticated matcher + var card = ProactiveCard(source: .sessionAction( + type: "start_new", + taskTitle: matchedTitle, + checkpoint: "", + reason: sa.reason ?? "You appear to be actively working — want to start a focus session?", + sessionId: nil, + taskId: matchedTaskId + )) + card.matchedTask = findMatchingTask( + for: inferredWork, + vlmSummary: result.vlmSummary ?? "", + appName: result.appName ?? "" + ) + showProactiveCard(card, autoDismiss: false) + case "resume" where proactiveCard == nil && !handledSessionActions.contains(sa.sessionId ?? ""): + let taskTitle = openSessions.first(where: { $0.id == sa.sessionId })?.task?.title + ?? result.inferredTask ?? "Previous task" + let checkpoint = openSessions.first(where: { $0.id == sa.sessionId })? + .checkpoint?.lastActionSummary ?? "" + showProactiveCard(ProactiveCard(source: .sessionAction( + type: "resume", + taskTitle: taskTitle, + checkpoint: checkpoint, + reason: sa.reason ?? "You appear to be returning to this task.", + sessionId: sa.sessionId, + taskId: nil + ))) + case "switch" where proactiveCard == nil && !handledSessionActions.contains(sa.sessionId ?? ""): + let taskTitle = openSessions.first(where: { $0.id == sa.sessionId })?.task?.title + ?? result.inferredTask ?? "Another task" + showProactiveCard(ProactiveCard(source: .sessionAction( + type: "switch", + taskTitle: taskTitle, + checkpoint: "", + reason: sa.reason ?? "You seem to have moved to a different task.", + sessionId: sa.sessionId, + taskId: nil + )), autoDismiss: false) + case "complete" where proactiveCard == nil: + let taskTitle = activeTask?.title ?? result.inferredTask ?? "Current task" + showProactiveCard(ProactiveCard(source: .sessionAction( + type: "complete", + taskTitle: taskTitle, + checkpoint: "", + reason: sa.reason ?? "It looks like you've finished this task.", + sessionId: sa.sessionId ?? activeSession?.id, + taskId: nil + )), autoDismiss: false) + default: + break + } + } + + // MARK: Distraction Timer (focus session only) + if monitoringState == .focusSession { + if result.onTask { + if continuousOffTaskStart != nil { + print("[Distraction] User back on task — timer reset (was \(consecutiveNudgeCount) nudges)") + } + continuousOffTaskStart = nil + consecutiveNudgeCount = 0 + } else { + if continuousOffTaskStart == nil { + continuousOffTaskStart = Date() + } + + let offTaskDuration = Date().timeIntervalSince(continuousOffTaskStart!) + let canNudge = lastNudgeSentAt == nil || + Date().timeIntervalSince(lastNudgeSentAt!) >= distractionThresholdSeconds + + if offTaskDuration >= distractionThresholdSeconds && canNudge { + if consecutiveNudgeCount < maxConsecutiveNudges { + consecutiveNudgeCount += 1 + lastNudgeSentAt = Date() + continuousOffTaskStart = Date() + + let nudgeContent = buildNudgeContent() + showNudge(nudgeContent.body) + + // Backend push to all other devices + if let sessionId = activeSession?.id { + Task { + _ = try? await APIClient.shared.sendNudge( + sessionId: sessionId, + title: nudgeContent.title, + body: nudgeContent.body, + nudgeNumber: consecutiveNudgeCount, + lastStep: nudgeContent.lastStep, + nextStep: nudgeContent.nextStep + ) + } + } + print("[Distraction] Nudge \(consecutiveNudgeCount)/\(maxConsecutiveNudges) sent") + } + + if consecutiveNudgeCount >= maxConsecutiveNudges { + print("[Distraction] 5 nudges reached — auto-stopping session") + Task { await endSession(status: "auto_stopped") } + } + } + } } } - // MARK: - Notifications + // MARK: - Nudge Content - private func sendNudgeNotification(_ nudge: String) { - let content = UNMutableNotificationContent() - content.title = "Hey, quick check-in!" - content.body = nudge - content.sound = .default - let req = UNNotificationRequest( - identifier: UUID().uuidString, - content: content, - trigger: nil - ) - UNUserNotificationCenter.current().add(req) - } + private func buildNudgeContent() -> (title: String, body: String, lastStep: String?, nextStep: String?) { + let lastCompleted = activeSteps.last(where: { $0.isDone })?.title + let nextStepTitle = currentStep?.title - private func requestNotificationPermission() async { - try? await UNUserNotificationCenter.current() - .requestAuthorization(options: [.alert, .sound]) + let title = "Hey, quick check-in!" + var body = "" + if let last = lastCompleted { body += "Done: \(last). " } + if let next = nextStepTitle { body += "Next up: \(next). " } + body += "You got this!" + + return (title, body, lastCompleted, nextStepTitle) } }