include argus workflow

This commit is contained in:
joyzhuo
2026-03-29 06:29:18 -04:00
parent 275a53ab40
commit 56673078f5
23 changed files with 3098 additions and 307 deletions

11
.gitignore vendored Normal file
View File

@@ -0,0 +1,11 @@
# Python venv — run `python3 -m venv .venv && pip install -r argus/requirements.txt` to recreate
.venv/
# Python bytecode
argus/__pycache__/
argus/**/__pycache__/
*.pyc
*.pyo
# macOS
.DS_Store

View File

@@ -18,6 +18,7 @@ struct FloatingHUDView: View {
.animation(.spring(duration: 0.3), value: session.proactiveCard?.id) .animation(.spring(duration: 0.3), value: session.proactiveCard?.id)
.animation(.spring(duration: 0.3), value: session.isExecuting) .animation(.spring(duration: 0.3), value: session.isExecuting)
.animation(.spring(duration: 0.3), value: session.executorOutput?.title) .animation(.spring(duration: 0.3), value: session.executorOutput?.title)
.animation(.spring(duration: 0.3), value: session.monitoringError)
} }
// MARK: - Header // MARK: - Header
@@ -58,6 +59,12 @@ struct FloatingHUDView: View {
@ViewBuilder @ViewBuilder
private var content: some View { private var content: some View {
// Error / warning banner shown above all other content when monitoring has a problem
if let error = session.monitoringError {
MonitoringErrorBanner(message: error)
.transition(.move(edge: .top).combined(with: .opacity))
}
// Executor output sticky card (highest priority persists until dismissed) // Executor output sticky card (highest priority persists until dismissed)
if let output = session.executorOutput { if let output = session.executorOutput {
ExecutorOutputCard(title: output.title, content: output.content) { ExecutorOutputCard(title: output.title, content: output.content) {
@@ -83,11 +90,21 @@ struct FloatingHUDView: View {
.transition(.move(edge: .top).combined(with: .opacity)) .transition(.move(edge: .top).combined(with: .opacity))
} }
// Latest VLM summary (idle state) // Latest VLM summary (idle state)
else { else if session.monitoringError == nil {
VStack(alignment: .leading, spacing: 4) {
if let task = session.latestInferredTask, !task.isEmpty {
Text(task)
.font(.caption.bold())
.foregroundStyle(.primary)
.fixedSize(horizontal: false, vertical: true)
.lineLimit(2)
}
Text(session.latestVlmSummary ?? "Monitoring your screen…") Text(session.latestVlmSummary ?? "Monitoring your screen…")
.font(.caption) .font(.caption)
.foregroundStyle(.secondary) .foregroundStyle(.secondary)
.fixedSize(horizontal: false, vertical: true) .fixedSize(horizontal: false, vertical: true)
.lineLimit(3)
}
.padding(14) .padding(14)
.transition(.opacity) .transition(.opacity)
} }
@@ -128,8 +145,17 @@ private struct HUDCardView: View {
.buttonStyle(.plain) .buttonStyle(.plain)
} }
// Action buttons only for VLM friction with proposed actions // Action buttons
if case .vlmFriction(_, _, let actions) = card.source, !actions.isEmpty { actionButtons
}
.padding(14)
.background(Color.purple.opacity(0.07))
}
@ViewBuilder
private var actionButtons: some View {
switch card.source {
case .vlmFriction(_, _, let actions) where !actions.isEmpty:
VStack(alignment: .leading, spacing: 6) { VStack(alignment: .leading, spacing: 6) {
ForEach(Array(actions.prefix(2).enumerated()), id: \.offset) { index, action in ForEach(Array(actions.prefix(2).enumerated()), id: \.offset) { index, action in
Button { Button {
@@ -157,18 +183,49 @@ private struct HUDCardView: View {
.buttonStyle(.plain) .buttonStyle(.plain)
.foregroundStyle(.purple) .foregroundStyle(.purple)
} }
notNowButton
}
case .sessionAction(let type, _, _, _, _):
VStack(alignment: .leading, spacing: 6) {
Button {
session.approveProactiveCard(actionIndex: 0)
} label: {
Text(sessionActionButtonLabel(type))
.font(.caption.bold())
.frame(maxWidth: .infinity, alignment: .leading)
.padding(.horizontal, 10)
.padding(.vertical, 6)
.background(Color.purple.opacity(0.12))
.clipShape(.rect(cornerRadius: 8))
}
.buttonStyle(.plain)
.foregroundStyle(.purple)
notNowButton
}
default:
EmptyView()
}
}
private var notNowButton: some View {
Button("Not now — I'm good") { session.dismissProactiveCard() } Button("Not now — I'm good") { session.dismissProactiveCard() }
.font(.caption) .font(.caption)
.foregroundStyle(.secondary) .foregroundStyle(.secondary)
.buttonStyle(.plain) .buttonStyle(.plain)
.padding(.top, 2) .padding(.top, 2)
} }
private func sessionActionButtonLabel(_ type: String) -> String {
switch type {
case "resume": return "Resume session"
case "switch": return "Switch to this task"
case "complete": return "Mark complete"
case "start_new": return "Start focus session"
default: return "OK"
} }
} }
.padding(14)
.background(Color.purple.opacity(0.07))
}
private var bodyText: String { private var bodyText: String {
switch card.source { switch card.source {
@@ -176,10 +233,59 @@ private struct HUDCardView: View {
return description ?? "I noticed something that might be slowing you down." return description ?? "I noticed something that might be slowing you down."
case .appSwitchLoop(let apps, let count): case .appSwitchLoop(let apps, let count):
return "You've switched between \(apps.joined(separator: "")) \(count)× — are you stuck?" return "You've switched between \(apps.joined(separator: "")) \(count)× — are you stuck?"
case .sessionAction(_, _, let checkpoint, let reason, _):
if !checkpoint.isEmpty { return "Left off: \(checkpoint)" }
return reason.isEmpty ? "Argus noticed a session change." : reason
} }
} }
} }
// MARK: - Monitoring Error Banner
private struct MonitoringErrorBanner: View {
let message: String
@Environment(SessionManager.self) private var session
private var isRestarting: Bool { message.contains("restarting") }
var body: some View {
VStack(alignment: .leading, spacing: 8) {
HStack(spacing: 8) {
Image(systemName: isRestarting ? "arrow.clockwise" : "exclamationmark.triangle.fill")
.font(.caption)
.foregroundStyle(isRestarting ? .orange : .red)
.symbolEffect(.pulse, isActive: isRestarting)
Text(message)
.font(.caption)
.foregroundStyle(isRestarting ? .orange : .red)
.fixedSize(horizontal: false, vertical: true)
Spacer(minLength: 0)
}
if !isRestarting {
Button("Retry") { session.retryMonitoring() }
.font(.caption.bold())
.foregroundStyle(.white)
.padding(.horizontal, 12)
.padding(.vertical, 5)
.background(Color.red.opacity(0.8))
.clipShape(.rect(cornerRadius: 6))
.buttonStyle(.plain)
}
}
.padding(12)
.background(isRestarting ? Color.orange.opacity(0.08) : Color.red.opacity(0.08))
.overlay(
Rectangle()
.frame(width: 3)
.foregroundStyle(isRestarting ? Color.orange : Color.red),
alignment: .leading
)
}
}
// MARK: - Executor Output Sticky Card // MARK: - Executor Output Sticky Card
private struct ExecutorOutputCard: View { private struct ExecutorOutputCard: View {

278
GeminiVLMClient.swift Normal file
View File

@@ -0,0 +1,278 @@
// GeminiVLMClient.swift Native Swift Gemini Vision API client
// Ports the Python argus VLM analysis (vlm.py) directly into Swift.
// No subprocess required: screenshots go straight from ScreenCaptureKit Gemini UI.
import Foundation
struct GeminiVLMClient {
private static let apiBase = "https://generativelanguage.googleapis.com/v1beta/models"
private static let model = "gemini-3.1-pro-preview"
let apiKey: String
// MARK: - Public
/// Analyze a sequence of JPEG frames and return a structured distraction analysis.
/// - Parameters:
/// - frames: JPEG screenshot frames, oldest first, newest last.
/// - taskTitle: Current task title (empty if no session).
/// - taskGoal: Task description / goal.
/// - steps: Active step list for the current task.
/// - windowTitle: Frontmost app name from NSWorkspace.
/// - recentSummaries: Rolling summaries from previous analyses (temporal context).
func analyze(
frames: [Data],
taskTitle: String,
taskGoal: String,
steps: [Step],
windowTitle: String,
recentSummaries: [String]
) async throws -> DistractionAnalysisResponse {
let prompt = buildPrompt(
taskTitle: taskTitle,
taskGoal: taskGoal,
steps: steps,
windowTitle: windowTitle,
recentSummaries: recentSummaries
)
let raw = try await callGemini(prompt: prompt, frames: frames)
return try parseResponse(raw)
}
// MARK: - Prompt Builder (ported from vlm.py build_system_prompt)
private func buildPrompt(
taskTitle: String,
taskGoal: String,
steps: [Step],
windowTitle: String,
recentSummaries: [String]
) -> String {
let stepsText: String
if steps.isEmpty {
stepsText = " (no steps defined)"
} else {
stepsText = steps.map { s in
let marker: String
switch s.status {
case "pending": marker = ""
case "in_progress": marker = ""
case "done": marker = ""
default: marker = "?"
}
var line = " \(marker) [\(s.status)] (id=\(s.id)) \(s.sortOrder). \(s.title)"
if let note = s.checkpointNote { line += " — checkpoint: \(note)" }
return line
}.joined(separator: "\n")
}
let historyText: String
if recentSummaries.isEmpty {
historyText = " (no previous frames)"
} else {
historyText = recentSummaries.enumerated()
.map { i, s in " [frame \(i + 1)] \(s)" }
.joined(separator: "\n")
}
return """
You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots.
## How to read the screenshots
You receive screenshots in chronological order (oldest first, newest last).
Each frame is ~5 seconds apart. This means:
- 2 unchanged frames = ~10 seconds idle — significant.
- 3 unchanged frames = ~15 seconds idle — user is stuck or distracted.
- If ALL frames are identical, the user has been idle for 15+ seconds — flag it.
Your PRIMARY signal is the DIFFERENCES between consecutive frames.
Where the screen CHANGED = where attention is. Static areas = ignore.
Diff signals and what they mean:
- New text appearing / cursor advancing → user is actively typing (this IS their task)
- Window or tab switch → context change, could be reference or distraction
- Same content, no pixel changes → stalled, idle, or reading
- Repeated switching between same 2-3 apps → repetitive loop (manual data transfer)
- Error message that APPEARED between frames → user just triggered it, relevant
- Error message already in ALL frames → stale, ignore
CRITICAL — looking at something ≠ working on something:
- User switches to browser/another app and just LOOKS → distraction or quick reference.
- User switches and starts TYPING/EDITING → might be a new task.
- If the user has an active session and switches away WITHOUT typing in the new app,
they are DISTRACTED from their session, not starting a new task.
- A single app switch is NEVER enough to infer a new task. Wait for active work.
## Current task context
Task: \(taskTitle.isEmpty ? "(no active task)" : taskTitle)
Goal: \(taskGoal.isEmpty ? taskTitle : taskGoal)
Steps:
\(stepsText)
Window title (OS): \(windowTitle.isEmpty ? "(unknown)" : windowTitle)
## Recent screen history (for temporal context)
\(historyText)
## What to output
Analyze the screenshots and return JSON with EXACTLY this structure (no extra fields, no markdown):
{
"on_task": true,
"current_step_id": "step UUID or null",
"inferred_task": "what the user is actually working on based on screen diffs",
"checkpoint_note_update": "what specifically changed across these frames",
"steps_completed": [],
"friction": {
"type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none",
"confidence": 0.0,
"description": "what the user is struggling with",
"proposed_actions": [
{
"label": "specific verb phrase: what to do",
"action_type": "auto_extract | brain_dump | other",
"details": "natural language spec for what action to take"
}
],
"source_context": "filename or app name, or null",
"target_context": "filename or app name, or null"
},
"session_action": {
"type": "none",
"session_id": null,
"reason": ""
},
"intent": "skimming | engaged | unclear | null",
"distraction_type": "app_switch | browsing | idle | null",
"app_name": "primary visible application",
"confidence": 0.8,
"gentle_nudge": "short nudge message if distracted but no friction action applies, otherwise null",
"vlm_summary": "1-sentence description of what CHANGED across the frames (not what is static)"
}
FRICTION DETECTION rules:
- REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually)
- STALLED: No meaningful pixel changes across 2+ frames; or user wrote then deleted
- TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand)
- CONTEXT_OVERHEAD: Many windows open, visibly searching across them
- TASK_RESUMPTION: User just returned to a task they were working on earlier
If friction confidence < 0.5, set type to "none".
Only set gentle_nudge when user is off-task AND no actionable friction applies.
"""
}
// MARK: - Action Executor
/// Execute a user-approved proactive action and return a plain-text result.
func executeAction(
label: String,
actionType: String,
details: String,
screenshot: Data?
) async throws -> String {
let taskInstruction: String
switch actionType {
case "auto_extract":
taskInstruction = "Extract the relevant data from the screenshot and present it concisely as plain text."
case "brain_dump":
taskInstruction = "Format this as a short brain-dump note the user should add to their task list."
default:
taskInstruction = "Provide 23 concrete next steps the user can take right now."
}
let prompt = """
You are a productivity assistant. The user approved this action: "\(label)"
Details: \(details.isEmpty ? "(none)" : details)
\(taskInstruction)
Be specific and brief (35 sentences max). No markdown, no preamble, plain text only.
"""
let frames: [Data] = screenshot.map { [$0] } ?? []
return try await callGemini(prompt: prompt, frames: frames)
}
// MARK: - Gemini REST API Call
private func callGemini(prompt: String, frames: [Data]) async throws -> String {
let urlStr = "\(Self.apiBase)/\(Self.model):generateContent?key=\(apiKey)"
guard let url = URL(string: urlStr) else { throw URLError(.badURL) }
// Build content parts: label + image for each frame, then instruction
var parts: [[String: Any]] = []
let total = frames.count
for (i, frame) in frames.enumerated() {
parts.append(["text": "[Screenshot \(i + 1)/\(total)\((total - i) * 5)s ago]"])
parts.append([
"inlineData": [
"mimeType": "image/jpeg",
"data": frame.base64EncodedString()
]
])
}
parts.append(["text": "Analyze this screenshot sequence now. Reply with ONLY valid JSON — no markdown, no code fences."])
let body: [String: Any] = [
"systemInstruction": ["parts": [["text": prompt]]],
"contents": [["parts": parts]],
"generationConfig": [
"temperature": 0.2,
"maxOutputTokens": 1024
]
]
var request = URLRequest(url: url)
request.httpMethod = "POST"
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
request.httpBody = try JSONSerialization.data(withJSONObject: body)
request.timeoutInterval = 60
let (data, response) = try await URLSession.shared.data(for: request)
if let http = response as? HTTPURLResponse, http.statusCode != 200 {
let msg = String(data: data, encoding: .utf8) ?? "HTTP \(http.statusCode)"
print("[GeminiVLM] API error \(http.statusCode): \(msg)")
throw URLError(.badServerResponse)
}
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
let candidates = json["candidates"] as? [[String: Any]],
let first = candidates.first,
let content = first["content"] as? [String: Any],
let contentParts = content["parts"] as? [[String: Any]],
let text = contentParts.first?["text"] as? String
else {
let raw = String(data: data, encoding: .utf8) ?? ""
print("[GeminiVLM] Unexpected response shape: \(raw.prefix(300))")
throw URLError(.cannotParseResponse)
}
print("[GeminiVLM] Response (\(text.count) chars): \(text.prefix(200))")
return text
}
// MARK: - Response Parsing
private func parseResponse(_ text: String) throws -> DistractionAnalysisResponse {
var cleaned = text.trimmingCharacters(in: .whitespacesAndNewlines)
// Strip ```json ... ``` or ``` ... ``` fences
if cleaned.hasPrefix("```") {
let lines = cleaned.components(separatedBy: "\n")
cleaned = lines.dropFirst().joined(separator: "\n")
if let backtickRange = cleaned.range(of: "```") {
cleaned = String(cleaned[..<backtickRange.lowerBound])
}
cleaned = cleaned.trimmingCharacters(in: .whitespacesAndNewlines)
}
// Find JSON object boundaries robustly
guard let start = cleaned.firstIndex(of: "{"),
let end = cleaned.lastIndex(of: "}") else {
throw URLError(.cannotParseResponse)
}
let jsonStr = String(cleaned[start...end])
guard let jsonData = jsonStr.data(using: .utf8) else {
throw URLError(.cannotParseResponse)
}
return try JSONDecoder().decode(DistractionAnalysisResponse.self, from: jsonData)
}
}

View File

@@ -7,6 +7,7 @@
objects = { objects = {
/* Begin PBXBuildFile section */ /* Begin PBXBuildFile section */
FF341F642F7932FA00B5716A /* GeminiVLMClient.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF341F632F7932FA00B5716A /* GeminiVLMClient.swift */; };
FF935B1E2F78A83100ED3330 /* SpeakerKit in Frameworks */ = {isa = PBXBuildFile; productRef = FF935B1D2F78A83100ED3330 /* SpeakerKit */; }; FF935B1E2F78A83100ED3330 /* SpeakerKit in Frameworks */ = {isa = PBXBuildFile; productRef = FF935B1D2F78A83100ED3330 /* SpeakerKit */; };
FF935B202F78A83100ED3330 /* TTSKit in Frameworks */ = {isa = PBXBuildFile; productRef = FF935B1F2F78A83100ED3330 /* TTSKit */; }; FF935B202F78A83100ED3330 /* TTSKit in Frameworks */ = {isa = PBXBuildFile; productRef = FF935B1F2F78A83100ED3330 /* TTSKit */; };
FF935B222F78A83100ED3330 /* WhisperKit in Frameworks */ = {isa = PBXBuildFile; productRef = FF935B212F78A83100ED3330 /* WhisperKit */; }; FF935B222F78A83100ED3330 /* WhisperKit in Frameworks */ = {isa = PBXBuildFile; productRef = FF935B212F78A83100ED3330 /* WhisperKit */; };
@@ -16,6 +17,7 @@
/* Begin PBXFileReference section */ /* Begin PBXFileReference section */
FF3296C22F785B3300C734EB /* LockInBro.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LockInBro.app; sourceTree = BUILT_PRODUCTS_DIR; }; FF3296C22F785B3300C734EB /* LockInBro.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LockInBro.app; sourceTree = BUILT_PRODUCTS_DIR; };
FF341F632F7932FA00B5716A /* GeminiVLMClient.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = GeminiVLMClient.swift; sourceTree = "<group>"; };
FF935B232F78D0AA00ED3330 /* FloatingPanel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FloatingPanel.swift; sourceTree = "<group>"; }; FF935B232F78D0AA00ED3330 /* FloatingPanel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FloatingPanel.swift; sourceTree = "<group>"; };
FF935B252F78D0BF00ED3330 /* FloatingHUDView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FloatingHUDView.swift; sourceTree = "<group>"; }; FF935B252F78D0BF00ED3330 /* FloatingHUDView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FloatingHUDView.swift; sourceTree = "<group>"; };
/* End PBXFileReference section */ /* End PBXFileReference section */
@@ -49,6 +51,7 @@
FF3296C32F785B3300C734EB /* Products */, FF3296C32F785B3300C734EB /* Products */,
FF935B232F78D0AA00ED3330 /* FloatingPanel.swift */, FF935B232F78D0AA00ED3330 /* FloatingPanel.swift */,
FF935B252F78D0BF00ED3330 /* FloatingHUDView.swift */, FF935B252F78D0BF00ED3330 /* FloatingHUDView.swift */,
FF341F632F7932FA00B5716A /* GeminiVLMClient.swift */,
); );
sourceTree = "<group>"; sourceTree = "<group>";
}; };
@@ -141,6 +144,7 @@
buildActionMask = 2147483647; buildActionMask = 2147483647;
files = ( files = (
FF935B262F78D0BF00ED3330 /* FloatingHUDView.swift in Sources */, FF935B262F78D0BF00ED3330 /* FloatingHUDView.swift in Sources */,
FF341F642F7932FA00B5716A /* GeminiVLMClient.swift in Sources */,
FF935B242F78D0AA00ED3330 /* FloatingPanel.swift in Sources */, FF935B242F78D0AA00ED3330 /* FloatingPanel.swift in Sources */,
); );
runOnlyForDeploymentPostprocessing = 0; runOnlyForDeploymentPostprocessing = 0;

View File

@@ -24,16 +24,34 @@ enum NetworkError: Error, LocalizedError {
final class TokenStore { final class TokenStore {
static let shared = TokenStore() static let shared = TokenStore()
private let key = "lockInBro.jwt" private let accessKey = "lockInBro.jwt"
private let refreshKey = "lockInBro.refreshToken"
private init() {} private init() {}
var token: String? { var token: String? {
get { UserDefaults.standard.string(forKey: key) } get { UserDefaults.standard.string(forKey: accessKey) }
set { set {
if let v = newValue { UserDefaults.standard.set(v, forKey: key) } if let v = newValue { UserDefaults.standard.set(v, forKey: accessKey) }
else { UserDefaults.standard.removeObject(forKey: key) } else { UserDefaults.standard.removeObject(forKey: accessKey) }
} }
} }
var refreshToken: String? {
get { UserDefaults.standard.string(forKey: refreshKey) }
set {
if let v = newValue { UserDefaults.standard.set(v, forKey: refreshKey) }
else { UserDefaults.standard.removeObject(forKey: refreshKey) }
}
}
func clear() {
token = nil
refreshToken = nil
}
}
extension Notification.Name {
static let lockInBroAuthExpired = Notification.Name("lockInBroAuthExpired")
} }
// MARK: - APIClient // MARK: - APIClient
@@ -46,13 +64,17 @@ final class APIClient {
// MARK: Core Request // MARK: Core Request
// Coalesces concurrent 401-triggered refreshes into one request
private var activeRefreshTask: Task<Bool, Never>?
private func req( private func req(
_ path: String, _ path: String,
method: String = "GET", method: String = "GET",
body: Data? = nil, body: Data? = nil,
contentType: String = "application/json", contentType: String = "application/json",
auth: Bool = true, auth: Bool = true,
timeout: TimeInterval = 30 timeout: TimeInterval = 30,
isRetry: Bool = false
) async throws -> Data { ) async throws -> Data {
guard let url = URL(string: base + path) else { guard let url = URL(string: base + path) else {
throw NetworkError.unknown(URLError(.badURL)) throw NetworkError.unknown(URLError(.badURL))
@@ -75,6 +97,17 @@ final class APIClient {
throw NetworkError.unknown(URLError(.badServerResponse)) throw NetworkError.unknown(URLError(.badServerResponse))
} }
guard http.statusCode < 400 else { guard http.statusCode < 400 else {
if http.statusCode == 401 && auth && !isRetry {
// Try to silently refresh the access token, then retry once
let refreshed = await refreshAccessToken()
if refreshed {
return try await req(path, method: method, body: body,
contentType: contentType, auth: auth,
timeout: timeout, isRetry: true)
}
// Refresh also failed force logout
await MainActor.run { AuthManager.shared.handleSessionExpired() }
}
let msg = (try? JSONDecoder().decode(APIErrorResponse.self, from: data))?.detail let msg = (try? JSONDecoder().decode(APIErrorResponse.self, from: data))?.detail
?? String(data: data, encoding: .utf8) ?? String(data: data, encoding: .utf8)
?? "Unknown error" ?? "Unknown error"
@@ -83,6 +116,32 @@ final class APIClient {
return data return data
} }
/// Refreshes the access token. Concurrent callers share one in-flight request.
private func refreshAccessToken() async -> Bool {
if let existing = activeRefreshTask { return await existing.value }
let task = Task<Bool, Never> {
defer { self.activeRefreshTask = nil }
guard let refresh = TokenStore.shared.refreshToken else { return false }
do {
let body = try JSONSerialization.data(withJSONObject: ["refresh_token": refresh])
guard let url = URL(string: base + "/auth/refresh") else { return false }
var req = URLRequest(url: url)
req.httpMethod = "POST"
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.httpBody = body
req.timeoutInterval = 30
let (data, res) = try await urlSession.data(for: req)
guard let http = res as? HTTPURLResponse, http.statusCode == 200 else { return false }
let auth = try self.decode(AuthResponse.self, from: data)
TokenStore.shared.token = auth.accessToken
TokenStore.shared.refreshToken = auth.refreshToken
return true
} catch { return false }
}
activeRefreshTask = task
return await task.value
}
private func decode<T: Decodable>(_ type: T.Type, from data: Data) throws -> T { private func decode<T: Decodable>(_ type: T.Type, from data: Data) throws -> T {
let decoder = JSONDecoder() let decoder = JSONDecoder()
decoder.dateDecodingStrategy = .iso8601 decoder.dateDecodingStrategy = .iso8601
@@ -206,6 +265,16 @@ final class APIClient {
// MARK: - Sessions // MARK: - Sessions
/// Returns the currently active session, or nil if none (404).
func getActiveSession() async throws -> FocusSession? {
do {
let data = try await req("/sessions/active")
return try decode(FocusSession.self, from: data)
} catch NetworkError.httpError(404, _) {
return nil
}
}
func startSession(taskId: String?) async throws -> FocusSession { func startSession(taskId: String?) async throws -> FocusSession {
var dict: [String: Any] = ["platform": "mac"] var dict: [String: Any] = ["platform": "mac"]
if let tid = taskId { dict["task_id"] = tid } if let tid = taskId { dict["task_id"] = tid }
@@ -268,9 +337,35 @@ final class APIClient {
_ = try await req("/distractions/app-activity", method: "POST", body: body) _ = try await req("/distractions/app-activity", method: "POST", body: body)
} }
// MARK: - Distraction / Screenshot Analysis // MARK: - Distraction / VLM Analysis
// Note: spec primary endpoint is POST /distractions/analyze-result (device-side VLM, JSON only).
// Backend currently implements analyze-screenshot (legacy fallback) using that until analyze-result is deployed. /// Post a VLM analysis result (from GeminiVLMClient) to the backend.
/// This updates the session checkpoint so the backend has the latest on_task / friction data.
func postAnalysisResult(_ result: DistractionAnalysisResponse, sessionId: String) async throws {
var payload: [String: Any] = [
"session_id": sessionId,
"on_task": result.onTask,
"confidence": result.confidence,
"vlm_summary": result.vlmSummary ?? "",
"steps_completed": result.stepsCompleted,
]
if let stepId = result.currentStepId { payload["current_step_id"] = stepId }
if let note = result.checkpointNoteUpdate { payload["checkpoint_note_update"] = note }
if let app = result.appName { payload["app_name"] = app }
if let nudge = result.gentleNudge { payload["gentle_nudge"] = nudge }
if let friction = result.friction {
payload["friction"] = [
"type": friction.type,
"confidence": friction.confidence,
"description": friction.description as Any,
"proposed_actions": friction.proposedActions.map {
["label": $0.label, "action_type": $0.actionType, "details": $0.details as Any]
},
]
}
let body = try JSONSerialization.data(withJSONObject: payload)
_ = try await req("/distractions/analyze-result", method: "POST", body: body)
}
func analyzeScreenshot( func analyzeScreenshot(
imageData: Data, imageData: Data,

View File

@@ -22,6 +22,7 @@ final class AuthManager {
do { do {
let response = try await APIClient.shared.login(email: email, password: password) let response = try await APIClient.shared.login(email: email, password: password)
TokenStore.shared.token = response.accessToken TokenStore.shared.token = response.accessToken
TokenStore.shared.refreshToken = response.refreshToken
currentUser = response.user currentUser = response.user
isLoggedIn = true isLoggedIn = true
} catch { } catch {
@@ -40,6 +41,7 @@ final class AuthManager {
displayName: displayName displayName: displayName
) )
TokenStore.shared.token = response.accessToken TokenStore.shared.token = response.accessToken
TokenStore.shared.refreshToken = response.refreshToken
currentUser = response.user currentUser = response.user
isLoggedIn = true isLoggedIn = true
} catch { } catch {
@@ -58,6 +60,7 @@ final class AuthManager {
fullName: fullName fullName: fullName
) )
TokenStore.shared.token = response.accessToken TokenStore.shared.token = response.accessToken
TokenStore.shared.refreshToken = response.refreshToken
currentUser = response.user currentUser = response.user
isLoggedIn = true isLoggedIn = true
} catch { } catch {
@@ -67,8 +70,20 @@ final class AuthManager {
} }
func logout() { func logout() {
TokenStore.shared.token = nil SessionManager.shared.stopMonitoring()
TokenStore.shared.clear()
currentUser = nil currentUser = nil
isLoggedIn = false isLoggedIn = false
errorMessage = nil
}
/// Called by APIClient when the server returns 401 and the refresh token is also dead.
func handleSessionExpired() {
guard isLoggedIn else { return }
SessionManager.shared.stopMonitoring()
TokenStore.shared.clear()
currentUser = nil
isLoggedIn = false
errorMessage = "Your session expired — please log in again."
} }
} }

View File

@@ -401,6 +401,8 @@ private struct ProactiveCardView: View {
return description ?? "I noticed something that might be slowing you down." return description ?? "I noticed something that might be slowing you down."
case .appSwitchLoop(let apps, let count): case .appSwitchLoop(let apps, let count):
return "You've switched between \(apps.joined(separator: "")) \(count)× in a row — are you stuck?" return "You've switched between \(apps.joined(separator: "")) \(count)× in a row — are you stuck?"
case .sessionAction(_, _, let checkpoint, let reason, _):
return checkpoint.isEmpty ? reason : "Left off: \(checkpoint)"
} }
} }
} }

View File

@@ -2,8 +2,24 @@
import SwiftUI import SwiftUI
// MARK: - AppDelegate (subprocess cleanup on quit)
final class AppDelegate: NSObject, NSApplicationDelegate {
/// Called for normal quits (Cmd+Q), window close, and SIGTERM.
/// Ensures the argus subprocess is killed before the process exits.
func applicationWillTerminate(_ notification: Notification) {
// applicationWillTerminate runs on the main thread, so we can safely
// call @MainActor methods synchronously via assumeIsolated.
MainActor.assumeIsolated {
SessionManager.shared.stopMonitoring()
}
}
}
@main @main
struct LockInBroApp: App { struct LockInBroApp: App {
@NSApplicationDelegateAdaptor(AppDelegate.self) var appDelegate
@State private var auth = AuthManager.shared @State private var auth = AuthManager.shared
@State private var session = SessionManager.shared @State private var session = SessionManager.shared
@@ -13,9 +29,11 @@ struct LockInBroApp: App {
ContentView() ContentView()
.environment(auth) .environment(auth)
.environment(session) .environment(session)
.onChange(of: session.isSessionActive) { _, isActive in .onChange(of: auth.isLoggedIn, initial: true) { _, loggedIn in
if isActive { if loggedIn {
// Show HUD and start always-on monitoring as soon as user logs in
FloatingPanelController.shared.show(session: session) FloatingPanelController.shared.show(session: session)
Task { await session.startMonitoring() }
} else { } else {
FloatingPanelController.shared.close() FloatingPanelController.shared.close()
} }

View File

@@ -109,6 +109,32 @@ struct Step: Identifiable, Codable, Hashable {
// MARK: - Focus Session // MARK: - Focus Session
/// Subset of the JSONB checkpoint dict stored on the backend session record.
/// Populated by argus when it POSTs to /distractions/analyze-result.
struct SessionCheckpoint: Codable {
/// Written by POST /distractions/analyze-result (argus live mode).
let lastVlmSummary: String?
/// Written by POST /distractions/analyze-screenshot (Swift fallback).
let lastScreenshotAnalysis: String?
/// Concise summary of the last completed action.
let lastActionSummary: String?
/// Frontmost application name at last checkpoint.
let activeApp: String?
/// Running count of distractions logged during this session.
let distractionCount: Int?
/// Returns whichever VLM summary field is populated, preferring the most recent.
var vlmSummary: String? { lastVlmSummary ?? lastScreenshotAnalysis }
enum CodingKeys: String, CodingKey {
case lastVlmSummary = "last_vlm_summary"
case lastScreenshotAnalysis = "last_screenshot_analysis"
case lastActionSummary = "last_action_summary"
case activeApp = "active_app"
case distractionCount = "distraction_count"
}
}
struct FocusSession: Identifiable, Codable { struct FocusSession: Identifiable, Codable {
let id: String let id: String
let userId: String let userId: String
@@ -117,9 +143,11 @@ struct FocusSession: Identifiable, Codable {
let startedAt: String let startedAt: String
var endedAt: String? var endedAt: String?
var status: String var status: String
/// Live checkpoint data written by argus (nil when no checkpoint exists yet).
var checkpoint: SessionCheckpoint?
enum CodingKeys: String, CodingKey { enum CodingKeys: String, CodingKey {
case id, platform, status case id, platform, status, checkpoint
case userId = "user_id" case userId = "user_id"
case taskId = "task_id" case taskId = "task_id"
case startedAt = "started_at" case startedAt = "started_at"
@@ -144,8 +172,10 @@ struct BrainDumpResponse: Codable {
struct ParsedTask: Codable, Identifiable { struct ParsedTask: Codable, Identifiable {
// local UUID for list identity before saving // local UUID for list identity before saving
var localId: String = UUID().uuidString var localId: String = UUID().uuidString
var id: String { localId } var id: String { taskId ?? localId }
/// Set by backend when the brain-dump endpoint creates the task automatically.
let taskId: String?
let title: String let title: String
let description: String? let description: String?
let priority: Int let priority: Int
@@ -154,6 +184,7 @@ struct ParsedTask: Codable, Identifiable {
let tags: [String] let tags: [String]
enum CodingKeys: String, CodingKey { enum CodingKeys: String, CodingKey {
case taskId = "task_id"
case title, description, priority, deadline, tags case title, description, priority, deadline, tags
case estimatedMinutes = "estimated_minutes" case estimatedMinutes = "estimated_minutes"
} }
@@ -208,13 +239,28 @@ struct FrictionInfo: Codable {
var isResumption: Bool { type == "task_resumption" } var isResumption: Bool { type == "task_resumption" }
} }
/// Session lifecycle action suggested by the VLM (new argus feature).
struct SessionAction: Codable {
/// resume | switch | complete | start_new | none
let type: String
let sessionId: String?
let reason: String?
enum CodingKeys: String, CodingKey {
case type, reason
case sessionId = "session_id"
}
}
struct DistractionAnalysisResponse: Codable { struct DistractionAnalysisResponse: Codable {
let onTask: Bool let onTask: Bool
let currentStepId: String? let currentStepId: String?
let inferredTask: String?
let checkpointNoteUpdate: String? let checkpointNoteUpdate: String?
let stepsCompleted: [String] let stepsCompleted: [String]
// Upgraded Argus prompt fields (nil when backend uses legacy prompt) // Upgraded Argus prompt fields (nil when backend uses legacy prompt)
let friction: FrictionInfo? let friction: FrictionInfo?
let sessionAction: SessionAction? // new argus: session lifecycle suggestions
let intent: String? // skimming | engaged | unclear | null let intent: String? // skimming | engaged | unclear | null
let distractionType: String? let distractionType: String?
let appName: String? let appName: String?
@@ -225,9 +271,11 @@ struct DistractionAnalysisResponse: Codable {
enum CodingKeys: String, CodingKey { enum CodingKeys: String, CodingKey {
case onTask = "on_task" case onTask = "on_task"
case currentStepId = "current_step_id" case currentStepId = "current_step_id"
case inferredTask = "inferred_task"
case checkpointNoteUpdate = "checkpoint_note_update" case checkpointNoteUpdate = "checkpoint_note_update"
case stepsCompleted = "steps_completed" case stepsCompleted = "steps_completed"
case friction, intent case friction, intent
case sessionAction = "session_action"
case distractionType = "distraction_type" case distractionType = "distraction_type"
case appName = "app_name" case appName = "app_name"
case confidence case confidence
@@ -298,6 +346,8 @@ struct ProactiveCard: Identifiable {
case vlmFriction(frictionType: String, description: String?, actions: [ProposedAction]) case vlmFriction(frictionType: String, description: String?, actions: [ProposedAction])
/// Heuristic app-switch loop detected by NSWorkspace observer (fallback when VLM hasn't returned friction yet). /// Heuristic app-switch loop detected by NSWorkspace observer (fallback when VLM hasn't returned friction yet).
case appSwitchLoop(apps: [String], switchCount: Int) case appSwitchLoop(apps: [String], switchCount: Int)
/// VLM suggests a session lifecycle action (new argus: resume, switch, complete, start_new).
case sessionAction(type: String, taskTitle: String, checkpoint: String, reason: String, sessionId: String?)
} }
let id = UUID() let id = UUID()
@@ -316,6 +366,14 @@ struct ProactiveCard: Identifiable {
} }
case .appSwitchLoop: case .appSwitchLoop:
return "Repetitive Pattern Detected" return "Repetitive Pattern Detected"
case .sessionAction(let type, let taskTitle, _, _, _):
switch type {
case "resume": return "Resume: \(taskTitle)"
case "switch": return "Switch to: \(taskTitle)"
case "complete": return "Done with \(taskTitle)?"
case "start_new": return "Start a Focus Session?"
default: return "Session Suggestion"
}
} }
} }
@@ -332,6 +390,14 @@ struct ProactiveCard: Identifiable {
} }
case .appSwitchLoop: case .appSwitchLoop:
return "arrow.triangle.2.circlepath" return "arrow.triangle.2.circlepath"
case .sessionAction(let type, _, _, _, _):
switch type {
case "resume": return "arrow.counterclockwise.circle"
case "switch": return "arrow.left.arrow.right"
case "complete": return "checkmark.circle"
case "start_new": return "plus.circle"
default: return "circle"
}
} }
} }
} }

View File

@@ -1,4 +1,6 @@
// SessionManager.swift Focus session state, screenshot engine, distraction detection // SessionManager.swift Focus session state, native VLM screen analysis
// Screenshot capture Gemini Vision API apply results to UI + post to backend.
// No Python subprocess. No external process management.
import AppKit import AppKit
import SwiftUI import SwiftUI
@@ -25,30 +27,27 @@ final class SessionManager {
var errorMessage: String? var errorMessage: String?
var isLoading: Bool = false var isLoading: Bool = false
// Proactive agent // VLM / proactive agent
var proactiveCard: ProactiveCard? var proactiveCard: ProactiveCard?
/// Set when the user approves a proposed action shown as a confirmation toast
var approvedActionLabel: String?
/// Latest one-sentence summary from the VLM, shown in the floating HUD
var latestVlmSummary: String? var latestVlmSummary: String?
/// True while the argus executor is running an approved action var latestInferredTask: String?
var isExecuting: Bool = false var isExecuting: Bool = false
/// Result produced by the executor's output() tool shown as a sticky card in the HUD
var executorOutput: (title: String, content: String)? var executorOutput: (title: String, content: String)?
var monitoringError: String?
// Screenshot engine // Screenshot engine
var isCapturing: Bool = false var isCapturing: Bool = false
private var captureTask: Task<Void, Never>? @ObservationIgnored private var captureTask: Task<Void, Never>?
private let captureInterval: TimeInterval = 5.0 private let captureInterval: TimeInterval = 5.0
// Rolling screenshot history buffer (max 4 entries, ~20-second window) // Frame buffer accumulate N frames before calling VLM for temporal diff context
// Provides temporal context to the VLM so it can detect patterns across captures. @ObservationIgnored private var frameBuffer: [Data] = []
private struct ScreenshotHistoryEntry { private let framesPerVLMCall = 3
let summary: String // vlm_summary text from the previous analysis
let timestamp: Date // Rolling summary history fed as context into subsequent VLM calls
} private struct HistoryEntry { let summary: String; let timestamp: Date }
@ObservationIgnored private var screenshotHistory: [ScreenshotHistoryEntry] = [] @ObservationIgnored private var screenshotHistory: [HistoryEntry] = []
// App switch tracking // App switch tracking
@ObservationIgnored private var appSwitches: [(name: String, bundleId: String, time: Date)] = [] @ObservationIgnored private var appSwitches: [(name: String, bundleId: String, time: Date)] = []
@@ -56,15 +55,8 @@ final class SessionManager {
@ObservationIgnored private var lastApp: (name: String, bundleId: String) = ("", "") @ObservationIgnored private var lastApp: (name: String, bundleId: String) = ("", "")
@ObservationIgnored private var lastAppEnteredAt: Date = Date() @ObservationIgnored private var lastAppEnteredAt: Date = Date()
// Argus subprocess (device-side VLM) // Proactive card auto-dismiss timer
@ObservationIgnored private var argusProcess: Process?
@ObservationIgnored private var argusReadTask: Task<Void, Never>?
@ObservationIgnored private var argusStdinPipe: Pipe?
/// Whether the current proactive card came from VLM (needs argus stdin response) vs local heuristic
@ObservationIgnored private var proactiveCardNeedsArgusResponse = false
@ObservationIgnored private var proactiveCardTimer: Task<Void, Never>? @ObservationIgnored private var proactiveCardTimer: Task<Void, Never>?
private let argusPythonPath = "/Users/joyzhuo/miniconda3/envs/gmr/bin/python3"
private let argusRepoPath = "/Users/joyzhuo/yhack/lockinbro-argus"
private init() {} private init() {}
@@ -83,9 +75,96 @@ final class SessionManager {
return Date().timeIntervalSince(start) return Date().timeIntervalSince(start)
} }
// MARK: - Monitoring Lifecycle
/// Immediately shuts down all monitoring without making any API calls.
func stopMonitoring() {
stopCapture()
stopAppObserver()
proactiveCardTimer?.cancel()
proactiveCardTimer = nil
activeSession = nil
activeTask = nil
activeSteps = []
isSessionActive = false
sessionStartDate = nil
lastNudge = nil
resumeCard = nil
showingResumeCard = false
proactiveCard = nil
latestVlmSummary = nil
latestInferredTask = nil
isExecuting = false
executorOutput = nil
monitoringError = nil
screenshotHistory = []
frameBuffer = []
persistedSessionId = nil
}
/// Called once after login. Auto-resumes any existing active session and starts the capture loop.
func startMonitoring() async {
guard TokenStore.shared.token != nil else { return }
guard !isCapturing else { return }
monitoringError = nil
await requestNotificationPermission()
// Silent preflight never shows UI; only request permission if not yet granted.
if !CGPreflightScreenCaptureAccess() {
CGRequestScreenCaptureAccess()
monitoringError = "Screen Recording permission required — enable in System Settings → Privacy & Security → Screen Recording, then tap Retry"
return
}
do {
if let existing = try await APIClient.shared.getActiveSession() {
await autoResumeSession(existing)
} else {
startCapture()
startAppObserver()
}
} catch {
startCapture()
startAppObserver()
}
}
/// Silently resume an active session found on the backend (no loading UI shown).
private func autoResumeSession(_ session: FocusSession) async {
activeSession = session
persistedSessionId = session.id
isSessionActive = true
sessionStartDate = Date()
distractionCount = 0
lastNudge = nil
screenshotHistory = []
frameBuffer = []
if let taskId = session.taskId {
do {
let tasks = try await APIClient.shared.getTasks()
activeTask = tasks.first(where: { $0.id == taskId })
if let task = activeTask {
let steps = try await APIClient.shared.getSteps(taskId: task.id)
activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder }
currentStepIndex = activeSteps.firstIndex(where: { $0.isActive })
?? activeSteps.firstIndex(where: { $0.status == "pending" })
?? 0
}
} catch {}
}
let shortId = String(session.id.prefix(8))
let taskLabel = activeTask?.title ?? "(no task)"
latestVlmSummary = "Resumed session \(shortId) · \(taskLabel)"
startCapture()
startAppObserver()
}
// MARK: - Session Lifecycle // MARK: - Session Lifecycle
// Persisted so we can end a stale session after an app restart
private var persistedSessionId: String? { private var persistedSessionId: String? {
get { UserDefaults.standard.string(forKey: "lockInBro.lastSessionId") } get { UserDefaults.standard.string(forKey: "lockInBro.lastSessionId") }
set { set {
@@ -98,18 +177,16 @@ final class SessionManager {
isLoading = true isLoading = true
errorMessage = nil errorMessage = nil
do { do {
let session: FocusSession // End any existing session first
do { var staleId: String? = activeSession?.id ?? persistedSessionId
session = try await APIClient.shared.startSession(taskId: task?.id) if staleId == nil {
} catch NetworkError.httpError(409, _) { staleId = (try? await APIClient.shared.getActiveSession())?.id
// End whichever session is active prefer the locally known one, }
// fall back to the last persisted ID (survives app restarts)
let staleId = activeSession?.id ?? persistedSessionId
if let id = staleId { if let id = staleId {
_ = try? await APIClient.shared.endSession(sessionId: id, status: "completed") _ = try? await APIClient.shared.endSession(sessionId: id, status: "completed")
} }
session = try await APIClient.shared.startSession(taskId: task?.id)
} let session = try await APIClient.shared.startSession(taskId: task?.id)
activeSession = session activeSession = session
persistedSessionId = session.id persistedSessionId = session.id
activeTask = task activeTask = task
@@ -119,20 +196,22 @@ final class SessionManager {
sessionStartDate = Date() sessionStartDate = Date()
distractionCount = 0 distractionCount = 0
lastNudge = nil lastNudge = nil
screenshotHistory = []
frameBuffer = []
if let task { if let task {
let steps = try await APIClient.shared.getSteps(taskId: task.id) let steps = try await APIClient.shared.getSteps(taskId: task.id)
activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder } activeSteps = steps.sorted { $0.sortOrder < $1.sortOrder }
// Pick first in-progress or first pending step
currentStepIndex = activeSteps.firstIndex(where: { $0.isActive }) currentStepIndex = activeSteps.firstIndex(where: { $0.isActive })
?? activeSteps.firstIndex(where: { $0.status == "pending" }) ?? activeSteps.firstIndex(where: { $0.status == "pending" })
?? 0 ?? 0
} }
screenshotHistory = []
await requestNotificationPermission() await requestNotificationPermission()
startArgus(session: session, task: task) // Restart capture loop (in case it wasn't running or was in monitoring-only mode)
startAppObserver() stopCapture()
startCapture()
if appSwitchObserver == nil { startAppObserver() }
} catch { } catch {
errorMessage = error.localizedDescription errorMessage = error.localizedDescription
} }
@@ -140,7 +219,6 @@ final class SessionManager {
} }
func endSession(status: String = "completed") async { func endSession(status: String = "completed") async {
stopArgus()
stopCapture() stopCapture()
stopAppObserver() stopAppObserver()
if let session = activeSession { if let session = activeSession {
@@ -155,14 +233,21 @@ final class SessionManager {
resumeCard = nil resumeCard = nil
showingResumeCard = false showingResumeCard = false
proactiveCard = nil proactiveCard = nil
approvedActionLabel = nil
latestVlmSummary = nil latestVlmSummary = nil
latestInferredTask = nil
isExecuting = false isExecuting = false
executorOutput = nil executorOutput = nil
proactiveCardTimer?.cancel() proactiveCardTimer?.cancel()
proactiveCardTimer = nil proactiveCardTimer = nil
screenshotHistory = [] screenshotHistory = []
frameBuffer = []
persistedSessionId = nil persistedSessionId = nil
// Keep the capture loop running for app-switch heuristics
if TokenStore.shared.token != nil {
startCapture()
startAppObserver()
}
} }
func fetchResumeCard() async { func fetchResumeCard() async {
@@ -183,7 +268,6 @@ final class SessionManager {
if let idx = activeSteps.firstIndex(where: { $0.id == updated.id }) { if let idx = activeSteps.firstIndex(where: { $0.id == updated.id }) {
activeSteps[idx] = updated activeSteps[idx] = updated
} }
// Advance to next pending
if let next = activeSteps.firstIndex(where: { $0.status == "pending" }) { if let next = activeSteps.firstIndex(where: { $0.status == "pending" }) {
currentStepIndex = next currentStepIndex = next
} }
@@ -192,12 +276,19 @@ final class SessionManager {
} }
} }
// MARK: - Retry (HUD Retry button)
func retryMonitoring() {
monitoringError = nil
frameBuffer = []
stopCapture()
startCapture()
if appSwitchObserver == nil { startAppObserver() }
}
// MARK: - Proactive Card Lifecycle // MARK: - Proactive Card Lifecycle
/// Show a proactive card and start the 15-second auto-dismiss timer. private func showProactiveCard(_ card: ProactiveCard) {
/// - Parameter vlmCard: Pass true when the card came from VLM so argus gets a stdin response on dismiss.
private func showProactiveCard(_ card: ProactiveCard, vlmCard: Bool = false) {
proactiveCardNeedsArgusResponse = vlmCard
proactiveCardTimer?.cancel() proactiveCardTimer?.cancel()
withAnimation { proactiveCard = card } withAnimation { proactiveCard = card }
@@ -208,31 +299,44 @@ final class SessionManager {
} }
} }
/// Dismiss the current card (user tapped "Not now" or 15s elapsed).
func dismissProactiveCard() { func dismissProactiveCard() {
proactiveCardTimer?.cancel() proactiveCardTimer?.cancel()
proactiveCardTimer = nil proactiveCardTimer = nil
withAnimation { proactiveCard = nil } withAnimation { proactiveCard = nil }
if proactiveCardNeedsArgusResponse { sendArgusResponse(0) }
proactiveCardNeedsArgusResponse = false
} }
/// Approve action at the given index (0-based). Argus stdin uses 1-based (1 = action 0).
func approveProactiveCard(actionIndex: Int) { func approveProactiveCard(actionIndex: Int) {
proactiveCardTimer?.cancel() proactiveCardTimer?.cancel()
proactiveCardTimer = nil proactiveCardTimer = nil
let card = proactiveCard
withAnimation { proactiveCard = nil } withAnimation { proactiveCard = nil }
if proactiveCardNeedsArgusResponse { guard case .vlmFriction(_, _, let actions) = card?.source,
sendArgusResponse(actionIndex + 1) actionIndex < actions.count else { return }
let action = actions[actionIndex]
isExecuting = true isExecuting = true
Task {
do {
let screenshot = await captureScreen()
let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
guard !geminiKey.isEmpty else {
isExecuting = false
executorOutput = (title: action.label, content: action.details ?? "Action approved.")
return
}
let client = GeminiVLMClient(apiKey: geminiKey)
let result = try await client.executeAction(
label: action.label,
actionType: action.actionType,
details: action.details ?? "",
screenshot: screenshot
)
isExecuting = false
executorOutput = (title: action.label, content: result)
} catch {
isExecuting = false
executorOutput = (title: action.label, content: action.details ?? "Couldn't complete automatically.")
} }
proactiveCardNeedsArgusResponse = false
} }
private func sendArgusResponse(_ choice: Int) {
guard let pipe = argusStdinPipe,
let data = "\(choice)\n".data(using: .utf8) else { return }
try? pipe.fileHandleForWriting.write(contentsOf: data)
} }
// MARK: - App Switch Observer // MARK: - App Switch Observer
@@ -269,7 +373,7 @@ final class SessionManager {
guard name != lastApp.name else { return } guard name != lastApp.name else { return }
// Log previous app's dwell time to backend (fire-and-forget) // Log previous app dwell time to backend
let duration = max(1, Int(now.timeIntervalSince(lastAppEnteredAt))) let duration = max(1, Int(now.timeIntervalSince(lastAppEnteredAt)))
let prev = lastApp let prev = lastApp
if let session = activeSession, !prev.name.isEmpty { if let session = activeSession, !prev.name.isEmpty {
@@ -289,159 +393,36 @@ final class SessionManager {
appSwitches.append((name: name, bundleId: bundleId, time: now)) appSwitches.append((name: name, bundleId: bundleId, time: now))
if appSwitches.count > 30 { appSwitches.removeFirst() } if appSwitches.count > 30 { appSwitches.removeFirst() }
// Only trigger card during active session and when none is already showing
guard isSessionActive, proactiveCard == nil else { return } guard isSessionActive, proactiveCard == nil else { return }
if let loop = detectRepetitiveLoop() { if let loop = detectRepetitiveLoop() {
showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count)), vlmCard: false) showProactiveCard(ProactiveCard(source: .appSwitchLoop(apps: loop.apps, switchCount: loop.count)))
} }
} }
// Detects a back-and-forth pattern between exactly 2 apps within a 5-minute window.
// Requires 3 full cycles (6 consecutive alternating switches) to avoid false positives.
private func detectRepetitiveLoop() -> (apps: [String], count: Int)? { private func detectRepetitiveLoop() -> (apps: [String], count: Int)? {
let cutoff = Date().addingTimeInterval(-300) let cutoff = Date().addingTimeInterval(-300)
let recent = appSwitches.filter { $0.time > cutoff }.map(\.name) let recent = appSwitches.filter { $0.time > cutoff }.map(\.name)
guard recent.count >= 6 else { return nil } guard recent.count >= 6 else { return nil }
let last6 = Array(recent.suffix(6)) let last6 = Array(recent.suffix(6))
guard Set(last6).count == 2 else { return nil } guard Set(last6).count == 2 else { return nil }
// Strictly alternating no two consecutive identical app names
for i in 1..<last6.count { for i in 1..<last6.count {
if last6[i] == last6[i - 1] { return nil } if last6[i] == last6[i - 1] { return nil }
} }
return (apps: Array(Set(last6)).sorted(), count: 3) return (apps: Array(Set(last6)).sorted(), count: 3)
} }
// MARK: - Argus Subprocess (device-side VLM) // MARK: - Screenshot Capture Loop
/// Launch the argus Python daemon as a subprocess.
/// Argus captures screenshots itself, runs them through a local VLM (Ollama/Gemini),
/// posts results to the backend, and emits RESULT:{json} lines to stdout for Swift to consume.
/// Falls back to the internal `startCapture()` loop if the process cannot be launched.
private func startArgus(session: FocusSession, task: AppTask?) {
guard FileManager.default.fileExists(atPath: argusPythonPath),
FileManager.default.fileExists(atPath: argusRepoPath) else {
startCapture()
return
}
// Encode steps as JSON for --steps-json arg
var stepsJSONString = "[]"
if !activeSteps.isEmpty {
let stepsArray: [[String: Any]] = activeSteps.map { step in
var s: [String: Any] = [
"id": step.id,
"sort_order": step.sortOrder,
"title": step.title,
"status": step.status
]
if let note = step.checkpointNote { s["checkpoint_note"] = note }
return s
}
if let data = try? JSONSerialization.data(withJSONObject: stepsArray),
let str = String(data: data, encoding: .utf8) {
stepsJSONString = str
}
}
let jwt = TokenStore.shared.token ?? ""
let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
var arguments = [
"-m", "argus",
"--session-id", session.id,
"--task-title", task?.title ?? "(no task)",
"--task-goal", task?.description ?? "",
"--steps-json", stepsJSONString,
"--window-title", NSWorkspace.shared.frontmostApplication?.localizedName ?? "",
"--vlm", "gemini",
"--jwt", jwt,
"--backend-url", "https://wahwa.com/api/v1",
"--swift-ipc",
"--execute" // enables agentic executor; Swift sends 0/1/2 via stdin
]
if !geminiKey.isEmpty {
arguments += ["--gemini-key", geminiKey]
}
let process = Process()
process.executableURL = URL(fileURLWithPath: argusPythonPath)
process.currentDirectoryURL = URL(fileURLWithPath: argusRepoPath)
process.arguments = arguments
// Pipe stdout for RESULT:/STATUS:/EXEC_OUTPUT: lines
// stderr is NOT captured leaving it unset lets argus log to the system console
// without risk of the pipe buffer filling and blocking the process.
let stdoutPipe = Pipe()
let stdinPipe = Pipe()
process.standardOutput = stdoutPipe
process.standardInput = stdinPipe
do {
try process.run()
} catch {
startCapture()
return
}
argusProcess = process
argusStdinPipe = stdinPipe
isCapturing = true
// Read RESULT:/STATUS:/EXEC_OUTPUT: lines from argus stdout in a background task
let fileHandle = stdoutPipe.fileHandleForReading
argusReadTask = Task { [weak self] in
do {
for try await line in fileHandle.bytes.lines {
guard let self, !Task.isCancelled else { break }
if line.hasPrefix("RESULT:") {
let jsonStr = String(line.dropFirst("RESULT:".count))
if let data = jsonStr.data(using: .utf8),
let result = try? JSONDecoder().decode(DistractionAnalysisResponse.self, from: data) {
await MainActor.run { self.applyDistractionResult(result) }
}
} else if line.hasPrefix("STATUS:exec_done:") {
await MainActor.run { self.isExecuting = false }
} else if line.hasPrefix("EXEC_OUTPUT:") {
let jsonStr = String(line.dropFirst("EXEC_OUTPUT:".count))
if let data = jsonStr.data(using: .utf8),
let obj = try? JSONSerialization.jsonObject(with: data) as? [String: String],
let title = obj["title"], let content = obj["content"] {
await MainActor.run { self.executorOutput = (title: title, content: content) }
}
}
}
} catch {
// Pipe closed argus process ended
}
}
}
private func stopArgus() {
argusReadTask?.cancel()
argusReadTask = nil
argusStdinPipe = nil
if let proc = argusProcess {
proc.terminate()
argusProcess = nil
isCapturing = false
}
}
// MARK: - Screenshot Capture Loop (fallback when argus is unavailable)
private func startCapture() { private func startCapture() {
guard !isCapturing else { return }
isCapturing = true isCapturing = true
captureTask = Task { [weak self] in captureTask = Task { [weak self] in
guard let self else { return } guard let self else { return }
// Capture immediately on session start, then repeat on interval // Capture immediately, then repeat on interval
await self.captureAndAnalyze() await self.captureAndAnalyze()
while !Task.isCancelled && self.isSessionActive { while !Task.isCancelled {
try? await Task.sleep(for: .seconds(self.captureInterval)) try? await Task.sleep(for: .seconds(self.captureInterval))
guard !Task.isCancelled && self.isSessionActive else { break } guard !Task.isCancelled else { break }
await self.captureAndAnalyze() await self.captureAndAnalyze()
} }
} }
@@ -453,56 +434,77 @@ final class SessionManager {
isCapturing = false isCapturing = false
} }
/// Capture one frame, buffer it, and call VLM every `framesPerVLMCall` frames.
private func captureAndAnalyze() async { private func captureAndAnalyze() async {
guard let session = activeSession else { return }
guard let imageData = await captureScreen() else { return } guard let imageData = await captureScreen() else { return }
let windowTitle = NSWorkspace.shared.frontmostApplication?.localizedName ?? "Unknown" frameBuffer.append(imageData)
var context = buildTaskContext() // Keep buffer bounded rolling window of most recent frames
if frameBuffer.count > framesPerVLMCall { frameBuffer.removeFirst() }
// Inject rolling history so the VLM has temporal context across captures. // Only call VLM once we have a full batch for temporal diff analysis
// Only summaries (text) are sent not the raw images to keep token cost low. guard frameBuffer.count >= framesPerVLMCall else { return }
if !screenshotHistory.isEmpty {
let iso = ISO8601DateFormatter() let geminiKey = UserDefaults.standard.string(forKey: "geminiApiKey") ?? ""
context["screenshot_history"] = screenshotHistory.map { entry in guard !geminiKey.isEmpty else {
["summary": entry.summary, "timestamp": iso.string(from: entry.timestamp)] print("[VLM] No Gemini API key set — skipping analysis")
} return
} }
let client = GeminiVLMClient(apiKey: geminiKey)
let windowTitle = NSWorkspace.shared.frontmostApplication?.localizedName ?? ""
let recentSummaries = screenshotHistory.map(\.summary)
let frames = frameBuffer // snapshot before async gap
do { do {
let result = try await APIClient.shared.analyzeScreenshot( print("[VLM] Calling Gemini with \(frames.count) frames…")
imageData: imageData, let result = try await client.analyze(
frames: frames,
taskTitle: activeTask?.title ?? "",
taskGoal: activeTask?.description ?? "",
steps: activeSteps,
windowTitle: windowTitle, windowTitle: windowTitle,
sessionId: session.id, recentSummaries: recentSummaries
taskContext: context
) )
print("[VLM] Result: on_task=\(result.onTask), friction=\(result.friction?.type ?? "none"), summary=\(result.vlmSummary ?? "")")
// Append this result's summary to the rolling buffer (max 4 entries) // Append to rolling summary history
if let summary = result.vlmSummary { if let summary = result.vlmSummary, !summary.isEmpty {
screenshotHistory.append(ScreenshotHistoryEntry(summary: summary, timestamp: Date())) screenshotHistory.append(HistoryEntry(summary: summary, timestamp: Date()))
if screenshotHistory.count > 4 { screenshotHistory.removeFirst() } if screenshotHistory.count > 4 { screenshotHistory.removeFirst() }
} }
// Clear frame buffer next batch starts fresh
frameBuffer.removeAll()
monitoringError = nil
applyDistractionResult(result) applyDistractionResult(result)
// Post result to backend (fire-and-forget)
if let session = activeSession {
Task {
try? await APIClient.shared.postAnalysisResult(result, sessionId: session.id)
}
}
} catch { } catch {
// Silent fail don't interrupt the user print("[VLM] Analysis error: \(error)")
// Don't surface transient errors the next attempt will retry automatically
} }
} }
// MARK: - Screen Capture
private func captureScreen() async -> Data? { private func captureScreen() async -> Data? {
guard CGPreflightScreenCaptureAccess() else { return nil }
do { do {
let content = try await SCShareableContent.current let content = try await SCShareableContent.current
guard let display = content.displays.first else { return nil } guard let display = content.displays.first else { return nil }
let config = SCStreamConfiguration() let config = SCStreamConfiguration()
config.width = 1280 config.width = 1280
config.height = 720 config.height = 720
let filter = SCContentFilter(display: display, excludingWindows: []) let filter = SCContentFilter(display: display, excludingWindows: [])
let image = try await SCScreenshotManager.captureImage( let image = try await SCScreenshotManager.captureImage(
contentFilter: filter, contentFilter: filter, configuration: config)
configuration: config
)
return cgImageToJPEG(image) return cgImageToJPEG(image)
} catch { } catch {
return nil return nil
@@ -518,29 +520,13 @@ final class SessionManager {
return jpeg return jpeg
} }
private func buildTaskContext() -> [String: Any] { // MARK: - Apply VLM Result
var ctx: [String: Any] = [:]
guard let task = activeTask else { return ctx }
ctx["task_title"] = task.title
ctx["task_goal"] = task.description ?? task.title
ctx["steps"] = activeSteps.map { step -> [String: Any] in
var s: [String: Any] = [
"id": step.id,
"sort_order": step.sortOrder,
"title": step.title,
"status": step.status
]
if let note = step.checkpointNote { s["checkpoint_note"] = note }
return s
}
return ctx
}
private func applyDistractionResult(_ result: DistractionAnalysisResponse) { private func applyDistractionResult(_ result: DistractionAnalysisResponse) {
// 0. Store latest summary for the floating HUD
if let summary = result.vlmSummary { latestVlmSummary = summary } if let summary = result.vlmSummary { latestVlmSummary = summary }
if let task = result.inferredTask, !task.isEmpty { latestInferredTask = task }
// 1. Apply step side-effects (always) // Apply step side-effects
for completedId in result.stepsCompleted { for completedId in result.stepsCompleted {
if let idx = activeSteps.firstIndex(where: { $0.id == completedId }) { if let idx = activeSteps.firstIndex(where: { $0.id == completedId }) {
activeSteps[idx].status = "done" activeSteps[idx].status = "done"
@@ -556,22 +542,25 @@ final class SessionManager {
currentStepIndex = idx currentStepIndex = idx
} }
// 2. Notification priority (design spec §1.5): // Notification priority: friction card (formal or has actions) nudge
// Proactive friction help Context resume Gentle nudge if let friction = result.friction {
// NEVER nudge when the system could help instead. let shouldShow = friction.isActionable || !friction.proposedActions.isEmpty
if let friction = result.friction, friction.isActionable { if shouldShow {
if friction.isResumption { if friction.isResumption {
// Task resumption detected auto-surface resume card without button press
Task { await fetchResumeCard() } Task { await fetchResumeCard() }
} else if proactiveCard == nil { } else if proactiveCard == nil {
showProactiveCard(ProactiveCard(source: .vlmFriction( showProactiveCard(ProactiveCard(source: .vlmFriction(
frictionType: friction.type, frictionType: friction.type,
description: friction.description, description: friction.description,
actions: friction.proposedActions actions: friction.proposedActions
)), vlmCard: true) )))
}
} else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge {
distractionCount += 1
lastNudge = nudge
sendNudgeNotification(nudge)
} }
} else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge { } else if !result.onTask, result.confidence > 0.7, let nudge = result.gentleNudge {
// Only nudge if VLM found no actionable friction
distractionCount += 1 distractionCount += 1
lastNudge = nudge lastNudge = nudge
sendNudgeNotification(nudge) sendNudgeNotification(nudge)

104
README.md Normal file
View File

@@ -0,0 +1,104 @@
# LockInBro
ADHD-friendly macOS focus assistant. Monitors your screen with a local VLM agent (Argus), detects friction and distractions, and nudges you back on track.
## Requirements
- macOS 14+
- Xcode 16+
- Python 3.11+ (system or Homebrew)
## Setup
### 1. Python environment
Create the venv once from the project root:
```bash
cd ~/yhack/LockInBro
python3 -m venv .venv
.venv/bin/pip install -r argus/requirements.txt
```
### 2. API keys
Create a `.env` file in the project root (next to `argus/`):
```bash
cp argus/.env.example .env # if example exists, otherwise create manually
```
`.env` contents:
```
GEMINI_API_KEY=your_gemini_api_key_here
BACKEND_BASE_URL=https://wahwa.com/api/v1
```
The Gemini API key is also set at runtime from the app's Settings screen — the `.env` is only needed if running Argus directly from the command line.
### 3. Xcode permissions
In Xcode → Signing & Capabilities, ensure the app has:
- **Screen Recording** — required for screenshot capture
- **App Sandbox** disabled (or `com.apple.security.screen-recording` entitlement added)
### 4. Run
Open `LockInBro.xcodeproj` in Xcode and press **Run** (⌘R).
On first launch:
- Grant **Screen Recording** permission when prompted
- Log in or register via the app
- Enter your Gemini API key in Settings
---
## Argus (VLM agent)
The `argus/` directory contains the Python screen-analysis agent. It runs as a subprocess of the Swift app — you do not need to launch it manually.
### Running Argus directly (for debugging)
```bash
cd ~/yhack/LockInBro
.venv/bin/python3 -m argus \
--vlm gemini \
--gemini-key YOUR_KEY \
--dry-run \
--task-title "debug run"
```
### Recreating the venv
```bash
rm -rf .venv
python3 -m venv .venv
.venv/bin/pip install -r argus/requirements.txt
```
---
## Project structure
```
LockInBro/
├── .venv/ # Python venv (gitignored)
├── .env # API keys (gitignored)
├── argus/ # VLM agent (Python)
│ ├── requirements.txt
│ ├── capture.py # Reads screenshots from /tmp/lockinbro_capture.jpg
│ ├── loop.py # Main analysis loop
│ ├── vlm.py # Gemini / Ollama client
│ └── ...
├── LockInBro/ # Swift app source
│ ├── SessionManager.swift
│ ├── APIClient.swift
│ └── ...
└── LockInBro.xcodeproj
```
## Backend
Deployed at `https://wahwa.com/api/v1` (FastAPI on DigitalOcean).
Source: `~/yhack/lockinbro-api`

0
argus/__init__.py Normal file
View File

113
argus/__main__.py Normal file
View File

@@ -0,0 +1,113 @@
"""CLI entry point: python -m argus [options]"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import sys
from argus.config import CAPTURE_INTERVAL_S
from argus.loop import run_loop
from argus.vlm import StepInfo, TaskContext
def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
prog="argus",
description="Argus VLM — proactive focus assistant screen analyzer",
)
p.add_argument("--session-id", default="00000000-0000-0000-0000-000000000000")
p.add_argument("--task-title", default="(no task)")
p.add_argument("--task-goal", default="")
p.add_argument(
"--steps-json",
default=None,
help='JSON array of steps: [{"id":"...", "sort_order":1, "title":"...", "status":"pending"}]',
)
p.add_argument("--window-title", default="")
p.add_argument("--vlm", choices=["ollama", "gemini"], default=None, help="VLM backend (default: ollama)")
p.add_argument("--gemini-key", default=None, help="Override GEMINI_API_KEY env var")
p.add_argument("--jwt", default=None, help="Override BACKEND_JWT env var")
p.add_argument("--backend-url", default=None, help="Override BACKEND_BASE_URL env var")
p.add_argument("--dry-run", action="store_true", help="Print JSON instead of POSTing")
p.add_argument("--execute", action="store_true", help="Enable notification + executor flow")
p.add_argument("--mock-sessions", default=None, help='JSON array of mock sessions: [{"task_title":"...", "status":"interrupted", "last_app":"VS Code", "last_file":"solution.cpp", "checkpoint_note":"stuck on impl"}]')
p.add_argument("--iterations", type=int, default=None, help="Stop after N iterations")
p.add_argument("-v", "--verbose", action="store_true")
return p.parse_args()
def main() -> None:
args = _parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(asctime)s %(levelname)-5s %(name)s %(message)s",
datefmt="%H:%M:%S",
)
# httpx logs every request/response at INFO — suppress to WARNING to avoid noisy 404s
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
steps: list[StepInfo] = []
if args.steps_json:
for s in json.loads(args.steps_json):
steps.append(
StepInfo(
id=s["id"],
sort_order=s["sort_order"],
title=s["title"],
status=s.get("status", "pending"),
checkpoint_note=s.get("checkpoint_note"),
)
)
ctx = TaskContext(
task_title=args.task_title,
task_goal=args.task_goal,
steps=steps,
window_title=args.window_title,
session_id=args.session_id,
)
# Parse mock sessions if provided
mock_sessions = None
if args.mock_sessions:
from argus.session import SessionInfo
mock_sessions = []
for i, s in enumerate(json.loads(args.mock_sessions)):
mock_sessions.append(
SessionInfo(
session_id=s.get("session_id", f"mock-{i}"),
task_id=s.get("task_id"),
task_title=s.get("task_title", ""),
task_goal=s.get("task_goal", ""),
status=s.get("status", "interrupted"),
last_app=s.get("last_app", ""),
last_file=s.get("last_file", ""),
checkpoint_note=s.get("checkpoint_note", ""),
started_at=s.get("started_at", ""),
ended_at=s.get("ended_at"),
minutes_ago=s.get("minutes_ago", 30),
)
)
asyncio.run(
run_loop(
ctx,
api_key=args.gemini_key,
vlm_backend=args.vlm,
jwt=args.jwt,
base_url=args.backend_url,
dry_run=args.dry_run,
max_iterations=args.iterations,
auto_execute=args.execute,
mock_sessions=mock_sessions,
)
)
if __name__ == "__main__":
main()

50
argus/backend.py Normal file
View File

@@ -0,0 +1,50 @@
"""Backend API client — sends VLM analysis results to the LockInBro server.
POST /distractions/analyze-result with the enriched JSON payload.
"""
from __future__ import annotations
import logging
import httpx
from argus.config import BACKEND_BASE_URL, BACKEND_JWT
from argus.vlm import VLMResult
log = logging.getLogger(__name__)
async def send_analysis(
result: VLMResult,
session_id: str,
*,
jwt: str | None = None,
base_url: str | None = None,
) -> dict:
"""Post VLM analysis result to the backend.
Returns the backend response body on success, or an error dict.
"""
url = f"{base_url or BACKEND_BASE_URL}/distractions/analyze-result"
token = jwt or BACKEND_JWT
payload = result.to_backend_payload(session_id)
headers = {}
if token:
headers["Authorization"] = f"Bearer {token}"
log.info("Sending analysis to %s on_task=%s", url, result.on_task)
log.debug("Payload: %s", payload)
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(url, json=payload, headers=headers)
resp.raise_for_status()
return resp.json()
except httpx.HTTPStatusError as exc:
log.error("Backend returned %s: %s", exc.response.status_code, exc.response.text)
return {"error": str(exc), "status_code": exc.response.status_code}
except httpx.RequestError as exc:
log.error("Backend unreachable: %s", exc)
return {"error": str(exc)}

123
argus/buffer.py Normal file
View File

@@ -0,0 +1,123 @@
"""Rolling history buffer for VLM screenshot analysis.
Two tiers:
- Image buffer: deque(maxlen=4) of recent screenshots sent as images
- Text history: deque(maxlen=12) of older VLM summaries + previous outputs
for extended context (what happened 30-60s ago) and self-refinement
"""
from __future__ import annotations
import time
from collections import deque
from dataclasses import dataclass, field
@dataclass
class BufferEntry:
jpeg: bytes
vlm_summary: str
timestamp: float = field(default_factory=time.time)
@dataclass
class TextEntry:
vlm_summary: str
timestamp: float
class HistoryBuffer:
def __init__(self, image_maxlen: int = 4, text_maxlen: int = 12):
self._images: deque[BufferEntry] = deque(maxlen=image_maxlen)
self._text_history: deque[TextEntry] = deque(maxlen=text_maxlen)
self._last_output: dict | None = None
self._last_execution: str | None = None
def push(self, jpeg: bytes, vlm_summary: str) -> None:
now = time.time()
# When an image entry gets evicted from the image buffer,
# it's already captured in text_history, so nothing extra needed.
self._images.append(BufferEntry(jpeg=jpeg, vlm_summary=vlm_summary, timestamp=now))
self._text_history.append(TextEntry(vlm_summary=vlm_summary, timestamp=now))
def set_last_output(self, output: dict) -> None:
"""Store the previous VLM JSON output for self-refinement."""
self._last_output = output
def set_last_execution(self, summary: str | None) -> None:
"""Store the result of the last executor action."""
self._last_execution = summary
def get_last_execution(self) -> str | None:
return self._last_execution
def clear_last_execution(self) -> None:
self._last_execution = None
def get_entries(self) -> list[BufferEntry]:
"""Return image entries oldest-first."""
return list(self._images)
def format_for_prompt(self) -> str:
"""Format the full timeline: text history + image labels."""
now = time.time()
lines: list[str] = []
# Text-only history (entries older than what's in the image buffer)
image_timestamps = {round(e.timestamp, 2) for e in self._images}
older = [
e for e in self._text_history
if round(e.timestamp, 2) not in image_timestamps
]
older = [e for e in older if e.vlm_summary] # skip empty summaries
if older:
lines.append("Older context (text only, no images):")
for entry in older:
ago = int(now - entry.timestamp)
lines.append(f" - [{ago}s ago] {entry.vlm_summary}")
lines.append("")
# Image timeline
n = len(self._images)
if n > 0:
lines.append(f"Recent screenshots ({n} prior + 1 current = {n + 1} images):")
for i, entry in enumerate(self._images):
ago = int(now - entry.timestamp)
lines.append(f" - Screenshot {i + 1}/{n + 1}: [{ago}s ago]")
lines.append(f" - Screenshot {n + 1}/{n + 1}: [now] (current)")
else:
lines.append("Screenshots:")
lines.append(" - Screenshot 1/1: [now] (current, first capture)")
return "\n".join(lines)
def format_last_output(self) -> str:
"""Format previous VLM output for self-refinement context."""
if not self._last_output:
return ""
import json
# Only include the key fields, not the full blob
prev = self._last_output
parts = [
f" on_task: {prev.get('on_task')}",
f" app: {prev.get('app_name')}",
f" friction: {prev.get('friction', {}).get('type')}",
f" summary: {prev.get('vlm_summary')}",
]
note = prev.get("checkpoint_note_update")
if note:
parts.append(f" checkpoint: {note}")
desc = prev.get("friction", {}).get("description")
if desc:
parts.append(f" friction_desc: {desc}")
return "\n".join(parts)
def __len__(self) -> int:
return len(self._images)
def clear(self) -> None:
self._images.clear()
self._text_history.clear()
self._last_output = None
self._last_execution = None

80
argus/capture.py Normal file
View File

@@ -0,0 +1,80 @@
"""Screenshot capture for macOS.
Reads a JPEG file written by the Swift host app (LockInBro.app), which holds
the Screen Recording TCC permission. The Swift app writes atomically to
/tmp/lockinbro_capture.jpg every ~5s using ScreenCaptureKit.
capture_screenshot() blocks up to 15 seconds waiting for a fresh file,
then raises RuntimeError if nothing arrives — so loop.py can handle it
gracefully without falling back to the screencapture CLI (which would fail
with a permissions error when called from a Python subprocess).
"""
from __future__ import annotations
import io
import os
import time
from PIL import Image
from argus.config import SCREENSHOT_JPEG_QUALITY, SCREENSHOT_MAX_WIDTH
_SWIFT_FRAME_PATH = "/tmp/lockinbro_capture.jpg"
_SWIFT_FRAME_MAX_AGE_S = 10.0 # treat as stale if older than this
_WAIT_TIMEOUT_S = 15.0 # how long to wait for Swift to write the first frame
_WAIT_POLL_S = 0.5 # polling interval while waiting
def _to_jpeg(img: Image.Image) -> bytes:
"""Convert a PIL Image to JPEG bytes, handling RGBA → RGB."""
if img.width > SCREENSHOT_MAX_WIDTH:
ratio = SCREENSHOT_MAX_WIDTH / img.width
img = img.resize(
(SCREENSHOT_MAX_WIDTH, int(img.height * ratio)),
Image.LANCZOS,
)
if img.mode == "RGBA":
img = img.convert("RGB")
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=SCREENSHOT_JPEG_QUALITY)
return buf.getvalue()
def _read_swift_frame() -> bytes | None:
"""Return JPEG bytes from the Swift-provided file if it exists and is fresh."""
try:
age = time.time() - os.path.getmtime(_SWIFT_FRAME_PATH)
if age >= _SWIFT_FRAME_MAX_AGE_S:
return None
with open(_SWIFT_FRAME_PATH, "rb") as f:
data = f.read()
img = Image.open(io.BytesIO(data))
img.load()
return _to_jpeg(img)
except Exception:
return None
def capture_screenshot() -> bytes:
"""Return JPEG bytes for the current screen.
Blocks up to _WAIT_TIMEOUT_S seconds waiting for the Swift app to write
a fresh frame. Raises RuntimeError if no frame arrives in time.
"""
deadline = time.time() + _WAIT_TIMEOUT_S
while time.time() < deadline:
frame = _read_swift_frame()
if frame is not None:
return frame
time.sleep(_WAIT_POLL_S)
raise RuntimeError(
f"No screenshot received from LockInBro.app within {_WAIT_TIMEOUT_S}s. "
"Make sure the app is running and has Screen Recording permission."
)
def load_image_file(path: str) -> bytes:
"""Load an image file and return JPEG bytes (for testing)."""
return _to_jpeg(Image.open(path))

28
argus/config.py Normal file
View File

@@ -0,0 +1,28 @@
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parent.parent / ".env", override=True)
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.5-pro")
GEMINI_URL = (
f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL}:generateContent"
)
# Ollama (local VLM — default)
OLLAMA_BASE_URL = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434")
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "qwen3.5:9b")
# VLM backend: "ollama" (default) or "gemini"
VLM_BACKEND = os.environ.get("VLM_BACKEND", "ollama")
BACKEND_BASE_URL = os.environ.get("BACKEND_BASE_URL", "https://wahwa.com/api/v1")
BACKEND_JWT = os.environ.get("BACKEND_JWT", "")
CAPTURE_INTERVAL_S = 2.5 # screenshot every 2.5s
BUFFER_MAX_LEN = 4 # 4 frames accumulated
VLM_CALL_EVERY_N = 4 # call VLM every 4 captures (= every 10s)
SCREENSHOT_JPEG_QUALITY = 50
SCREENSHOT_MAX_WIDTH = 1280

384
argus/executor.py Normal file
View File

@@ -0,0 +1,384 @@
"""Agentic executor — uses Gemini 2.5 Pro with tool use to complete actions.
When the user approves a proposed action, the executor gets:
- The recent screenshots (so it can read source content)
- Tool access: read_file, write_file, run_command
- A loop: Gemini proposes tool calls → we execute → feed results back → repeat
This is a full agent loop, not a single-shot LLM call.
Swift portability notes:
- The agent loop is the same HTTP pattern (Gemini function calling API)
- Tool implementations map to FileManager, NSTask, NSPasteboard
- The loop structure is identical in Swift async/await
"""
from __future__ import annotations
import asyncio
import base64
import logging
import os
import subprocess
import httpx
from argus.buffer import HistoryBuffer
from argus.config import GEMINI_API_KEY
log = logging.getLogger(__name__)
EXECUTOR_MODEL = os.environ.get("EXECUTOR_MODEL", "gemini-2.5-pro")
EXECUTOR_URL = (
f"https://generativelanguage.googleapis.com/v1beta/models/{EXECUTOR_MODEL}:generateContent"
)
MAX_AGENT_STEPS = 10
# ── Tool definitions (Gemini function calling format) ────────────────────
TOOLS = [
{
"functionDeclarations": [
{
"name": "read_file",
"description": "Read the contents of a file. Use this to inspect source files, code, configs, etc.",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Absolute or relative file path to read",
}
},
"required": ["path"],
},
},
{
"name": "output",
"description": "Display content to the user as a sticky note. Use for: extracted text from PDFs/images, form fill suggestions, content the user needs to paste into binary formats (docx, ppt, websites). The user will copy/paste from this.",
"parameters": {
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "Short title (e.g. 'Extracted receipt', 'Form values')",
},
"content": {
"type": "string",
"description": "The full content to display",
},
},
"required": ["title", "content"],
},
},
{
"name": "write_file",
"description": "Write content to an EXISTING plain text file (code, markdown, config, txt). Only use when: (1) you have confirmed the file path via read_file or mdfind, AND (2) the file is a plain text format you can write correctly. NEVER use for binary formats (docx, ppt, pdf, images).",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Absolute file path (must have been confirmed to exist first)",
},
"content": {
"type": "string",
"description": "Complete file content to write",
},
},
"required": ["path", "content"],
},
},
{
"name": "run_command",
"description": "Run a shell command and return its output. Use for compiling, testing, listing files, etc.",
"parameters": {
"type": "object",
"properties": {
"command": {
"type": "string",
"description": "The shell command to execute",
}
},
"required": ["command"],
},
},
{
"name": "done",
"description": "Signal that the task is complete. Call this AFTER using output() to present results. Summary should describe what was shown to the user, not file operations.",
"parameters": {
"type": "object",
"properties": {
"summary": {
"type": "string",
"description": "Brief summary of what was done",
}
},
"required": ["summary"],
},
},
]
}
]
# ── Tool implementations ─────────────────────────────────────────────────
def _exec_read_file(path: str) -> str:
resolved = path if os.path.isabs(path) else os.path.join(os.getcwd(), path)
try:
with open(resolved, "r") as f:
content = f.read()
log.info(" read_file: %s (%d bytes)", resolved, len(content))
return content
except OSError as e:
return f"Error reading {resolved}: {e}"
def _exec_write_file(path: str, content: str) -> str:
resolved = path if os.path.isabs(path) else os.path.join(os.getcwd(), path)
# Safety: only write to existing text files
if not os.path.exists(resolved):
return f"Error: {resolved} does not exist. Use output() instead for new content."
try:
with open(resolved, "r"):
pass # confirm it's readable as text
except (OSError, UnicodeDecodeError):
return f"Error: {resolved} is not a readable text file. Use output() instead."
try:
with open(resolved, "w") as f:
f.write(content)
log.info(" write_file: %s (%d bytes)", resolved, len(content))
return f"Successfully wrote {len(content)} bytes to {resolved}"
except OSError as e:
return f"Error writing {resolved}: {e}"
def _exec_output(title: str, content: str) -> str:
"""Display content to the user via terminal.
Swift portability: becomes a sticky note / floating card UI.
"""
print()
print(f"┌── 📋 {title} " + "" * max(0, 50 - len(title)) + "")
for line in content.split("\n"):
print(f"{line}")
print(f"" + "" * 58 + "")
print()
log.info(" output: %s (%d chars)", title, len(content))
return f"Displayed '{title}' to user ({len(content)} chars)"
def _exec_run_command(command: str) -> str:
log.info(" run_command: %s", command[:80])
try:
result = subprocess.run(
command, shell=True, capture_output=True, text=True, timeout=30
)
output = result.stdout
if result.stderr:
output += "\nSTDERR:\n" + result.stderr
if result.returncode != 0:
output += f"\n(exit code {result.returncode})"
return output[:4000] # cap output length
except subprocess.TimeoutExpired:
return "Error: command timed out after 30s"
def _execute_tool(name: str, args: dict) -> str:
if name == "read_file":
return _exec_read_file(args["path"])
elif name == "output":
return _exec_output(args["title"], args["content"])
elif name == "write_file":
return _exec_write_file(args["path"], args["content"])
elif name == "run_command":
return _exec_run_command(args["command"])
elif name == "done":
return args.get("summary", "Done.")
else:
return f"Unknown tool: {name}"
# ── Agent loop ───────────────────────────────────────────────────────────
async def execute(
vlm_payload: dict,
action_index: int = 0,
*,
history: HistoryBuffer | None = None,
current_screenshot: bytes | None = None,
api_key: str | None = None,
) -> str | None:
"""Run the agentic executor loop.
The agent can read files, write files, and run commands to complete
the user's approved action. It loops until it calls done() or hits
the step limit.
Returns a summary of what was done, or None on failure.
"""
friction = vlm_payload.get("friction", {})
actions = friction.get("proposed_actions", [])
if action_index >= len(actions):
log.warning("Action index %d out of range", action_index)
return None
chosen = actions[action_index]
key = api_key or GEMINI_API_KEY
if not key:
log.warning("No API key for executor")
return None
log.info("Agent executing: %s", chosen.get("label", "?")[:80])
# Build initial message with screenshots + task context
initial_parts = _build_initial_parts(vlm_payload, chosen, history, current_screenshot)
# Conversation history for the agent loop
contents = [{"role": "user", "parts": initial_parts}]
for step in range(MAX_AGENT_STEPS):
log.debug("Agent step %d/%d", step + 1, MAX_AGENT_STEPS)
payload = {
"contents": contents,
"tools": TOOLS,
"generationConfig": {"temperature": 0.2, "maxOutputTokens": 8192},
}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
for attempt in range(3):
resp = await client.post(f"{EXECUTOR_URL}?key={key}", json=payload)
if resp.status_code == 429:
wait = 2 ** attempt
log.warning("Executor 429, retrying in %ds...", wait)
await asyncio.sleep(wait)
continue
resp.raise_for_status()
break
else:
resp.raise_for_status()
except Exception:
log.exception("Agent API call failed at step %d", step + 1)
return None
body = resp.json()
candidate = body["candidates"][0]
response_parts = candidate["content"]["parts"]
# Add assistant response to conversation
contents.append({"role": "model", "parts": response_parts})
# Check for function calls
function_calls = [p for p in response_parts if "functionCall" in p]
if not function_calls:
# No tool calls — agent returned text, we're done
text = "".join(p.get("text", "") for p in response_parts)
log.info("Agent finished with text response (step %d)", step + 1)
return text.strip() if text.strip() else "Done."
# Execute each tool call
tool_results = []
done_summary = None
for fc_part in function_calls:
fc = fc_part["functionCall"]
name = fc["name"]
args = fc.get("args", {})
print(f" 🔧 {name}({_summarize_args(args)})")
result = _execute_tool(name, args)
tool_results.append({
"functionResponse": {
"name": name,
"response": {"result": result},
}
})
if name == "done":
done_summary = result
# Add tool results to conversation
contents.append({"role": "user", "parts": tool_results})
if done_summary:
log.info("Agent called done() at step %d: %s", step + 1, done_summary[:80])
return done_summary
log.warning("Agent hit step limit (%d)", MAX_AGENT_STEPS)
return "Agent reached maximum steps without completing."
def _build_initial_parts(
vlm_payload: dict,
action: dict,
history: HistoryBuffer | None,
current_screenshot: bytes | None,
) -> list[dict]:
"""Build the initial message parts: screenshots + task prompt."""
parts: list[dict] = []
# Include screenshots so agent can read source content
if history:
entries = history.get_entries()
for i, entry in enumerate(entries):
b64 = base64.b64encode(entry.jpeg).decode()
parts.append({"text": f"[Screenshot {i + 1}/{len(entries)}]"})
parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
if current_screenshot:
b64 = base64.b64encode(current_screenshot).decode()
parts.append({"text": "[Current screenshot]"})
parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
friction = vlm_payload.get("friction", {})
prompt = f"""\
The user approved this action. Complete it using the tools available to you.
ACTION: {action.get('label', '')}
DETAILS: {action.get('details', '')}
Context:
User's task: {vlm_payload.get('inferred_task', '')}
Problem: {friction.get('description', '')}
Current state: {vlm_payload.get('checkpoint_note_update', '')}
Application: {vlm_payload.get('app_name', '')}
Source: {friction.get('source_context', '')}
Target: {friction.get('target_context', '')}
INSTRUCTIONS:
1. For BINARY files (PDFs, images, etc.): use your VISION. Read content directly
from the screenshots — this is your most reliable source for non-text files.
2. For TEXT files (code, markdown, configs, txt): use read_file to get exact content.
3. If you need a file but only know the filename (not the path), FIND IT FIRST:
- run_command("mdfind -name 'filename'") — fast macOS Spotlight search
- run_command("lsof -c AppName | grep filename") — find what file an app has open
Do NOT guess paths. Search first.
4. Choose the right output method:
- Binary format targets (docx, ppt, website forms, PDFs): use output() — user will copy/paste.
- Existing plain text files (code, markdown, config): use write_file() — modify directly.
- write_file() only works on files that ALREADY EXIST. Confirm the path with read_file first.
5. Use run_command to compile, test, or search for files. Never to write files.
6. Do NOT hallucinate content. If you can't read something, say so.
7. Call done() with a summary when the action is complete.
Working directory: {os.getcwd()}"""
parts.append({"text": prompt})
return parts
def _summarize_args(args: dict) -> str:
"""Short summary of tool args for terminal display."""
parts = []
for k, v in args.items():
sv = str(v)
if len(sv) > 50:
sv = sv[:47] + "..."
parts.append(f"{k}={sv}")
return ", ".join(parts)

415
argus/loop.py Normal file
View File

@@ -0,0 +1,415 @@
"""Main orchestrator loop — capture → analyze → notify → execute.
Three concurrent tasks:
1. VLM loop: capture screenshots, analyze, push results
2. Notification handler: show cards when friction detected, wait for response
3. Input reader: read user's terminal input (1=accept, 0=dismiss)
Swift portability notes:
- The VLM loop becomes a Timer or DispatchSourceTimer
- The notification handler becomes SwiftUI state updates
- The input reader becomes button tap handlers
- The three tasks map to Swift's structured concurrency (async let / TaskGroup)
"""
from __future__ import annotations
import asyncio
import json
import logging
import sys
import time
from argus.backend import send_analysis
from argus.buffer import HistoryBuffer
from argus.capture import capture_screenshot
from argus.config import BUFFER_MAX_LEN, CAPTURE_INTERVAL_S, VLM_CALL_EVERY_N
from argus.executor import execute
from argus.notification import NotificationManager
from argus.session import SessionManager
from argus.vlm import TaskContext, VLMResult, analyze_screenshot
log = logging.getLogger(__name__)
# Track session actions already shown to avoid repeating
_handled_session_actions: set[str] = set()
def _session_action_key(sa) -> str:
return f"{sa.type}:{sa.session_id}"
def _show_session_card(title: str, body: str, options: list[str]) -> None:
"""Display a session-related card (not friction). Swift: native notification."""
print()
print("" + "" * 58 + "")
print(f"{title:<56}")
print("" + " " * 58 + "")
for line in body.split("\n"):
print(f"{line:<56}")
print("" + " " * 58 + "")
for i, opt in enumerate(options):
print(f"│ [{i + 1}] {opt:<53}")
print(f"│ [0] Not now{' ' * 44}")
print("" + "" * 58 + "")
print()
async def _wait_for_input() -> int:
"""Wait for a single integer input from stdin. Returns the number or 0."""
loop = asyncio.get_event_loop()
line = await loop.run_in_executor(None, sys.stdin.readline)
try:
return int(line.strip())
except (ValueError, EOFError):
return 0
async def _handle_session_action(sa, result, sessions: SessionManager, notifier: NotificationManager) -> None:
"""Handle VLM session_action output — show card and act on approval.
Swift portability: becomes SwiftUI state changes + button handlers.
"""
# Skip if we already showed this exact action
key = _session_action_key(sa)
if key in _handled_session_actions:
return
_handled_session_actions.add(key)
if sa.type == "resume":
session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
checkpoint = session.checkpoint_note if session else ""
task_title = session.task_title if session else sa.reason
_show_session_card(
f"📂 Resume: {task_title}",
f"You left off: {checkpoint}" if checkpoint else sa.reason,
["Resume session"],
)
choice = await _wait_for_input()
if choice == 1:
if not sessions._mock:
resume_card = await sessions.get_resume_card(sa.session_id)
if resume_card:
rc = resume_card.get("resume_card", {})
print(f" 💡 {rc.get('welcome_back', '')}")
print(f" {rc.get('you_were_doing', '')}")
print(f" {rc.get('next_step', '')}")
print(f" {rc.get('motivation', '')}")
else:
print(f" ✓ Resumed \"{task_title}\"")
if checkpoint:
print(f" Last checkpoint: {checkpoint}")
if session:
session.status = "active"
sessions._active_session = session
else:
print(" dismissed.")
elif sa.type == "start_new":
task = result.inferred_task
_show_session_card(
"🆕 New focus session",
f"You're working on: {task}",
["Start focus session"],
)
choice = await _wait_for_input()
if choice == 1:
if not sessions._mock:
resp = await sessions.start_session(task)
if resp:
print(f" ✓ Session started: {resp.get('id', '?')}")
else:
print(f" ✓ (mock) Session started for \"{task}\"")
else:
print(" dismissed.")
elif sa.type == "complete":
session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
task_title = session.task_title if session else "session"
_show_session_card(
f"✅ Complete: {task_title}",
sa.reason,
["Complete session"],
)
choice = await _wait_for_input()
if choice == 1 and sa.session_id:
if not sessions._mock:
ok = await sessions.end_session(sa.session_id)
if ok:
print(f" ✓ Session completed")
else:
print(f" ✓ (mock) Session \"{task_title}\" completed")
sessions._sessions = [s for s in sessions._sessions if s.session_id != sa.session_id]
if sessions._active_session and sessions._active_session.session_id == sa.session_id:
sessions._active_session = None
else:
print(" dismissed.")
elif sa.type == "switch":
session = next((s for s in sessions.sessions if s.session_id == sa.session_id), None)
task_title = session.task_title if session else "another task"
_show_session_card(
f"🔄 Switch to: {task_title}",
sa.reason,
[f"Switch to \"{task_title}\""],
)
choice = await _wait_for_input()
if choice == 1:
print(f" ✓ Switched to \"{task_title}\"")
else:
print(" dismissed.")
def _is_valid_session_id(session_id: str | None, sessions: SessionManager) -> bool:
"""Check if a session_id from VLM output actually exists in our sessions."""
if not session_id:
return False
return any(s.session_id == session_id for s in sessions.sessions)
async def run_loop(
ctx: TaskContext,
*,
api_key: str | None = None,
vlm_backend: str | None = None,
jwt: str | None = None,
base_url: str | None = None,
dry_run: bool = False,
max_iterations: int | None = None,
on_result: None | (callable) = None,
auto_execute: bool = False,
mock_sessions: list | None = None,
) -> None:
"""Run the Argus VLM loop with notification and execution support.
Args:
ctx: Task/session context.
api_key: Gemini API key override.
vlm_backend: "ollama" or "gemini".
jwt: Backend JWT override.
base_url: Backend base URL override.
dry_run: If True, skip sending to backend.
max_iterations: Stop after N iterations (None = forever).
on_result: Optional callback(VLMResult) per analysis.
auto_execute: If True, enable notification + executor flow.
"""
history = HistoryBuffer(image_maxlen=BUFFER_MAX_LEN)
notifier = NotificationManager()
sessions = SessionManager(jwt=jwt, base_url=base_url)
iteration = 0
log.info(
"Argus loop starting — interval=%ds, session=%s, task=%s, executor=%s",
CAPTURE_INTERVAL_S,
ctx.session_id,
ctx.task_title,
"on" if auto_execute else "off",
)
# Load sessions — mock or from backend
use_mock = mock_sessions is not None
if use_mock:
sessions._sessions = mock_sessions
sessions._active_session = next((s for s in mock_sessions if s.status == "active"), None)
sessions._mock = True # prevent refresh from overwriting
log.info("Loaded %d mock sessions", len(mock_sessions))
else:
sessions._mock = False
try:
await sessions.fetch_open_sessions()
except Exception:
log.exception("Startup: failed to fetch sessions — continuing without session context")
# Log attachment status so it's visible in the console
if sessions.active:
s = sessions.active
log.info(
"ATTACHED to session %s — task=%r last_app=%s checkpoint=%r",
s.session_id[:8], s.task_title, s.last_app or "(unknown)", s.checkpoint_note or "(none)",
)
else:
log.info("No active session found — running in monitoring-only mode (dry-run=%s)", dry_run)
# Pending frames collected between VLM calls
pending_frames: list[bytes] = []
capture_count = 0
try:
while max_iterations is None or iteration < max_iterations:
t0 = time.monotonic()
try:
# 0. Refresh sessions periodically
await sessions.maybe_refresh()
# 1. Capture screenshot every interval
screenshot = capture_screenshot()
pending_frames.append(screenshot)
capture_count += 1
log.debug("Captured frame %d (%d pending)", capture_count, len(pending_frames))
# 2. Only call VLM every N captures
if len(pending_frames) < VLM_CALL_EVERY_N:
elapsed = time.monotonic() - t0
sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed)
if sleep_for > 0:
await asyncio.sleep(sleep_for)
continue
# ── VLM call with all pending frames ──
iteration += 1
# Push all pending frames into buffer (buffer keeps last BUFFER_MAX_LEN)
for frame in pending_frames:
history.push(frame, "") # summaries filled after VLM call
pending_frames.clear()
t_vlm = time.monotonic()
result: VLMResult = await analyze_screenshot(
screenshot, ctx, history,
api_key=api_key,
backend=vlm_backend,
session_context=sessions.format_for_prompt(),
)
t_vlm_done = time.monotonic()
payload = result.to_backend_payload(ctx.session_id)
log.info(
"[%d] vlm=%.2fs frames=%d friction=%s summary=%s",
iteration,
t_vlm_done - t_vlm,
VLM_CALL_EVERY_N,
result.friction.type,
result.vlm_summary,
)
# Update last entry's summary now that we have it
history.set_last_output(payload)
# 4. Print / send to backend
if dry_run:
print(json.dumps(payload, indent=2))
else:
resp = await send_analysis(
result, ctx.session_id, jwt=jwt, base_url=base_url
)
log.debug("Backend response: %s", resp)
if auto_execute:
sa = result.session_action
# Validate session_action — check that VLM output relates to the session.
# Match against filename, file stem (no extension), or task title.
if sa.type in ("resume", "switch") and sa.session_id:
session = next(
(s for s in sessions.sessions if s.session_id == sa.session_id), None
)
if session:
context = (result.vlm_summary + " " + result.inferred_task + " " + sa.reason).lower()
matches = []
if session.last_file:
matches.append(session.last_file.lower())
# Also check stem without extension (e.g. "receipt" from "receipt.pdf")
stem = session.last_file.rsplit(".", 1)[0].lower()
if stem:
matches.append(stem)
if session.task_title:
# Check key words from task title
for word in session.task_title.lower().split():
if len(word) > 3: # skip short words
matches.append(word)
if matches and not any(m in context for m in matches):
log.debug(
"Suppressing session_action=%s — none of %s found in context",
sa.type, matches,
)
sa.type = "none"
if not _is_valid_session_id(sa.session_id, sessions) and sa.type in ("resume", "switch"):
sa.type = "none"
# 5. Session actions take priority — but only if not already handled
session_handled = False
if sa.type != "none":
key = _session_action_key(sa)
if key not in _handled_session_actions:
await _handle_session_action(sa, result, sessions, notifier)
session_handled = True
# 6. Friction notification + executor
if not session_handled and notifier.should_notify(payload):
card = notifier.create_card(payload)
notifier.show_card_terminal(card)
choice = await _wait_for_input()
action_idx = choice - 1
if choice > 0:
print(f"\n⚡ Executing action {action_idx + 1}...")
summary = await execute(
payload, action_idx,
history=history,
current_screenshot=screenshot,
api_key=api_key,
)
# Emit a parseable JSON block so Swift can surface the result
# in the HUD and clear the executing spinner.
print(json.dumps({"exec_summary": summary or ""}, indent=2), flush=True)
if summary:
history.set_last_execution(summary)
else:
print("dismissed.")
# 7. Nudge — VLM decided a nudge is appropriate (only if nothing else fired)
elif not session_handled and result.gentle_nudge:
print(f"\n💛 {result.gentle_nudge}")
# 8. Fallback: suggest new session if VLM didn't
elif not session_handled and (
sa.type == "none"
and result.on_task
and not sessions.active
and sessions.should_suggest_new_session(result.inferred_task)
):
task = result.inferred_task
print(f"\n🆕 You've been working on \"{task}\" — start a focus session?")
print(f" [1] Start [0] Not now")
card = notifier.create_card({
"friction": {
"type": "none",
"confidence": 1.0,
"description": f"New task detected: {task}",
"proposed_actions": [{"label": "Start focus session", "action_type": "other", "details": task}],
}
})
notifier.show_card_terminal(card)
accepted, _ = await notifier.wait_for_response(timeout=60.0)
if accepted:
resp = await sessions.start_session(task)
if resp:
print(f" ✓ Session started: {resp.get('id', '?')}")
# Clear execution context after 3 iterations
if history.get_last_execution() and iteration % 3 == 0:
history.clear_last_execution()
# 9. Callback
if on_result:
on_result(result)
except Exception:
log.exception("Error in Argus loop iteration %d", iteration)
# Sleep for remainder of capture interval
elapsed = time.monotonic() - t0
sleep_for = max(0.0, CAPTURE_INTERVAL_S - elapsed)
if sleep_for > 0:
await asyncio.sleep(sleep_for)
finally:
pass

172
argus/notification.py Normal file
View File

@@ -0,0 +1,172 @@
"""Notification manager — decides when to show action cards to the user.
Only pushes a new notification when the proposed action meaningfully changes.
In production (Swift), this becomes a native macOS notification / floating card.
Here it's a terminal prompt for testing.
Swift portability notes:
- NotificationManager becomes a class with @Published properties
- show_card() triggers a SwiftUI overlay or UNUserNotificationCenter
- user_response() is wired to button taps instead of stdin
"""
from __future__ import annotations
import asyncio
import logging
from dataclasses import dataclass, field
log = logging.getLogger(__name__)
@dataclass
class ActionCard:
"""A proposed action shown to the user."""
friction_type: str
description: str
actions: list[dict]
source: str
target: str
vlm_payload: dict # full VLM result for the executor
class NotificationManager:
"""Deduplicates notifications and manages the pending action card.
Only shows a new card when the friction type or proposed action labels
change from the previous notification.
"""
def __init__(self):
self._last_fingerprint: str = ""
self._pending_card: ActionCard | None = None
self._response_event: asyncio.Event = asyncio.Event()
self._user_accepted: bool = False
def _fingerprint(self, vlm_payload: dict) -> str:
"""Generate a dedup key from friction type + action labels."""
friction = vlm_payload.get("friction", {})
ftype = friction.get("type", "none")
labels = tuple(
a.get("label", "") for a in friction.get("proposed_actions", [])
)
return f"{ftype}:{labels}"
# Action types that the executor can actually perform
ACTIONABLE_TYPES = {"auto_extract", "brain_dump", "auto_fill", "summarize"}
def should_notify(self, vlm_payload: dict) -> bool:
"""Check if this VLM result warrants a friction card (executor action).
Only fires when proposed_actions contain something the executor can
act on. Generic suggestions (action_type: "other") are nudges, not
executor actions.
"""
friction = vlm_payload.get("friction", {})
# No friction or no proposed actions → no notification
if friction.get("type") == "none":
return False
actions = friction.get("proposed_actions", [])
if not actions:
return False
# Only notify if at least one action is executor-actionable
has_actionable = any(
a.get("action_type") in self.ACTIONABLE_TYPES for a in actions
)
if not has_actionable:
return False
# Same as last notification → skip
fp = self._fingerprint(vlm_payload)
if fp == self._last_fingerprint:
return False
return True
def create_card(self, vlm_payload: dict) -> ActionCard:
"""Create an action card from VLM output and mark it as pending."""
friction = vlm_payload.get("friction", {})
card = ActionCard(
friction_type=friction.get("type", ""),
description=friction.get("description", ""),
actions=friction.get("proposed_actions", []),
source=friction.get("source_context", ""),
target=friction.get("target_context", ""),
vlm_payload=vlm_payload,
)
self._pending_card = card
self._last_fingerprint = self._fingerprint(vlm_payload)
self._response_event.clear()
self._user_accepted = False
return card
def show_card_terminal(self, card: ActionCard) -> None:
"""Print the action card to terminal. (Swift: show native UI.)"""
print()
print("" + "" * 58 + "")
print(f"{'🔧 Friction detected':^58}")
print("" + " " * 58 + "")
desc_lines = _wrap(card.description, 56)
for line in desc_lines:
print(f"{line:<56}")
print("" + " " * 58 + "")
for i, action in enumerate(card.actions):
label = action.get("label", "?")
print(f"│ [{i + 1}] {label:<53}")
print(f"│ [0] Not now{' ' * 44}")
print("" + "" * 58 + "")
print()
async def wait_for_response(self, timeout: float = 30.0) -> tuple[bool, int]:
"""Wait for user response. Returns (accepted, action_index).
Swift portability: this becomes a button tap callback.
"""
try:
await asyncio.wait_for(self._response_event.wait(), timeout=timeout)
return self._user_accepted, self._selected_index
except asyncio.TimeoutError:
log.debug("Notification timed out, dismissing")
self._pending_card = None
return False, -1
def submit_response(self, choice: int) -> None:
"""Submit user's choice. Called from input reader task.
Swift portability: called from button tap handler.
"""
if choice > 0 and self._pending_card:
self._user_accepted = True
self._selected_index = choice - 1
else:
self._user_accepted = False
self._selected_index = -1
self._response_event.set()
@property
def pending(self) -> ActionCard | None:
return self._pending_card
def dismiss(self) -> None:
"""Dismiss current card without action."""
self._pending_card = None
self._last_fingerprint = ""
def _wrap(text: str, width: int) -> list[str]:
"""Simple word wrap."""
words = text.split()
lines: list[str] = []
current = ""
for word in words:
if len(current) + len(word) + 1 <= width:
current = f"{current} {word}" if current else word
else:
if current:
lines.append(current)
current = word
if current:
lines.append(current)
return lines or [""]

3
argus/requirements.txt Normal file
View File

@@ -0,0 +1,3 @@
httpx>=0.27
Pillow>=10.0
python-dotenv>=1.0

293
argus/session.py Normal file
View File

@@ -0,0 +1,293 @@
"""Session manager — fetches and caches session state from the backend.
Provides session context to the VLM prompt and handles session lifecycle
actions (start, resume, switch, complete).
Swift portability notes:
- SessionManager becomes an ObservableObject with @Published properties
- Backend calls use URLSession instead of httpx
- match_session() logic is identical
"""
from __future__ import annotations
import logging
import time
from dataclasses import dataclass, field
import httpx
from argus.config import BACKEND_BASE_URL, BACKEND_JWT
log = logging.getLogger(__name__)
@dataclass
class SessionInfo:
session_id: str
task_id: str | None
task_title: str
task_goal: str
status: str # active | interrupted
last_app: str
last_file: str
checkpoint_note: str
started_at: str
ended_at: str | None
minutes_ago: int | None = None
class SessionManager:
"""Caches open sessions and provides matching + lifecycle operations."""
def __init__(self, jwt: str | None = None, base_url: str | None = None):
self._jwt = jwt or BACKEND_JWT
self._base_url = base_url or BACKEND_BASE_URL
self._sessions: list[SessionInfo] = []
self._active_session: SessionInfo | None = None
self._last_fetch: float = 0
self._fetch_interval = 30.0 # re-fetch every 30s
self._mock: bool = False
# Track inferred_task stability for new session suggestion
self._inferred_task_history: list[str] = []
self._stable_threshold = 3 # consecutive matching inferred_tasks
@property
def active(self) -> SessionInfo | None:
return self._active_session
@property
def sessions(self) -> list[SessionInfo]:
return self._sessions
def has_sessions(self) -> bool:
return len(self._sessions) > 0
# ── Backend communication ────────────────────────────────────────
async def fetch_open_sessions(self) -> list[SessionInfo]:
"""Fetch the active session from backend (GET /sessions/active → single object or 404).
Called on startup and periodically.
"""
url = f"{self._base_url}/sessions/active"
headers = {}
if self._jwt:
headers["Authorization"] = f"Bearer {self._jwt}"
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.get(url, headers=headers)
if resp.status_code == 404:
# No active session — clear cache
self._sessions = []
self._active_session = None
self._last_fetch = time.monotonic()
log.info("No active session on backend")
return []
resp.raise_for_status()
data = resp.json()
session = self._parse_session(data)
self._sessions = [session]
self._active_session = session
self._last_fetch = time.monotonic()
log.info("Fetched active session: %s (%s)", session.session_id, session.task_title)
return self._sessions
except httpx.HTTPStatusError as e:
log.warning("Failed to fetch sessions: %s", e.response.status_code)
return self._sessions
except httpx.RequestError as e:
log.debug("Backend unreachable for sessions: %s", e)
return self._sessions
async def maybe_refresh(self) -> None:
"""Re-fetch if stale. Skips if using mock data."""
if self._mock:
return
if time.monotonic() - self._last_fetch > self._fetch_interval:
await self.fetch_open_sessions()
async def start_session(self, task_title: str, task_goal: str = "") -> dict | None:
"""Start a new focus session. Returns session data or None."""
url = f"{self._base_url}/sessions/start"
headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {}
# If we have a task_title but no task_id, the backend should
# create an ad-hoc session (or we create the task first).
payload = {"platform": "mac"}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(url, json=payload, headers=headers)
resp.raise_for_status()
result = resp.json()
await self.fetch_open_sessions() # refresh
log.info("Started session: %s", result.get("id"))
return result
except Exception:
log.exception("Failed to start session")
return None
async def end_session(self, session_id: str, status: str = "completed") -> bool:
"""End a session. Returns True on success."""
url = f"{self._base_url}/sessions/{session_id}/end"
headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(url, json={"status": status}, headers=headers)
resp.raise_for_status()
await self.fetch_open_sessions() # refresh
log.info("Ended session %s (%s)", session_id, status)
return True
except Exception:
log.exception("Failed to end session %s", session_id)
return False
async def get_resume_card(self, session_id: str) -> dict | None:
"""Fetch AI-generated resume card for a session."""
url = f"{self._base_url}/sessions/{session_id}/resume"
headers = {"Authorization": f"Bearer {self._jwt}"} if self._jwt else {}
try:
async with httpx.AsyncClient(timeout=15.0) as client:
resp = await client.get(url, headers=headers)
resp.raise_for_status()
return resp.json()
except Exception:
log.exception("Failed to get resume card for %s", session_id)
return None
# ── Session matching ─────────────────────────────────────────────
def match_session(self, inferred_task: str, app_name: str) -> SessionInfo | None:
"""Find an open session that matches the current screen state.
Returns the best match or None.
"""
if not inferred_task or not self._sessions:
return None
inferred_lower = inferred_task.lower()
best_match: SessionInfo | None = None
best_score = 0
for session in self._sessions:
score = 0
title_lower = session.task_title.lower()
# Check for keyword overlap between inferred_task and session task_title
inferred_words = set(inferred_lower.split())
title_words = set(title_lower.split())
overlap = inferred_words & title_words
# Filter out common words
overlap -= {"the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with"}
score += len(overlap) * 2
# App match
if app_name and session.last_app and app_name.lower() in session.last_app.lower():
score += 3
# File match — check if session's last_file appears in inferred_task
if session.last_file and session.last_file.lower() in inferred_lower:
score += 5
if score > best_score:
best_score = score
best_match = session
# Require minimum score to avoid false matches
if best_score >= 4:
return best_match
return None
def should_suggest_new_session(self, inferred_task: str) -> bool:
"""Check if we should suggest starting a new session.
Returns True if inferred_task has been stable for N iterations
and doesn't match any existing session.
"""
if not inferred_task:
self._inferred_task_history.clear()
return False
self._inferred_task_history.append(inferred_task)
# Keep only recent history
if len(self._inferred_task_history) > self._stable_threshold + 2:
self._inferred_task_history = self._inferred_task_history[-self._stable_threshold - 2:]
if len(self._inferred_task_history) < self._stable_threshold:
return False
# Check if the last N inferred tasks are "similar" (share key words)
recent = self._inferred_task_history[-self._stable_threshold:]
first_words = set(recent[0].lower().split()) - {"the", "a", "an", "in", "on", "to", "and", "or", "is", "for", "of", "with"}
all_similar = all(
len(first_words & set(t.lower().split())) >= len(first_words) * 0.5
for t in recent[1:]
)
if not all_similar:
return False
# Make sure it doesn't match an existing session
matched = self.match_session(inferred_task, "")
return matched is None
# ── Prompt formatting ────────────────────────────────────────────
def format_for_prompt(self) -> str:
"""Format open sessions for injection into the VLM prompt.
Includes session_id so VLM can reference the exact ID in session_action.
"""
if not self._sessions:
return "(no open sessions)"
lines: list[str] = []
for s in self._sessions:
status_tag = f"[{s.status}]"
ago = f" (paused {s.minutes_ago}m ago)" if s.minutes_ago else ""
line = f" session_id=\"{s.session_id}\" {status_tag} \"{s.task_title}\" — last in {s.last_app}"
if s.last_file:
line += f"/{s.last_file}"
if s.checkpoint_note:
line += f", \"{s.checkpoint_note}\""
line += ago
lines.append(line)
return "\n".join(lines)
# ── Internal ─────────────────────────────────────────────────────
def _parse_session(self, data: dict) -> SessionInfo:
checkpoint = data.get("checkpoint", {}) or {}
ended = data.get("ended_at")
minutes_ago = None
if ended:
import datetime
try:
ended_dt = datetime.datetime.fromisoformat(ended.replace("Z", "+00:00"))
now = datetime.datetime.now(datetime.timezone.utc)
minutes_ago = int((now - ended_dt).total_seconds() / 60)
except (ValueError, TypeError):
pass
# SessionOut has no nested "task" object — task title is stored in checkpoint["goal"]
# by POST /sessions/start when a task_id is provided.
task_title = checkpoint.get("goal", "")
checkpoint_note = checkpoint.get("last_action_summary", checkpoint.get("last_vlm_summary", ""))
return SessionInfo(
session_id=str(data.get("id", "")),
task_id=data.get("task_id"),
task_title=task_title,
task_goal=task_title,
status=data.get("status", ""),
last_app=checkpoint.get("active_app", ""),
last_file=checkpoint.get("active_file", ""),
checkpoint_note=checkpoint_note,
started_at=data.get("started_at", ""),
ended_at=ended,
minutes_ago=minutes_ago,
)

442
argus/vlm.py Normal file
View File

@@ -0,0 +1,442 @@
"""VLM client — supports Ollama (local, default) and Gemini (cloud fallback).
Sends the current screenshot plus text summaries of recent analyses.
Parses the structured JSON response.
"""
from __future__ import annotations
import asyncio
import base64
import json
import logging
import re
from dataclasses import dataclass, field
import httpx
from argus.buffer import HistoryBuffer
from argus.config import (
GEMINI_API_KEY,
GEMINI_URL,
OLLAMA_BASE_URL,
OLLAMA_MODEL,
VLM_BACKEND,
)
log = logging.getLogger(__name__)
# ── Task context passed in from the session ──────────────────────────────
@dataclass
class StepInfo:
id: str
sort_order: int
title: str
status: str # pending | in_progress | done | skipped
checkpoint_note: str | None = None
@dataclass
class TaskContext:
"""Context about the user's current task/session, fed into the VLM prompt."""
task_title: str = ""
task_goal: str = ""
steps: list[StepInfo] = field(default_factory=list)
window_title: str = ""
session_id: str = ""
# ── VLM response schema ─────────────────────────────────────────────────
@dataclass
class FrictionInfo:
type: str = "none"
confidence: float = 0.0
description: str | None = None
proposed_actions: list[dict] = field(default_factory=list)
source_context: str | None = None
target_context: str | None = None
@dataclass
class SessionAction:
type: str = "none" # resume | switch | complete | start_new | none
session_id: str | None = None
reason: str = ""
@dataclass
class VLMResult:
on_task: bool = True
current_step_id: str | None = None
inferred_task: str = ""
checkpoint_note_update: str | None = None
steps_completed: list[str] = field(default_factory=list)
friction: FrictionInfo = field(default_factory=FrictionInfo)
session_action: SessionAction = field(default_factory=SessionAction)
intent: str | None = None
distraction_type: str | None = None
app_name: str = ""
confidence: float = 0.0
gentle_nudge: str | None = None
vlm_summary: str = ""
def to_backend_payload(self, session_id: str) -> dict:
"""Serialize to the JSON shape expected by POST /distractions/analyze-result."""
return {
"session_id": session_id,
"on_task": self.on_task,
"current_step_id": self.current_step_id,
"inferred_task": self.inferred_task,
"checkpoint_note_update": self.checkpoint_note_update,
"steps_completed": self.steps_completed,
"friction": {
"type": self.friction.type,
"confidence": self.friction.confidence,
"description": self.friction.description,
"proposed_actions": self.friction.proposed_actions,
"source_context": self.friction.source_context,
"target_context": self.friction.target_context,
},
"session_action": {
"type": self.session_action.type,
"session_id": self.session_action.session_id,
"reason": self.session_action.reason,
},
"intent": self.intent,
"distraction_type": self.distraction_type,
"app_name": self.app_name,
"confidence": self.confidence,
"gentle_nudge": self.gentle_nudge,
"vlm_summary": self.vlm_summary,
}
# ── Prompt construction ──────────────────────────────────────────────────
def _format_steps(steps: list[StepInfo]) -> str:
lines: list[str] = []
for s in steps:
marker = {"pending": "", "in_progress": "", "done": "", "skipped": ""}.get(
s.status, "?"
)
line = f" {marker} [{s.status}] (id={s.id}) {s.sort_order}. {s.title}"
if s.checkpoint_note:
line += f" — checkpoint: {s.checkpoint_note}"
lines.append(line)
return "\n".join(lines) if lines else " (no steps)"
def build_system_prompt(ctx: TaskContext, history: HistoryBuffer, session_context: str = "") -> str:
history_text = history.format_for_prompt()
steps_text = _format_steps(ctx.steps)
prev_output = history.format_last_output()
prev_section = ""
if prev_output:
prev_section = f"""
Your previous analysis (refine or correct this based on new evidence):
{prev_output}
If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it with new observations.
"""
execution = history.get_last_execution()
exec_section = ""
if execution:
exec_section = f"""
IMPORTANT — An AI agent just completed an action for the user:
{execution}
This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT.
"""
return f"""\
You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots.
## How to read the screenshots
You receive screenshots in chronological order (oldest first, newest last).
You receive ~5 frames spanning ~20 seconds (one frame every 4 seconds). This means:
- 2 unchanged frames = 8+ seconds idle. That's significant.
- 3+ unchanged frames = 12-20 seconds idle. The user is stuck or distracted.
- If ALL frames are identical, the user has been idle for 20 seconds — definitely flag it.
- If the user wrote code and then 2+ frames show no changes, they are STUCK NOW.
Do NOT wait for many frames to flag problems. React fast.
Your PRIMARY signal is the DIFFERENCES between consecutive frames.
Where the screen CHANGED = where the user's ATTENTION is.
Where the screen is STATIC = background noise. Ignore it unless the user interacts with it.
Diff signals and what they mean:
- New text appearing / cursor advancing → user is actively typing (THIS is their task)
- Window or tab switch → context change, could be reference lookup or distraction
- Same content, no pixel changes → stalled, idle, or reading
- Repeated switching between same 2-3 apps → repetitive loop (manual data transfer)
- Scroll position change → reading or browsing
- Error message that APPEARED between frames → user just triggered it, relevant
- Error message that was ALREADY THERE in all frames → stale, ignore it
## Task inference
Infer the user's current task from what they are ACTIVELY DOING across the frames.
Do NOT assume static content (old terminal output, background panels, stale errors)
is the task. The region of the screen where pixels are changing IS the task.
CRITICAL — looking at something ≠ working on something:
- User switches to Preview/browser/another app and just LOOKS → this is NOT a new task.
It could be a distraction, a quick reference, or idle browsing.
- User switches to another app AND starts TYPING/EDITING → this might be a new task.
- If the user has an active session and switches away WITHOUT typing in the new app,
they are DISTRACTED from their session, not starting a new task.
- Only infer a new task when there is clear evidence of productive work (typing, editing,
cursor movement between frames) in the new context.
- A single app switch is NEVER enough to infer a new task. Wait for active work.
If an explicit task is provided below, use it. Otherwise, infer from the screenshots.
Task: {ctx.task_title}
Goal: {ctx.task_goal}
Steps:
{steps_text}
Window title reported by OS: {ctx.window_title}
## Open sessions from backend (use EXACT session_id values below)
{session_context if session_context else "(no open sessions — suggest start_new if user is working on something)"}
Session matching rules — be STRICT:
- A session matches ONLY if the user is actively editing the session's last_file.
Being in the same app (e.g. VS Code) is NOT enough. The user must be typing/editing
in the specific file listed in the session (e.g. solution.cpp).
- Chatting in a sidebar, reading logs, or browsing in the same app ≠ working on the session.
- If the user is in VS Code but editing a DIFFERENT file or chatting in a panel,
they are NOT on-task for the session. That's a distraction.
If the session's file IS being actively edited, output session_action: resume with the EXACT session_id.
If the user moved to a different open session's file, output session_action: switch with the EXACT session_id.
IMPORTANT: If you propose a friction action (e.g. auto_extract) that relates to an existing
session's task, ALSO output session_action: resume with that session's ID. The friction action
and session resume should go together — don't propose work related to a session without
linking it to the session.
If the user finished and closed the session's file, output session_action: complete with the EXACT session_id.
If no sessions exist but the user is actively working, output session_action: start_new (session_id: null).
NEVER invent session IDs. Use only the IDs listed above or null.
{prev_section}{exec_section}
Screenshot timeline:
{history_text}
## What to analyze
1. INFERRED TASK: What is the user actually working on right now? Base this on where
the screen is changing, not on static content.
2. CHECKPOINT: What specific progress did the user make across these frames?
Describe what changed (e.g., "typed 3 new lines of C++ code", "scrolled to next section").
3. FRICTION DETECTION: Is the user stuck in any of these patterns?
- REPETITIVE_LOOP: Switching between same 2-3 windows (copying data manually)
- STALLED: No meaningful pixel changes across 2+ frames, OR user wrote code then
deleted/undid it (write-then-delete = struggle, NOT "refining")
- TEDIOUS_MANUAL: Doing automatable work (filling forms, transcribing, copying by hand)
- CONTEXT_OVERHEAD: Many windows open, visibly searching across them
- TASK_RESUMPTION: User just returned to a task from earlier (check text history)
IMPORTANT signals to catch IMMEDIATELY (do NOT wait many frames):
- User wrote code/text then deleted it → STUCK, not refining. Flag stalled.
- User idle for 2+ frames after deletion → definitely stuck. Flag stalled.
- User switching between source doc and target file repeatedly → TEDIOUS_MANUAL.
This is NOT "fluent workflow." If the user is copying data from one place to
another by switching windows, flag it on the SECOND switch. Don't wait.
- Code is incomplete/wrong and user stopped typing → need help.
4. INTENT: If viewing informational content, is the user skimming, engaged, or unclear?
5. PROPOSED ACTION: If friction detected, suggest what the AI could DO.
Be SPECIFIC: "Extract full text from writeup.pdf into transcript.md" not "Summarize text".
The label should be a concrete verb phrase the user can approve with one tap.
CRITICAL: The "details" field is the executor agent's instruction manual. Write it as
a natural language spec — the executor has vision too, so tell it where to look and
what to do, not the raw data:
Bad: "Extract data from the document"
Bad: "Help with the code"
Good: "User is writing a report in report.md and has a PDF open with source data. They are manually copying table values from the PDF into markdown. Extract the table from the PDF (visible in screenshots), format as a markdown table matching the style in report.md, and append to the file."
Good: "User is implementing quicksort in solution.cpp but has been idle after writing the function signature. They appear stuck on the partition logic. Provide a working C++ quicksort implementation that fits their existing code structure."
Respond ONLY with JSON:
{{
"on_task": true,
"current_step_id": "step UUID or null",
"inferred_task": "what the user is actually working on, based on screen diffs",
"checkpoint_note_update": "what changed across these frames specifically",
"steps_completed": ["UUIDs"],
"friction": {{
"type": "repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none",
"confidence": 0.0-1.0,
"description": "what the user is struggling with, based on diff evidence",
"proposed_actions": [
{{"label": "specific verb phrase: what to do, from where, to where", "action_type": "auto_extract | brain_dump | auto_fill | summarize | other", "details": "Natural language spec for executor. Include: (1) what the user wants done, (2) where to look in the screenshots, (3) EXACT format to use — quote what the user already wrote so the executor matches it (e.g. if user wrote '3 tacos with steak' in plain text, say 'format as plain text lines like the user already started, NOT JSON'), (4) target file. The executor has vision to read screenshots — tell it WHERE to look, not the raw data."}}
],
"source_context": "just the filename if visible (e.g. writeup.pdf), or app name if no file",
"target_context": "just the filename if visible (e.g. transcript.md), or app name if no file"
}},
"session_action": {{
"type": "resume | switch | complete | start_new | none",
"session_id": "uuid of matching session, or null for start_new/none",
"reason": "why this session action is suggested"
}},
"intent": "skimming | engaged | unclear | null",
"distraction_type": "app_switch | browsing | idle | null",
"app_name": "primary visible application",
"confidence": 0.0-1.0,
"gentle_nudge": "nudge text if distracted, null otherwise",
"vlm_summary": "1-sentence description of what CHANGED across the frames (not what's static)"
}}"""
# ── Gemini API call ──────────────────────────────────────────────────────
def _extract_json(text: str) -> dict:
"""Extract JSON from Gemini response, handling markdown code fences."""
text = text.strip()
# Strip ```json ... ``` wrappers
m = re.search(r"```(?:json)?\s*([\s\S]*?)```", text)
if m:
text = m.group(1).strip()
return json.loads(text)
def _parse_vlm_response(raw: dict) -> VLMResult:
"""Parse the raw JSON dict into a VLMResult dataclass."""
friction_raw = raw.get("friction", {})
friction = FrictionInfo(
type=friction_raw.get("type", "none"),
confidence=friction_raw.get("confidence", 0.0),
description=friction_raw.get("description"),
proposed_actions=friction_raw.get("proposed_actions", []),
source_context=friction_raw.get("source_context"),
target_context=friction_raw.get("target_context"),
)
sa_raw = raw.get("session_action", {})
session_action = SessionAction(
type=sa_raw.get("type", "none"),
session_id=sa_raw.get("session_id"),
reason=sa_raw.get("reason", ""),
)
return VLMResult(
on_task=raw.get("on_task", True),
current_step_id=raw.get("current_step_id"),
inferred_task=raw.get("inferred_task", ""),
checkpoint_note_update=raw.get("checkpoint_note_update"),
steps_completed=raw.get("steps_completed", []),
friction=friction,
session_action=session_action,
intent=raw.get("intent"),
distraction_type=raw.get("distraction_type"),
app_name=raw.get("app_name", ""),
confidence=raw.get("confidence", 0.0),
gentle_nudge=raw.get("gentle_nudge"),
vlm_summary=raw.get("vlm_summary", ""),
)
async def _call_ollama(system_prompt: str, b64_images: list[str]) -> str:
"""Call Ollama local VLM with multiple images and return raw text."""
payload = {
"model": OLLAMA_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": "/no_think\nAnalyze this screenshot sequence now.",
"images": b64_images,
},
],
"stream": False,
"keep_alive": -1,
"options": {"temperature": 0.2},
}
async with httpx.AsyncClient(timeout=300.0) as client:
resp = await client.post(f"{OLLAMA_BASE_URL}/api/chat", json=payload)
resp.raise_for_status()
body = resp.json()
return body["message"]["content"]
async def _call_gemini(system_prompt: str, b64_images: list[str], api_key: str) -> str:
"""Call Gemini Vision API with multiple images and return raw text."""
# Build parts: interleave images with labels, newest last
parts: list[dict] = []
total = len(b64_images)
for i, b64 in enumerate(b64_images):
parts.append({"text": f"[Screenshot {i + 1}/{total}]"})
parts.append({"inlineData": {"mimeType": "image/jpeg", "data": b64}})
parts.append({"text": "Analyze this screenshot sequence now."})
payload = {
"systemInstruction": {"parts": [{"text": system_prompt}]},
"contents": [{"parts": parts}],
"generationConfig": {
"temperature": 0.2,
"maxOutputTokens": 4096,
},
}
async with httpx.AsyncClient(timeout=60.0) as client:
for attempt in range(3):
resp = await client.post(f"{GEMINI_URL}?key={api_key}", json=payload)
if resp.status_code == 429:
wait = 2 ** attempt
log.warning("Gemini 429 rate limited, retrying in %ds...", wait)
await asyncio.sleep(wait)
continue
resp.raise_for_status()
break
else:
resp.raise_for_status()
body = resp.json()
parts = body["candidates"][0]["content"]["parts"]
text = ""
for part in parts:
if "text" in part:
text = part["text"]
return text
async def analyze_screenshot(
screenshot_jpeg: bytes,
ctx: TaskContext,
history: HistoryBuffer,
*,
api_key: str | None = None,
backend: str | None = None,
session_context: str = "",
) -> VLMResult:
"""Analyze a screenshot sequence via Ollama or Gemini.
Sends all buffered screenshots + the current one as images (oldest first).
"""
which = backend or VLM_BACKEND
system_prompt = build_system_prompt(ctx, history, session_context=session_context)
# Build image list: buffered (oldest first) + current
b64_images: list[str] = []
for entry in history.get_entries():
b64_images.append(base64.b64encode(entry.jpeg).decode())
b64_images.append(base64.b64encode(screenshot_jpeg).decode())
log.debug("Sending %d screenshots to %s", len(b64_images), which)
if which == "ollama":
text = await _call_ollama(system_prompt, b64_images)
elif which == "gemini":
key = api_key or GEMINI_API_KEY
if not key:
raise RuntimeError("GEMINI_API_KEY not set")
text = await _call_gemini(system_prompt, b64_images, key)
else:
raise ValueError(f"Unknown VLM backend: {which}")
log.debug("VLM raw response: %s", text)
parsed = _extract_json(text)
return _parse_vlm_response(parsed)