// HistoryBuffer.swift — Two-tier rolling history for VLM temporal context // Ports Python argus buffer.py: image tier (recent frames) + text tier (older summaries). // The VLM sees recent images directly AND gets text context for events 30-60s ago. import Foundation /// A single buffered screenshot frame with its VLM summary. struct BufferEntry: Sendable { let imageData: Data // JPEG bytes var summary: String // VLM-generated summary (populated after analysis) let timestamp: Date var fileUri: String? // Gemini Files API URI (set async after upload; nil = use inline) } /// A text-only summary from an older analysis (images already evicted). struct TextEntry: Sendable { let summary: String let timestamp: Date } /// Two-tier rolling buffer that provides temporal context to the VLM. /// /// - **Image tier:** Last N frames (JPEG + summary + timestamp). Sent as images. /// - **Text tier:** Older summaries that rolled off the image buffer. Sent as text. /// - **Last output:** Previous VLM JSON result for self-refinement. /// - **Last execution:** Executor action summary to prevent re-flagging. /// /// Only accessed from `SessionManager` on the main actor — no concurrent access. @MainActor final class HistoryBuffer { private let imageMaxLen: Int private let textMaxLen: Int /// Recent frames — sent as images to the VLM. private(set) var images: [BufferEntry] = [] /// Older summaries — sent as text context. private(set) var textHistory: [TextEntry] = [] /// Full VLM JSON output from last analysis (for self-refinement). private(set) var lastOutput: String = "" /// Summary of last executor action (prevents re-flagging same friction). private(set) var lastExecution: String = "" /// Counter for how many VLM calls since execution was set (clear after 3). private var executionAge: Int = 0 init(imageMaxLen: Int = 4, textMaxLen: Int = 12) { self.imageMaxLen = imageMaxLen self.textMaxLen = textMaxLen } // MARK: - Push / Update /// Add a new frame to the image buffer. If the buffer is full, the oldest /// frame's summary is promoted to the text tier before eviction. func push(imageData: Data, summary: String = "") { let entry = BufferEntry(imageData: imageData, summary: summary, timestamp: Date()) if images.count >= imageMaxLen { // Promote oldest image's summary to text tier (if non-empty) let evicted = images.removeFirst() if !evicted.summary.isEmpty { textHistory.append(TextEntry(summary: evicted.summary, timestamp: evicted.timestamp)) if textHistory.count > textMaxLen { textHistory.removeFirst() } } } images.append(entry) } /// Update the summary on the most recent image entry (called after VLM returns). func updateLastSummary(_ summary: String) { guard !images.isEmpty else { return } images[images.count - 1].summary = summary } /// Store the Gemini Files API URI for the frame with the given timestamp. /// Called asynchronously after upload completes — safe because pushes happen at 5s intervals. func updateFileUri(_ uri: String, forTimestamp ts: Date) { guard let idx = images.firstIndex(where: { abs($0.timestamp.timeIntervalSince(ts)) < 1.0 }) else { return } images[idx].fileUri = uri } /// Store the full VLM JSON output for self-refinement on the next call. func setLastOutput(_ json: String) { lastOutput = json } /// Store executor action summary. Cleared automatically after 3 VLM iterations. func setLastExecution(_ summary: String) { lastExecution = summary executionAge = 0 } /// Tick execution age — call after each VLM analysis. Clears after 3. func tickExecutionAge() { if !lastExecution.isEmpty { executionAge += 1 if executionAge >= 3 { lastExecution = "" executionAge = 0 } } } /// Get all buffered JPEG frames (for sending to VLM as images). var frameData: [Data] { images.map(\.imageData) } /// File URIs parallel to frameData — nil means fall back to inline base64 for that frame. var fileUris: [String?] { images.map(\.fileUri) } /// Get recent summaries as strings (for recentSummaries parameter). var recentSummaries: [String] { images.compactMap { $0.summary.isEmpty ? nil : $0.summary } } /// Clear all state (e.g., on session end). func clear() { images.removeAll() textHistory.removeAll() lastOutput = "" lastExecution = "" executionAge = 0 } // MARK: - Prompt Formatting /// Build the temporal context section for the VLM prompt. /// Returns a formatted string with older text context + image labels. func formatForPrompt() -> String { var lines: [String] = [] // Older text-only context (no images — just summaries) if !textHistory.isEmpty { lines.append("Older context (text only, no images):") for entry in textHistory { let age = Int(Date().timeIntervalSince(entry.timestamp)) lines.append(" - [\(age)s ago] \(entry.summary)") } lines.append("") } // Recent image labels (these accompany the actual images sent to the VLM) if !images.isEmpty { let total = images.count lines.append("Recent screenshots (\(total) frames, newest last):") for (i, entry) in images.enumerated() { let age = Int(Date().timeIntervalSince(entry.timestamp)) let isCurrent = (i == images.count - 1) let label = " - Screenshot \(i + 1)/\(total): [\(isCurrent ? "now" : "\(age)s ago")]" if !entry.summary.isEmpty { lines.append("\(label) \(entry.summary)") } else { lines.append(label) } } } return lines.isEmpty ? "(no previous context)" : lines.joined(separator: "\n") } /// Format the last VLM output for self-refinement injection into the prompt. func formatLastOutput() -> String { guard !lastOutput.isEmpty else { return "" } return """ Your previous analysis (refine or correct this based on new evidence): \(lastOutput) If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it. """ } /// Format execution context for injection into the prompt. func formatLastExecution() -> String { guard !lastExecution.isEmpty else { return "" } return """ IMPORTANT — An AI agent just completed an action for the user: \(lastExecution) This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT. """ } }