Files
LockInBroMacOS/HistoryBuffer.swift

190 lines
7.0 KiB
Swift
Raw Permalink Normal View History

2026-04-01 16:10:30 -05:00
// HistoryBuffer.swift Two-tier rolling history for VLM temporal context
// Ports Python argus buffer.py: image tier (recent frames) + text tier (older summaries).
// The VLM sees recent images directly AND gets text context for events 30-60s ago.
import Foundation
/// A single buffered screenshot frame with its VLM summary.
struct BufferEntry: Sendable {
let imageData: Data // JPEG bytes
var summary: String // VLM-generated summary (populated after analysis)
let timestamp: Date
var fileUri: String? // Gemini Files API URI (set async after upload; nil = use inline)
}
/// A text-only summary from an older analysis (images already evicted).
struct TextEntry: Sendable {
let summary: String
let timestamp: Date
}
/// Two-tier rolling buffer that provides temporal context to the VLM.
///
/// - **Image tier:** Last N frames (JPEG + summary + timestamp). Sent as images.
/// - **Text tier:** Older summaries that rolled off the image buffer. Sent as text.
/// - **Last output:** Previous VLM JSON result for self-refinement.
/// - **Last execution:** Executor action summary to prevent re-flagging.
///
/// Only accessed from `SessionManager` on the main actor no concurrent access.
@MainActor
final class HistoryBuffer {
private let imageMaxLen: Int
private let textMaxLen: Int
/// Recent frames sent as images to the VLM.
private(set) var images: [BufferEntry] = []
/// Older summaries sent as text context.
private(set) var textHistory: [TextEntry] = []
/// Full VLM JSON output from last analysis (for self-refinement).
private(set) var lastOutput: String = ""
/// Summary of last executor action (prevents re-flagging same friction).
private(set) var lastExecution: String = ""
/// Counter for how many VLM calls since execution was set (clear after 3).
private var executionAge: Int = 0
init(imageMaxLen: Int = 4, textMaxLen: Int = 12) {
self.imageMaxLen = imageMaxLen
self.textMaxLen = textMaxLen
}
// MARK: - Push / Update
/// Add a new frame to the image buffer. If the buffer is full, the oldest
/// frame's summary is promoted to the text tier before eviction.
func push(imageData: Data, summary: String = "") {
let entry = BufferEntry(imageData: imageData, summary: summary, timestamp: Date())
if images.count >= imageMaxLen {
// Promote oldest image's summary to text tier (if non-empty)
let evicted = images.removeFirst()
if !evicted.summary.isEmpty {
textHistory.append(TextEntry(summary: evicted.summary, timestamp: evicted.timestamp))
if textHistory.count > textMaxLen {
textHistory.removeFirst()
}
}
}
images.append(entry)
}
/// Update the summary on the most recent image entry (called after VLM returns).
func updateLastSummary(_ summary: String) {
guard !images.isEmpty else { return }
images[images.count - 1].summary = summary
}
/// Store the Gemini Files API URI for the frame with the given timestamp.
/// Called asynchronously after upload completes safe because pushes happen at 5s intervals.
func updateFileUri(_ uri: String, forTimestamp ts: Date) {
guard let idx = images.firstIndex(where: { abs($0.timestamp.timeIntervalSince(ts)) < 1.0 }) else { return }
images[idx].fileUri = uri
}
/// Store the full VLM JSON output for self-refinement on the next call.
func setLastOutput(_ json: String) {
lastOutput = json
}
/// Store executor action summary. Cleared automatically after 3 VLM iterations.
func setLastExecution(_ summary: String) {
lastExecution = summary
executionAge = 0
}
/// Tick execution age call after each VLM analysis. Clears after 3.
func tickExecutionAge() {
if !lastExecution.isEmpty {
executionAge += 1
if executionAge >= 3 {
lastExecution = ""
executionAge = 0
}
}
}
/// Get all buffered JPEG frames (for sending to VLM as images).
var frameData: [Data] {
images.map(\.imageData)
}
/// File URIs parallel to frameData nil means fall back to inline base64 for that frame.
var fileUris: [String?] {
images.map(\.fileUri)
}
/// Get recent summaries as strings (for recentSummaries parameter).
var recentSummaries: [String] {
images.compactMap { $0.summary.isEmpty ? nil : $0.summary }
}
/// Clear all state (e.g., on session end).
func clear() {
images.removeAll()
textHistory.removeAll()
lastOutput = ""
lastExecution = ""
executionAge = 0
}
// MARK: - Prompt Formatting
/// Build the temporal context section for the VLM prompt.
/// Returns a formatted string with older text context + image labels.
func formatForPrompt() -> String {
var lines: [String] = []
// Older text-only context (no images just summaries)
if !textHistory.isEmpty {
lines.append("Older context (text only, no images):")
for entry in textHistory {
let age = Int(Date().timeIntervalSince(entry.timestamp))
lines.append(" - [\(age)s ago] \(entry.summary)")
}
lines.append("")
}
// Recent image labels (these accompany the actual images sent to the VLM)
if !images.isEmpty {
let total = images.count
lines.append("Recent screenshots (\(total) frames, newest last):")
for (i, entry) in images.enumerated() {
let age = Int(Date().timeIntervalSince(entry.timestamp))
let isCurrent = (i == images.count - 1)
let label = " - Screenshot \(i + 1)/\(total): [\(isCurrent ? "now" : "\(age)s ago")]"
if !entry.summary.isEmpty {
lines.append("\(label) \(entry.summary)")
} else {
lines.append(label)
}
}
}
return lines.isEmpty ? "(no previous context)" : lines.joined(separator: "\n")
}
/// Format the last VLM output for self-refinement injection into the prompt.
func formatLastOutput() -> String {
guard !lastOutput.isEmpty else { return "" }
return """
Your previous analysis (refine or correct this based on new evidence):
\(lastOutput)
If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it.
"""
}
/// Format execution context for injection into the prompt.
func formatLastExecution() -> String {
guard !lastExecution.isEmpty else { return "" }
return """
IMPORTANT An AI agent just completed an action for the user:
\(lastExecution)
This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT.
"""
}
}