new macOS version
This commit is contained in:
189
HistoryBuffer.swift
Normal file
189
HistoryBuffer.swift
Normal file
@@ -0,0 +1,189 @@
|
||||
// HistoryBuffer.swift — Two-tier rolling history for VLM temporal context
|
||||
// Ports Python argus buffer.py: image tier (recent frames) + text tier (older summaries).
|
||||
// The VLM sees recent images directly AND gets text context for events 30-60s ago.
|
||||
|
||||
import Foundation
|
||||
|
||||
/// A single buffered screenshot frame with its VLM summary.
|
||||
struct BufferEntry: Sendable {
|
||||
let imageData: Data // JPEG bytes
|
||||
var summary: String // VLM-generated summary (populated after analysis)
|
||||
let timestamp: Date
|
||||
var fileUri: String? // Gemini Files API URI (set async after upload; nil = use inline)
|
||||
}
|
||||
|
||||
/// A text-only summary from an older analysis (images already evicted).
|
||||
struct TextEntry: Sendable {
|
||||
let summary: String
|
||||
let timestamp: Date
|
||||
}
|
||||
|
||||
/// Two-tier rolling buffer that provides temporal context to the VLM.
|
||||
///
|
||||
/// - **Image tier:** Last N frames (JPEG + summary + timestamp). Sent as images.
|
||||
/// - **Text tier:** Older summaries that rolled off the image buffer. Sent as text.
|
||||
/// - **Last output:** Previous VLM JSON result for self-refinement.
|
||||
/// - **Last execution:** Executor action summary to prevent re-flagging.
|
||||
///
|
||||
/// Only accessed from `SessionManager` on the main actor — no concurrent access.
|
||||
@MainActor
|
||||
final class HistoryBuffer {
|
||||
|
||||
private let imageMaxLen: Int
|
||||
private let textMaxLen: Int
|
||||
|
||||
/// Recent frames — sent as images to the VLM.
|
||||
private(set) var images: [BufferEntry] = []
|
||||
|
||||
/// Older summaries — sent as text context.
|
||||
private(set) var textHistory: [TextEntry] = []
|
||||
|
||||
/// Full VLM JSON output from last analysis (for self-refinement).
|
||||
private(set) var lastOutput: String = ""
|
||||
|
||||
/// Summary of last executor action (prevents re-flagging same friction).
|
||||
private(set) var lastExecution: String = ""
|
||||
|
||||
/// Counter for how many VLM calls since execution was set (clear after 3).
|
||||
private var executionAge: Int = 0
|
||||
|
||||
init(imageMaxLen: Int = 4, textMaxLen: Int = 12) {
|
||||
self.imageMaxLen = imageMaxLen
|
||||
self.textMaxLen = textMaxLen
|
||||
}
|
||||
|
||||
// MARK: - Push / Update
|
||||
|
||||
/// Add a new frame to the image buffer. If the buffer is full, the oldest
|
||||
/// frame's summary is promoted to the text tier before eviction.
|
||||
func push(imageData: Data, summary: String = "") {
|
||||
let entry = BufferEntry(imageData: imageData, summary: summary, timestamp: Date())
|
||||
|
||||
if images.count >= imageMaxLen {
|
||||
// Promote oldest image's summary to text tier (if non-empty)
|
||||
let evicted = images.removeFirst()
|
||||
if !evicted.summary.isEmpty {
|
||||
textHistory.append(TextEntry(summary: evicted.summary, timestamp: evicted.timestamp))
|
||||
if textHistory.count > textMaxLen {
|
||||
textHistory.removeFirst()
|
||||
}
|
||||
}
|
||||
}
|
||||
images.append(entry)
|
||||
}
|
||||
|
||||
/// Update the summary on the most recent image entry (called after VLM returns).
|
||||
func updateLastSummary(_ summary: String) {
|
||||
guard !images.isEmpty else { return }
|
||||
images[images.count - 1].summary = summary
|
||||
}
|
||||
|
||||
/// Store the Gemini Files API URI for the frame with the given timestamp.
|
||||
/// Called asynchronously after upload completes — safe because pushes happen at 5s intervals.
|
||||
func updateFileUri(_ uri: String, forTimestamp ts: Date) {
|
||||
guard let idx = images.firstIndex(where: { abs($0.timestamp.timeIntervalSince(ts)) < 1.0 }) else { return }
|
||||
images[idx].fileUri = uri
|
||||
}
|
||||
|
||||
/// Store the full VLM JSON output for self-refinement on the next call.
|
||||
func setLastOutput(_ json: String) {
|
||||
lastOutput = json
|
||||
}
|
||||
|
||||
/// Store executor action summary. Cleared automatically after 3 VLM iterations.
|
||||
func setLastExecution(_ summary: String) {
|
||||
lastExecution = summary
|
||||
executionAge = 0
|
||||
}
|
||||
|
||||
/// Tick execution age — call after each VLM analysis. Clears after 3.
|
||||
func tickExecutionAge() {
|
||||
if !lastExecution.isEmpty {
|
||||
executionAge += 1
|
||||
if executionAge >= 3 {
|
||||
lastExecution = ""
|
||||
executionAge = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get all buffered JPEG frames (for sending to VLM as images).
|
||||
var frameData: [Data] {
|
||||
images.map(\.imageData)
|
||||
}
|
||||
|
||||
/// File URIs parallel to frameData — nil means fall back to inline base64 for that frame.
|
||||
var fileUris: [String?] {
|
||||
images.map(\.fileUri)
|
||||
}
|
||||
|
||||
/// Get recent summaries as strings (for recentSummaries parameter).
|
||||
var recentSummaries: [String] {
|
||||
images.compactMap { $0.summary.isEmpty ? nil : $0.summary }
|
||||
}
|
||||
|
||||
/// Clear all state (e.g., on session end).
|
||||
func clear() {
|
||||
images.removeAll()
|
||||
textHistory.removeAll()
|
||||
lastOutput = ""
|
||||
lastExecution = ""
|
||||
executionAge = 0
|
||||
}
|
||||
|
||||
// MARK: - Prompt Formatting
|
||||
|
||||
/// Build the temporal context section for the VLM prompt.
|
||||
/// Returns a formatted string with older text context + image labels.
|
||||
func formatForPrompt() -> String {
|
||||
var lines: [String] = []
|
||||
|
||||
// Older text-only context (no images — just summaries)
|
||||
if !textHistory.isEmpty {
|
||||
lines.append("Older context (text only, no images):")
|
||||
for entry in textHistory {
|
||||
let age = Int(Date().timeIntervalSince(entry.timestamp))
|
||||
lines.append(" - [\(age)s ago] \(entry.summary)")
|
||||
}
|
||||
lines.append("")
|
||||
}
|
||||
|
||||
// Recent image labels (these accompany the actual images sent to the VLM)
|
||||
if !images.isEmpty {
|
||||
let total = images.count
|
||||
lines.append("Recent screenshots (\(total) frames, newest last):")
|
||||
for (i, entry) in images.enumerated() {
|
||||
let age = Int(Date().timeIntervalSince(entry.timestamp))
|
||||
let isCurrent = (i == images.count - 1)
|
||||
let label = " - Screenshot \(i + 1)/\(total): [\(isCurrent ? "now" : "\(age)s ago")]"
|
||||
if !entry.summary.isEmpty {
|
||||
lines.append("\(label) \(entry.summary)")
|
||||
} else {
|
||||
lines.append(label)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return lines.isEmpty ? "(no previous context)" : lines.joined(separator: "\n")
|
||||
}
|
||||
|
||||
/// Format the last VLM output for self-refinement injection into the prompt.
|
||||
func formatLastOutput() -> String {
|
||||
guard !lastOutput.isEmpty else { return "" }
|
||||
return """
|
||||
Your previous analysis (refine or correct this based on new evidence):
|
||||
\(lastOutput)
|
||||
If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it.
|
||||
"""
|
||||
}
|
||||
|
||||
/// Format execution context for injection into the prompt.
|
||||
func formatLastExecution() -> String {
|
||||
guard !lastExecution.isEmpty else { return "" }
|
||||
return """
|
||||
IMPORTANT — An AI agent just completed an action for the user:
|
||||
\(lastExecution)
|
||||
This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT.
|
||||
"""
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user