new macOS version

2026-04-01 16:10:30 -05:00
parent 56673078f5
commit 483e3c1d00
13 changed files with 2302 additions and 319 deletions
--- a/HistoryBuffer.swift
+++ b/HistoryBuffer.swift
@@ -0,0 +1,189 @@
+// HistoryBuffer.swift — Two-tier rolling history for VLM temporal context
+// Ports Python argus buffer.py: image tier (recent frames) + text tier (older summaries).
+// The VLM sees recent images directly AND gets text context for events 30-60s ago.
+
+import Foundation
+
+/// A single buffered screenshot frame with its VLM summary.
+struct BufferEntry: Sendable {
+    let imageData: Data      // JPEG bytes
+    var summary: String      // VLM-generated summary (populated after analysis)
+    let timestamp: Date
+    var fileUri: String?     // Gemini Files API URI (set async after upload; nil = use inline)
+}
+
+/// A text-only summary from an older analysis (images already evicted).
+struct TextEntry: Sendable {
+    let summary: String
+    let timestamp: Date
+}
+
+/// Two-tier rolling buffer that provides temporal context to the VLM.
+///
+/// - **Image tier:** Last N frames (JPEG + summary + timestamp). Sent as images.
+/// - **Text tier:** Older summaries that rolled off the image buffer. Sent as text.
+/// - **Last output:** Previous VLM JSON result for self-refinement.
+/// - **Last execution:** Executor action summary to prevent re-flagging.
+///
+/// Only accessed from `SessionManager` on the main actor — no concurrent access.
+@MainActor
+final class HistoryBuffer {
+
+    private let imageMaxLen: Int
+    private let textMaxLen: Int
+
+    /// Recent frames — sent as images to the VLM.
+    private(set) var images: [BufferEntry] = []
+
+    /// Older summaries — sent as text context.
+    private(set) var textHistory: [TextEntry] = []
+
+    /// Full VLM JSON output from last analysis (for self-refinement).
+    private(set) var lastOutput: String = ""
+
+    /// Summary of last executor action (prevents re-flagging same friction).
+    private(set) var lastExecution: String = ""
+
+    /// Counter for how many VLM calls since execution was set (clear after 3).
+    private var executionAge: Int = 0
+
+    init(imageMaxLen: Int = 4, textMaxLen: Int = 12) {
+        self.imageMaxLen = imageMaxLen
+        self.textMaxLen = textMaxLen
+    }
+
+    // MARK: - Push / Update
+
+    /// Add a new frame to the image buffer. If the buffer is full, the oldest
+    /// frame's summary is promoted to the text tier before eviction.
+    func push(imageData: Data, summary: String = "") {
+        let entry = BufferEntry(imageData: imageData, summary: summary, timestamp: Date())
+
+        if images.count >= imageMaxLen {
+            // Promote oldest image's summary to text tier (if non-empty)
+            let evicted = images.removeFirst()
+            if !evicted.summary.isEmpty {
+                textHistory.append(TextEntry(summary: evicted.summary, timestamp: evicted.timestamp))
+                if textHistory.count > textMaxLen {
+                    textHistory.removeFirst()
+                }
+            }
+        }
+        images.append(entry)
+    }
+
+    /// Update the summary on the most recent image entry (called after VLM returns).
+    func updateLastSummary(_ summary: String) {
+        guard !images.isEmpty else { return }
+        images[images.count - 1].summary = summary
+    }
+
+    /// Store the Gemini Files API URI for the frame with the given timestamp.
+    /// Called asynchronously after upload completes — safe because pushes happen at 5s intervals.
+    func updateFileUri(_ uri: String, forTimestamp ts: Date) {
+        guard let idx = images.firstIndex(where: { abs($0.timestamp.timeIntervalSince(ts)) < 1.0 }) else { return }
+        images[idx].fileUri = uri
+    }
+
+    /// Store the full VLM JSON output for self-refinement on the next call.
+    func setLastOutput(_ json: String) {
+        lastOutput = json
+    }
+
+    /// Store executor action summary. Cleared automatically after 3 VLM iterations.
+    func setLastExecution(_ summary: String) {
+        lastExecution = summary
+        executionAge = 0
+    }
+
+    /// Tick execution age — call after each VLM analysis. Clears after 3.
+    func tickExecutionAge() {
+        if !lastExecution.isEmpty {
+            executionAge += 1
+            if executionAge >= 3 {
+                lastExecution = ""
+                executionAge = 0
+            }
+        }
+    }
+
+    /// Get all buffered JPEG frames (for sending to VLM as images).
+    var frameData: [Data] {
+        images.map(\.imageData)
+    }
+
+    /// File URIs parallel to frameData — nil means fall back to inline base64 for that frame.
+    var fileUris: [String?] {
+        images.map(\.fileUri)
+    }
+
+    /// Get recent summaries as strings (for recentSummaries parameter).
+    var recentSummaries: [String] {
+        images.compactMap { $0.summary.isEmpty ? nil : $0.summary }
+    }
+
+    /// Clear all state (e.g., on session end).
+    func clear() {
+        images.removeAll()
+        textHistory.removeAll()
+        lastOutput = ""
+        lastExecution = ""
+        executionAge = 0
+    }
+
+    // MARK: - Prompt Formatting
+
+    /// Build the temporal context section for the VLM prompt.
+    /// Returns a formatted string with older text context + image labels.
+    func formatForPrompt() -> String {
+        var lines: [String] = []
+
+        // Older text-only context (no images — just summaries)
+        if !textHistory.isEmpty {
+            lines.append("Older context (text only, no images):")
+            for entry in textHistory {
+                let age = Int(Date().timeIntervalSince(entry.timestamp))
+                lines.append("  - [\(age)s ago] \(entry.summary)")
+            }
+            lines.append("")
+        }
+
+        // Recent image labels (these accompany the actual images sent to the VLM)
+        if !images.isEmpty {
+            let total = images.count
+            lines.append("Recent screenshots (\(total) frames, newest last):")
+            for (i, entry) in images.enumerated() {
+                let age = Int(Date().timeIntervalSince(entry.timestamp))
+                let isCurrent = (i == images.count - 1)
+                let label = "  - Screenshot \(i + 1)/\(total): [\(isCurrent ? "now" : "\(age)s ago")]"
+                if !entry.summary.isEmpty {
+                    lines.append("\(label) \(entry.summary)")
+                } else {
+                    lines.append(label)
+                }
+            }
+        }
+
+        return lines.isEmpty ? "(no previous context)" : lines.joined(separator: "\n")
+    }
+
+    /// Format the last VLM output for self-refinement injection into the prompt.
+    func formatLastOutput() -> String {
+        guard !lastOutput.isEmpty else { return "" }
+        return """
+        Your previous analysis (refine or correct this based on new evidence):
+        \(lastOutput)
+        If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it.
+        """
+    }
+
+    /// Format execution context for injection into the prompt.
+    func formatLastExecution() -> String {
+        guard !lastExecution.isEmpty else { return "" }
+        return """
+        IMPORTANT — An AI agent just completed an action for the user:
+          \(lastExecution)
+        This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT.
+        """
+    }
+}