HistoryBuffer.swift

// HistoryBuffer.swift — Two-tier rolling history for VLM temporal context
// Ports Python argus buffer.py: image tier (recent frames) + text tier (older summaries).
// The VLM sees recent images directly AND gets text context for events 30-60s ago.

import Foundation

/// A single buffered screenshot frame with its VLM summary.
struct BufferEntry: Sendable {
    let imageData: Data      // JPEG bytes
    var summary: String      // VLM-generated summary (populated after analysis)
    let timestamp: Date
    var fileUri: String?     // Gemini Files API URI (set async after upload; nil = use inline)
}

/// A text-only summary from an older analysis (images already evicted).
struct TextEntry: Sendable {
    let summary: String
    let timestamp: Date
}

/// Two-tier rolling buffer that provides temporal context to the VLM.
///
/// - **Image tier:** Last N frames (JPEG + summary + timestamp). Sent as images.
/// - **Text tier:** Older summaries that rolled off the image buffer. Sent as text.
/// - **Last output:** Previous VLM JSON result for self-refinement.
/// - **Last execution:** Executor action summary to prevent re-flagging.
///
/// Only accessed from `SessionManager` on the main actor — no concurrent access.
@MainActor
final class HistoryBuffer {

    private let imageMaxLen: Int
    private let textMaxLen: Int

    /// Recent frames — sent as images to the VLM.
    private(set) var images: [BufferEntry] = []

    /// Older summaries — sent as text context.
    private(set) var textHistory: [TextEntry] = []

    /// Full VLM JSON output from last analysis (for self-refinement).
    private(set) var lastOutput: String = ""

    /// Summary of last executor action (prevents re-flagging same friction).
    private(set) var lastExecution: String = ""

    /// Counter for how many VLM calls since execution was set (clear after 3).
    private var executionAge: Int = 0

    init(imageMaxLen: Int = 4, textMaxLen: Int = 12) {
        self.imageMaxLen = imageMaxLen
        self.textMaxLen = textMaxLen
    }

    // MARK: - Push / Update

    /// Add a new frame to the image buffer. If the buffer is full, the oldest
    /// frame's summary is promoted to the text tier before eviction.
    func push(imageData: Data, summary: String = "") {
        let entry = BufferEntry(imageData: imageData, summary: summary, timestamp: Date())

        if images.count >= imageMaxLen {
            // Promote oldest image's summary to text tier (if non-empty)
            let evicted = images.removeFirst()
            if !evicted.summary.isEmpty {
                textHistory.append(TextEntry(summary: evicted.summary, timestamp: evicted.timestamp))
                if textHistory.count > textMaxLen {
                    textHistory.removeFirst()
                }
            }
        }
        images.append(entry)
    }

    /// Update the summary on the most recent image entry (called after VLM returns).
    func updateLastSummary(_ summary: String) {
        guard !images.isEmpty else { return }
        images[images.count - 1].summary = summary
    }

    /// Store the Gemini Files API URI for the frame with the given timestamp.
    /// Called asynchronously after upload completes — safe because pushes happen at 5s intervals.
    func updateFileUri(_ uri: String, forTimestamp ts: Date) {
        guard let idx = images.firstIndex(where: { abs($0.timestamp.timeIntervalSince(ts)) < 1.0 }) else { return }
        images[idx].fileUri = uri
    }

    /// Store the full VLM JSON output for self-refinement on the next call.
    func setLastOutput(_ json: String) {
        lastOutput = json
    }

    /// Store executor action summary. Cleared automatically after 3 VLM iterations.
    func setLastExecution(_ summary: String) {
        lastExecution = summary
        executionAge = 0
    }

    /// Tick execution age — call after each VLM analysis. Clears after 3.
    func tickExecutionAge() {
        if !lastExecution.isEmpty {
            executionAge += 1
            if executionAge >= 3 {
                lastExecution = ""
                executionAge = 0
            }
        }
    }

    /// Get all buffered JPEG frames (for sending to VLM as images).
    var frameData: [Data] {
        images.map(\.imageData)
    }

    /// File URIs parallel to frameData — nil means fall back to inline base64 for that frame.
    var fileUris: [String?] {
        images.map(\.fileUri)
    }

    /// Get recent summaries as strings (for recentSummaries parameter).
    var recentSummaries: [String] {
        images.compactMap { $0.summary.isEmpty ? nil : $0.summary }
    }

    /// Clear all state (e.g., on session end).
    func clear() {
        images.removeAll()
        textHistory.removeAll()
        lastOutput = ""
        lastExecution = ""
        executionAge = 0
    }

    // MARK: - Prompt Formatting

    /// Build the temporal context section for the VLM prompt.
    /// Returns a formatted string with older text context + image labels.
    func formatForPrompt() -> String {
        var lines: [String] = []

        // Older text-only context (no images — just summaries)
        if !textHistory.isEmpty {
            lines.append("Older context (text only, no images):")
            for entry in textHistory {
                let age = Int(Date().timeIntervalSince(entry.timestamp))
                lines.append("  - [\(age)s ago] \(entry.summary)")
            }
            lines.append("")
        }

        // Recent image labels (these accompany the actual images sent to the VLM)
        if !images.isEmpty {
            let total = images.count
            lines.append("Recent screenshots (\(total) frames, newest last):")
            for (i, entry) in images.enumerated() {
                let age = Int(Date().timeIntervalSince(entry.timestamp))
                let isCurrent = (i == images.count - 1)
                let label = "  - Screenshot \(i + 1)/\(total): [\(isCurrent ? "now" : "\(age)s ago")]"
                if !entry.summary.isEmpty {
                    lines.append("\(label) \(entry.summary)")
                } else {
                    lines.append(label)
                }
            }
        }

        return lines.isEmpty ? "(no previous context)" : lines.joined(separator: "\n")
    }

    /// Format the last VLM output for self-refinement injection into the prompt.
    func formatLastOutput() -> String {
        guard !lastOutput.isEmpty else { return "" }
        return """
        Your previous analysis (refine or correct this based on new evidence):
        \(lastOutput)
        If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it.
        """
    }

    /// Format execution context for injection into the prompt.
    func formatLastExecution() -> String {
        guard !lastExecution.isEmpty else { return "" }
        return """
        IMPORTANT — An AI agent just completed an action for the user:
          \(lastExecution)
        This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT.
        """
    }
}
new macOS version 2026-04-01 16:10:30 -05:00			`// HistoryBuffer.swift — Two-tier rolling history for VLM temporal context`
			`// Ports Python argus buffer.py: image tier (recent frames) + text tier (older summaries).`
			`// The VLM sees recent images directly AND gets text context for events 30-60s ago.`

			`import Foundation`

			`/// A single buffered screenshot frame with its VLM summary.`
			`struct BufferEntry: Sendable {`
			`let imageData: Data // JPEG bytes`
			`var summary: String // VLM-generated summary (populated after analysis)`
			`let timestamp: Date`
			`var fileUri: String? // Gemini Files API URI (set async after upload; nil = use inline)`
			`}`

			`/// A text-only summary from an older analysis (images already evicted).`
			`struct TextEntry: Sendable {`
			`let summary: String`
			`let timestamp: Date`
			`}`

			`/// Two-tier rolling buffer that provides temporal context to the VLM.`
			`///`
			`/// - Image tier: Last N frames (JPEG + summary + timestamp). Sent as images.`
			`/// - Text tier: Older summaries that rolled off the image buffer. Sent as text.`
			`/// - Last output: Previous VLM JSON result for self-refinement.`
			`/// - Last execution: Executor action summary to prevent re-flagging.`
			`///`
			/// Only accessed from `SessionManager` on the main actor — no concurrent access.
			`@MainActor`
			`final class HistoryBuffer {`

			`private let imageMaxLen: Int`
			`private let textMaxLen: Int`

			`/// Recent frames — sent as images to the VLM.`
			`private(set) var images: [BufferEntry] = []`

			`/// Older summaries — sent as text context.`
			`private(set) var textHistory: [TextEntry] = []`

			`/// Full VLM JSON output from last analysis (for self-refinement).`
			`private(set) var lastOutput: String = ""`

			`/// Summary of last executor action (prevents re-flagging same friction).`
			`private(set) var lastExecution: String = ""`

			`/// Counter for how many VLM calls since execution was set (clear after 3).`
			`private var executionAge: Int = 0`

			`init(imageMaxLen: Int = 4, textMaxLen: Int = 12) {`
			`self.imageMaxLen = imageMaxLen`
			`self.textMaxLen = textMaxLen`
			`}`

			`// MARK: - Push / Update`

			`/// Add a new frame to the image buffer. If the buffer is full, the oldest`
			`/// frame's summary is promoted to the text tier before eviction.`
			`func push(imageData: Data, summary: String = "") {`
			`let entry = BufferEntry(imageData: imageData, summary: summary, timestamp: Date())`

			`if images.count >= imageMaxLen {`
			`// Promote oldest image's summary to text tier (if non-empty)`
			`let evicted = images.removeFirst()`
			`if !evicted.summary.isEmpty {`
			`textHistory.append(TextEntry(summary: evicted.summary, timestamp: evicted.timestamp))`
			`if textHistory.count > textMaxLen {`
			`textHistory.removeFirst()`
			`}`
			`}`
			`}`
			`images.append(entry)`
			`}`

			`/// Update the summary on the most recent image entry (called after VLM returns).`
			`func updateLastSummary(_ summary: String) {`
			`guard !images.isEmpty else { return }`
			`images[images.count - 1].summary = summary`
			`}`

			`/// Store the Gemini Files API URI for the frame with the given timestamp.`
			`/// Called asynchronously after upload completes — safe because pushes happen at 5s intervals.`
			`func updateFileUri(_ uri: String, forTimestamp ts: Date) {`
			`guard let idx = images.firstIndex(where: { abs($0.timestamp.timeIntervalSince(ts)) < 1.0 }) else { return }`
			`images[idx].fileUri = uri`
			`}`

			`/// Store the full VLM JSON output for self-refinement on the next call.`
			`func setLastOutput(_ json: String) {`
			`lastOutput = json`
			`}`

			`/// Store executor action summary. Cleared automatically after 3 VLM iterations.`
			`func setLastExecution(_ summary: String) {`
			`lastExecution = summary`
			`executionAge = 0`
			`}`

			`/// Tick execution age — call after each VLM analysis. Clears after 3.`
			`func tickExecutionAge() {`
			`if !lastExecution.isEmpty {`
			`executionAge += 1`
			`if executionAge >= 3 {`
			`lastExecution = ""`
			`executionAge = 0`
			`}`
			`}`
			`}`

			`/// Get all buffered JPEG frames (for sending to VLM as images).`
			`var frameData: [Data] {`
			`images.map(\.imageData)`
			`}`

			`/// File URIs parallel to frameData — nil means fall back to inline base64 for that frame.`
			`var fileUris: [String?] {`
			`images.map(\.fileUri)`
			`}`

			`/// Get recent summaries as strings (for recentSummaries parameter).`
			`var recentSummaries: [String] {`
			`images.compactMap { $0.summary.isEmpty ? nil : $0.summary }`
			`}`

			`/// Clear all state (e.g., on session end).`
			`func clear() {`
			`images.removeAll()`
			`textHistory.removeAll()`
			`lastOutput = ""`
			`lastExecution = ""`
			`executionAge = 0`
			`}`

			`// MARK: - Prompt Formatting`

			`/// Build the temporal context section for the VLM prompt.`
			`/// Returns a formatted string with older text context + image labels.`
			`func formatForPrompt() -> String {`
			`var lines: [String] = []`

			`// Older text-only context (no images — just summaries)`
			`if !textHistory.isEmpty {`
			`lines.append("Older context (text only, no images):")`
			`for entry in textHistory {`
			`let age = Int(Date().timeIntervalSince(entry.timestamp))`
			`lines.append(" - [\(age)s ago] \(entry.summary)")`
			`}`
			`lines.append("")`
			`}`

			`// Recent image labels (these accompany the actual images sent to the VLM)`
			`if !images.isEmpty {`
			`let total = images.count`
			`lines.append("Recent screenshots (\(total) frames, newest last):")`
			`for (i, entry) in images.enumerated() {`
			`let age = Int(Date().timeIntervalSince(entry.timestamp))`
			`let isCurrent = (i == images.count - 1)`
			`let label = " - Screenshot \(i + 1)/\(total): [\(isCurrent ? "now" : "\(age)s ago")]"`
			`if !entry.summary.isEmpty {`
			`lines.append("\(label) \(entry.summary)")`
			`} else {`
			`lines.append(label)`
			`}`
			`}`
			`}`

			`return lines.isEmpty ? "(no previous context)" : lines.joined(separator: "\n")`
			`}`

			`/// Format the last VLM output for self-refinement injection into the prompt.`
			`func formatLastOutput() -> String {`
			`guard !lastOutput.isEmpty else { return "" }`
			`return """`
			`Your previous analysis (refine or correct this based on new evidence):`
			`\(lastOutput)`
			`If your previous analysis was wrong or incomplete, correct it now. If it was accurate, build on it.`
			`"""`
			`}`

			`/// Format execution context for injection into the prompt.`
			`func formatLastExecution() -> String {`
			`guard !lastExecution.isEmpty else { return "" }`
			`return """`
			`IMPORTANT — An AI agent just completed an action for the user:`
			`\(lastExecution)`
			`This task is DONE. Do not re-flag the same friction. Look for what the user does NEXT.`
			`"""`
			`}`
			`}`