2026-04-01 16:10:30 -05:00
// G e m i n i V L M C l i e n t . s w i f t — N a t i v e S w i f t G e m i n i V i s i o n A P I c l i e n t + A g e n t i c E x e c u t o r
// P o r t s t h e P y t h o n a r g u s V L M a n a l y s i s ( v l m . p y ) a n d e x e c u t o r ( e x e c u t o r . p y ) i n t o S w i f t .
2026-03-29 06:29:18 -04:00
// N o s u b p r o c e s s r e q u i r e d : s c r e e n s h o t s g o s t r a i g h t f r o m S c r e e n C a p t u r e K i t → G e m i n i → U I .
import Foundation
struct GeminiVLMClient {
private static let apiBase = " https://generativelanguage.googleapis.com/v1beta/models "
2026-04-01 16:10:30 -05:00
private static let analysisModel = " gemini-3-flash-preview "
private static let executorModel = " gemini-3-flash-preview "
2026-03-29 06:29:18 -04:00
let apiKey : String
2026-04-01 16:10:30 -05:00
// MARK: - F i l e s A P I U p l o a d
// / U p l o a d a s i n g l e J P E G f r a m e t o t h e G e m i n i F i l e s A P I .
// / R e t u r n s t h e f i l e U R I w h i c h c a n b e r e u s e d i n s u b s e q u e n t V L M r e q u e s t s ,
// / a v o i d i n g r e d u n d a n t b a s e 6 4 r e - e n c o d i n g o f f r a m e s a l r e a d y s e e n b y t h e m o d e l .
func uploadFrame ( _ data : Data ) async throws -> String {
let urlStr = " https://generativelanguage.googleapis.com/upload/v1beta/files?uploadType=multipart&key= \( apiKey ) "
guard let url = URL ( string : urlStr ) else { throw URLError ( . badURL ) }
let boundary = " frameboundary- \( UUID ( ) . uuidString . prefix ( 16 ) ) "
var body = Data ( )
let meta = " { \" file \" :{ \" display_name \" : \" frame \" }} "
body . append ( " -- \( boundary ) \r \n Content-Type: application/json; charset=UTF-8 \r \n \r \n \( meta ) \r \n " . data ( using : . utf8 ) ! )
body . append ( " -- \( boundary ) \r \n Content-Type: image/jpeg \r \n \r \n " . data ( using : . utf8 ) ! )
body . append ( data )
body . append ( " \r \n -- \( boundary ) -- \r \n " . data ( using : . utf8 ) ! )
var request = URLRequest ( url : url )
request . httpMethod = " POST "
request . setValue ( " multipart/related; boundary= \( boundary ) " , forHTTPHeaderField : " Content-Type " )
request . httpBody = body
request . timeoutInterval = 30
let ( responseData , response ) = try await URLSession . shared . data ( for : request )
if let http = response as ? HTTPURLResponse , http . statusCode != 200 {
let msg = String ( data : responseData , encoding : . utf8 ) ? ? " HTTP \( http . statusCode ) "
print ( " [GeminiFiles] Upload failed \( http . statusCode ) : \( msg . prefix ( 200 ) ) " )
throw URLError ( . badServerResponse )
}
guard let json = try JSONSerialization . jsonObject ( with : responseData ) as ? [ String : Any ] ,
let file = json [ " file " ] as ? [ String : Any ] ,
let uri = file [ " uri " ] as ? String
else {
let raw = String ( data : responseData , encoding : . utf8 ) ? ? " "
print ( " [GeminiFiles] Unexpected upload response: \( raw . prefix ( 200 ) ) " )
throw URLError ( . cannotParseResponse )
}
print ( " [GeminiFiles] Uploaded \( data . count / 1024 ) KB → \( uri . suffix ( 20 ) ) " )
return uri
}
// MARK: - V L M A n a l y s i s
2026-03-29 06:29:18 -04:00
// / A n a l y z e a s e q u e n c e o f J P E G f r a m e s a n d r e t u r n a s t r u c t u r e d d i s t r a c t i o n a n a l y s i s .
2026-04-01 16:10:30 -05:00
// / P a s s ` f i l e U r i s ` ( p a r a l l e l t o ` f r a m e s ` ) t o u s e G e m i n i F i l e s A P I U R I s f o r f r a m e s t h a t
// / w e r e a l r e a d y u p l o a d e d — a v o i d s r e - s e n d i n g b a s e 6 4 f o r t h e 3 f r a m e s c a r r i e d o v e r f r o m
// / t h e p r e v i o u s r o l l i n g - w i n d o w c a l l . N i l e n t r i e s f a l l b a c k t o i n l i n e b a s e 6 4 .
2026-03-29 06:29:18 -04:00
func analyze (
frames : [ Data ] ,
2026-04-01 16:10:30 -05:00
fileUris : [ String ? ] = [ ] ,
2026-03-29 06:29:18 -04:00
taskTitle : String ,
taskGoal : String ,
steps : [ Step ] ,
windowTitle : String ,
2026-04-01 16:10:30 -05:00
historyContext : String ,
sessionContext : String ,
lastOutputContext : String ,
executionContext : String
2026-03-29 06:29:18 -04:00
) async throws -> DistractionAnalysisResponse {
let prompt = buildPrompt (
taskTitle : taskTitle ,
taskGoal : taskGoal ,
steps : steps ,
windowTitle : windowTitle ,
2026-04-01 16:10:30 -05:00
historyContext : historyContext ,
sessionContext : sessionContext ,
lastOutputContext : lastOutputContext ,
executionContext : executionContext
2026-03-29 06:29:18 -04:00
)
2026-04-01 16:10:30 -05:00
let raw = try await callGemini ( prompt : prompt , frames : frames , fileUris : fileUris , maxOutputTokens : 1024 )
2026-03-29 06:29:18 -04:00
return try parseResponse ( raw )
}
// MARK: - P r o m p t B u i l d e r ( p o r t e d f r o m v l m . p y b u i l d _ s y s t e m _ p r o m p t )
private func buildPrompt (
taskTitle : String ,
taskGoal : String ,
steps : [ Step ] ,
windowTitle : String ,
2026-04-01 16:10:30 -05:00
historyContext : String ,
sessionContext : String ,
lastOutputContext : String ,
executionContext : String
2026-03-29 06:29:18 -04:00
) -> String {
let stepsText : String
if steps . isEmpty {
2026-04-01 16:10:30 -05:00
stepsText = " (no steps) "
2026-03-29 06:29:18 -04:00
} else {
stepsText = steps . map { s in
let marker : String
switch s . status {
case " pending " : marker = " ○ "
case " in_progress " : marker = " ► "
case " done " : marker = " ✓ "
default : marker = " ? "
}
var line = " \( marker ) [ \( s . status ) ] (id= \( s . id ) ) \( s . sortOrder ) . \( s . title ) "
if let note = s . checkpointNote { line += " — checkpoint: \( note ) " }
return line
} . joined ( separator : " \n " )
}
2026-04-01 16:10:30 -05:00
let sessionSection = sessionContext . isEmpty
? " (no open sessions — suggest start_new if user is actively working on something) "
: sessionContext
let prevSection = lastOutputContext . isEmpty ? " " : " \n \( lastOutputContext ) "
let execSection = executionContext . isEmpty ? " " : " \n \( executionContext ) "
2026-03-29 06:29:18 -04:00
return " " "
You are a proactive focus assistant analyzing a TIME SEQUENCE of screenshots .
# # How to read the screenshots
You receive screenshots in chronological order ( oldest first , newest last ) .
2026-04-01 16:10:30 -05:00
You receive ~ 4 frames spanning ~ 20 seconds ( one frame every 5 seconds ) . This means :
- 2 unchanged frames = 10 + seconds idle . That ' s significant .
- 3 + unchanged frames = 15 - 20 seconds idle . The user is stuck or distracted .
- If ALL frames are identical , the user has been idle for 20 seconds — definitely flag it .
- If the user wrote code / text and then 2 + frames show no changes , they are STUCK NOW .
Do NOT wait for many frames to flag problems . React fast .
2026-03-29 06:29:18 -04:00
Your PRIMARY signal is the DIFFERENCES between consecutive frames .
2026-04-01 16:10:30 -05:00
Where the screen CHANGED = where the user ' s ATTENTION is .
Where the screen is STATIC = background noise . Ignore it .
2026-03-29 06:29:18 -04:00
Diff signals and what they mean :
2026-04-01 16:10:30 -05:00
- New text appearing / cursor advancing → user is actively typing ( THIS is their task )
2026-03-29 06:29:18 -04:00
- Window or tab switch → context change , could be reference or distraction
- Same content , no pixel changes → stalled , idle , or reading
- Repeated switching between same 2 - 3 apps → repetitive loop ( manual data transfer )
2026-04-01 16:10:30 -05:00
- Scroll position change → reading or browsing
2026-03-29 06:29:18 -04:00
- Error message that APPEARED between frames → user just triggered it , relevant
2026-04-01 16:10:30 -05:00
- Error message that was ALREADY THERE in all frames → stale , ignore it
2026-03-29 06:29:18 -04:00
CRITICAL — looking at something ≠ working on something :
- User switches to browser / another app and just LOOKS → distraction or quick reference .
- User switches and starts TYPING / EDITING → might be a new task .
- If the user has an active session and switches away WITHOUT typing in the new app ,
they are DISTRACTED from their session , not starting a new task .
2026-04-01 16:10:30 -05:00
- Only infer a new task when there is clear evidence of productive work ( typing , editing ,
cursor movement between frames ) in the new context .
2026-03-29 06:29:18 -04:00
- A single app switch is NEVER enough to infer a new task . Wait for active work .
2026-04-01 16:10:30 -05:00
# # Current state : \ ( taskTitle . isEmpty ? " MONITORING MODE (no active focus session) " : " FOCUS SESSION on \" \( taskTitle ) \" " )
2026-03-29 06:29:18 -04:00
2026-04-01 16:10:30 -05:00
\ ( taskTitle . isEmpty ? " " : " Task: \( taskTitle ) \n Goal: \( taskGoal . isEmpty ? taskTitle : taskGoal ) \n Steps: \n \( stepsText ) " )
2026-03-29 06:29:18 -04:00
Window title ( OS ) : \ ( windowTitle . isEmpty ? " (unknown) " : windowTitle )
2026-04-01 16:10:30 -05:00
\ ( taskTitle . isEmpty ? " " "
You are in MONITORING MODE — no focus session is active .
Rules for monitoring mode :
- NEVER send notification type " nudge " . Nudges are only for active focus sessions .
- Instead , suggest session_action : start_new or resume if the user is actively working .
- If the user is browsing , idle , or doing casual stuff , set notification type " none " .
- Do NOT nag the user about incomplete tasks . Only suggest sessions when you see ACTIVE WORK .
" " " : " " "
IMPORTANT — Do NOT force - fit everything to the current task :
- The current task is what the user WAS working on . They may have MOVED ON .
- If the screen shows UNRELATED work ( different app , different topic , different file ) ,
the user is NOT on this task . Set on_task : false .
- If the user has been doing unrelated work for multiple frames , suggest
session_action : complete ( they ' re done ) or session_action : start_new ( new work ) .
- Do NOT interpret browsing YouTube , checking email , or working on a different project
as " related to " the current task just because a session is active .
- Your job is to OBSERVE what the user is doing , not to anchor to the current task .
" " " )
# # Open sessions and tasks from backend ( use EXACT IDs below )
\ ( sessionSection )
Session & task matching rules :
- A session matches ONLY if the user is actively EDITING the session ' s last_file .
Being in the same app ( e . g . VS Code ) is NOT enough — must be typing / editing the specific file .
- If the session ' s file IS being actively edited → session_action : resume with EXACT session_id .
- If the user moved to a different open session ' s file → session_action : switch with EXACT session_id .
- If the session ' s task appears DONE → session_action : complete with EXACT session_id .
Completion = the task ' s GOAL is visibly achieved on screen , NOT " all steps checked off. "
Steps are AI - generated approximations . A commit , successful build , or " fixed " message
means the task is done regardless of how many steps are still marked pending .
- If the user is working on something matching an UNSTARTED TASK ( listed above with task_id ) ,
output session_action : start_new with task_id set to that task ' s ID . This starts a session
linked to the existing task instead of creating a new one .
- If the user is working on something that matches NO existing session or task ,
output session_action : start_new with session_id : null AND task_id : null .
- NEVER invent IDs . Use only the IDs listed above or null .
\ ( prevSection ) \ ( execSection )
# # Recent screen history ( temporal context )
\ ( historyContext )
# # What to analyze
1. INFERRED TASK : What is the user working on right now ? Base this on where pixels changed .
2. CHECKPOINT : What specific progress did the user make across these frames ?
3. STEP COMPLETION — be AGGRESSIVE about marking steps done :
- Steps are AI - generated APPROXIMATIONS , not a rigid checklist .
- The user might solve the entire task in fewer steps than listed .
- If the screen shows the task ' s GOAL is achieved ( e . g . , code compiles , commit succeeded ,
file is saved , output looks correct ) , mark ALL remaining steps as done via steps_completed .
- Look for completion signals : " committed " , " fixed " , " done " , " success " , green checkmarks ,
successful build output , " pushed " , merged PR , closed issue .
- A single action ( like an AI agent fixing a bug ) can complete multiple steps at once .
- When in doubt about whether a step is done , CHECK THE SCREEN — if the end result is
visible and correct , the intermediate steps don ' t matter .
4. TASK / SESSION COMPLETION — detect when the WHOLE task is done :
- If you can see the task ' s goal is achieved on screen , output session_action : complete .
- Do NOT wait for all steps to be individually checked off . Steps are suggestions .
- Completion signals : successful commit / push , " fixed " , moving on to unrelated work ,
closing the relevant files , terminal showing success .
- If an AI agent ( like Claude Code ) just solved the problem and committed , the task is DONE .
5. FRICTION DETECTION : Is the user stuck in any of these patterns ?
- REPETITIVE_LOOP : Switching between same 2 - 3 windows ( copying data manually )
- STALLED : No meaningful pixel changes across 2 + frames , OR user wrote then deleted / undid
( write - then - delete = struggle , NOT " refining " )
- TEDIOUS_MANUAL : Doing automatable work ( filling forms , transcribing , copying by hand )
- CONTEXT_OVERHEAD : Many windows open , visibly searching across them
- TASK_RESUMPTION : User just returned to a task from earlier
IMPORTANT signals to catch IMMEDIATELY :
- User wrote code / text then deleted it → STUCK . Flag stalled .
- User switching between source doc and target file repeatedly → TEDIOUS_MANUAL .
Flag it on the SECOND switch . Don ' t wait .
6. NOTIFICATION : Decide what to show the user :
- " none " — user is productively working
- " nudge " — user is idle / distracted , set message to a short reminder
- " friction " — user is stuck and an AI agent can take a concrete action
ONLY use " friction " when proposed_actions has a specific , executable task with a target
7. PROPOSED ACTION ( only when notification . type = " friction " ) :
The " details " field is the executor agent ' s full instruction :
Bad : " Extract data from the document "
Good : " User is copying table values from a PDF into markdown. Extract the table from the PDF
( visible in screenshots ) , format as a markdown table matching the style already in the
file , and append to report . md . The user has been writing plain text tables — match that style . "
Respond ONLY with JSON ( no markdown fences ) :
2026-03-29 06:29:18 -04:00
{
" on_task " : true ,
" current_step_id " : " step UUID or null " ,
2026-04-01 16:10:30 -05:00
" inferred_task " : " what the user is actually working on, based on screen diffs " ,
" checkpoint_note_update " : " what changed across these frames specifically " ,
2026-03-29 06:29:18 -04:00
" steps_completed " : [ ] ,
" friction " : {
" type " : " repetitive_loop | stalled | tedious_manual | context_overhead | task_resumption | none " ,
" confidence " : 0.0 ,
2026-04-01 16:10:30 -05:00
" description " : " what the user is struggling with, based on diff evidence " ,
2026-03-29 06:29:18 -04:00
" proposed_actions " : [
{
2026-04-01 16:10:30 -05:00
" label " : " specific verb phrase the user can approve with one tap " ,
" details " : " Natural language spec: (1) what to do, (2) where to look in screenshots, (3) EXACT format matching what the user already wrote, (4) target file. Concrete enough for an agent to execute without asking questions. "
2026-03-29 06:29:18 -04:00
}
] ,
2026-04-01 16:10:30 -05:00
" source_context " : " filename if visible, or app name " ,
" target_context " : " filename if visible, or app name "
2026-03-29 06:29:18 -04:00
} ,
" session_action " : {
2026-04-01 16:10:30 -05:00
" type " : " resume | switch | complete | start_new | none " ,
" session_id " : " uuid of matching session, or null for start_new/none " ,
" task_id " : " uuid of matching unstarted task (for start_new only), or null " ,
" reason " : " why this session action is suggested "
} ,
" notification " : {
" type " : " none | nudge | friction " ,
" message " : " nudge text if type=nudge, null otherwise "
2026-03-29 06:29:18 -04:00
} ,
" intent " : " skimming | engaged | unclear | null " ,
" distraction_type " : " app_switch | browsing | idle | null " ,
" app_name " : " primary visible application " ,
" confidence " : 0.8 ,
" vlm_summary " : " 1-sentence description of what CHANGED across the frames (not what is static) "
}
" " "
}
2026-04-01 16:10:30 -05:00
// MARK: - A g e n t i c E x e c u t o r ( p o r t e d f r o m e x e c u t o r . p y )
2026-03-29 06:29:18 -04:00
2026-04-01 16:10:30 -05:00
// / E x e c u t e a u s e r - a p p r o v e d p r o a c t i v e a c t i o n u s i n g a m u l t i - s t e p a g e n t l o o p
// / w i t h G e m i n i f u n c t i o n c a l l i n g . R e t u r n s t h e f i n a l o u t p u t / s u m m a r y .
2026-03-29 06:29:18 -04:00
func executeAction (
label : String ,
details : String ,
2026-04-01 16:10:30 -05:00
frames : [ Data ] ,
onToolCall : ( @ Sendable ( String , String ) -> Void ) ? = nil
2026-03-29 06:29:18 -04:00
) async throws -> String {
2026-04-01 16:10:30 -05:00
let systemPrompt = " " "
You are a productivity assistant executing a task the user approved .
Action : " \( label ) "
Spec : \ ( details . isEmpty ? " (none provided) " : details )
INSTRUCTIONS :
1. For BINARY files ( PDFs , images , etc . ) : use your VISION . Read content directly
from the screenshots — this is your most reliable source for non - text files .
2. For TEXT files ( code , markdown , configs , txt ) : use read_file to get exact content .
3. If you need a file but only know the filename ( not the path ) , FIND IT FIRST :
- run_command ( " mdfind -name 'filename' " ) — fast macOS Spotlight search
- run_command ( " lsof -c AppName | grep filename " ) — find what file an app has open
Do NOT guess paths . Search first .
4. Choose the right output method :
- write_file ( ) : For existing text files where the modification is clear and the
file location is known — code files ( cpp , py , js , etc . ) , markdown , configs .
Read the file first , then write the updated version .
NEVER create new files . NEVER write to files you haven ' t read first .
- output ( ) : For everything else — extracted data from PDFs / images , content for
binary targets ( docx , ppt , forms , websites ) , or when you ' re unsure where to
put the result . User will review and copy / paste .
5. Use run_command to compile , test , or search for files . Never to write files .
6. Do NOT hallucinate content . If you can ' t read something , say so .
7. Call done ( ) with a summary when the action is complete .
2026-03-29 06:29:18 -04:00
" " "
2026-04-01 16:10:30 -05:00
// B u i l d i n i t i a l u s e r m e s s a g e w i t h s c r e e n s h o t s
var userParts : [ [ String : Any ] ] = [ ]
for ( i , frame ) in frames . enumerated ( ) {
userParts . append ( [ " text " : " [Screenshot \( i + 1 ) / \( frames . count ) ] " ] )
userParts . append ( [
" inlineData " : [
" mimeType " : " image/jpeg " ,
" data " : frame . base64EncodedString ( )
]
] )
}
userParts . append ( [ " text " : " Execute the action now. Use the tools available to you. " ] )
var messages : [ [ String : Any ] ] = [
[ " role " : " user " , " parts " : userParts ]
]
let maxSteps = 10
var filesRead : Set < String > = [ ]
var outputResult : String ?
var doneSummary : String ?
for step in 0. . < maxSteps {
print ( " [Executor] Step \( step + 1 ) / \( maxSteps ) " )
let responseData = try await callGeminiWithTools (
systemPrompt : systemPrompt ,
messages : messages ,
maxOutputTokens : 4096
)
// P a r s e r e s p o n s e
guard let json = try JSONSerialization . jsonObject ( with : responseData ) as ? [ String : Any ] ,
let candidates = json [ " candidates " ] as ? [ [ String : Any ] ] ,
let first = candidates . first ,
let content = first [ " content " ] as ? [ String : Any ] ,
let parts = content [ " parts " ] as ? [ [ String : Any ] ]
else {
let raw = String ( data : responseData , encoding : . utf8 ) ? ? " "
print ( " [Executor] Unexpected response: \( raw . prefix ( 300 ) ) " )
break
}
// C h e c k f o r t e x t r e s p o n s e ( m o d e l i s d o n e )
if let textPart = parts . first ( where : { $0 [ " text " ] != nil } ) ,
let text = textPart [ " text " ] as ? String ,
parts . allSatisfy ( { $0 [ " functionCall " ] = = nil } ) {
// M o d e l r e s p o n d e d w i t h t e x t , n o f u n c t i o n c a l l s — i t ' s d o n e
return doneSummary ? ? outputResult ? ? text
}
// A p p e n d m o d e l ' s r e s p o n s e t o c o n v e r s a t i o n
messages . append ( [ " role " : " model " , " parts " : parts ] )
// P r o c e s s f u n c t i o n c a l l s
var functionResponses : [ [ String : Any ] ] = [ ]
for part in parts {
guard let funcCall = part [ " functionCall " ] as ? [ String : Any ] ,
let name = funcCall [ " name " ] as ? String ,
let args = funcCall [ " args " ] as ? [ String : Any ]
else { continue }
let result : String
print ( " [Executor] → \( name ) ( \( args ) ) " )
onToolCall ? ( name , " \( args ) " )
switch name {
case " read_file " :
let path = args [ " path " ] as ? String ? ? " "
result = executeReadFile ( path : path )
filesRead . insert ( path )
case " write_file " :
let path = args [ " path " ] as ? String ? ? " "
let fileContent = args [ " content " ] as ? String ? ? " "
if ! filesRead . contains ( path ) {
result = " ERROR: You must read_file(' \( path ) ') before writing to it. "
} else {
result = executeWriteFile ( path : path , content : fileContent )
}
case " run_command " :
let command = args [ " command " ] as ? String ? ? args [ " shell_command " ] as ? String ? ? " "
result = await executeRunCommand ( command : command )
case " output " :
let title = args [ " title " ] as ? String ? ? label
let content = args [ " content " ] as ? String ? ? " "
outputResult = content . isEmpty ? title : content
result = " Displayed to user: \( title ) "
case " done " :
let summary = args [ " summary " ] as ? String ? ? " Action completed. "
doneSummary = summary
// R e t u r n i m m e d i a t e l y — a g e n t i s d o n e
return outputResult ? ? summary
default :
result = " Unknown tool: \( name ) "
}
print ( " [Executor] ← \( result . prefix ( 200 ) ) " )
functionResponses . append ( [
" functionResponse " : [
" name " : name ,
" response " : [ " content " : result ]
]
] )
}
// F e e d t o o l r e s u l t s b a c k t o t h e m o d e l
if ! functionResponses . isEmpty {
messages . append ( [ " role " : " user " , " parts " : functionResponses ] )
}
}
// H i t s t e p l i m i t
return outputResult ? ? doneSummary ? ? " Action completed (reached step limit). "
}
// MARK: - T o o l I m p l e m e n t a t i o n s
nonisolated private func executeReadFile ( path : String ) -> String {
let expandedPath = NSString ( string : path ) . expandingTildeInPath
guard FileManager . default . fileExists ( atPath : expandedPath ) else {
return " ERROR: File not found: \( path ) "
}
guard FileManager . default . isReadableFile ( atPath : expandedPath ) else {
return " ERROR: Cannot read file: \( path ) "
}
do {
let content = try String ( contentsOfFile : expandedPath , encoding : . utf8 )
// T r u n c a t e v e r y l a r g e f i l e s
if content . count > 50_000 {
return String ( content . prefix ( 50_000 ) ) + " \n \n [TRUNCATED — file is \( content . count ) characters] "
}
return content
} catch {
return " ERROR: \( error . localizedDescription ) "
}
}
nonisolated private func executeWriteFile ( path : String , content : String ) -> String {
let expandedPath = NSString ( string : path ) . expandingTildeInPath
guard FileManager . default . fileExists ( atPath : expandedPath ) else {
return " ERROR: File does not exist: \( path ) . Cannot create new files. "
}
do {
try content . write ( toFile : expandedPath , atomically : true , encoding : . utf8 )
return " OK — wrote \( content . count ) characters to \( path ) "
} catch {
return " ERROR: \( error . localizedDescription ) "
}
2026-03-29 06:29:18 -04:00
}
2026-04-01 16:10:30 -05:00
nonisolated private func executeRunCommand ( command : String ) async -> String {
// S a f e t y : b l o c k o b v i o u s l y d e s t r u c t i v e c o m m a n d s
let dangerous = [ " rm -rf / " , " rm -rf ~ " , " mkfs " , " dd if= " , " > /dev/ " ]
for d in dangerous where command . contains ( d ) {
return " ERROR: Blocked dangerous command. "
}
return await withCheckedContinuation { continuation in
let process = Process ( )
process . executableURL = URL ( fileURLWithPath : " /bin/zsh " )
process . arguments = [ " -c " , command ]
let stdout = Pipe ( )
let stderr = Pipe ( )
process . standardOutput = stdout
process . standardError = stderr
var hasResumed = false
// T i m e o u t a f t e r 3 0 s e c o n d s
let timeoutWork = DispatchWorkItem {
guard ! hasResumed else { return }
hasResumed = true
process . terminate ( )
continuation . resume ( returning : " ERROR: Command timed out after 30s. " )
}
DispatchQueue . global ( ) . asyncAfter ( deadline : . now ( ) + 30 , execute : timeoutWork )
2026-03-29 06:29:18 -04:00
2026-04-01 16:10:30 -05:00
process . terminationHandler = { _ in
timeoutWork . cancel ( )
guard ! hasResumed else { return }
hasResumed = true
let outData = stdout . fileHandleForReading . readDataToEndOfFile ( )
let errData = stderr . fileHandleForReading . readDataToEndOfFile ( )
let out = String ( data : outData , encoding : . utf8 ) ? ? " "
let err = String ( data : errData , encoding : . utf8 ) ? ? " "
var result = " "
if ! out . isEmpty { result += out }
if ! err . isEmpty { result += ( result . isEmpty ? " " : " \n " ) + " STDERR: " + err }
if result . isEmpty { result = " (no output) " }
if result . count > 10_000 {
result = String ( result . prefix ( 10_000 ) ) + " \n \n [TRUNCATED] "
}
if process . terminationStatus != 0 {
result += " \n (exit code: \( process . terminationStatus ) ) "
}
continuation . resume ( returning : result )
}
do {
try process . run ( )
} catch {
timeoutWork . cancel ( )
guard ! hasResumed else { return }
hasResumed = true
continuation . resume ( returning : " ERROR: \( error . localizedDescription ) " )
}
}
}
// MARK: - G e m i n i A P I : A n a l y s i s ( n o t o o l s )
private func callGemini (
prompt : String ,
frames : [ Data ] ,
fileUris : [ String ? ] = [ ] ,
finalInstruction : String = " Analyze this screenshot sequence now. Reply with ONLY valid JSON — no markdown, no code fences. " ,
maxOutputTokens : Int = 1024
) async throws -> String {
let urlStr = " \( Self . apiBase ) / \( Self . analysisModel ) :generateContent?key= \( apiKey ) "
2026-03-29 06:29:18 -04:00
guard let url = URL ( string : urlStr ) else { throw URLError ( . badURL ) }
var parts : [ [ String : Any ] ] = [ ]
let total = frames . count
2026-04-01 16:10:30 -05:00
var inlineCount = 0
var uriCount = 0
2026-03-29 06:29:18 -04:00
for ( i , frame ) in frames . enumerated ( ) {
2026-04-01 16:10:30 -05:00
let age = ( total - i ) * 5 // a p p r o x i m a t e s e c o n d s a g o
parts . append ( [ " text " : " [Screenshot \( i + 1 ) / \( total ) — \( age ) s ago] " ] )
let uri = i < fileUris . count ? fileUris [ i ] : nil
if let uri {
// U s e F i l e s A P I U R I — n o r e - u p l o a d o f t h i s f r a m e ' s b y t e s
parts . append ( [ " fileData " : [ " mimeType " : " image/jpeg " , " fileUri " : uri ] ] )
uriCount += 1
} else {
// F a l l b a c k t o i n l i n e b a s e 6 4 ( n e w e s t f r a m e , o r u p l o a d n o t y e t c o m p l e t e )
parts . append ( [ " inlineData " : [ " mimeType " : " image/jpeg " , " data " : frame . base64EncodedString ( ) ] ] )
inlineCount += 1
}
2026-03-29 06:29:18 -04:00
}
2026-04-01 16:10:30 -05:00
print ( " [GeminiVLM] Sending \( uriCount ) URI frames + \( inlineCount ) inline frames " )
parts . append ( [ " text " : finalInstruction ] )
2026-03-29 06:29:18 -04:00
let body : [ String : Any ] = [
" systemInstruction " : [ " parts " : [ [ " text " : prompt ] ] ] ,
" contents " : [ [ " parts " : parts ] ] ,
" generationConfig " : [
" temperature " : 0.2 ,
2026-04-01 16:10:30 -05:00
" maxOutputTokens " : maxOutputTokens
2026-03-29 06:29:18 -04:00
]
]
var request = URLRequest ( url : url )
request . httpMethod = " POST "
request . setValue ( " application/json " , forHTTPHeaderField : " Content-Type " )
request . httpBody = try JSONSerialization . data ( withJSONObject : body )
request . timeoutInterval = 60
let ( data , response ) = try await URLSession . shared . data ( for : request )
if let http = response as ? HTTPURLResponse , http . statusCode != 200 {
let msg = String ( data : data , encoding : . utf8 ) ? ? " HTTP \( http . statusCode ) "
print ( " [GeminiVLM] API error \( http . statusCode ) : \( msg ) " )
throw URLError ( . badServerResponse )
}
guard let json = try JSONSerialization . jsonObject ( with : data ) as ? [ String : Any ] ,
let candidates = json [ " candidates " ] as ? [ [ String : Any ] ] ,
let first = candidates . first ,
let content = first [ " content " ] as ? [ String : Any ] ,
let contentParts = content [ " parts " ] as ? [ [ String : Any ] ] ,
let text = contentParts . first ? [ " text " ] as ? String
else {
let raw = String ( data : data , encoding : . utf8 ) ? ? " "
print ( " [GeminiVLM] Unexpected response shape: \( raw . prefix ( 300 ) ) " )
throw URLError ( . cannotParseResponse )
}
2026-04-01 16:10:30 -05:00
if let reason = first [ " finishReason " ] as ? String , reason != " STOP " {
print ( " [GeminiVLM] finishReason= \( reason ) — response may be truncated " )
}
2026-03-29 06:29:18 -04:00
print ( " [GeminiVLM] Response ( \( text . count ) chars): \( text . prefix ( 200 ) ) " )
return text
}
2026-04-01 16:10:30 -05:00
// MARK: - G e m i n i A P I : E x e c u t o r ( w i t h f u n c t i o n c a l l i n g )
// / G e m i n i f u n c t i o n c a l l i n g t o o l d e c l a r a t i o n s f o r t h e a g e n t i c e x e c u t o r .
private var executorTools : [ [ String : Any ] ] {
[ [
" functionDeclarations " : [
[
" name " : " read_file " ,
" description " : " Read a plain text file. Returns the file contents as a string. " ,
" parameters " : [
" type " : " object " ,
" properties " : [
" path " : [ " type " : " string " , " description " : " Absolute file path to read " ]
] ,
" required " : [ " path " ]
]
] ,
[
" name " : " write_file " ,
" description " : " Write content to an existing plain text file. You MUST call read_file on this path first. Cannot create new files. " ,
" parameters " : [
" type " : " object " ,
" properties " : [
" path " : [ " type " : " string " , " description " : " Absolute file path (must already exist) " ] ,
" content " : [ " type " : " string " , " description " : " Full file content to write " ]
] ,
" required " : [ " path " , " content " ]
]
] ,
[
" name " : " run_command " ,
" description " : " Execute a shell command and return stdout/stderr. Use for compilation, testing, file discovery (mdfind, lsof). Do not use to write files. " ,
" parameters " : [
" type " : " object " ,
" properties " : [
" command " : [ " type " : " string " , " description " : " Shell command to execute " ]
] ,
" required " : [ " command " ]
]
] ,
[
" name " : " output " ,
" description " : " Display content to the user in a sticky note card. Use for extracted data from PDFs/images, content for binary targets, or when unsure where to put results. " ,
" parameters " : [
" type " : " object " ,
" properties " : [
" title " : [ " type " : " string " , " description " : " Card title " ] ,
" content " : [ " type " : " string " , " description " : " Content to display " ]
] ,
" required " : [ " title " , " content " ]
]
] ,
[
" name " : " done " ,
" description " : " Signal that the action is complete. Always call this when finished. " ,
" parameters " : [
" type " : " object " ,
" properties " : [
" summary " : [ " type " : " string " , " description " : " Brief summary of what was done " ]
] ,
" required " : [ " summary " ]
]
]
]
] ]
}
// / C a l l G e m i n i w i t h f u n c t i o n c a l l i n g e n a b l e d . R e t u r n s r a w r e s p o n s e D a t a .
private func callGeminiWithTools (
systemPrompt : String ,
messages : [ [ String : Any ] ] ,
maxOutputTokens : Int = 4096
) async throws -> Data {
let urlStr = " \( Self . apiBase ) / \( Self . executorModel ) :generateContent?key= \( apiKey ) "
guard let url = URL ( string : urlStr ) else { throw URLError ( . badURL ) }
let body : [ String : Any ] = [
" systemInstruction " : [ " parts " : [ [ " text " : systemPrompt ] ] ] ,
" tools " : executorTools ,
" contents " : messages ,
" generationConfig " : [
" temperature " : 0.2 ,
" maxOutputTokens " : maxOutputTokens
]
]
var request = URLRequest ( url : url )
request . httpMethod = " POST "
request . setValue ( " application/json " , forHTTPHeaderField : " Content-Type " )
request . httpBody = try JSONSerialization . data ( withJSONObject : body )
request . timeoutInterval = 120
let ( data , response ) = try await URLSession . shared . data ( for : request )
if let http = response as ? HTTPURLResponse , http . statusCode = = 429 {
// R a t e l i m i t e d — w a i t a n d r e t r y o n c e
print ( " [Executor] Rate limited (429) — retrying in 5s " )
try await Task . sleep ( for : . seconds ( 5 ) )
let ( retryData , retryResponse ) = try await URLSession . shared . data ( for : request )
if let retryHttp = retryResponse as ? HTTPURLResponse , retryHttp . statusCode != 200 {
let msg = String ( data : retryData , encoding : . utf8 ) ? ? " HTTP \( retryHttp . statusCode ) "
print ( " [Executor] Retry failed: \( msg ) " )
throw URLError ( . badServerResponse )
}
return retryData
}
if let http = response as ? HTTPURLResponse , http . statusCode != 200 {
let msg = String ( data : data , encoding : . utf8 ) ? ? " HTTP \( http . statusCode ) "
print ( " [Executor] API error \( http . statusCode ) : \( msg ) " )
throw URLError ( . badServerResponse )
}
return data
}
2026-03-29 06:29:18 -04:00
// MARK: - R e s p o n s e P a r s i n g
private func parseResponse ( _ text : String ) throws -> DistractionAnalysisResponse {
var cleaned = text . trimmingCharacters ( in : . whitespacesAndNewlines )
if cleaned . hasPrefix ( " ``` " ) {
let lines = cleaned . components ( separatedBy : " \n " )
cleaned = lines . dropFirst ( ) . joined ( separator : " \n " )
if let backtickRange = cleaned . range ( of : " ``` " ) {
cleaned = String ( cleaned [ . . < backtickRange . lowerBound ] )
}
cleaned = cleaned . trimmingCharacters ( in : . whitespacesAndNewlines )
}
2026-04-01 16:10:30 -05:00
guard let start = cleaned . firstIndex ( of : " { " ) else {
2026-03-29 06:29:18 -04:00
throw URLError ( . cannotParseResponse )
}
2026-04-01 16:10:30 -05:00
guard let end = cleaned . lastIndex ( of : " } " ) else {
print ( " [GeminiVLM] Truncated JSON — attempting partial field extraction " )
return partialFallback ( from : String ( cleaned [ start . . . ] ) )
}
2026-03-29 06:29:18 -04:00
let jsonStr = String ( cleaned [ start . . . end ] )
guard let jsonData = jsonStr . data ( using : . utf8 ) else {
throw URLError ( . cannotParseResponse )
}
2026-04-01 16:10:30 -05:00
do {
return try JSONDecoder ( ) . decode ( DistractionAnalysisResponse . self , from : jsonData )
} catch {
print ( " [GeminiVLM] Decode error: \( error ) — attempting partial field extraction " )
return partialFallback ( from : jsonStr )
}
}
private func partialFallback ( from jsonText : String ) -> DistractionAnalysisResponse {
let onTask = ! jsonText . contains ( " \" on_task \" : false " ) && ! jsonText . contains ( " \" on_task \" :false " )
let inferredTask = regexExtract ( # " " inferred_task " \ s*: \ s* " ( ( ? : [ ^ " \\ ]| \\ .)*) " " #, from: jsonText)
let vlmSummary = regexExtract ( # " " vlm_summary " \ s*: \ s* " ( ( ? : [ ^ " \\ ]| \\ .)*) " " #, from: jsonText)
let appName = regexExtract ( # " " app_name " \ s*: \ s* " ( ( ? : [ ^ " \\ ]| \\ .)*) " " #, from: jsonText)
print ( " [GeminiVLM] Partial recovery — on_task= \( onTask ) task= \( inferredTask ? ? " nil " ) " )
return DistractionAnalysisResponse (
onTask : onTask ,
currentStepId : nil ,
inferredTask : inferredTask ,
checkpointNoteUpdate : nil ,
stepsCompleted : [ ] ,
friction : nil ,
sessionAction : nil ,
notification : nil ,
intent : nil ,
distractionType : nil ,
appName : appName ,
confidence : 0.0 ,
vlmSummary : vlmSummary
)
}
private func regexExtract ( _ pattern : String , from text : String ) -> String ? {
guard let regex = try ? NSRegularExpression ( pattern : pattern ) ,
let match = regex . firstMatch ( in : text , range : NSRange ( text . startIndex . . . , in : text ) ) ,
let range = Range ( match . range ( at : 1 ) , in : text )
else { return nil }
return String ( text [ range ] )
2026-03-29 06:29:18 -04:00
}
}