@@ -48,6 +48,9 @@ async def get_next_action(model, messages, objective, session_id):
4848 if model == "gpt-4-with-ocr" :
4949 operation = await call_gpt_4o_with_ocr (messages , objective , model )
5050 return operation , None
51+ if model == "o1-with-ocr" :
52+ operation = await call_o1_with_ocr (messages , objective , model )
53+ return operation , None
5154 if model == "agent-1" :
5255 return "coming soon"
5356 if model == "gemini-pro-vision" :
@@ -231,7 +234,7 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
231234 messages .append (vision_message )
232235
233236 response = client .chat .completions .create (
234- model = "gpt-4o " ,
237+ model = "o1 " ,
235238 messages = messages ,
236239 temperature = 0.7 ,
237240 max_tokens = 3000 ,
@@ -307,6 +310,121 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
307310 return gpt_4_fallback (messages , objective , model )
308311
309312
313+ async def call_o1_with_ocr (messages , objective , model ):
314+ if config .verbose :
315+ print ("[call_o1_with_ocr]" )
316+
317+ # Construct the path to the file within the package
318+ try :
319+ time .sleep (1 )
320+ client = config .initialize_openai ()
321+
322+ confirm_system_prompt (messages , objective , model )
323+ screenshots_dir = "screenshots"
324+ if not os .path .exists (screenshots_dir ):
325+ os .makedirs (screenshots_dir )
326+
327+ screenshot_filename = os .path .join (screenshots_dir , "screenshot.png" )
328+ # Call the function to capture the screen with the cursor
329+ capture_screen_with_cursor (screenshot_filename )
330+
331+ with open (screenshot_filename , "rb" ) as img_file :
332+ img_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
333+
334+ if len (messages ) == 1 :
335+ user_prompt = get_user_first_message_prompt ()
336+ else :
337+ user_prompt = get_user_prompt ()
338+
339+ vision_message = {
340+ "role" : "user" ,
341+ "content" : [
342+ {"type" : "text" , "text" : user_prompt },
343+ {
344+ "type" : "image_url" ,
345+ "image_url" : {"url" : f"data:image/jpeg;base64,{ img_base64 } " },
346+ },
347+ ],
348+ }
349+ messages .append (vision_message )
350+
351+ response = client .chat .completions .create (
352+ model = "gpt-4o" ,
353+ messages = messages ,
354+ temperature = 0.7 ,
355+ max_tokens = 3000 ,
356+ )
357+
358+ content = response .choices [0 ].message .content
359+
360+ content = clean_json (content )
361+
362+ # used later for the messages
363+ content_str = content
364+
365+ content = json .loads (content )
366+
367+ processed_content = []
368+
369+ for operation in content :
370+ if operation .get ("operation" ) == "click" :
371+ text_to_click = operation .get ("text" )
372+ if config .verbose :
373+ print (
374+ "[call_o1_with_ocr][click] text_to_click" ,
375+ text_to_click ,
376+ )
377+ # Initialize EasyOCR Reader
378+ reader = easyocr .Reader (["en" ])
379+
380+ # Read the screenshot
381+ result = reader .readtext (screenshot_filename )
382+
383+ text_element_index = get_text_element (
384+ result , text_to_click , screenshot_filename
385+ )
386+ coordinates = get_text_coordinates (
387+ result , text_element_index , screenshot_filename
388+ )
389+
390+ # add `coordinates`` to `content`
391+ operation ["x" ] = coordinates ["x" ]
392+ operation ["y" ] = coordinates ["y" ]
393+
394+ if config .verbose :
395+ print (
396+ "[call_o1_with_ocr][click] text_element_index" ,
397+ text_element_index ,
398+ )
399+ print (
400+ "[call_o1_with_ocr][click] coordinates" ,
401+ coordinates ,
402+ )
403+ print (
404+ "[call_o1_with_ocr][click] final operation" ,
405+ operation ,
406+ )
407+ processed_content .append (operation )
408+
409+ else :
410+ processed_content .append (operation )
411+
412+ # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
413+ assistant_message = {"role" : "assistant" , "content" : content_str }
414+ messages .append (assistant_message )
415+
416+ return processed_content
417+
418+ except Exception as e :
419+ print (
420+ f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_BRIGHT_MAGENTA } [{ model } ] That did not work. Trying another method { ANSI_RESET } "
421+ )
422+ if config .verbose :
423+ print ("[Self-Operating Computer][Operate] error" , e )
424+ traceback .print_exc ()
425+ return gpt_4_fallback (messages , objective , model )
426+
427+
310428async def call_gpt_4o_labeled (messages , objective , model ):
311429 time .sleep (1 )
312430
0 commit comments