Skip to content

Commit 74d187a

Browse files
authored
Merge pull request #135 from OthersideAI/add-agent-1
Add agent 1
2 parents 578eb49 + 01bd850 commit 74d187a

File tree

15 files changed

+790
-1163
lines changed

15 files changed

+790
-1163
lines changed

operate/actions.py

Lines changed: 0 additions & 409 deletions
This file was deleted.

operate/config.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import os
2+
import sys
3+
from dotenv import load_dotenv
4+
from openai import OpenAI
5+
from prompt_toolkit.shortcuts import input_dialog
6+
7+
8+
class Config:
9+
"""
10+
Configuration class for managing settings.
11+
12+
Attributes:
13+
debug (bool): Flag indicating whether debug mode is enabled.
14+
openai_api_key (str): API key for OpenAI.
15+
google_api_key (str): API key for Google.
16+
"""
17+
18+
def __init__(self):
19+
load_dotenv()
20+
self.verbose = False
21+
self.openai_api_key = os.getenv("OPENAI_API_KEY")
22+
self.google_api_key = os.getenv("GOOGLE_API_KEY")
23+
24+
def initialize_openai(self):
25+
if self.openai_api_key:
26+
client = OpenAI()
27+
client.api_key = self.openai_api_key
28+
client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url)
29+
return client
30+
return None
31+
32+
def validation(self, model, voice_mode):
33+
"""
34+
Validate the input parameters for the dialog operation.
35+
"""
36+
self.require_api_key(
37+
"OPENAI_API_KEY", "OpenAI API key", model == "gpt-4" or voice_mode
38+
)
39+
self.require_api_key(
40+
"GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision"
41+
)
42+
43+
def require_api_key(self, key_name, key_description, is_required):
44+
if is_required and not getattr(self, key_name.lower()):
45+
self.prompt_and_save_api_key(key_name, key_description)
46+
47+
def prompt_and_save_api_key(self, key_name, key_description):
48+
key_value = input_dialog(
49+
title="API Key Required", text=f"Please enter your {key_description}:"
50+
).run()
51+
52+
if key_value is None: # User pressed cancel or closed the dialog
53+
sys.exit("Operation cancelled by user.")
54+
55+
if key_value:
56+
self.save_api_key_to_env(key_name, key_value)
57+
load_dotenv() # Reload environment variables
58+
# Update the instance attribute with the new key
59+
60+
if key_value:
61+
self.save_api_key_to_env(key_name, key_value)
62+
load_dotenv() # Reload environment variables
63+
setattr(self, key_name.lower(), key_value)
64+
65+
@staticmethod
66+
def save_api_key_to_env(key_name, key_value):
67+
with open(".env", "a") as file:
68+
file.write(f"\n{key_name}='{key_value}'")

operate/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44
import argparse
55
from operate.utils.style import ANSI_BRIGHT_MAGENTA
6-
from operate.dialog import main
6+
from operate.operate import main
77

88

99
def main_entry():

operate/models/__init__.py

Whitespace-only changes.

operate/models/apis.py

Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
import os
2+
import time
3+
import json
4+
import base64
5+
import traceback
6+
import io
7+
8+
9+
from PIL import Image
10+
from ultralytics import YOLO
11+
import google.generativeai as genai
12+
from operate.config import Config
13+
from operate.exceptions import ModelNotRecognizedException
14+
from operate.utils.screenshot import (
15+
capture_screen_with_cursor,
16+
)
17+
from operate.models.prompts import (
18+
get_user_first_message_prompt,
19+
get_user_prompt,
20+
get_system_prompt,
21+
)
22+
23+
24+
from operate.utils.label import (
25+
add_labels,
26+
get_click_position_in_percent,
27+
get_label_coordinates,
28+
)
29+
from operate.utils.style import (
30+
ANSI_GREEN,
31+
ANSI_RED,
32+
ANSI_RESET,
33+
)
34+
35+
36+
# Load configuration
37+
VERBOSE = Config().verbose
38+
39+
40+
async def get_next_action(model, messages, objective, session_id):
41+
if model == "gpt-4":
42+
return call_gpt_4_vision_preview(messages), None
43+
if model == "gpt-4-with-som":
44+
operation = await call_gpt_4_vision_preview_labeled(messages, objective)
45+
return operation, None
46+
elif model == "agent-1":
47+
return "coming soon"
48+
elif model == "gemini-pro-vision":
49+
return call_gemini_pro_vision(messages, objective), None
50+
51+
raise ModelNotRecognizedException(model)
52+
53+
54+
def call_gpt_4_vision_preview(messages):
55+
config = Config()
56+
client = config.initialize_openai()
57+
if VERBOSE:
58+
print("[Self Operating Computer][get_next_action][call_gpt_4_v]")
59+
time.sleep(1)
60+
try:
61+
screenshots_dir = "screenshots"
62+
if not os.path.exists(screenshots_dir):
63+
os.makedirs(screenshots_dir)
64+
65+
screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
66+
# Call the function to capture the screen with the cursor
67+
capture_screen_with_cursor(screenshot_filename)
68+
69+
with open(screenshot_filename, "rb") as img_file:
70+
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
71+
72+
if len(messages) == 1:
73+
user_prompt = get_user_first_message_prompt()
74+
else:
75+
user_prompt = get_user_prompt()
76+
77+
if VERBOSE:
78+
print(
79+
"[Self Operating Computer][get_next_action][call_gpt_4_v] user_prompt",
80+
user_prompt,
81+
)
82+
83+
vision_message = {
84+
"role": "user",
85+
"content": [
86+
{"type": "text", "text": user_prompt},
87+
{
88+
"type": "image_url",
89+
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
90+
},
91+
],
92+
}
93+
messages.append(vision_message)
94+
95+
response = client.chat.completions.create(
96+
model="gpt-4-vision-preview",
97+
messages=messages,
98+
presence_penalty=1,
99+
frequency_penalty=1,
100+
temperature=0.7,
101+
max_tokens=3000,
102+
)
103+
104+
content = response.choices[0].message.content
105+
106+
if content.startswith("```json"):
107+
content = content[len("```json") :] # Remove starting ```json
108+
if content.endswith("```"):
109+
content = content[: -len("```")] # Remove ending
110+
111+
assistant_message = {"role": "assistant", "content": content}
112+
if VERBOSE:
113+
print(
114+
"[Self Operating Computer][get_next_action][call_gpt_4_v] content",
115+
content,
116+
)
117+
content = json.loads(content)
118+
119+
messages.append(assistant_message)
120+
121+
return content
122+
123+
except Exception as e:
124+
print(
125+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying again {ANSI_RESET}",
126+
e,
127+
)
128+
print(
129+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
130+
content,
131+
)
132+
traceback.print_exc()
133+
return call_gpt_4_vision_preview(messages)
134+
135+
136+
def call_gemini_pro_vision(messages, objective):
137+
"""
138+
Get the next action for Self-Operating Computer using Gemini Pro Vision
139+
"""
140+
# sleep for a second
141+
time.sleep(1)
142+
try:
143+
screenshots_dir = "screenshots"
144+
if not os.path.exists(screenshots_dir):
145+
os.makedirs(screenshots_dir)
146+
147+
screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
148+
# Call the function to capture the screen with the cursor
149+
capture_screen_with_cursor(screenshot_filename)
150+
# sleep for a second
151+
time.sleep(1)
152+
prompt = get_system_prompt(objective)
153+
154+
model = genai.GenerativeModel("gemini-pro-vision")
155+
156+
response = model.generate_content([prompt, Image.open(screenshot_filename)])
157+
158+
content = response.text[1:]
159+
160+
content = json.loads(content)
161+
if VERBOSE:
162+
print(
163+
"[Self Operating Computer][get_next_action][call_gemini_pro_vision] content",
164+
content,
165+
)
166+
167+
return content
168+
169+
except Exception as e:
170+
print(
171+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
172+
e,
173+
)
174+
return call_gpt_4_vision_preview(messages)
175+
176+
177+
async def call_gpt_4_vision_preview_labeled(messages, objective):
178+
config = Config()
179+
client = config.initialize_openai()
180+
time.sleep(1)
181+
try:
182+
yolo_model = YOLO("./operate/models/weights/best.pt") # Load your trained model
183+
screenshots_dir = "screenshots"
184+
if not os.path.exists(screenshots_dir):
185+
os.makedirs(screenshots_dir)
186+
187+
screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
188+
# Call the function to capture the screen with the cursor
189+
capture_screen_with_cursor(screenshot_filename)
190+
191+
with open(screenshot_filename, "rb") as img_file:
192+
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
193+
194+
img_base64_labeled, label_coordinates = add_labels(img_base64, yolo_model)
195+
196+
if len(messages) == 1:
197+
user_prompt = get_user_first_message_prompt()
198+
else:
199+
user_prompt = get_user_prompt()
200+
201+
if VERBOSE:
202+
print(
203+
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] user_prompt",
204+
user_prompt,
205+
)
206+
207+
vision_message = {
208+
"role": "user",
209+
"content": [
210+
{"type": "text", "text": user_prompt},
211+
{
212+
"type": "image_url",
213+
"image_url": {
214+
"url": f"data:image/jpeg;base64,{img_base64_labeled}"
215+
},
216+
},
217+
],
218+
}
219+
messages.append(vision_message)
220+
221+
response = client.chat.completions.create(
222+
model="gpt-4-vision-preview",
223+
messages=messages,
224+
presence_penalty=1,
225+
frequency_penalty=1,
226+
temperature=0.7,
227+
max_tokens=1000,
228+
)
229+
230+
content = response.choices[0].message.content
231+
232+
if content.startswith("```json"):
233+
content = content[len("```json") :] # Remove starting ```json
234+
if content.endswith("```"):
235+
content = content[: -len("```")] # Remove ending
236+
237+
assistant_message = {"role": "assistant", "content": content}
238+
if VERBOSE:
239+
print(
240+
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] content",
241+
content,
242+
)
243+
messages.append(assistant_message)
244+
245+
content = json.loads(content)
246+
247+
processed_content = []
248+
249+
for operation in content:
250+
if operation.get("operation") == "click":
251+
label = operation.get("label")
252+
if VERBOSE:
253+
print(
254+
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] label",
255+
label,
256+
)
257+
258+
coordinates = get_label_coordinates(label, label_coordinates)
259+
if VERBOSE:
260+
print(
261+
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] coordinates",
262+
coordinates,
263+
)
264+
image = Image.open(
265+
io.BytesIO(base64.b64decode(img_base64))
266+
) # Load the image to get its size
267+
image_size = image.size # Get the size of the image (width, height)
268+
click_position_percent = get_click_position_in_percent(
269+
coordinates, image_size
270+
)
271+
if VERBOSE:
272+
print(
273+
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] click_position_percent",
274+
click_position_percent,
275+
)
276+
if not click_position_percent:
277+
print(
278+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}"
279+
)
280+
return call_gpt_4_vision_preview(messages)
281+
282+
x_percent = f"{click_position_percent[0]:.2f}"
283+
y_percent = f"{click_position_percent[1]:.2f}"
284+
operation["x"] = x_percent
285+
operation["y"] = y_percent
286+
if VERBOSE:
287+
print(
288+
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new click operation",
289+
operation,
290+
)
291+
processed_content.append(operation)
292+
else:
293+
processed_content.append(operation)
294+
295+
if VERBOSE:
296+
print(
297+
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new processed_content",
298+
processed_content,
299+
)
300+
return processed_content
301+
302+
except Exception as e:
303+
print(
304+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
305+
e,
306+
)
307+
return call_gpt_4_vision_preview(messages)
308+
309+
310+
def get_last_assistant_message(messages):
311+
"""
312+
Retrieve the last message from the assistant in the messages array.
313+
If the last assistant message is the first message in the array, return None.
314+
"""
315+
for index in reversed(range(len(messages))):
316+
if messages[index]["role"] == "assistant":
317+
if index == 0: # Check if the assistant message is the first in the array
318+
return None
319+
else:
320+
return messages[index]
321+
return None # Return None if no assistant message is found

0 commit comments

Comments
 (0)