Skip to content

Commit 6ae1d7f

Browse files
committed
added support for gpt 5.2 and some bug fixes
1 parent 93ecede commit 6ae1d7f

File tree

5 files changed

+88
-6
lines changed

5 files changed

+88
-6
lines changed

app/interpreter.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ def execute_function(self, function_name: str, parameters: dict[str, Any]) -> No
5454
1. time.sleep() - to wait for web pages, applications, and other things to load.
5555
2. pyautogui calls to interact with system's mouse and keyboard.
5656
"""
57+
# Strip to bare name to normalize
58+
if function_name.startswith('pyautogui.'):
59+
function_name = function_name.split('.')[-1]
60+
5761
# Sometimes pyautogui needs warming up i.e. sometimes first call isn't executed hence padding a random call here
5862
pyautogui.press("command", interval=0.2)
5963

@@ -64,9 +68,9 @@ def execute_function(self, function_name: str, parameters: dict[str, Any]) -> No
6468
function_to_call = getattr(pyautogui, function_name)
6569

6670
# Special handling for the 'write' function
67-
if function_name == 'write' and ('string' in parameters or 'text' in parameters):
71+
if function_name == 'write' and ('string' in parameters or 'text' in parameters or 'message' in parameters):
6872
# 'write' function expects a string, not a 'text' keyword argument but LLM sometimes gets confused on the parameter name.
69-
string_to_write = parameters.get('string') or parameters.get('text')
73+
string_to_write = parameters.get('string') or parameters.get('text') or parameters.get('message')
7074
interval = parameters.get('interval', 0.1)
7175
function_to_call(string_to_write, interval=interval)
7276
elif function_name == 'press' and ('keys' in parameters or 'key' in parameters):
@@ -77,8 +81,14 @@ def execute_function(self, function_name: str, parameters: dict[str, Any]) -> No
7781
function_to_call(keys_to_press, presses=presses, interval=interval)
7882
elif function_name == 'hotkey':
7983
# 'hotkey' function expects multiple key arguments, not a list
80-
keys = list(parameters.values())
81-
function_to_call(*keys)
84+
keys_to_press = parameters.get('keys') or parameters.get('key')
85+
if isinstance(keys_to_press, list):
86+
function_to_call(*keys_to_press)
87+
elif isinstance(keys_to_press, str):
88+
function_to_call(keys_to_press)
89+
else:
90+
keys = list(parameters.values())
91+
function_to_call(*keys)
8292
else:
8393
# For other functions, pass the parameters as they are
8494
function_to_call(**parameters)

app/llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from utils.screen import Screen
77
from utils.settings import Settings
88

9-
DEFAULT_MODEL_NAME = 'gpt-4o'
9+
DEFAULT_MODEL_NAME = 'gpt-5.2'
1010

1111

1212
class LLM:

app/models/factory.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from models.gpt4o import GPT4o
22
from models.gpt4v import GPT4v
3+
from models.gpt5 import GPT5
34
from models.gemini import Gemini
45

56

@@ -9,6 +10,8 @@ def create_model(model_name, *args):
910
try:
1011
if model_name == 'gpt-4o' or model_name == 'gpt-4o-mini':
1112
return GPT4o(model_name, *args)
13+
elif model_name.startswith('gpt-5'):
14+
return GPT5(model_name, *args)
1215
elif model_name == 'gpt-4-vision-preview' or model_name == 'gpt-4-turbo':
1316
return GPT4v(model_name, *args)
1417
elif model_name.startswith("gemini"):

app/models/gpt5.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import json
2+
from typing import Any
3+
4+
from models.model import Model
5+
from utils.screen import Screen
6+
7+
8+
class GPT5(Model):
9+
def get_instructions_for_objective(self, original_user_request: str, step_num: int = 0) -> dict[str, Any]:
10+
message = self.format_user_request_for_llm(original_user_request, step_num)
11+
llm_response = self.send_message_to_llm(message)
12+
json_instructions: dict[str, Any] = self.convert_llm_response_to_json_instructions(llm_response)
13+
return json_instructions
14+
15+
def format_user_request_for_llm(self, original_user_request: str, step_num: int) -> list[dict[str, Any]]:
16+
base64_img: str = Screen().get_screenshot_in_base64()
17+
request_data: str = json.dumps({
18+
'original_user_request': original_user_request,
19+
'step_num': step_num
20+
})
21+
22+
# GPT-5 uses Responses API content blocks.
23+
return [
24+
{
25+
'role': 'user',
26+
'content': [
27+
{
28+
'type': 'input_text',
29+
'text': self.context + request_data
30+
},
31+
{
32+
'type': 'input_image',
33+
'image_url': f'data:image/jpeg;base64,{base64_img}'
34+
}
35+
]
36+
}
37+
]
38+
39+
def send_message_to_llm(self, message: list[dict[str, Any]]) -> Any:
40+
return self.client.responses.create(
41+
model=self.model_name,
42+
input=message,
43+
max_output_tokens=800,
44+
)
45+
46+
def convert_llm_response_to_json_instructions(self, llm_response: Any) -> dict[str, Any]:
47+
llm_response_data = (getattr(llm_response, 'output_text', '') or '').strip()
48+
49+
if llm_response_data == '':
50+
# Fallback parsing for SDKs/providers that don't populate output_text.
51+
chunks = []
52+
for output_item in getattr(llm_response, 'output', []) or []:
53+
for content_item in getattr(output_item, 'content', []) or []:
54+
text = getattr(content_item, 'text', None)
55+
if text:
56+
chunks.append(text)
57+
llm_response_data = ''.join(chunks).strip()
58+
59+
start_index = llm_response_data.find('{')
60+
end_index = llm_response_data.rfind('}')
61+
62+
try:
63+
json_response = json.loads(llm_response_data[start_index:end_index + 1].strip())
64+
except Exception as e:
65+
print(f'Error while parsing JSON response - {e}')
66+
json_response = {}
67+
68+
return json_response

app/ui.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ def create_widgets(self) -> None:
6060
radio_frame.pack(padx=20, pady=10) # Add padding around the frame
6161

6262
models = [
63-
('GPT-4o (Default. Medium-Accurate, Medium-Fast)', 'gpt-4o'),
63+
('GPT-5.2 (Default)', 'gpt-5.2'),
64+
('GPT-4o (Medium-Accurate, Medium-Fast)', 'gpt-4o'),
6465
('GPT-4o-mini (Cheapest, Fastest)', 'gpt-4o-mini'),
6566
('GPT-4v (Deprecated. Most-Accurate, Slowest)', 'gpt-4-vision-preview'),
6667
('GPT-4-Turbo (Least Accurate, Fast)', 'gpt-4-turbo'),

0 commit comments

Comments
 (0)