added support for gpt 5.2 and some bug fixes

AmberSahdev · AmberSahdev · commit 6ae1d7f7f013 · 2026-02-24T18:48:10.000-08:00
diff --git a/app/interpreter.py b/app/interpreter.py
@@ -54,6 +54,10 @@ def execute_function(self, function_name: str, parameters: dict[str, Any]) -> No
             1. time.sleep() - to wait for web pages, applications, and other things to load.
             2. pyautogui calls to interact with system's mouse and keyboard.
         """
+        # Strip to bare name to normalize
+        if function_name.startswith('pyautogui.'):
+            function_name = function_name.split('.')[-1]
+
         # Sometimes pyautogui needs warming up i.e. sometimes first call isn't executed hence padding a random call here
         pyautogui.press("command", interval=0.2)
 
@@ -64,9 +68,9 @@ def execute_function(self, function_name: str, parameters: dict[str, Any]) -> No
             function_to_call = getattr(pyautogui, function_name)
 
             # Special handling for the 'write' function
-            if function_name == 'write' and ('string' in parameters or 'text' in parameters):
+            if function_name == 'write' and ('string' in parameters or 'text' in parameters or 'message' in parameters):
                 # 'write' function expects a string, not a 'text' keyword argument but LLM sometimes gets confused on the parameter name.
-                string_to_write = parameters.get('string') or parameters.get('text')
+                string_to_write = parameters.get('string') or parameters.get('text') or parameters.get('message')
                 interval = parameters.get('interval', 0.1)
                 function_to_call(string_to_write, interval=interval)
             elif function_name == 'press' and ('keys' in parameters or 'key' in parameters):
@@ -77,8 +81,14 @@ def execute_function(self, function_name: str, parameters: dict[str, Any]) -> No
                 function_to_call(keys_to_press, presses=presses, interval=interval)
             elif function_name == 'hotkey':
                 # 'hotkey' function expects multiple key arguments, not a list
-                keys = list(parameters.values())
-                function_to_call(*keys)
+                keys_to_press = parameters.get('keys') or parameters.get('key')
+                if isinstance(keys_to_press, list):
+                    function_to_call(*keys_to_press)
+                elif isinstance(keys_to_press, str):
+                    function_to_call(keys_to_press)
+                else:
+                    keys = list(parameters.values())
+                    function_to_call(*keys)
             else:
                 # For other functions, pass the parameters as they are
                 function_to_call(**parameters)
diff --git a/app/llm.py b/app/llm.py
@@ -6,7 +6,7 @@
 from utils.screen import Screen
 from utils.settings import Settings
 
-DEFAULT_MODEL_NAME = 'gpt-4o'
+DEFAULT_MODEL_NAME = 'gpt-5.2'
 
 
 class LLM:
diff --git a/app/models/factory.py b/app/models/factory.py
@@ -1,5 +1,6 @@
 from models.gpt4o import GPT4o
 from models.gpt4v import GPT4v
+from models.gpt5 import GPT5
 from models.gemini import Gemini
 
 
@@ -9,6 +10,8 @@ def create_model(model_name, *args):
         try:
             if model_name == 'gpt-4o' or model_name == 'gpt-4o-mini':
                 return GPT4o(model_name, *args)
+            elif model_name.startswith('gpt-5'):
+                return GPT5(model_name, *args)
             elif model_name == 'gpt-4-vision-preview' or model_name == 'gpt-4-turbo':
                 return GPT4v(model_name, *args)
             elif model_name.startswith("gemini"):
diff --git a/app/models/gpt5.py b/app/models/gpt5.py
@@ -0,0 +1,68 @@
+import json
+from typing import Any
+
+from models.model import Model
+from utils.screen import Screen
+
+
+class GPT5(Model):
+    def get_instructions_for_objective(self, original_user_request: str, step_num: int = 0) -> dict[str, Any]:
+        message = self.format_user_request_for_llm(original_user_request, step_num)
+        llm_response = self.send_message_to_llm(message)
+        json_instructions: dict[str, Any] = self.convert_llm_response_to_json_instructions(llm_response)
+        return json_instructions
+
+    def format_user_request_for_llm(self, original_user_request: str, step_num: int) -> list[dict[str, Any]]:
+        base64_img: str = Screen().get_screenshot_in_base64()
+        request_data: str = json.dumps({
+            'original_user_request': original_user_request,
+            'step_num': step_num
+        })
+
+        # GPT-5 uses Responses API content blocks.
+        return [
+            {
+                'role': 'user',
+                'content': [
+                    {
+                        'type': 'input_text',
+                        'text': self.context + request_data
+                    },
+                    {
+                        'type': 'input_image',
+                        'image_url': f'data:image/jpeg;base64,{base64_img}'
+                    }
+                ]
+            }
+        ]
+
+    def send_message_to_llm(self, message: list[dict[str, Any]]) -> Any:
+        return self.client.responses.create(
+            model=self.model_name,
+            input=message,
+            max_output_tokens=800,
+        )
+
+    def convert_llm_response_to_json_instructions(self, llm_response: Any) -> dict[str, Any]:
+        llm_response_data = (getattr(llm_response, 'output_text', '') or '').strip()
+
+        if llm_response_data == '':
+            # Fallback parsing for SDKs/providers that don't populate output_text.
+            chunks = []
+            for output_item in getattr(llm_response, 'output', []) or []:
+                for content_item in getattr(output_item, 'content', []) or []:
+                    text = getattr(content_item, 'text', None)
+                    if text:
+                        chunks.append(text)
+            llm_response_data = ''.join(chunks).strip()
+
+        start_index = llm_response_data.find('{')
+        end_index = llm_response_data.rfind('}')
+
+        try:
+            json_response = json.loads(llm_response_data[start_index:end_index + 1].strip())
+        except Exception as e:
+            print(f'Error while parsing JSON response - {e}')
+            json_response = {}
+
+        return json_response
diff --git a/app/ui.py b/app/ui.py
@@ -60,7 +60,8 @@ def create_widgets(self) -> None:
             radio_frame.pack(padx=20, pady=10)  # Add padding around the frame
 
             models = [
-                ('GPT-4o (Default. Medium-Accurate, Medium-Fast)', 'gpt-4o'),
+                ('GPT-5.2 (Default)', 'gpt-5.2'),
+                ('GPT-4o (Medium-Accurate, Medium-Fast)', 'gpt-4o'),
                 ('GPT-4o-mini (Cheapest, Fastest)', 'gpt-4o-mini'),
                 ('GPT-4v (Deprecated. Most-Accurate, Slowest)', 'gpt-4-vision-preview'),
                 ('GPT-4-Turbo (Least Accurate, Fast)', 'gpt-4-turbo'),