diff --git a/README.md b/README.md index 686db4b..1f56b77 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ The `ai_function` takes the following parameters: - `args`: A list of arguments for the function. - `description`: A string describing the purpose of the function. - `model`: (Optional) A string specifying the GPT model to use. Default is 'gpt-4'. +- `error_correction`: (Optional) A boolean specifying if mulitple additional attempts are being made to auto-correct errors. Default is 'false'. Example usage: @@ -67,21 +68,18 @@ print(result) # Output: 12 ## Limitations -The table below shows the success rate of the AI functions with different GPT models: +The table below shows the success rate of the AI functions with different GPT models without using error correction: -| Description | GPT-4 Result | GPT-3.5-turbo Result | Reason | +| Description | GPT-4 Result | GPT-3.5-turbo Result | Comment | |---------------------------|--------------|----------------------|--------| -| Generate fake people | PASSED | FAILED | Incorrect response format | +| Generate fake people | PASSED | PASSED | N/A | | Generate Random Password | PASSED | PASSED | N/A | -| Calculate area of triangle| FAILED | FAILED | Incorrect float value (GPT-4), Incorrect response format (GPT-3.5-turbo) | +| Calculate area of triangle| PASSED | PASSED | N/A | | Calculate the nth prime number | PASSED | PASSED | N/A | | Encrypt text | PASSED | PASSED | N/A | | Find missing numbers | PASSED | PASSED | N/A | -It's important to note that AI Functions are not suited for certain tasks, particularly those involving mathematical calculations and precision. As observed in the case of calculating the area of a triangle and finding the nth prime number, GPT models can struggle with providing accurate results. The limitations of GPT models in such cases are mainly due to their inherent inability to perform precise arithmetic and the ambiguity in understanding user inputs. - -In conclusion, while AI Functions can be helpful in various scenarios, they may not be the optimal choice for tasks requiring mathematical accuracy or specific domain knowledge. For such use-cases, utilizing traditional algorithms and libraries would yield better results. - +We are looking for more test cases that will push the boundaries of AI Functions. ### test_ai_functions.py diff --git a/ai_functions.py b/ai_functions.py index a0d53a3..a467bc9 100644 --- a/ai_functions.py +++ b/ai_functions.py @@ -1,14 +1,116 @@ import openai +from util import exec_with_return, getSmallTrace -def ai_function(function, args, description, model = "gpt-4"): + +def ai_function( + function: str, + args, + description: str, + model="gpt-4", + error_correction=False +): + VERBOSE=False + MAX_ERROR_CORRECTION_RETRY = 5 # parse args to comma separated string args = ", ".join(args) - messages = [{"role": "system", "content": f"You are now the following python function: ```# {description}\n{function}```\n\nOnly respond with your `return` value. Do not include any other explanatory text in your response."},{"role": "user", "content": args}] + system_prompt = f"""You are now coding the following python function: \n\n# {description}\n{function}\n\n + In the last line of your output, call the function with the given argument. + Only respond with python code. Do not include any other explanatory text in your response and do not format as markdown. + Do not use the print() function anywhere in your code. Only use standard python library in your import statements as dependencies." + """ + + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": args}, + ] + code = callOpenAI(messages, model) + + if VERBOSE: + print(f"CODE:\n{code}") + + if not error_correction: + try: + result = exec_with_return(code) + except Exception: + result = "" + else: + for i in range(MAX_ERROR_CORRECTION_RETRY): + try: + result = exec_with_return(code) + break + except Exception: + broken_code = code + code = fix_code(code, messages, VERBOSE, model) + result = "" + + # If there are no further improvements, then stop error correcting + if broken_code == code: + break + + if not result: + return ai_function_without_code(function, args, description) + + return result + + +def ai_function_without_code( + function: str, + args, + description: str, + model="gpt-4", +): + messages = [ + { + "role": "system", + "content": f"You are now the following python function: ```# {description}\n{function}```\n\nOnly respond with your `return` value. Do not include any other explanatory text in your response.", + }, + {"role": "user", "content": args}, + ] response = openai.ChatCompletion.create( - model=model, - messages=messages, - temperature=0 + model=model, messages=messages, temperature=0 ) + result = response.choices[0].message["content"] + + # If result is enclosed in quotes, then strip quotes + if ( + isinstance(result, str) + and result[0] == result[-1] + and (result[0] == '"' or result[0] == "'") + ): + result = result.strip("'\"") + return result + + +def fix_code(broken_code: str, messages: list[dict], verbose: bool, model): + error_trace = getSmallTrace() + if verbose: + print(f"ERROR:\n{error_trace}") + error_correction_prompt = f""" + Unfortunately, when executing the code the following error happens. Can you fix the code? + Remember, only respond output executable python code. Do not include any other explanatory text in your response. + Do not use the print() function anywhere in your code. + Only use standard python library in your import statements as dependencies. + Here is the error: {error_trace} + """ + + messages.extend( + [ + {"role": "assistant", "content": broken_code}, + {"role": "user", "content": error_correction_prompt}, + ] + ) + if verbose: + print(f"ERROR CORRECTION PROMPT:\n{messages}") + result = callOpenAI(messages, model) + if verbose: + print(f"IMPROVED CODE:\n{code}") + return result + + +def callOpenAI(messages: list[dict], model="gpt-4") -> str: + response = openai.ChatCompletion.create( + model=model, messages=messages, temperature=0 + ) return response.choices[0].message["content"] diff --git a/keys.py b/keys.py index 30ce07f..b15b4f1 100644 --- a/keys.py +++ b/keys.py @@ -1 +1 @@ -OPENAI_API_KEY="" # Get yours from: https://beta.openai.com/account/api-keys \ No newline at end of file +OPENAI_API_KEY = "" # Get yours from: https://beta.openai.com/account/api-keys diff --git a/test_ai_function.py b/test_ai_function.py index bb2e19d..bee06bf 100644 --- a/test_ai_function.py +++ b/test_ai_function.py @@ -9,6 +9,7 @@ # Initialize the OpenAI API client openai.api_key = keys.OPENAI_API_KEY + # Run all tests, print the results, and return the number of failed tests def run_tests(model): test_functions = [test_1, test_2, test_3, test_4, test_5, test_6] @@ -18,13 +19,15 @@ def run_tests(model): "Calculate area of triangle", "Calculate the nth prime number", "Encrypt text", - "Find missing numbers" -] + "Find missing numbers", + ] failed_tests = [] i = 0 for test in test_functions: - print(f"=-=-=- Running test: {test.__name__} - {test_names[i]} with model {model} -=-=-=") + print( + f"=-=-=- Running test: {test.__name__} - {test_names[i]} with model {model} -=-=-=" + ) i += 1 try: test(model) @@ -39,7 +42,10 @@ def run_tests(model): print(f"Total tests: {len(test_functions)}") # Print the number of failed tests - print(f"Success Rate: {len(test_functions) - len(failed_tests)}/{len(test_functions)}") + print( + f"Success Rate: {len(test_functions) - len(failed_tests)}/{len(test_functions)}" + ) + # Ai function test 1 def test_1(model): @@ -48,35 +54,48 @@ def test_1(model): description_string = """Generates n examples of fake data representing people, each with a name and an age.""" - result_string = ai_functions.ai_function(function_string, args, description_string, model) + result_string = ai_functions.ai_function( + function_string, args, description_string, model + ) print(f"Output: {result_string}") - # Assert the result can be parsed as is a list of dictionaries - print("Testing if result is a a string...") - assert isinstance(result_string, str) - result = None - try: - print("Testing if result can be parsed as a list of dictionaries...") - # Parse the result as a list of dictionaries - result = json.loads(result_string) - except Exception as e: - # If the result can't be parsed as a list of dictionaries, the test fails - assert False - + + if isinstance(result_string, list): + result = result_string + else: + # Assert the result can be parsed as is a list of dictionaries + print("Testing if result is a a string...") + assert isinstance(result_string, str) + result = None + try: + print("Testing if result can be parsed as a list of dictionaries...") + # Parse the result as a list of dictionaries + result = json.loads(result_string) + except Exception as e: + # If the result can't be parsed as a list of dictionaries, the test fails + assert False + # Assert the length of the result is equal to the number of people requested - print("Testing if the length of the result is equal to the number of people requested...") + print( + "Testing if the length of the result is equal to the number of people requested..." + ) if result: assert len(result) == int(args[0]) else: assert False + # Ai function test 2 def test_2(model): - function_string = "def random_password_generator(length: int, special_chars: bool) -> str:" + function_string = ( + "def random_password_generator(length: int, special_chars: bool) -> str:" + ) args = ["12", "True"] description_string = """Generates a random password of given length with or without special characters.""" - result_string = ai_functions.ai_function(function_string, args, description_string, model) + result_string = ai_functions.ai_function( + function_string, args, description_string, model + ) print(f"Output: {result_string}") @@ -84,13 +103,20 @@ def test_2(model): print("Testing if the length of the result is equal to the length requested...") assert len(result_string) == int(args[0]) + # Ai function test 3 def test_3(model): - function_string = "def calculate_area_of_triangle(base: float, height: float) -> float:" + function_string = ( + "def calculate_area_of_triangle(base: float, height: float) -> float:" + ) args = ["15", "6.5"] - description_string = """Calculates the area of a triangle given its base and height.""" + description_string = ( + """Calculates the area of a triangle given its base and height.""" + ) - result_string = ai_functions.ai_function(function_string, args, description_string, model) + result_string = ai_functions.ai_function( + function_string, args, description_string, model + ) print(f"Output: {result_string}") # Assert the result can be parsed as a float @@ -103,16 +129,22 @@ def test_3(model): # Assert the result is equal to the expected area of the triangle expected_area = (float(args[0]) * float(args[1])) / 2 - print("Testing if the result is equal to the expected area of the triangle, which is: " + str(expected_area)) + print( + "Testing if the result is equal to the expected area of the triangle, which is: " + + str(expected_area) + ) assert float(result_string) == pytest.approx(expected_area) + # Ai function test 4 def test_4(model): function_string = "def get_nth_prime_number(n: int) -> int:" args = ["10"] description_string = """Finds and returns the nth prime number.""" - result_string = ai_functions.ai_function(function_string, args, description_string, model) + result_string = ai_functions.ai_function( + function_string, args, description_string, model + ) print(f"Output: {result_string}") @@ -126,46 +158,63 @@ def test_4(model): # Assert the result is equal to the expected nth prime number expected_prime_number = 29 - print("Testing if the result is equal to the expected nth prime number, which is: " + str(expected_prime_number)) + print( + "Testing if the result is equal to the expected nth prime number, which is: " + + str(expected_prime_number) + ) assert int(result_string) == expected_prime_number + # Ai function test 5 def test_5(model): function_string = "def encrypt_text(text: str, key: str) -> str:" - args = ["'Hello, World!'", "'abc123'"] + plain_text = "Hello, World!" + key = "abc123" + args = [f"'{plain_text}'", f"'{key}'"] description_string = """Encrypts the given text using a simple character substitution based on the provided key.""" - result_string = ai_functions.ai_function(function_string, args, description_string, model) + cipher = ai_functions.ai_function(function_string, args, description_string, model) - print(f"Output: {result_string}") + print(f"Output: {cipher}") # Assert the result has the same length as the input text print("Testing if the result has the same length as the input text...") - assert len(result_string) == len(args[0]) + assert len(cipher) == len(plain_text) + # Ai function test 6 def test_6(model): - function_string = "def find_missing_numbers_in_list(numbers: list[int]) -> list[int]:" + function_string = ( + "def find_missing_numbers_in_list(numbers: list[int]) -> list[int]:" + ) args = ["[3, 5, 8, 15, 16]"] - description_string = """Finds and returns a list of missing numbers in a given sorted list.""" + description_string = ( + """Finds and returns a list of missing numbers in a given sorted list.""" + ) - result_string = ai_functions.ai_function(function_string, args, description_string, model) + result_string = ai_functions.ai_function( + function_string, args, description_string, model + ) print(f"Output: {result_string}") - # Assert the result can be parsed as a list - try: - result_list = ast.literal_eval(result_string) - print("Testing if result is a a list...") - assert isinstance(result_list, list) - except Exception as e: - print(e) - assert False + if isinstance(result_string, list): + result_list = result_string + else: + # Assert the result can be parsed as a list + try: + result_list = ast.literal_eval(result_string) + print("Testing if result is a a list...") + assert isinstance(result_list, list) + except Exception as e: + print(e) + assert False # Assert the result list contains the expected missing numbers expected_missing_numbers = [4, 6, 7, 9, 10, 11, 12, 13, 14] print("Testing if the result list contains the expected missing numbers...") assert result_list == expected_missing_numbers + run_tests("gpt-4") -run_tests("gpt-3.5-turbo") \ No newline at end of file +run_tests("gpt-3.5-turbo") diff --git a/test_util.py b/test_util.py new file mode 100644 index 0000000..1d580e0 --- /dev/null +++ b/test_util.py @@ -0,0 +1,81 @@ +import unittest +from util import exec_with_return +import util + + +class TestExecWithReturn(unittest.TestCase): + def test_exec_with_return_single_expression(self): + code = "3 + 4" + expected = 7 + result = exec_with_return(code) + self.assertEqual(result, expected) + + def test_exec_with_return_single_statement(self): + code = "x = 5" + expected = None + result = exec_with_return(code) + self.assertEqual(result, expected) + + def test_exec_with_return_mixed_code(self): + code = """ +a = 2 +b = 3 +a * b +""" + expected = 6 + result = exec_with_return(code) + self.assertEqual(result, expected) + + def test_exec_with_return_function_definition(self): + code = """ +def add(x, y): + return x + y + +add(3, 4) +""" + expected = 7 + result = exec_with_return(code) + self.assertEqual(result, expected) + + def test_exec_with_return_complex_code(self): + code = """ +class MyClass: + def __init__(self, x): + self.x = x + + def multiply(self, y): + return self.x * y + +obj = MyClass(5) +obj.multiply(4) +""" + expected = 20 + result = exec_with_return(code) + self.assertEqual(result, expected) + + def test_get_small_trace(self): + code = """ +import string + +def encrypt_text(text: str, key: str) -> str: + alphabet = string.ascii_lowercase + key_map = str.maketrans(alphabet, key[:len(alphabet)]) + return text.translate(key_map) + +encrypt_text('Hello, World!', 'abc123') +""" + + try: + exec_with_return(code) + except Exception: + small_trace = util.getSmallTrace() + self.assertIn("ValueError", small_trace) + self.assertIn("encrypt_text", small_trace) + + # Check if the output has exactly two lines + lines = small_trace.splitlines() + self.assertEqual(len(lines), 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/util.py b/util.py new file mode 100644 index 0000000..82fca11 --- /dev/null +++ b/util.py @@ -0,0 +1,34 @@ +import ast +import copy +import traceback + + +def convertExpr2Expression(Expr): + Expr.lineno = 0 + Expr.col_offset = 0 + result = ast.Expression(Expr.value, lineno=0, col_offset=0) + + return result + + +def exec_with_return(code): + code_ast = ast.parse(code) + + init_ast = copy.deepcopy(code_ast) + init_ast.body = code_ast.body[:-1] + + last_ast = copy.deepcopy(code_ast) + last_ast.body = code_ast.body[-1:] + + exec(compile(init_ast, "", "exec"), globals()) + if type(last_ast.body[0]) == ast.Expr: + expression = convertExpr2Expression(last_ast.body[0]) + compliedExpr = compile(expression, "", "eval") + return eval(compliedExpr, globals()) + else: + exec(compile(last_ast, "", "exec"), globals()) + + +def getSmallTrace() -> str: + lines = traceback.format_exc().splitlines() + return "\n".join(lines[-2:])