reagent-systems · ThyFriendlyFox · May 8, 2025 · May 22, 2025
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,22 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/ubuntu
+{
+	"name": "Ubuntu",
+	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+	"image": "mcr.microsoft.com/devcontainers/base:jammy"
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	// "postCreateCommand": "uname -a",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "root"
+}
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ SimpleAgent is designed with the belief that AI agents don't need to be complex
 - **Easy to Extend**: Add new capabilities by creating new command modules
 - **Change Summarization**: Automatically summarizes changes made using a cheaper GPT model
 - **Modular Architecture**: Core components are separated into their own modules
+- **Benchmarking System**: Comprehensive testing framework to verify command functionality
 
 ## Project Structure
 
@@ -34,8 +35,14 @@ SimpleAgent/
   │   │   ├── write_file/
   │   │   └── ...
   │   └── ...                # Other command categories
+  ├── benchmark/             # Benchmark tests
+  │   ├── __init__.py        # Benchmark package initialization
+  │   ├── test_framework.py  # Test discovery and execution framework
+  │   ├── test_file_ops.py   # Tests for file operations
+  │   └── ...                # Tests for other command categories
   ├── output/                # Generated files and input files directory
   ├── SimpleAgent.py          # Main entry point
+  ├── status.md              # Command status report
   ├── requirements.txt       # Dependencies
   └── .env                   # Environment variables (create from .env.example)
 ```
@@ -133,6 +140,25 @@ python SimpleAgent.py -a 10 "research and look into https://github.com/PyGithub/
 python SimpleAgent.py -a 10 "please research the latest in stock and look at the top 10 stock prices and write them to a file called 'stock_prices.txt'"
 ```
 
+## Running Benchmarks
+
+SimpleAgent includes a comprehensive benchmark system to verify that all commands are working correctly.
+
+To run the benchmark tests:
+
+```
+python SimpleAgent.py --benchmark
+```
+
+This will test all available commands and generate a status report in `status.md`.
+
+To view the status report without running the tests:
+
+```
+python SimpleAgent.py --status
+```
+
+For more information about benchmarks, see the [benchmark README](SimpleAgent/benchmark/README.md).
 
 ## Adding New Commands
 

diff --git a/SimpleAgent/SimpleAgent.py b/SimpleAgent/SimpleAgent.py
@@ -21,6 +21,13 @@
 from core.agent import SimpleAgent
 from core.config import OPENAI_API_KEY, MAX_STEPS
 
+# Import benchmark modules
+try:
+    from benchmark.test_framework import discover_and_run_tests, generate_status_markdown, save_status_file
+    BENCHMARK_AVAILABLE = True
+except ImportError:
+    BENCHMARK_AVAILABLE = False
+
 # Initialize commands
 commands.init()
 
@@ -39,11 +46,54 @@ def main():
                       help='Auto-continue for N steps (default: 10 if no number provided)')
     parser.add_argument('-m', '--max-steps', type=int, default=10,
                       help='Maximum number of steps to run (default: 10)')
-    parser.add_argument('instruction', nargs='+', help='The instruction for the AI agent')
+    parser.add_argument('-b', '--benchmark', action='store_true',
+                      help='Run benchmark tests for all commands')
+    parser.add_argument('-s', '--status', action='store_true',
+                      help='Generate a status report for all commands without running tests')
+    parser.add_argument('-o', '--output', default=None,
+                      help='Output file path for status.md (default: SimpleAgent/status.md)')
+    parser.add_argument('instruction', nargs='*', help='The instruction for the AI agent')
 
     # Parse arguments
     args = parser.parse_args()
 
+    # Run benchmarks if requested
+    if args.benchmark:
+        if not BENCHMARK_AVAILABLE:
+            print("Error: Benchmark module not available. Please install it first.")
+            return 1
+
+        print("Running benchmark tests for all commands...")
+        results = discover_and_run_tests()
+        status_md = generate_status_markdown(results)
+        output_path = save_status_file(status_md, args.output)
+        print(f"Benchmark tests completed! Status file saved to: {output_path}")
+        return 0
+
+    # Generate status report if requested
+    if args.status:
+        if not BENCHMARK_AVAILABLE:
+            print("Error: Benchmark module not available. Please install it first.")
+            return 1
+
+        status_path = args.output or os.path.join(os.path.dirname(__file__), 'status.md')
+        if os.path.exists(status_path):
+            print(f"Status report is available at: {status_path}")
+            with open(status_path, 'r', encoding='utf-8') as f:
+                # Print summary section
+                for line in f:
+                    print(line.strip())
+                    if line.strip() == "## Command Status by Category":
+                        break
+        else:
+            print(f"Status report not found. Run with --benchmark to generate it.")
+        return 0
+
+    # Ensure instruction is provided if not running benchmarks or status
+    if not args.instruction:
+        parser.print_help()
+        return 1
+
     # Join the instruction parts back together
     instruction = ' '.join(args.instruction)
 
@@ -53,7 +103,9 @@ def main():
     # Initialize and run the agent
     agent = SimpleAgent()
     agent.run(instruction, max_steps=max_steps, auto_continue=args.auto)
+
+    return 0
 
 
 if __name__ == "__main__":
-    main() 
+    sys.exit(main()) 
diff --git a/SimpleAgent/benchmark/README.md b/SimpleAgent/benchmark/README.md
@@ -0,0 +1,87 @@
+# SimpleAgent Benchmark Tests
+
+This directory contains benchmark tests for all SimpleAgent commands to ensure they work correctly. The tests are designed to be run periodically to verify that all functionality is working as expected.
+
+## Running Benchmark Tests
+
+There are several ways to run the benchmark tests:
+
+### 1. Using SimpleAgent.py
+
+The simplest way is to use the `--benchmark` flag with SimpleAgent.py:
+
+```bash
+python SimpleAgent.py --benchmark
+```
+
+This will run all tests and generate a status.md file with the results.
+
+### 2. Using the Benchmark Runner
+
+You can also use the dedicated benchmark runner script:
+
+```bash
+./benchmark/run_all.py
+```
+
+### 3. Using Individual Test Modules
+
+You can run individual test modules using Python:
+
+```bash
+python -m benchmark.test_file_ops
+python -m benchmark.test_web_ops
+python -m benchmark.test_data_ops
+python -m benchmark.test_github_ops
+python -m benchmark.test_agent
+```
+
+## Viewing Test Results
+
+After running the tests, you can view the results in the generated status.md file. This file contains a summary of all tests and their status.
+
+You can also display a summary of the results using the `--status` flag:
+
+```bash
+python SimpleAgent.py --status
+```
+
+## Adding New Tests
+
+To add tests for a new command, create a test function in the appropriate test module. The test function should:
+
+1. Be named `test_commandname`
+2. Return a tuple of (success, message)
+3. Handle exceptions gracefully
+
+Example:
+
+```python
+def test_new_command() -> Tuple[bool, str]:
+    """Test the new_command command."""
+    try:
+        # Test code here
+        result = new_command(param1, param2)
+
+        # Verify the result
+        if not result:
+            return False, "Command failed"
+
+        return True, "Command successful"
+    except Exception as e:
+        return False, f"Exception: {str(e)}"
+```
+
+## Test Environment
+
+Tests run in a dedicated test environment with:
+
+- A clean test directory (`TEST_OUTPUT_DIR`)
+- Mocked network calls to avoid actual web requests
+- Mocked GitHub API calls to avoid actual GitHub operations
+
+This ensures tests can run without external dependencies or side effects.
+
+## Failed Tests
+
+If a test fails, it will be marked as "Failed" in the status.md file. You can fix the issue and run the tests again to update the status. 
diff --git a/SimpleAgent/benchmark/__init__.py b/SimpleAgent/benchmark/__init__.py
@@ -0,0 +1,6 @@
+"""
+Benchmark package for SimpleAgent.
+
+This package contains benchmark tests for all SimpleAgent commands to ensure 
+they work correctly. It also generates a status.md file for tracking command status.
+""" 
diff --git a/SimpleAgent/benchmark/run_all.py b/SimpleAgent/benchmark/run_all.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+"""
+Run all SimpleAgent benchmark tests.
+
+This script is a simple shortcut to run benchmark tests for all SimpleAgent commands
+and generate a status.md file with the results.
+"""
+
+import os
+import sys
+
+# Add parent directory to sys.path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Import test framework
+from benchmark.test_framework import (
+    discover_and_run_tests,
+    generate_status_markdown,
+    save_status_file
+)
+
+if __name__ == "__main__":
+    print("Running SimpleAgent benchmark tests...")
+
+    # Run all tests
+    results = discover_and_run_tests()
+
+    # Generate status markdown
+    status_md = generate_status_markdown(results)
+
+    # Save status file
+    output_path = save_status_file(status_md)
+
+    print(f"\nBenchmark tests completed!")
+    print(f"Status file saved to: {output_path}")
+
+    # Print summary
+    total = 0
+    working = 0
+    failed = 0
+    not_tested = 0
+
+    for category, commands in results.items():
+        for cmd, result in commands.items():
+            total += 1
+            if result['status'] == 'Working':
+                working += 1
+            elif result['status'] == 'Failed':
+                failed += 1
+            else:
+                not_tested += 1
+
+    print(f"\nSummary:")
+    print(f"- Total Commands: {total}")
+    if total > 0:
+        print(f"- Working: {working} ({working/total*100:.1f}%)")
+        print(f"- Failed: {failed} ({failed/total*100:.1f}%)")
+        print(f"- Not Tested: {not_tested} ({not_tested/total*100:.1f}%)")
+    else:
+        print("- No commands found to test") 
diff --git a/SimpleAgent/benchmark/run_benchmarks.py b/SimpleAgent/benchmark/run_benchmarks.py
@@ -0,0 +1,52 @@
+"""
+Run benchmarks for SimpleAgent commands.
+
+This script runs tests for all SimpleAgent commands and 
+generates a status.md file with the results.
+"""
+
+import os
+import sys
+import argparse
+
+from benchmark.test_framework import (
+    discover_and_run_tests,
+    generate_status_markdown,
+    save_status_file
+)
+
+def main():
+    parser = argparse.ArgumentParser(description='Run benchmark tests for SimpleAgent commands.')
+    parser.add_argument('--output', '-o', default=None, 
+                        help='Output file path for status.md (default: SimpleAgent/status.md)')
+    parser.add_argument('--verbose', '-v', action='store_true',
+                        help='Enable verbose output')
+
+    args = parser.parse_args()
+
+    # Ensure we can import SimpleAgent modules
+    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+    print("Starting SimpleAgent benchmark tests...")
+
+    try:
+        # Run all tests
+        results = discover_and_run_tests()
+
+        # Generate status markdown
+        status_md = generate_status_markdown(results)
+
+        # Save status file
+        output_path = save_status_file(status_md, args.output)
+
+        print(f"\nBenchmark tests completed!")
+        print(f"Status file saved to: {output_path}")
+
+        return 0
+
+    except Exception as e:
+        print(f"Error running benchmark tests: {e}")
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(main())