ManifoldRG · Locke0 · Oct 2, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 9, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,22 +27,26 @@ repos:
     hooks:
       - id: check-added-large-files
         args: ['--maxkb=1024']
-        exclude: experiments/perturbation_experiment.ipynb
+        exclude: experiments/perturbation_experiment.ipynb|^osworld-human-main/.*$
       # - id: debug-statements
       - id: check-merge-conflict
       - id: check-case-conflict
       - id: check-yaml
       - id: check-toml
       - id: end-of-file-fixer
+        exclude: ^osworld-human-main/.*$
       - id: trailing-whitespace
+        exclude: ^osworld-human-main/.*$
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.12.12
     hooks:
       - id: ruff-format
+        exclude: ^src/perturbation_engine/tools/autoglm_v/.*$|^osworld-human-main/
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
         files: \.py$
+        exclude: ^src/perturbation_engine/tools/autoglm_v/.*$|^osworld-human-main/
 
   # - repo: https://github.com/PyCQA/isort
   #   rev: 6.0.1
@@ -56,6 +60,7 @@ repos:
       - id: typos
         args: [--force-exclude]
         files: \.py$|.md$
+        exclude: ^src/perturbation_engine/tools/autoglm_v/.*$|^osworld-human-main/.*$
 
 
   # - repo: https://github.com/asottile/pyupgrade
@@ -73,6 +78,7 @@ repos:
         types_or: [markdown, mdx]
         args: [--prose-wrap=preserve]
         files: \.py$|.md$
+        exclude: ^osworld-human-main/.*$
 
 
   ##### Security #####

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -17,6 +17,19 @@
       "cwd": "${workspaceFolder}",
       "justMyCode": true
     },
+    {
+      "name": "Python Debugger: generate_trajectories.py",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "src/perturbation_engine/pipeline/generate_trajectories.py",
+      "console": "integratedTerminal",
+      "python": "${workspaceFolder}/.venv/bin/python",
+      "env": {
+        "PYTHONPATH": "${workspaceFolder}/src"
+      },
+      "cwd": "${workspaceFolder}",
+      "justMyCode": true
+    },
     {
       "name": "Pytest: Current File",
       "type": "debugpy",

diff --git a/experiments/debug_generated_trajectories.ipynb b/experiments/debug_generated_trajectories.ipynb
@@ -0,0 +1,260 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5c065f6a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'action': '# Click on the \"Data\" menu to explore options for creating a Pivot '\n",
+      "           'Table\\n'\n",
+      "           'pyautogui.click(458, 75)',\n",
+      " 'action_timestamp': '20251001@000353',\n",
+      " 'app_states': [{'app_name': 'gsd-xsettings',\n",
+      "                 'app_type': 'system_settings',\n",
+      "                 'current_view': 'main_view',\n",
+      "                 'element_count': 1,\n",
+      "                 'key_elements': [],\n",
+      "                 'task_context': 'Application: gsd-xsettings'},\n",
+      "                {'app_name': 'gnome-shell',\n",
+      "                 'app_type': 'terminal',\n",
+      "                 'current_view': 'main_view',\n",
+      "                 'element_count': 1895,\n",
+      "                 'key_elements': [],\n",
+      "                 'task_context': 'Application: gnome-shell'},\n",
+      "                {'app_name': 'Invoices.xlsx - LibreOffice Calc',\n",
+      "                 'app_type': 'libreoffice_calc',\n",
+      "                 'current_view': 'main_view',\n",
+      "                 'element_count': 2637,\n",
+      "                 'key_elements': [],\n",
+      "                 'task_context': 'Application: Invoices.xlsx - LibreOffice '\n",
+      "                                 'Calc'}],\n",
+      " 'done': False,\n",
+      " 'perturbation_applied': True,\n",
+      " 'perturbation_decision': {'api_call': 'execute_python_command',\n",
+      "                           'generated_code': 'import subprocess\\n'\n",
+      "                                             '\\n'\n",
+      "                                             '# Apply High Contrast theme\\n'\n",
+      "                                             \"subprocess.run(['gsettings', \"\n",
+      "                                             \"'set', \"\n",
+      "                                             \"'org.gnome.desktop.interface', \"\n",
+      "                                             \"'gtk-theme', 'HighContrast'], \"\n",
+      "                                             'check=False)\\n'\n",
+      "                                             \"subprocess.run(['gsettings', \"\n",
+      "                                             \"'set', \"\n",
+      "                                             \"'org.gnome.desktop.interface', \"\n",
+      "                                             \"'icon-theme', 'HighContrast'], \"\n",
+      "                                             'check=False)\\n'\n",
+      "                                             \"subprocess.run(['gsettings', \"\n",
+      "                                             \"'set', \"\n",
+      "                                             \"'org.gnome.desktop.interface', \"\n",
+      "                                             \"'font-name', 'Arial 12'], \"\n",
+      "                                             'check=False)\\n'\n",
+      "                                             \"subprocess.run(['gsettings', \"\n",
+      "                                             \"'set', \"\n",
+      "                                             \"'org.gnome.desktop.wm.preferences', \"\n",
+      "                                             \"'titlebar-font', 'Arial Bold \"\n",
+      "                                             \"12'], check=False)\\n\"\n",
+      "                                             \"subprocess.run(['gsettings', \"\n",
+      "                                             \"'set', \"\n",
+      "                                             \"'org.gnome.desktop.interface', \"\n",
+      "                                             \"'document-font-name', 'Arial \"\n",
+      "                                             \"12'], check=False)\\n\"\n",
+      "                                             \"subprocess.run(['gsettings', \"\n",
+      "                                             \"'set', \"\n",
+      "                                             \"'org.gnome.desktop.interface', \"\n",
+      "                                             \"'monospace-font-name', \"\n",
+      "                                             \"'Monospace 12'], check=False)\\n\"\n",
+      "                                             \"subprocess.run(['gsettings', \"\n",
+      "                                             \"'set', \"\n",
+      "                                             \"'org.gnome.desktop.interface', \"\n",
+      "                                             \"'cursor-theme', 'whiteglass'], \"\n",
+      "                                             'check=False)\\n'\n",
+      "                                             \"subprocess.run(['gsettings', \"\n",
+      "                                             \"'set', \"\n",
+      "                                             \"'org.gnome.desktop.interface', \"\n",
+      "                                             \"'enable-animations', 'false'], \"\n",
+      "                                             'check=False)\\n'\n",
+      "                                             '\\n'\n",
+      "                                             '# Send a notification to inform '\n",
+      "                                             'the user\\n'\n",
+      "                                             \"subprocess.run(['notify-send', \"\n",
+      "                                             \"'System', 'High Contrast Theme \"\n",
+      "                                             \"applied'], check=False)\",\n",
+      "                           'parameters': {'target_app': 'system'},\n",
+      "                           'perturbation_type': 'complete_design_system_transformation',\n",
+      "                           'reasoning': 'Applying the High Contrast theme at '\n",
+      "                                        'the beginning of the task as '\n",
+      "                                        'specified in the scenario spec. The '\n",
+      "                                        \"'Data' menu click is the first \"\n",
+      "                                        'action, so perturbing the system here '\n",
+      "                                        'will allow us to see how the agent '\n",
+      "                                        'adapts to the change during task '\n",
+      "                                        'execution. Targeting the desktop '\n",
+      "                                        'theme, icon theme, fonts, and '\n",
+      "                                        'notification to ensure complete '\n",
+      "                                        'coverage and adherence to the High '\n",
+      "                                        'Contrast design system.',\n",
+      "                           'should_apply': True,\n",
+      "                           'target_app': 'system'},\n",
+      " 'response': {'action': '# Click on the \"Data\" menu to explore options for '\n",
+      "                        'creating a Pivot Table\\n'\n",
+      "                        'pyautogui.click(458, 75)',\n",
+      "              'thought': ''},\n",
+      " 'reward': 0,\n",
+      " 'scenario_spec': {'available_perturbation_actions': 'gsettings set '\n",
+      "                                                     'org.gnome.desktop.interface '\n",
+      "                                                     'gtk-theme '\n",
+      "                                                     \"'HighContrast'; \"\n",
+      "                                                     'gsettings set '\n",
+      "                                                     'org.gnome.desktop.interface '\n",
+      "                                                     'icon-theme '\n",
+      "                                                     \"'HighContrast'; \"\n",
+      "                                                     'gsettings set '\n",
+      "                                                     'org.gnome.desktop.interface '\n",
+      "                                                     \"font-name 'Arial 12'; \"\n",
+      "                                                     'gsettings set '\n",
+      "                                                     'org.gnome.desktop.wm.preferences '\n",
+      "                                                     \"titlebar-font 'Arial \"\n",
+      "                                                     \"Bold 12'; gsettings set \"\n",
+      "                                                     'org.gnome.desktop.interface '\n",
+      "                                                     'document-font-name '\n",
+      "                                                     \"'Arial 12'; gsettings \"\n",
+      "                                                     'set '\n",
+      "                                                     'org.gnome.desktop.interface '\n",
+      "                                                     'monospace-font-name '\n",
+      "                                                     \"'Monospace 12'; \"\n",
+      "                                                     'gsettings set '\n",
+      "                                                     'org.gnome.desktop.interface '\n",
+      "                                                     'cursor-theme '\n",
+      "                                                     \"'whiteglass'; gsettings \"\n",
+      "                                                     'set '\n",
+      "                                                     'org.gnome.desktop.interface '\n",
+      "                                                     'enable-animations false; '\n",
+      "                                                     \"notify-send 'System' \"\n",
+      "                                                     \"'High Contrast Theme \"\n",
+      "                                                     \"applied'\",\n",
+      "                   'learning_objectives': 'Adapting to a high-contrast system '\n",
+      "                                          'theme with stark color contrasts '\n",
+      "                                          'and simplified fonts for improved '\n",
+      "                                          'visibility.',\n",
+      "                   'perturbation_trigger': 'at task start',\n",
+      "                   'perturbation_types': ['theme',\n",
+      "                                          'layout',\n",
+      "                                          'content_variation'],\n",
+      "                   'scenario_id': 'scenario_1',\n",
+      "                   'target_app': 'system',\n",
+      "                   'target_components': ['desktop',\n",
+      "                                         'windows',\n",
+      "                                         'notifications',\n",
+      "                                         'menus']},\n",
+      " 'screenshot_file': '[email protected]',\n",
+      " 'step_num': 1,\n",
+      " 'task_instruction': 'Create a Pivot Table in a new sheet (Sheet2) to count '\n",
+      "                     'how many times each \"Invoice No.\" appears.'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "\n",
+    "with open(\"/Users/lockewang/FIG/software-control/trajectories/1954cced-e748-45c4-9c26-9855b97fbc5e_scenario_1/traj.jsonl\", \"r\") as f:\n",
+    "    traj = []\n",
+    "    for line in f:\n",
+    "        traj.append(json.loads(line))\n",
+    "\n",
+    "\n",
+    "from pprint import pprint\n",
+    "pprint(traj[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "fa1ab950",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total runs: 7, Total completed: 3\n",
+      "Completed seed trajectory ids: \n",
+      "['99146c54-4f37-4ab8-9327-5f3291665e1e',\n",
+      " '59155008-fe71-45ec-8a8f-dc35497b6aa8',\n",
+      " 'f5d96daf-83a8-4c86-9686-bada31fc66ab']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from pprint import pprint\n",
+    "\n",
+    "trajectories_dir = \"/Users/lockewang/FIG/software-control/trajectories\"\n",
+    "folders = [f for f in os.listdir(trajectories_dir) if os.path.isdir(os.path.join(trajectories_dir, f))]\n",
+    "\n",
+    "total_runs = 0\n",
+    "total_completed = 0\n",
+    "completed_seed_trajectory_ids = []\n",
+    "\n",
+    "for folder in folders:\n",
+    "    run_folders = [f for f in os.listdir(os.path.join(trajectories_dir, folder)) if \"run_\" in f]\n",
+    "    newest_run_folder = max(run_folders, key=lambda x: os.path.getmtime(os.path.join(trajectories_dir, folder, x)))\n",
+    "    scenario_folders = [f for f in os.listdir(os.path.join(trajectories_dir, folder, newest_run_folder)) if os.path.isdir(os.path.join(trajectories_dir, folder, newest_run_folder, f))]\n",
+    "    scenario_folder = os.path.join(trajectories_dir, folder, newest_run_folder, scenario_folders[0])\n",
+    "    traj_file = os.path.join(scenario_folder, \"traj.jsonl\")\n",
+    "    try:\n",
+    "        with open(traj_file, \"r\") as f:\n",
+    "            traj = [json.loads(line) for line in f]\n",
+    "    except Exception as e:\n",
+    "        continue\n",
+    "        \n",
+    "    if traj[-1].get('completion_status'):\n",
+    "        # pprint(traj[-1]['completion_status'])\n",
+    "        total_runs += 1\n",
+    "        if traj[-1]['completion_status']['completed_all_steps'] == True:\n",
+    "            total_completed += 1\n",
+    "            completed_seed_trajectory_ids.append(folder)\n",
+    "\n",
+    "print(f\"Total runs: {total_runs}, Total completed: {total_completed}\")\n",
+    "print(\"Completed seed trajectory ids: \")\n",
+    "pprint(completed_seed_trajectory_ids)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67f11700",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/osworld-human-main/.gitignore b/osworld-human-main/.gitignore
@@ -0,0 +1,5 @@
+setup_task.py
+cp.py
+totest/*
+*.env
+*.bat
diff --git a/osworld-human-main/README.md b/osworld-human-main/README.md
@@ -0,0 +1,41 @@
+# OSWorld-Human
+| <a href="https://arxiv.org/abs/2506.16042"><b>Research Paper</b></a> | <a href="https://mlsys.wuklab.io/posts/oshuman"><b>Blog Post</b></a> | 
+
+Computer-use agents (CUAs) are often unusable due to extreme end-to-end latency—taking tens of minutes for tasks humans complete in just a few. We present the first temporal performance study of computer-use agents on OSWorld and find that large model calls for planning and reflection dominate latency, with later steps taking up to 3× longer than earlier ones. To measure efficiency, we introduce OSWorld-Human, a manually annotated version of OSWorld with human reference trajectories. Evaluating 16 agents, we find even top performers take 1.4–2.7× more steps than necessary.
+
+## News
+- <b>July 07, 2025</b>: OSWorld-Human blog post available on [mlsys.wuklab.io](https://mlsys.wuklab.io/posts/oshuman)
+- <b>June 19, 2025</b>: OSWorld-Human [research paper](https://arxiv.org/abs/2506.16042) available on arXiv.
+- <b>June 09, 2025</b>: 🎉 Our paper has been accepted to the **[Workshop on Computer-Use Agents](https://www.icml-computeruseagents.com/) at ICML 2025!** See you in Vancouver!
+
+## 🏆 Leaderboard (Updated 6/30)
+| Agent (Max Steps)              | Original OSWorld (%) | Single-Action WES+ (%) | Grouped-Action WES+ (%) | WES-   |
+|----------------------------------|--------------|------------------|------------------|--------|
+| UI-TARS-1.5 (100)                | **42.5**     | 23.7             | 14.3             | -0.22  |
+| Agent S2 w/ Gemini 2.5 (50)      | 41.4         | **28.2**         | **17.4**         | -0.26  |
+| InfantAgent (50)                 | 35.3         | 13.3             | 8.2              | -0.22  |
+| Agent S2 w/ Claude 3.7 (50)      | 34.5         | 20.0             | 11.4             | -0.42  |
+| UI-TARS-1.5 7B (100)             | 26.9         | 12.4             | 7.9              | -0.33  |
+| UI-TARS-72B-DPO (50)             | 24.6         | 15.6             | 10.6             | **-0.16** |
+
+
+## Usage
+To compute your agent's score on OSWorld-Human, simply provide the path to the `result` directory generated by OSWorld and the maximum number of steps your agent could use.
+```bash
+python score.py --result-path /path/to/results/ --max-steps-scoring 50
+```
+
+If you would like to score the UI-TARS trajectories that have been submitted to OSWorld, add the `--uitars` flag to the command.
+
+## Citation
+```
+@misc{abhyankar2025osworldhumanbenchmarkingefficiencycomputeruse,
+      title={OSWorld-Human: Benchmarking the Efficiency of Computer-Use Agents}, 
+      author={Reyna Abhyankar and Qi Qi and Yiying Zhang},
+      year={2025},
+      eprint={2506.16042},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2506.16042}, 
+}
+```