Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 22 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,34 @@
- [01/25/2024]: GitHub repo released with tasks and scripts for setting up the VWA environments.

## Install

**Option 1: pip install from GitHub (Recommended)**
```bash
# Python 3.10 (or 3.11, but not 3.12 cause 3.12 deprecated distutils needed here)
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
pip install git+https://github.com/web-arena-x/visualwebarena.git
playwright install
pip install -e .
```

You can also run the unit tests to ensure that VisualWebArena is installed correctly:
**Option 2: Install from Source**
```bash
# Python 3.10 (or 3.11, but not 3.12 cause 3.12 deprecated distutils needed here)
git clone https://github.com/web-arena-x/visualwebarena.git
cd visualwebarena
pip install .
playwright install
```

**Option 3: Development Installation**
```bash
# Python 3.10 (or 3.11, but not 3.12 cause 3.12 deprecated distutils needed here)
git clone https://github.com/web-arena-x/visualwebarena.git
cd visualwebarena
pip install -e .
playwright install
```

You can run the unit tests to ensure that VisualWebArena is installed correctly:
```bash
pytest -x
```

Expand Down
72 changes: 72 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
[build-system]
requires = ["setuptools>=64", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "visualwebarena"
version = "0.1.0"
description = "Evaluating Multimodal Agents on Realistic Visual Web Tasks"
readme = "README.md"
license = {text = "MIT"}
authors = [
{name = "VisualWebArena Team"},
]
requires-python = ">=3.7"
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Researchers",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
dependencies = [
"beartype>=0.12.0",
"beautifulsoup4>=4.12.2",
"gymnasium>=0.29.1",
"numpy>=1.25.2",
"pillow>=10.0.1",
"playwright>=1.37.0",
"pydantic>=2.4.2",
"requests>=2.31.0",
"openai>=1.3.5",
"torch>=2.0.1",
"transformers>=4.34.0",
"nltk>=3.8.1",
"scikit-image>=0.22.0",
"tiktoken>=0.4.0",
"matplotlib>=3.8.0",
"text-generation>=0.6.1",
"aiolimiter>=1.1.0",
"evaluate>=0.4.0",
]

[project.optional-dependencies]
dev = [
"pre-commit==3.0.1",
"pytest==7.1.2",
"mypy==0.991",
"nbmake",
"pytest-asyncio",
"types-requests",
]

[project.urls]
Homepage = "https://jykoh.com/vwa"
Repository = "https://github.com/web-arena-x/visualwebarena"
Issues = "https://github.com/web-arena-x/visualwebarena/issues"

[tool.setuptools.packages.find]
include = ["visualwebarena*"]

[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = "test_*.py"

[tool.mypy]
strict = true
16 changes: 8 additions & 8 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,26 @@
import torch
from PIL import Image

from agent import (
from visualwebarena.agent import (
PromptAgent,
construct_agent,
)
from agent.prompts import *
from browser_env import (
from visualwebarena.agent.prompts import *
from visualwebarena.browser_env import (
Action,
ActionTypes,
ScriptBrowserEnv,
StateInfo,
Trajectory,
create_stop_action,
)
from browser_env.actions import is_equivalent
from browser_env.auto_login import get_site_comb_from_filepath
from browser_env.helper_functions import (
from visualwebarena.browser_env.actions import is_equivalent
from visualwebarena.browser_env.auto_login import get_site_comb_from_filepath
from visualwebarena.browser_env.helper_functions import (
RenderHelper,
get_action_description,
)
from evaluation_harness import evaluator_router, image_utils
from visualwebarena.evaluation_harness import evaluator_router, image_utils

DATASET = os.environ["DATASET"]

Expand Down Expand Up @@ -469,7 +469,7 @@ def test(

def prepare(args: argparse.Namespace) -> None:
# convert prompt python files to json
from agent.prompts import to_json
from visualwebarena.agent.prompts import to_json

to_json.run()

Expand Down
14 changes: 7 additions & 7 deletions run_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,25 @@
from beartype import beartype
from PIL import Image

from agent import (
from visualwebarena.agent import (
PromptAgent,
construct_agent,
)
from agent.prompts import *
from browser_env import (
from visualwebarena.agent.prompts import *
from visualwebarena.browser_env import (
Action,
ActionTypes,
ScriptBrowserEnv,
StateInfo,
Trajectory,
create_stop_action,
)
from browser_env.actions import is_equivalent
from browser_env.helper_functions import (
from visualwebarena.browser_env.actions import is_equivalent
from visualwebarena.browser_env.helper_functions import (
RenderHelper,
get_action_description,
)
from evaluation_harness import image_utils
from visualwebarena.evaluation_harness import image_utils

LOG_FOLDER = "log_files"
Path(LOG_FOLDER).mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -395,7 +395,7 @@ def test(

def prepare(args: argparse.Namespace) -> None:
# convert prompt python files to json
from agent.prompts import to_json
from visualwebarena.agent.prompts import to_json

to_json.run()

Expand Down
4 changes: 2 additions & 2 deletions scripts/collect_obs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
import pytest
from playwright.sync_api import Page, expect
import browser_env
from browser_env import (
from visualwebarena.browser_env import (
ScriptBrowserEnv,
create_id_based_action,
create_key_press_action,
create_playwright_action,
create_scroll_action,
)
from browser_env.env_config import *
from visualwebarena.browser_env.env_config import *

HEADLESS = False

Expand Down
2 changes: 1 addition & 1 deletion scripts/generate_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
import os

from browser_env.env_config import *
from visualwebarena.browser_env.env_config import *


def main() -> None:
Expand Down
25 changes: 0 additions & 25 deletions setup.cfg

This file was deleted.

4 changes: 0 additions & 4 deletions setup.py

This file was deleted.

2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
import pytest_asyncio

from browser_env import AsyncScriptBrowserEnv, ScriptBrowserEnv
from visualwebarena.browser_env import AsyncScriptBrowserEnv, ScriptBrowserEnv

HEADLESS = True
SLOW_MO = 0
Expand Down
2 changes: 1 addition & 1 deletion tests/test_browser_env/test_action_functionalities.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from playwright.sync_api import Page, expect

from browser_env import (
from visualwebarena.browser_env import (
ScriptBrowserEnv,
create_id_based_action,
create_key_press_action,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_browser_env/test_actions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np

from browser_env import *
from visualwebarena.browser_env import *


def test_is_equivalent() -> None:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_browser_env/test_auth_cookie.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio
import json

from browser_env import *
from visualwebarena.browser_env import *

auth_json = {
"cookies": [
Expand Down
2 changes: 1 addition & 1 deletion tests/test_browser_env/test_playwright_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
from playwright.sync_api import Page

from browser_env import ScriptBrowserEnv, create_playwright_action
from visualwebarena.browser_env import ScriptBrowserEnv, create_playwright_action

HEADLESS = True
SLOW_MO = 0
Expand Down
6 changes: 3 additions & 3 deletions tests/test_browser_env/test_script_browser_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from gymnasium.vector import AsyncVectorEnv
from playwright.sync_api import Page

from browser_env import (
from visualwebarena.browser_env import (
Action,
AsyncScriptBrowserEnv,
DetachedPage,
Expand All @@ -20,8 +20,8 @@
create_playwright_action,
create_scroll_action,
)
from browser_env.actions import create_id_based_action
from browser_env.env_config import ACCOUNTS, REDDIT, SHOPPING
from visualwebarena.browser_env.actions import create_id_based_action
from visualwebarena.browser_env.env_config import ACCOUNTS, REDDIT, SHOPPING

@pytest.mark.skip(reason="The actions are deprecated")
def test_script_browser_env(script_browser_env: ScriptBrowserEnv) -> None:
Expand Down
10 changes: 5 additions & 5 deletions tests/test_evaluation_harness/test_exact_evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,17 @@
from PIL import Image
from py import test

from agent import Agent, TeacherForcingAgent
from browser_env import ActionTypes, ScriptBrowserEnv
from browser_env.env_config import *
from evaluation_harness import (
from visualwebarena.agent import Agent, TeacherForcingAgent
from visualwebarena.browser_env import ActionTypes, ScriptBrowserEnv
from visualwebarena.browser_env.env_config import *
from visualwebarena.evaluation_harness import (
HTMLContentExactEvaluator,
PageImageEvaluator,
StringEvaluator,
URLExactEvaluator,
image_utils,
)
from evaluation_harness.evaluators import EvaluatorComb
from visualwebarena.evaluation_harness.evaluators import EvaluatorComb

IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
HEADLESS = True
Expand Down
6 changes: 3 additions & 3 deletions tests/test_evaluation_harness/test_helper_functions.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import json
import os

from browser_env import ScriptBrowserEnv
from browser_env.env_config import *
from evaluation_harness.helper_functions import (
from visualwebarena.browser_env import ScriptBrowserEnv
from visualwebarena.browser_env.env_config import *
from visualwebarena.evaluation_harness.helper_functions import (
get_query_text,
get_query_text_lowercase,
reddit_get_latest_comment_content_by_username,
Expand Down
13 changes: 13 additions & 0 deletions visualwebarena/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""
VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web Tasks

This package provides components for evaluating multimodal autonomous language agents
on web-based visual tasks.
"""

__version__ = "0.1.0"

from .browser_env import ScriptBrowserEnv
from .agent import PromptAgent

__all__ = ["ScriptBrowserEnv", "PromptAgent"]
File renamed without changes.
Loading