Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,4 +146,5 @@ config_files/wa/test_webarena.json
config_files/wa/test_webarena/*

cache/*
agents/prompts/jsons/*
agents/prompts/jsons/*
log.txt
8 changes: 0 additions & 8 deletions agent/__init__.py

This file was deleted.

117 changes: 72 additions & 45 deletions agent/agent.py → agent/agents.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
import os
import sys
parent_dir = os.path.dirname(os.path.abspath(__file__))
up_dir = parent_dir
for i in range(3):
sys.path.append(up_dir)
up_dir = os.path.dirname(up_dir)
from kutils import DEBUG, INFO, WARN, ERROR
import utils as u
import argparse
import json
import importlib
from typing import Any, Optional

import tiktoken
from beartype import beartype
from PIL import Image

Expand All @@ -14,36 +22,23 @@
create_id_based_action,
create_none_action,
create_playwright_action,
create_vision_action,
create_mas_action,
)
from browser_env.utils import Observation, StateInfo
from llms import (
call_llm,
generate_from_huggingface_completion,
generate_from_openai_chat_completion,
generate_from_openai_completion,
# call_llm,
# generate_from_huggingface_completion,
# generate_from_openai_chat_completion,
# generate_from_openai_completion,
lm_config,
)
from llms.tokenizers import Tokenizer


class Agent:
"""Base class for the agent"""

def __init__(self, *args: Any) -> None:
pass

def next_action(
self, trajectory: Trajectory, intent: str, meta_data: Any
) -> Action:
"""Predict the next action given the observation"""
raise NotImplementedError

def reset(
self,
test_config_file: str,
) -> None:
raise NotImplementedError

from prompts.prompt_constructor import MultimodalCoTPromptConstructor, MultimodalCoTPromptConstructor
from llms.ais_requestor import get_lm_requestor
from base_agent import Agent
from multi_agent import MultiAgent
from vision_agent import VisionAgent

class TeacherForcingAgent(Agent):
"""Agent that follows a pre-defined action sequence"""
Expand Down Expand Up @@ -106,27 +101,27 @@ def __init__(
self,
action_set_tag: str,
lm_config: lm_config.LMConfig,
prompt_constructor: PromptConstructor,
prompt_constructor: PromptConstructor | MultimodalCoTPromptConstructor | MultimodalCoTPromptConstructor,
captioning_fn = None,
) -> None:
super().__init__()
self.lm_config = lm_config
self.prompt_constructor = prompt_constructor
self.action_set_tag = action_set_tag
self.captioning_fn = captioning_fn

# Check if the model is multimodal.
if ("gemini" in lm_config.model or "gpt-4" in lm_config.model and "vision" in lm_config.model) and type(prompt_constructor) == MultimodalCoTPromptConstructor:
self.multimodal_inputs = True
else:
self.multimodal_inputs = False
self.multimodal_inputs = True
self.kq = get_lm_requestor(lm_config.model)

def set_action_set_tag(self, tag: str) -> None:
self.action_set_tag = tag

@beartype
def next_action(
self, trajectory: Trajectory, intent: str, meta_data: dict[str, Any], images: Optional[list[Image.Image]] = None,
self,
trajectory: Trajectory,
intent: str,
meta_data: dict[str, Any],
images: Optional[list[Image.Image]] = None,
output_response: bool = False
) -> Action:
# Create page screenshot image for multimodal models.
Expand Down Expand Up @@ -155,17 +150,29 @@ def next_action(
)

if self.multimodal_inputs:
prompt = self.prompt_constructor.construct(
messages = self.prompt_constructor.construct(
trajectory, intent, page_screenshot_img, images, meta_data
)
else:
prompt = self.prompt_constructor.construct(
messages = self.prompt_constructor.construct(
trajectory, intent, meta_data
)
lm_config = self.lm_config
n = 0
while True:
response = call_llm(lm_config, prompt)
# response = call_llm(lm_config, prompt)
# u.write_json(f'{u.get_time()}.json', messages)

try:
model = lm_config.model
if 'qwen' in model:
response = self.kq.infer_messages(messages)
else:
raise ValueError(lm_config)
except Exception as e:
ERROR(e)
response = 'stop []'

force_prefix = self.prompt_constructor.instruction[
"meta_data"
].get("force_prefix", "")
Expand Down Expand Up @@ -204,14 +211,13 @@ def reset(self, test_config_file: str) -> None:
def construct_agent(args: argparse.Namespace, captioning_fn=None) -> Agent:
llm_config = lm_config.construct_llm_config(args)

default_provider = 'openai'
default_model = 'gpt-3.5-turbo-1106'
tokenizer = Tokenizer(default_provider, default_model)

agent: Agent
if args.agent_type == "teacher_forcing":
agent = TeacherForcingAgent()
elif args.agent_type == "prompt":
with open(args.instruction_path) as f:
constructor_type = json.load(f)["meta_data"]["prompt_constructor"]
tokenizer = Tokenizer(args.provider, args.model)
prompt_constructor = eval(constructor_type)(
if args.mode == "som":
prompt_constructor = MultimodalCoTPromptConstructor(
args.instruction_path, lm_config=llm_config, tokenizer=tokenizer
)
agent = PromptAgent(
Expand All @@ -220,8 +226,29 @@ def construct_agent(args: argparse.Namespace, captioning_fn=None) -> Agent:
prompt_constructor=prompt_constructor,
captioning_fn=captioning_fn
)
elif args.mode == "mas":
prompt_constructor = MultimodalCoTPromptConstructor(
args.instruction_path, lm_config=llm_config, tokenizer=tokenizer
)
agent = MultiAgent(
action_set_tag=args.action_set_tag,
lm_config=llm_config,
prompt_constructor=prompt_constructor,
captioning_fn=captioning_fn
)
elif args.mode == 'url_mas':
from url_infer.url_multi_agent import URLMultiAgent
prompt_constructor = MultimodalCoTPromptConstructor(
args.instruction_path, lm_config=llm_config, tokenizer=tokenizer
)
agent = URLMultiAgent(
action_set_tag=args.action_set_tag,
lm_config=llm_config,
prompt_constructor=prompt_constructor,
captioning_fn=captioning_fn
)
else:
raise NotImplementedError(
f"agent type {args.agent_type} not implemented"
f"agent type {args.mode} not implemented"
)
return agent
22 changes: 22 additions & 0 deletions agent/base_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from typing import Any, Optional
from browser_env import Trajectory
from browser_env.actions import Action

class Agent:
"""Base class for the agent"""

def __init__(self, *args: Any) -> None:
pass

def next_action(
self, trajectory: Trajectory, intent: str, meta_data: Any
) -> Action:
"""Predict the next action given the observation"""
raise NotImplementedError

def reset(
self,
test_config_file: str,
) -> None:
raise NotImplementedError

130 changes: 130 additions & 0 deletions agent/multi_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import os
import sys
parent_dir = os.path.dirname(os.path.abspath(__file__))
up_dir = parent_dir
for i in range(3):
sys.path.append(up_dir)
up_dir = os.path.dirname(up_dir)
from kutils import DEBUG, INFO, WARN, ERROR
import utils as u
from typing import Any, Optional
from beartype import beartype
from PIL import Image
from agent.prompts import *
from browser_env import Trajectory
from browser_env.actions import (
Action,
ActionParsingError,
create_id_based_action,
create_none_action,
create_playwright_action,
create_vision_action,
create_mas_action,
)
from browser_env.utils import Observation, StateInfo
from llms import lm_config
from llms.tokenizers import Tokenizer
from prompts.prompt_constructor import MultimodalCoTPromptConstructor, MultimodalCoTPromptConstructor
from base_agent import Agent
from AWorld.examples.visualwebarena.action_team import ActionTeam

class MultiAgent(Agent):
@beartype
def __init__(
self,
action_set_tag: str,
lm_config: lm_config.LMConfig,
prompt_constructor: PromptConstructor |
MultimodalCoTPromptConstructor |
MultimodalCoTPromptConstructor,
captioning_fn = None,
) -> None:
super().__init__()
self.lm_config = lm_config
self.prompt_constructor = prompt_constructor
self.action_set_tag = action_set_tag
self.captioning_fn = captioning_fn
self.multimodal_inputs = True
self.bu_agent = ActionTeam(self.lm_config.model)

def set_action_set_tag(self, tag: str) -> None:
self.action_set_tag = tag

@beartype
def next_action(
self,
trajectory: Trajectory,
intent: str,
meta_data: dict[str, Any],
images: Optional[list[Image.Image]] = None,
output_response: bool = False
) -> Action:
# Create page screenshot image for multimodal models.
ori_page_screenshot_arr = trajectory[-1]["observation"]["ori_image"]
ori_page_screenshot_img = Image.fromarray(ori_page_screenshot_arr)
som_page_screenshot_arr = trajectory[-1]["observation"]["image"]
som_page_screenshot_img = Image.fromarray(som_page_screenshot_arr)

last_som_img = None
last_ori_img = None
for i in range(-2, -len(trajectory), -1):
if 'observation' in trajectory[i].keys():
last_som_img = trajectory[i]['observation']['image']
last_som_img = Image.fromarray(last_som_img)
last_ori_img = trajectory[i]['observation']['ori_image']
last_ori_img = Image.fromarray(last_ori_img)
break

# Caption the input image, if provided.
image_input_caption = ''
input_img = None
try:
if images is not None and len(images) > 0:
if self.captioning_fn is not None:
for image_i, image in enumerate(images):
if image_i == 0:
image_input_caption += f'Input image {image_i+1}: "{self.captioning_fn([image])[0]}"'
else:
image_input_caption += f'input image {image_i+1}: "{self.captioning_fn([image])[0]}"'
if len(images) > 1:
image_input_caption += ", "
# Update intent to include captions of input images.
# intent = f"{image_input_caption}\nTask: {intent}"
elif not self.multimodal_inputs:
print("WARNING: Input image provided but no image captioner available.")
input_img = images[0]
except Exception as e:
ERROR(f'caption function {self.lm_config.caption_model} {e}')

page_text = trajectory[-1]['observation']['text']
page_texts = page_text.split('\n')
page_texts = [a.replace(' ', '') for a in page_texts if not a.startswith('[]')]
page_text = '\n'.join(page_texts)

state_info: StateInfo = trajectory[-1] # type: ignore[assignment]
obs = state_info["observation"]['text']
page = meta_data["page"]
url = page.url
action_history = meta_data["action_history"]
tabs = meta_data['tabs']
site_name = self.lm_config.domain

action_info, response = self.bu_agent.next_action(output_response,
intent, action_history, site_name, url, obs, tabs, input_img, som_page_screenshot_img, ori_page_screenshot_img, page)
force_prefix = self.prompt_constructor.instruction["meta_data"].get("force_prefix", "")
response = f"{force_prefix}{response}"

try:
parsed_response = self.prompt_constructor.extract_action(response)
action = create_mas_action(parsed_response, obs)
action["raw_prediction"] = response
except Exception as e:
action = create_none_action()
action["raw_prediction"] = response

action['action_info'] = action_info
action['domain'] = self.lm_config.domain
return action

def reset(self, test_config_file: str) -> None:
self.called_closing_agents = []
34 changes: 34 additions & 0 deletions agent/prompts/jsons/mas.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"intro": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page screenshot: This is a screenshot of the webpage, with each interactable element assigned a unique numerical id. Each bounding box and its respective id shares the same color.\nThe observation, which lists the IDs of all interactable elements on the current web page with their text content if any, in the format [id] [tagType] [text content]. tagType is the type of the element, such as button, link, or textbox. text content is the text content of the element. For example, [1234] [button] ['Add to Cart'] means that there is a button with id 1234 and text content 'Add to Cart' on the current web page. [] [StaticText] [text] means that the element is of some text that is not interactable.\nThe current web page's URL: This is the page you're currently navigating.\nThe open tabs: These are the tabs you have open.\nThe previous action: This is the action you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n```click [id]```: This action clicks on an element with a specific id on the webpage.\n```type [id] [content]```: Use this to type the content into the field with id. By default, the \"Enter\" key is pressed after typing unless press_enter_after is set to 0, i.e., ```type [id] [content] [0]```.\n```hover [id]```: Hover over an element with id.\n```press [key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n```scroll [down]``` or ```scroll [up]```: Scroll the page up or down.\n\nTab Management Actions:\n```new_tab```: Open a new, empty browser tab.\n```tab_focus [tab_index]```: Switch the browser's focus to a specific tab using its index.\n```close_tab```: Close the currently active tab.\n\nURL Navigation Actions:\n```goto [url]```: Navigate to a specific URL.\n```go_back```: Navigate to the previously viewed page.\n```go_forward```: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n```stop [answer]```: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket.\n\nHomepage:\nIf you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.\nhttp://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. Generate the action in the correct format. Start with a \"In summary, the next action I will perform is\" phrase, followed by action inside ``````. For example, \"In summary, the next action I will perform is ```click [1234]```\".\n5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.",
"examples": [
[
"OBSERVATION:\n[31] [IMG] [Image, description: hp fx-7010dn fax machine, url: http://ec2-3-13-232-171.us-east-2.compute.amazonaws.com:7770/media/catalog/product/cache/89ff578b9cd87e0600daac45c9e1ea98/B/0/B08GKZ3ZKD.0.jpg]\n[32] [A] [HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)]\n[] [StaticText] [$279.49]\n[33] [BUTTON] [Add to Cart]\n[34] [A] [Add to Wish List]\n[35] [A] [Add to Compare]\nURL: http://onestopmarket.com/office-products/office-electronics.html\nOBJECTIVE: What is the price of HP Inkjet Fax Machine?\nPREVIOUS ACTION: None",
"Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```",
"agent/prompts/som_examples/som_example1.png"
],
[
"OBSERVATION:\n[] [StaticText] [/f/food]\n[] [StaticText] [[homemade] Obligatory Halloween Pumpkin Loaf!\tSubmitted by\tkneechalice\tt3_yid9lu\t1 year ago]\n[9] [IMG] []\n[] [StaticText] [Submitted by\tkneechalice\tt3_yid9lu\t1 year ago]\n[10] [A] [kneechalice]\n[11] [A] [45 comments]\n[] [StaticText] [[I ate] Maple Pecan Croissant\tSubmitted by\tAccordingtoJP\tt3_y3hrpn\t1 year ago]\n[14] [IMG] []\n[] [StaticText] [Submitted by\tAccordingtoJP\tt3_y3hrpn\t1 year ago]\n[15] [A] [AccordingtoJP]\n[16] [A] [204 comments]\nURL: http://reddit.com\nOBJECTIVE: Tell me what the top comment on the croissant post says.\nPREVIOUS ACTION: None",
"Let's think step-by-step. This page has a post titled '[I ate] Maple Pecan Croissant', which is the post mentioned in the objective. In order to find the top comment, I will navigate into the comments section of the post. In summary, the next action I will perform is ```click [11]```",
"agent/prompts/som_examples/som_example2.png"
],
[
"OBSERVATION:\n[] [StaticText] [What are you looking for today?]\n[5] [INPUT] []\n[6] [SELECT] [Select a category]\n[7] [BUTTON] [Search]\n[] [StaticText] [Latest Listings]\n[] [StaticText] [Atlas Powered Audio System w/ Tripod\t150.00 $\tMusic instruments\tBorough of Red Lion (Pennsylvania)\t2023/11/16]\n[8] [IMG] [Atlas Powered Audio System w/ Tripod]\n[9] [A] [Atlas Powered Audio System w/ Tripod]\n[] [StaticText] [150.00 $]\n[] [StaticText] [Neptune Gaming Console\t350.00 $\tVideo gaming\tPennwyn (Pennsylvania)\t2023/11/16]\n[10] [IMG] [Neptune Gaming Console]\n[11] [A] [Neptune Gaming Console]\n[] [StaticText] [350.00 $]\nURL: http://classifieds.com\nOBJECTIVE: Help me find the cheapest dark colored guitar.\nPREVIOUS ACTION: None",
"Let's think step-by-step. The objective is to find the cheapest dark colored guitar on the site. The site has a search box whose ID is [5]. I can search for guitars by entering \"guitar\". I can submit this by pressing the Enter afterwards. In summary, the next action I will perform is ```type [5] [guitar] [1]```",
"agent/prompts/som_examples/som_example3.png"
]
],
"template": "OBSERVATION: {observation}\nURL: {url}\nOBJECTIVE: {objective}\nPREVIOUS ACTION: {previous_action}",
"meta_data": {
"observation": "image_som",
"action_type": "som",
"keywords": [
"url",
"objective",
"observation",
"previous_action"
],
"prompt_constructor": "MultimodalCoTPromptConstructor",
"answer_phrase": "In summary, the next action I will perform is",
"action_splitter": "```"
}
}
Loading