diff --git a/examples/experimental/group_discussion_agents/group_discussion_agents.py b/examples/experimental/group_discussion_agents/group_discussion_agents.py index e4b3c0c2c..8ef55e5c5 100644 --- a/examples/experimental/group_discussion_agents/group_discussion_agents.py +++ b/examples/experimental/group_discussion_agents/group_discussion_agents.py @@ -2,7 +2,7 @@ from aact import Message, NodeFactory from aact.messages import Text, Tick, DataModel, DataModelFactory from sotopia.agents.llm_agent import ainput -from sotopia.experimental.agents import BaseAgent +from sotopia.experimental.agents.base_agent import BaseAgent from sotopia.generation_utils import agenerate from sotopia.generation_utils.generate import StrOutputParser diff --git a/examples/experimental/interview_openhands/interview_openhands.toml b/examples/experimental/interview_openhands/interview_openhands.toml new file mode 100644 index 000000000..805d4112e --- /dev/null +++ b/examples/experimental/interview_openhands/interview_openhands.toml @@ -0,0 +1,45 @@ +redis_url = "redis://localhost:6379/0" +extra_modules = ["examples.experimental.interview_openhands.llm_agent"] + + +[[nodes]] +node_name = "Jack" +node_class = "llm_agent" + +[nodes.node_args] +query_interval = 5 +output_channel = "Jack:Jane" +input_text_channels = ["Jane:Jack"] +input_env_channels = ["Runtime:Agent"] +input_tick_channel = "tick/secs/1" +goal = "Your goal is to effectively test Jane's technical ability and finally decide if she has passed the interview. Make sure to also evaluate her communication skills, problem-solving approach, and enthusiasm." +model_name = "gpt-4o-mini" +agent_name = "Jack" + +[[nodes]] +node_name = "Jane" +node_class = "llm_agent" + +[nodes.node_args] +query_interval = 7 +output_channel = "Jane:Jack" +input_text_channels = ["Jack:Jane"] +input_env_channels = ["Runtime:Agent"] +input_tick_channel = "tick/secs/1" +goal = "Your goal is to do well in the interview by demonstrating your technical skills, clear communication, and enthusiasm for the position. Stay calm, ask clarifying questions when needed, and confidently explain your thought process." +model_name = "gpt-4o-mini" +agent_name = "Jane" + +[[nodes]] +node_name = "tick" +node_class = "tick" + + +[[nodes]] +node_name = "print" +node_class = "print" + +[nodes.node_args.print_channel_types] +"tick/secs/1" = "tick" +"Jane:Jack" = "agent_action" +"Jack:Jane" = "agent_action" diff --git a/examples/experimental/interview_openhands/llm_agent.py b/examples/experimental/interview_openhands/llm_agent.py new file mode 100644 index 000000000..67b0025f3 --- /dev/null +++ b/examples/experimental/interview_openhands/llm_agent.py @@ -0,0 +1,448 @@ +import logging +import sys +from enum import Enum +from rich.logging import RichHandler +from pydantic import Field + +from typing import Optional + +from aact import Message, NodeFactory +from aact.messages import Text, Tick, DataModel +from aact.messages.registry import DataModelFactory + +from sotopia.experimental.agents.base_agent import BaseAgent + +from sotopia.generation_utils import agenerate +from sotopia.generation_utils.generate import StrOutputParser + +import json + +# Check Python version +if sys.version_info >= (3, 11): + pass +else: + pass + +# Configure logging +FORMAT = "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +logging.basicConfig( + level=logging.WARNING, + format=FORMAT, + datefmt="[%X]", + handlers=[RichHandler()], +) + + +class ActionType(Enum): + NONE = "none" + SPEAK = "speak" + NON_VERBAL = "non-verbal" + LEAVE = "leave" + THOUGHT = "thought" + BROWSE = "browse" + BROWSE_ACTION = "browse_action" + READ = "read" + WRITE = "write" + RUN = "run" + + def __str__(self) -> str: + return self.value + + def __eq__(self, other: object) -> bool: + if isinstance(other, ActionType): + return self.value == other.value + elif isinstance(other, str): + return self.value == other + else: + return NotImplemented + + +@DataModelFactory.register("agent_action") +class AgentAction(DataModel): + agent_name: str = Field(description="the name of the agent") + action_type: ActionType = Field( + description="whether to speak at this turn or choose to not do anything" + ) + argument: str = Field( + description="the utterance if choose to speak, the expression or gesture if choose non-verbal communication, or the physical action if choose action" + ) + path: Optional[str] = Field(description="path of file") + + def to_natural_language(self) -> str: + action_descriptions = { + ActionType.NONE: "did nothing", + ActionType.SPEAK: f'said: "{self.argument}"', + ActionType.THOUGHT: f'thought: "{self.argument}"', + ActionType.BROWSE: f'browsed: "{self.argument}"', + ActionType.RUN: f'ran: "{self.argument}"', + ActionType.READ: f'read: "{self.argument}"', + ActionType.WRITE: f'wrote: "{self.argument}"', + ActionType.NON_VERBAL: f"[{self.action_type.value}] {self.argument}", + ActionType.LEAVE: "left the conversation", + } + + return action_descriptions.get(self.action_type, "performed an unknown action") + + +@NodeFactory.register("llm_agent") +class LLMAgent(BaseAgent[AgentAction | Tick | Text, AgentAction]): + def __init__( + self, + input_text_channels: list[str], + input_tick_channel: str, + input_env_channels: list[str], + output_channel: str, + query_interval: int, + agent_name: str, + goal: str, + model_name: str, + redis_url: str, + ): + super().__init__( + [ + (input_text_channel, AgentAction) + for input_text_channel in input_text_channels + ] + + [ + (input_tick_channel, Tick), + ] + + [(input_env_channel, Text) for input_env_channel in input_env_channels], + [(output_channel, AgentAction)], + redis_url, + ) + self.output_channel = output_channel + self.query_interval = query_interval + self.count_ticks = 0 + self.message_history: list[tuple[str, str, str]] = [] + self.name = agent_name + self.model_name = model_name + self.goal = goal + + async def send(self, message: AgentAction) -> None: + if message.action_type == "speak": + await self.r.publish( + self.output_channel, + Message[AgentAction](data=message).model_dump_json(), + ) + + elif message.action_type in ("browse", "browse_action", "write", "read", "run"): + await self.r.publish( + "Agent:Runtime", + Message[AgentAction](data=message).model_dump_json(), + ) + + def _format_message_history( + self, message_history: list[tuple[str, str, str]] + ) -> str: + ## TODO: akhatua Fix the mapping of action to be gramatically correct + return "\n".join( + (f"{speaker} {action} {message}") + for speaker, action, message in message_history + ) + + def get_action_template(self, selected_actions: list[ActionType]) -> str: + """ + Returns the action template string with selected actions. + + Args: + selected_actions (list[ActionType]): List of ActionType enum members to include in the template. + + Returns: + str: The action template with the selected actions. + """ + base_template = """ You are talking to another agent. + You are {agent_name}.\n + {message_history}\nand you plan to {goal}. + ## Action + What is your next thought or action? Your response must be in JSON format. + + It must be an object, and it must contain two fields: + * `action`, which is one of the actions below + * `args`, which is a map of key-value pairs, specifying the arguments for that action + """ + + action_descriptions = { + str( + ActionType.SPEAK + ): """`speak` - you can talk to the other agents to share information or ask them something. Arguments: + * `content` - the message to send to the other agents (should be short)""", + str( + ActionType.THOUGHT + ): """`thought` - only use this rarely to make a plan, set a goal, record your thoughts. Arguments: + * `content` - the message you send yourself to organize your thoughts (should be short). You cannot think more than 2 turns.""", + str( + ActionType.NONE + ): """`none` - you can choose not to take an action if you are waiting for some data""", + str( + ActionType.NON_VERBAL + ): """`non-verbal` - you can choose to do a non verbal action + * `content` - the non veral action you want to send to other agents. eg: smile, shrug, thumbs up""", + str(ActionType.BROWSE): """`browse` - opens a web page. Arguments: + * `url` - the URL to open, when you browse the web you must use `none` action until you get some information back. When you get the information back you must summarize the article and explain the article to the other agents.""", + str( + ActionType.BROWSE_ACTION + ): """`browse_action` - actions you can take on a web browser + * `command` - the command to run. You have 15 available commands. These commands must be a single string value of command + Options for `command`: + `command` = goto(url: str) + Description: Navigate to a url. + Examples: + goto('http://www.example.com') + + `command` = go_back() + Description: Navigate to the previous page in history. + Examples: + go_back() + + `command` = go_forward() + Description: Navigate to the next page in history. + Examples: + go_forward() + + `command` = noop(wait_ms: float = 1000) + Description: Do nothing, and optionally wait for the given time (in milliseconds). + You can use this to get the current page content and/or wait for the page to load. + Examples: + noop() + noop(500) + + `command` = scroll(delta_x: float, delta_y: float) + Description: Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event. + Examples: + scroll(0, 200) + scroll(-50.2, -100.5) + + `command` = fill(bid, value) + Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for ,