-
Notifications
You must be signed in to change notification settings - Fork 156
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Provisional implementation of computer tool.
- Loading branch information
Eric Patey
committed
Jan 9, 2025
1 parent
abc3cf6
commit 38414f5
Showing
35 changed files
with
1,472 additions
and
55 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
services: | ||
default: | ||
# Temporary internal image until the official one is available | ||
image: inspect-computer-tool | ||
init: true | ||
|
||
# If you only launch a single container, you can vnc into that container by using | ||
# the following port mapping | ||
# ports: | ||
# - "5900:5900" | ||
|
||
# If you launch multiple containers, you can vnc into each container by using the | ||
# following port mapping, but you will need to determine the host port number for | ||
# the specific container by using `docker inspect <container_id_or_name>` | ||
# ports: | ||
# - "5900" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from inspect_ai import Task, task | ||
from inspect_ai.dataset import Sample | ||
from inspect_ai.scorer import includes | ||
from inspect_ai.solver import system_message | ||
from inspect_ai.solver._basic_agent import basic_agent | ||
from inspect_ai.tool import computer | ||
|
||
SYSTEM_MESSAGE = """ | ||
Before each step, please clearly explain your intent for performing a tool action: "I expect tool command X to ...". | ||
After each step, carefully evaluate the resulting screenshot to see if the command achieved the right outcome. | ||
Explicitly show your thinking: "I have evaluated step X..." If not correct, try again. Only when | ||
you confirm a step was executed correctly should you move on to the next one. | ||
Note that launching applications from the bottom task bar requires a single left click. | ||
""" | ||
|
||
|
||
@task | ||
def computer_task(): | ||
return Task( | ||
dataset=[ | ||
Sample( | ||
input="Use the computer tool to find the contents of the file '/tmp/flag.txt'. When you've found it, report its contents.", | ||
target="Frunobulax", | ||
files={"/tmp/flag.txt": "flag.txt"}, | ||
), | ||
Sample( | ||
input="Use the computer tool and open the spreadsheet '/tmp/moonWeight.ods`. Enter your weight as 156 lbs. Report the result from cell B2. Important: If the 'Tip of the Day' dialog appears, you'll need to close it before proceeding. Important: You may need to install a spreadsheet using a command like 'sudo apt-get install -y libreoffice'.", | ||
target="26", | ||
files={"/tmp/moonWeight.ods": "moonWeight.ods"}, | ||
), | ||
Sample( | ||
input="Use the computer tool to launch a terminal. Type 'Trudging across the tundra. Mile after Mile.' into the terminal. Important: Make sure that the terminal window is active before typing. When you are done, please use the submit tool to record the result of hitting enter in the terminal after entering that text.", | ||
target="bash: Trudging: command not found", | ||
), | ||
Sample( | ||
input="Use the computer tool to launch a calculator. Calculate 123 x 456. Report the result.", | ||
target="56088", | ||
), | ||
Sample( | ||
input='Draw a smiley face with a paint program. When you are done, submit("Art is in the eye of the beholder"). Important: In XPaint, you need to create a new canvas before you can draw.', | ||
target="Art is in the eye of the beholder", | ||
), | ||
# Sample( | ||
# input='Draw a smiley face with a paint program. The face should be a solid yellow circle on a light gray background with solid black circles for eyes. The mouth should be a curved black line. When you are done, submit("Art is in the eye of the beholder"). Important: In XPaint, you need to create a new canvas before you can draw. Also, in XPaint, the toolbar contains the following tools on each row. 1. Pencil, Dynamic Pencil, Dot Pencil 2. Brush, Spray, Smear 3. Segment, Polygonal Line, Arc 4. Arrowhead, Text, Erase 5. Box, Filled Box, Box Region 6. Oval, Filled Oval, Oval Region 7. Freehand Shape, Filled Freehand Shape, Freehand Shape Region 8. Polygon, Filled Polygon, Polygon Region 9. Spline Curve, Filled Spline Curve, Spline Curve Region 10. Fill, Gradient Fill, Fractal Fill', | ||
# target='Art is in the eye of the beholder', | ||
# ), | ||
], | ||
solver=basic_agent( | ||
init=system_message(SYSTEM_MESSAGE), | ||
tools=[computer()], | ||
max_messages=100, | ||
), | ||
scorer=includes(), | ||
sandbox="docker", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Frunobulax |
Binary file not shown.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
services: | ||
default: | ||
# Temporary internal image until the official one is available | ||
image: inspect-computer-tool | ||
init: true | ||
ports: | ||
- "5900:5900" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from ._computer import computer | ||
|
||
__all__ = ["computer"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
import json | ||
import logging | ||
from typing import Literal | ||
|
||
from pydantic import BaseModel, Field | ||
|
||
from inspect_ai._util.content import ContentText | ||
from inspect_ai.model import ContentImage | ||
from inspect_ai.tool import ToolError, ToolResult | ||
from inspect_ai.util import sandbox | ||
|
||
Action = Literal[ | ||
"key", | ||
"type", | ||
"mouse_move", | ||
"left_click", | ||
"left_click_drag", | ||
"right_click", | ||
"middle_click", | ||
"double_click", | ||
"screenshot", | ||
"cursor_position", | ||
] | ||
|
||
log = logging.getLogger(__name__) | ||
# log = MockLogger() | ||
log.setLevel(logging.DEBUG) | ||
|
||
|
||
class ToolExecResult(BaseModel): | ||
output: str | None = Field(default=None) | ||
error: str | None = Field(default=None) | ||
base64_image: str | None = Field(default=None) | ||
|
||
|
||
async def _send_cmd(cmdTail: list[str], timeout: int | None = None) -> ToolResult: | ||
from inspect_ai.log._samples import sample_active | ||
|
||
sample = sample_active() | ||
assert sample | ||
sample_id = sample.sample.id | ||
assert sample_id | ||
|
||
cmd = ["python3", "-m", "computer_tool.computer_tool", "--action"] + cmdTail | ||
log.info(f"(sample={sample_id}) Executing command: {cmd}") | ||
|
||
try: | ||
raw_exec_result = await sandbox().exec(cmd, timeout=timeout) | ||
|
||
if not raw_exec_result.success: | ||
raise Exception( | ||
f"Failure executing command: ${cmd} {raw_exec_result.stderr}" | ||
) | ||
|
||
result = ToolExecResult(**json.loads(raw_exec_result.stdout)) | ||
|
||
if result.error: | ||
log.warning( | ||
f"(sample={sample_id}) Tool returned an error. Raising ToolError('{result.error}'" | ||
) | ||
raise ToolError(result.error) | ||
|
||
image = ( | ||
ContentImage(image=f"data:image/png;base64,{result.base64_image}") | ||
if result.base64_image | ||
else None | ||
) | ||
text = result.output if result.output and len(result.output) > 0 else None | ||
|
||
if text is not None and image is not None: | ||
log.info( | ||
f"(sample={sample_id}) ToolResult([ContentText('{text}'), ContentImage])" | ||
) | ||
return [ContentText(text=text), image] | ||
|
||
if text is not None: | ||
log.info(f"(sample={sample_id}) ToolResult('{text}')") | ||
return text | ||
|
||
if image is not None: | ||
log.info(f"(sample={sample_id}) ToolResult([ContentImage])") | ||
return [image] | ||
|
||
log.warning( | ||
"(sample={sample_id}) Tool returned neither output nor image - returning ToolResult('OK')" | ||
) | ||
return "OK" | ||
except ToolError: | ||
raise | ||
except Exception as e: | ||
log.error(f"(sample={sample_id}) Sandbox.exec threw for {cmd}...re-raising {e}") | ||
raise e | ||
|
||
|
||
async def cursor_position(timeout: int | None = None) -> ToolResult: | ||
return await _send_cmd(["cursor_position"], timeout=timeout) | ||
|
||
|
||
async def screenshot(timeout: int | None = None) -> ToolResult: | ||
return await _send_cmd(["screenshot"], timeout=timeout) | ||
|
||
|
||
async def mouse_move(x: int, y: int, timeout: int | None = None) -> ToolResult: | ||
return await _send_cmd( | ||
["mouse_move", "--coordinate", f"{x}", f"{y}"], timeout=timeout | ||
) | ||
|
||
|
||
async def left_click(timeout: int | None = None) -> ToolResult: | ||
return await _send_cmd(["left_click"], timeout=timeout) | ||
|
||
|
||
async def left_click_drag(x: int, y: int, timeout: int | None = None) -> ToolResult: | ||
return await _send_cmd( | ||
["left_click_drag", "--coordinate", f"{x}", f"{y}"], timeout=timeout | ||
) | ||
|
||
|
||
async def right_click(timeout: int | None = None) -> ToolResult: | ||
return await _send_cmd(["right_click"], timeout=timeout) | ||
|
||
|
||
async def middle_click(timeout: int | None = None) -> ToolResult: | ||
return await _send_cmd(["middle_click"], timeout=timeout) | ||
|
||
|
||
async def double_click(timeout: int | None = None) -> ToolResult: | ||
return await _send_cmd(["double_click"], timeout=timeout) | ||
|
||
|
||
async def press_key(key: str, timeout: int | None = None) -> ToolResult: | ||
return await _send_cmd(["key", "--text", key], timeout=timeout) | ||
|
||
|
||
async def type(text: str, timeout: int | None = None) -> ToolResult: | ||
return await _send_cmd(["type", "--text", text], timeout=timeout) |
Oops, something went wrong.