Merge branch 'main' of github.com:OpenDevin/OpenDevin into enyst/memories-condenser

This commit is contained in:
Engel Nyst
2024-06-28 05:29:49 +02:00
18 changed files with 139 additions and 81 deletions

View File

@@ -19,7 +19,7 @@ where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
LLM settings, as defined in your `config.toml`.
- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
like to evaluate. It could also be a release tag like `0.6.2`.
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting

View File

@@ -32,7 +32,7 @@ where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
LLM settings, as defined in your `config.toml`.
- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
like to evaluate. It could also be a release tag like `0.6.2`.
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting

View File

@@ -42,7 +42,7 @@ temperature = 0.0
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
LLM settings, as defined in your `config.toml`.
- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
like to evaluate. It could also be a release tag like `0.6.2`.
## Examples

View File

@@ -22,7 +22,7 @@ where `model_config` is mandatory, while `git-version`, `agent`, `eval_limit` an
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
LLM settings, as defined in your `config.toml`, defaulting to `gpt-3.5-turbo`
- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
like to evaluate. It could also be a release tag like `0.6.2`.
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting

View File

@@ -23,7 +23,7 @@ where `model_config` is mandatory, while all other arguments are optional.
`model_config`, e.g. `llm`, is the config group name for your
LLM settings, as defined in your `config.toml`.
`git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
`git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
like to evaluate. It could also be a release tag like `0.6.2`.
`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting

View File

@@ -60,7 +60,7 @@ From the root of the OpenDevin repo, run the following command:
You can replace `model_config_name` with any model you set up in `config.toml`.
- `model_config_name`: The model configuration name from `config.toml` that you want to evaluate.
- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
like to evaluate. It could also be a release tag like `0.6.2`.
- `num_samples_eval`: Number of samples to evaluate (useful for testing and debugging).
- `data_split`: The data split to evaluate on. Must be one of `gpqa_main`, `gqpa_diamond`, `gpqa_experts`, `gpqa_extended`. Defaults to `gpqa_diamond` as done in the paper.

View File

@@ -20,7 +20,7 @@ where `model_config` is mandatory, while others are optional.
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`.
- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
like to evaluate. It could also be a release tag like `0.6.2`.
- `subset`, e.g. `math`, is the subset of the MINT benchmark to evaluate on, defaulting to `math`. It can be either: `math`, `gsm8k`, `mmlu`, `theoremqa`, `mbpp`,`humaneval`.

View File

@@ -82,7 +82,7 @@ If you see an error, please make sure your `config.toml` contains all
```bash
./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
# e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview head CodeActAgent 300
# e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 300
```
where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
@@ -90,7 +90,7 @@ where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
`model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
LLM settings, as defined in your `config.toml`.
`git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
`git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
like to evaluate. It could also be a release tag like `0.6.2`.
`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
@@ -104,7 +104,7 @@ Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview` and Code
then your command would be:
```bash
./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview head CodeActAgent 10
./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 10
```
If you would like to specify a list of tasks you'd like to benchmark on, you could

View File

@@ -23,7 +23,7 @@ where `model_config` is mandatory, while all other arguments are optional.
`model_config`, e.g. `llm`, is the config group name for your
LLM settings, as defined in your `config.toml`.
`git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
`git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
like to evaluate. It could also be a release tag like `0.6.2`.
`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting

View File

@@ -21,7 +21,7 @@
"i18next": "^23.11.5",
"i18next-browser-languagedetector": "^8.0.0",
"i18next-http-backend": "^2.5.2",
"jose": "^5.5.0",
"jose": "^5.6.1",
"monaco-editor": "^0.50.0",
"react": "^18.3.1",
"react-dom": "^18.3.1",
@@ -10552,9 +10552,9 @@
}
},
"node_modules/jose": {
"version": "5.5.0",
"resolved": "https://registry.npmjs.org/jose/-/jose-5.5.0.tgz",
"integrity": "sha512-DUPr/1kYXbuqYpkCj9r66+B4SGCKXCLQ5ZbKCgmn4sJveJqcwNqWtAR56u4KPmpXjrmBO2uNuLdEAEiqIhFNBg==",
"version": "5.6.1",
"resolved": "https://registry.npmjs.org/jose/-/jose-5.6.1.tgz",
"integrity": "sha512-KyxsIFAtR0BH72iRCOwe+PRnsGltiXDRtemlOkp2xz7FrakAKd4lvYYJgFOJm2fryOMMUk0+f0E/uuTDoHDiTA==",
"funding": {
"url": "https://github.com/sponsors/panva"
}

View File

@@ -20,7 +20,7 @@
"i18next": "^23.11.5",
"i18next-browser-languagedetector": "^8.0.0",
"i18next-http-backend": "^2.5.2",
"jose": "^5.5.0",
"jose": "^5.6.1",
"monaco-editor": "^0.50.0",
"react": "^18.3.1",
"react-dom": "^18.3.1",

View File

@@ -126,9 +126,9 @@ class AgentController:
- the string message should be user-friendly, it will be shown in the UI
- an ErrorObservation can be sent to the LLM by the agent, with the exception message, so it can self-correct next time
"""
if exception:
message += f': {exception}'
self.state.error = message
if exception:
self.state.error += f': {str(exception)}'
self.event_stream.add_event(ErrorObservation(message), EventSource.AGENT)
async def _start_step_loop(self):
@@ -140,6 +140,7 @@ class AgentController:
logger.info('AgentController task was cancelled')
break
except Exception as e:
traceback.print_exc()
logger.error(f'Error while running the agent: {e}')
logger.error(traceback.format_exc())
await self.report_error(

View File

@@ -58,47 +58,52 @@ class ServerRuntime(Runtime):
('cat /tmp/opendevin_jupyter_temp.py | execute_cli'), background=False
)
output = obs.content
if 'pip install' in action.code and 'Successfully installed' in output:
if 'pip install' in action.code:
print(output)
restart_kernel = 'import IPython\nIPython.Application.instance().kernel.do_shutdown(True)'
if (
'Note: you may need to restart the kernel to use updated packages.'
in output
):
obs = self._run_command(
(
"cat > /tmp/opendevin_jupyter_temp.py <<'EOL'\n"
f'{restart_kernel}\n'
'EOL'
),
background=False,
)
obs = self._run_command(
('cat /tmp/opendevin_jupyter_temp.py | execute_cli'),
background=False,
)
output = '[Package installed successfully]'
if "{'status': 'ok', 'restart': True}" != obs.content.strip():
print(obs.content)
output += '\n[But failed to restart the kernel to load the package]'
else:
output += '\n[Kernel restarted successfully to load the package]'
package_names = action.code.split(' ', 2)[-1]
is_single_package = ' ' not in package_names
# re-init the kernel after restart
if action.kernel_init_code:
obs = self._run_command(
if 'Successfully installed' in output:
restart_kernel = 'import IPython\nIPython.Application.instance().kernel.do_shutdown(True)'
if (
'Note: you may need to restart the kernel to use updated packages.'
in output
):
self._run_command(
(
f"cat > /tmp/opendevin_jupyter_init.py <<'EOL'\n"
f'{action.kernel_init_code}\n'
"cat > /tmp/opendevin_jupyter_temp.py <<'EOL'\n"
f'{restart_kernel}\n'
'EOL'
),
background=False,
)
obs = self._run_command(
'cat /tmp/opendevin_jupyter_init.py | execute_cli',
('cat /tmp/opendevin_jupyter_temp.py | execute_cli'),
background=False,
)
output = '[Package installed successfully]'
if "{'status': 'ok', 'restart': True}" != obs.content.strip():
print(obs.content)
output += '\n[But failed to restart the kernel to load the package]'
else:
output += '\n[Kernel restarted successfully to load the package]'
# re-init the kernel after restart
if action.kernel_init_code:
obs = self._run_command(
(
f"cat > /tmp/opendevin_jupyter_init.py <<'EOL'\n"
f'{action.kernel_init_code}\n'
'EOL'
),
background=False,
)
obs = self._run_command(
'cat /tmp/opendevin_jupyter_init.py | execute_cli',
background=False,
)
elif is_single_package and f'Requirement already satisfied: {package_names}' in output:
output = '[Package already installed]'
return IPythonRunCellObservation(content=output, code=action.code)
async def read(self, action: FileReadAction) -> Observation:
@@ -131,9 +136,14 @@ class ServerRuntime(Runtime):
def _run_immediately(self, command: str) -> Observation:
try:
exit_code, output = self.sandbox.execute(command)
if 'pip install' in command and 'Successfully installed' in output:
if 'pip install' in command:
package_names = command.split(' ', 2)[-1]
is_single_package = ' ' not in package_names
print(output)
output = 'Package installed successfully'
if 'Successfully installed' in output:
output = '[Package installed successfully]'
elif is_single_package and f'Requirement already satisfied: {package_names}' in output:
output = '[Package already installed]'
return CmdOutputObservation(
command_id=-1, content=str(output), command=command, exit_code=exit_code
)

38
poetry.lock generated
View File

@@ -416,17 +416,17 @@ files = [
[[package]]
name = "boto3"
version = "1.34.133"
version = "1.34.134"
description = "The AWS SDK for Python"
optional = false
python-versions = ">=3.8"
files = [
{file = "boto3-1.34.133-py3-none-any.whl", hash = "sha256:da7e78c03270be872ad78301892396ffea56647efcb2c3a8621ef46a905541ab"},
{file = "boto3-1.34.133.tar.gz", hash = "sha256:7071f8ce1f09113ca5630860fd590464e6325a4df55faae83c956225941016fc"},
{file = "boto3-1.34.134-py3-none-any.whl", hash = "sha256:342782c02ff077aae118c9c61179eed95c585831fba666baacc5588ff04aa6e1"},
{file = "boto3-1.34.134.tar.gz", hash = "sha256:f6d6e5b0c9ab022a75373fa16c01f0cd54bc1bb64ef3b6ac64ac7cedd56cbe9c"},
]
[package.dependencies]
botocore = ">=1.34.133,<1.35.0"
botocore = ">=1.34.134,<1.35.0"
jmespath = ">=0.7.1,<2.0.0"
s3transfer = ">=0.10.0,<0.11.0"
@@ -435,13 +435,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
[[package]]
name = "botocore"
version = "1.34.133"
version = "1.34.134"
description = "Low-level, data-driven core of boto 3."
optional = false
python-versions = ">=3.8"
files = [
{file = "botocore-1.34.133-py3-none-any.whl", hash = "sha256:f269dad8e17432d2527b97ed9f1fd30ec8dc705f8b818957170d1af484680ef2"},
{file = "botocore-1.34.133.tar.gz", hash = "sha256:5ea609aa4831a6589e32eef052a359ad8d7311733b4d86a9d35dab4bd3ec80ff"},
{file = "botocore-1.34.134-py3-none-any.whl", hash = "sha256:45219e00639755f92569b29f8f279d5dde721494791412c1f7026a3779e8d9f4"},
{file = "botocore-1.34.134.tar.gz", hash = "sha256:e29c299599426ed16dd2d4c1e20eef784f96b15e1850ebbc59a3250959285b95"},
]
[package.dependencies]
@@ -1829,13 +1829,13 @@ test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit",
[[package]]
name = "google-ai-generativelanguage"
version = "0.6.5"
version = "0.6.6"
description = "Google Ai Generativelanguage API client library"
optional = false
python-versions = ">=3.7"
files = [
{file = "google-ai-generativelanguage-0.6.5.tar.gz", hash = "sha256:c4089c277fa4e26722f76ab03ee3039f28be8bf1c9be282948b9583a154c6d79"},
{file = "google_ai_generativelanguage-0.6.5-py3-none-any.whl", hash = "sha256:236875bb4a6d6ebdba2f12bd9d5e776100fd913402157a47b5e9fb80a13f25a7"},
{file = "google-ai-generativelanguage-0.6.6.tar.gz", hash = "sha256:1739f035caeeeca5c28f887405eec8690f3372daf79fecf26454a97a4f1733a8"},
{file = "google_ai_generativelanguage-0.6.6-py3-none-any.whl", hash = "sha256:59297737931f073d55ce1268dcc6d95111ee62850349d2b6cde942b16a4fca5c"},
]
[package.dependencies]
@@ -1927,16 +1927,16 @@ httplib2 = ">=0.19.0"
[[package]]
name = "google-generativeai"
version = "0.7.0"
version = "0.7.1"
description = "Google Generative AI High level API client library and tools."
optional = false
python-versions = ">=3.9"
files = [
{file = "google_generativeai-0.7.0-py3-none-any.whl", hash = "sha256:7be4b634afeb8b6bebde1af7271e94d2af84d2d28b5988c7ed9921733c40fe63"},
{file = "google_generativeai-0.7.1-py3-none-any.whl", hash = "sha256:25017b1278c873b6db65ba41c70566ee9fce8f265498e8b2f6aac036528ba1c7"},
]
[package.dependencies]
google-ai-generativelanguage = "0.6.5"
google-ai-generativelanguage = "0.6.6"
google-api-core = "*"
google-api-python-client = "*"
google-auth = ">=2.15.0"
@@ -2771,13 +2771,13 @@ types-tqdm = "*"
[[package]]
name = "litellm"
version = "1.40.27"
version = "1.40.28"
description = "Library to easily interface with LLM API providers"
optional = false
python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8"
files = [
{file = "litellm-1.40.27-py3-none-any.whl", hash = "sha256:f6906e5260d784e7e31d579f5b28545e87517268cb96dd0dcaf31e4c5d34073f"},
{file = "litellm-1.40.27.tar.gz", hash = "sha256:a13a04168be5a8e52d43c34c2e657ca2521da61039ac39a17abc233a1875923f"},
{file = "litellm-1.40.28-py3-none-any.whl", hash = "sha256:aa6d59390f24d1b1168a202b966249f9f5f93d08deba38ed9528654544065e96"},
{file = "litellm-1.40.28.tar.gz", hash = "sha256:08fdfcb01715006f9dadb8d05b94143f782e08d1944e5691d9faf20300e62739"},
]
[package.dependencies]
@@ -4190,13 +4190,13 @@ sympy = "*"
[[package]]
name = "openai"
version = "1.35.4"
version = "1.35.6"
description = "The official Python library for the openai API"
optional = false
python-versions = ">=3.7.1"
files = [
{file = "openai-1.35.4-py3-none-any.whl", hash = "sha256:894b79c485fae2df3a6b68ceb570730e5a480c08bccc32a412cf3be2d4eb1384"},
{file = "openai-1.35.4.tar.gz", hash = "sha256:b58a0d6257c5c86e85b9b2f43e6eed04ada616560df9da0dca6697d06845e7c8"},
{file = "openai-1.35.6-py3-none-any.whl", hash = "sha256:c2bfa599445a2d6010adc7954476c2dc64e1aa8dad02ef29e0f31b9a887c1d02"},
{file = "openai-1.35.6.tar.gz", hash = "sha256:c5958617048a2d777d2b96050fd69ae6721bdffbf59967698694223cc092abd9"},
]
[package.dependencies]

View File

@@ -550,7 +550,7 @@ pip install pymsgbox==1.0.9
----------
OBSERVATION:
Package installed successfully
[Package installed successfully]
[Command -1 finished with exit code 0]
ENVIRONMENT REMINDER: You have 8 turns left to complete the task.

View File

@@ -550,7 +550,7 @@ pip install pymsgbox==1.0.9
----------
OBSERVATION:
Package installed successfully
[Package installed successfully]
[Command -1 finished with exit code 0]
----------

View File

@@ -550,7 +550,7 @@ pip install pymsgbox==1.0.9
----------
OBSERVATION:
Package installed successfully
[Package installed successfully]
[Command -1 finished with exit code 0]
----------

View File

@@ -301,26 +301,69 @@ def _test_sandbox_jupyter_agentskills_fileop_pwd_impl(box):
print(output)
assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
exit_code, output = box.execute('echo "create_file(\'a.txt\')" | execute_cli')
exit_code, output = box.execute('echo "create_file(\'hello.py\')" | execute_cli')
print(output)
assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
assert output.strip().split('\r\n') == (
'[File: /workspace/a.txt (1 lines total)]\r\n' '1|\r\n' '[File a.txt created.]'
'[File: /workspace/hello.py (1 lines total)]\r\n'
'1|\r\n'
'[File hello.py created.]'
).strip().split('\r\n')
exit_code, output = box.execute('cd test')
print(output)
assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
exit_code, output = box.execute('echo "create_file(\'a.txt\')" | execute_cli')
exit_code, output = box.execute('echo "create_file(\'hello.py\')" | execute_cli')
print(output)
assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
assert output.strip().split('\r\n') == (
'[File: /workspace/test/a.txt (1 lines total)]\r\n'
'[File: /workspace/test/hello.py (1 lines total)]\r\n'
'1|\r\n'
'[File a.txt created.]'
'[File hello.py created.]'
).strip().split('\r\n')
if config.enable_auto_lint:
# edit file, but make a mistake in indentation
exit_code, output = box.execute(
'echo "edit_file(\'hello.py\', 1, 1, \' print(\\"hello world\\")\')" | execute_cli'
)
print(output)
assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
assert output.strip().split('\r\n') == (
"""
[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
ERRORS:
hello.py:1:3: E999 IndentationError: unexpected indent
[This is how your edit would have looked if applied]
-------------------------------------------------
1| print("hello world")
-------------------------------------------------
[This is the original code before your edit]
-------------------------------------------------
1|
-------------------------------------------------
Your changes have NOT been applied. Please fix your edit command and try again.
You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
DO NOT re-run the same failed edit command. Running it again will lead to the same error.
"""
).strip().split('\n')
# edit file with correct indentation
exit_code, output = box.execute(
'echo "edit_file(\'hello.py\', 1, 1, \'print(\\"hello world\\")\')" | execute_cli'
)
print(output)
assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
assert output.strip().split('\r\n') == (
"""
[File: /workspace/test/hello.py (1 lines total after edit)]
1|print("hello world")
[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
"""
).strip().split('\n')
exit_code, output = box.execute('rm -rf /workspace/*')
assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
box.close()
@@ -330,9 +373,10 @@ def test_sandbox_jupyter_agentskills_fileop_pwd(temp_dir):
# get a temporary directory
with patch.object(config, 'workspace_base', new=temp_dir), patch.object(
config, 'workspace_mount_path', new=temp_dir
), patch.object(config, 'run_as_devin', new='true'), patch.object(
), patch.object(config, 'run_as_devin', new=True), patch.object(
config, 'sandbox_type', new='ssh'
):
), patch.object(config, 'enable_auto_lint', new=True):
assert config.enable_auto_lint
for box in [DockerSSHBox()]:
_test_sandbox_jupyter_agentskills_fileop_pwd_impl(box)
@@ -346,8 +390,11 @@ def test_agnostic_sandbox_jupyter_agentskills_fileop_pwd(temp_dir):
# get a temporary directory
with patch.object(config, 'workspace_base', new=temp_dir), patch.object(
config, 'workspace_mount_path', new=temp_dir
), patch.object(config, 'run_as_devin', new='true'), patch.object(
), patch.object(config, 'run_as_devin', new=True), patch.object(
config, 'sandbox_type', new='ssh'
), patch.object(config, 'sandbox_container_image', new=base_sandbox_image):
), patch.object(
config, 'sandbox_container_image', new=base_sandbox_image
), patch.object(config, 'enable_auto_lint', new=False):
assert not config.enable_auto_lint
for box in [DockerSSHBox()]:
_test_sandbox_jupyter_agentskills_fileop_pwd_impl(box)