[Arch] Use hash to avoid repeat building EventStreamRuntime image (#3243)

* update the behavior of put source code to put files instead of tarball

* add dishash to dependency

* fix dockerfile copy

* use dirhash to avoid repeat building for update source

* fix runtime_build testcase

* add dir_hash to docker build pipeline

* add additional tests for source directory

* add comment

* clear the assertion by explictly check existing files

* also assert od is a dir
This commit is contained in:
Xingyao Wang 2024-08-05 11:13:32 +08:00 committed by GitHub
parent abec52abfe
commit a69120d399
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 286 additions and 95 deletions

View File

@ -53,6 +53,11 @@ fi
if [[ -n "$DOCKER_IMAGE_TAG" ]]; then
tags+=("$DOCKER_IMAGE_TAG")
fi
# If $DOCKER_IMAGE_HASH_TAG is set, add it to the tags
if [[ -n "$DOCKER_IMAGE_HASH_TAG" ]]; then
tags+=("$DOCKER_IMAGE_HASH_TAG")
fi
DOCKER_REPOSITORY="$DOCKER_REGISTRY/$DOCKER_ORG/$DOCKER_IMAGE"
DOCKER_REPOSITORY=${DOCKER_REPOSITORY,,} # lowercase

View File

@ -4,5 +4,3 @@ DOCKER_BASE_DIR="./containers/runtime"
# These two variables will be appended by the runtime_build.py script
# DOCKER_IMAGE=
# DOCKER_IMAGE_TAG=
DOCKER_IMAGE=od_runtime
DOCKER_IMAGE_TAG=od_v0.8.1_image_ubuntu_tag_22.04

View File

@ -6,6 +6,7 @@ import tempfile
import docker
import toml
from dirhash import dirhash
from jinja2 import Environment, FileSystemLoader
import opendevin
@ -47,7 +48,8 @@ def _create_project_source_dist():
return tarball_path
def _put_source_code_to_dir(temp_dir: str) -> str:
def _put_source_code_to_dir(temp_dir: str):
"""Put the source code of OpenDevin to the temp_dir/code."""
tarball_path = _create_project_source_dist()
filename = os.path.basename(tarball_path)
filename = filename.removesuffix('.tar.gz')
@ -59,12 +61,18 @@ def _put_source_code_to_dir(temp_dir: str) -> str:
logger.info(
f'Source distribution moved to {os.path.join(temp_dir, "project.tar.gz")}'
)
return filename
# unzip the tarball
shutil.unpack_archive(os.path.join(temp_dir, 'project.tar.gz'), temp_dir)
# remove the tarball
os.remove(os.path.join(temp_dir, 'project.tar.gz'))
# rename the directory to the 'code'
os.rename(os.path.join(temp_dir, filename), os.path.join(temp_dir, 'code'))
logger.info(f'Unpacked source code directory: {os.path.join(temp_dir, "code")}')
def _generate_dockerfile(
base_image: str,
source_code_dirname: str,
skip_init: bool = False,
extra_deps: str | None = None,
) -> str:
@ -77,7 +85,6 @@ def _generate_dockerfile(
template = env.get_template('Dockerfile.j2')
dockerfile_content = template.render(
base_image=base_image,
source_code_dirname=source_code_dirname,
skip_init=skip_init,
extra_deps=extra_deps if extra_deps is not None else '',
)
@ -89,12 +96,14 @@ def prep_docker_build_folder(
base_image: str,
skip_init: bool = False,
extra_deps: str | None = None,
):
"""Prepares the docker build folder by copying the source code and generating the Dockerfile."""
source_code_dirname = _put_source_code_to_dir(dir_path)
) -> str:
"""Prepares the docker build folder by copying the source code and generating the Dockerfile.
Return the MD5 hash of the directory.
"""
_put_source_code_to_dir(dir_path)
dockerfile_content = _generate_dockerfile(
base_image,
source_code_dirname,
skip_init=skip_init,
extra_deps=extra_deps,
)
@ -108,6 +117,15 @@ def prep_docker_build_folder(
with open(os.path.join(dir_path, 'Dockerfile'), 'w') as file:
file.write(dockerfile_content)
hash = dirhash(dir_path, 'md5')
logger.info(
f'Input base image: {base_image}\n'
f'Skip init: {skip_init}\n'
f'Extra deps: {extra_deps}\n'
f'Hash for docker build directory [{dir_path}] (contents: {os.listdir(dir_path)}): {hash}\n'
)
return hash
def _build_sandbox_image(
base_image: str,
@ -115,7 +133,12 @@ def _build_sandbox_image(
docker_client: docker.DockerClient,
skip_init: bool = False,
extra_deps: str | None = None,
):
) -> str:
"""Build the sandbox image and return the *hash* docker image name.
The hash is calculated based on the contents of the docker build folder (source code and Dockerfile). This is useful to help prevent rebuilding the image when the source code and Dockerfile are unchanged.
"""
target_repo, target_image_tag = target_image_name.split(':')
try:
with tempfile.TemporaryDirectory() as temp_dir:
if skip_init:
@ -124,36 +147,62 @@ def _build_sandbox_image(
)
else:
logger.info(f'Building agnostic sandbox image: {target_image_name}')
prep_docker_build_folder(
dir_hash = prep_docker_build_folder(
temp_dir, base_image, skip_init=skip_init, extra_deps=extra_deps
)
api_client = docker_client.api
build_logs = api_client.build(
path=temp_dir,
tag=target_image_name,
rm=True,
decode=True,
# do not use cache when skip_init is True (i.e., when we want to update the source code in the existing image)
nocache=skip_init,
)
# Use dir_hash as an alternative tag for the image
# This is useful to help prevent rebuilding the image when the source code/Dockerfile is the same
target_image_hash_name = f'{target_repo}:{dir_hash}'
if skip_init:
# Check if the hash image exists
if _check_image_exists(target_image_hash_name, docker_client):
logger.info(f'Image {target_image_hash_name} exists, skipping build.')
else:
logger.info(
f'Rebuilding existing od_sandbox image [{target_image_name}] to update the source code.'
f'Image {target_image_name} does not exist, neither does its hash {target_image_hash_name}.\n'
'Building the image...'
)
api_client = docker_client.api
build_logs = api_client.build(
path=temp_dir,
tag=target_image_hash_name,
rm=True,
decode=True,
# do not use cache when skip_init is True (i.e., when we want to update the source code in the existing image)
nocache=skip_init,
)
if skip_init:
logger.info(
f'Rebuilding existing od_sandbox image [{target_image_name}] to update the source code.'
)
for log in build_logs:
if 'stream' in log:
print(log['stream'].strip())
elif 'error' in log:
logger.error(log['error'].strip())
else:
logger.info(str(log))
logger.info(f'Image {target_image_hash_name} build finished.')
image = docker_client.images.get(target_image_hash_name)
image.tag(target_repo, target_image_tag)
logger.info(
f'Tagged image {target_image_hash_name} --> {target_image_name}'
)
for log in build_logs:
if 'stream' in log:
print(log['stream'].strip())
elif 'error' in log:
logger.error(log['error'].strip())
else:
logger.info(str(log))
# check if the image is built successfully
image = docker_client.images.get(target_image_name)
image = docker_client.images.get(target_image_hash_name)
if image is None:
raise RuntimeError(f'Build failed: Image {target_image_name} not found')
logger.info(f'Image {target_image_name} built successfully')
raise RuntimeError(
f'Build failed: Image {target_image_hash_name} / {target_image_name} not found'
)
logger.info(
f'Image {target_image_name} (hash: {target_image_hash_name}) built successfully'
)
return target_image_hash_name
except docker.errors.BuildError as e:
logger.error(f'Sandbox image build failed: {e}')
raise e
@ -183,6 +232,16 @@ def get_new_image_name(base_image: str, dev_mode: bool = False) -> str:
def _check_image_exists(image_name: str, docker_client: docker.DockerClient) -> bool:
"""Check if the image exists in the registry (try to pull it first) AND in the local store.
image_name is f'{repo}:{tag}'
"""
# Try to pull the new image from the registry
try:
docker_client.images.pull(image_name)
except Exception:
logger.info(f'Cannot pull image {image_name} directly')
images = docker_client.images.list()
if images:
for image in images:
@ -217,12 +276,6 @@ def build_runtime_image(
f'Invalid image name: {new_image_name}. Expected format "repository:tag".'
)
# Try to pull the new image from the registry
try:
docker_client.images.pull(new_image_name)
except Exception:
logger.info(f'Cannot pull image {new_image_name} directly')
# Detect if the sandbox image is built
image_exists = _check_image_exists(new_image_name, docker_client)
if image_exists:
@ -235,6 +288,7 @@ def build_runtime_image(
# If (1) Image exists & we are not updating the source code, we can reuse the existing production image
logger.info('No image build done (not updating source code)')
return new_image_name
elif image_exists and update_source_code:
# If (2) Image exists & we plan to update the source code (in dev mode), we need to rebuild the image
# and give it a special name
@ -244,6 +298,7 @@ def build_runtime_image(
new_image_name = get_new_image_name(base_image, dev_mode=True)
skip_init = True # since we only need to update the source code
else:
# If (3) Image does not exist, we need to build it from scratch
# e.g., ubuntu:latest -> od_runtime:ubuntu_tag_latest
@ -260,7 +315,7 @@ def build_runtime_image(
if not skip_init:
logger.info(f'Building image [{new_image_name}] from scratch')
_build_sandbox_image(
new_image_name = _build_sandbox_image(
base_image,
new_image_name,
docker_client,
@ -299,15 +354,17 @@ if __name__ == '__main__':
f'Will prepare a build folder by copying the source code and generating the Dockerfile: {build_folder}'
)
new_image_path = get_new_image_name(args.base_image)
prep_docker_build_folder(
dir_hash = prep_docker_build_folder(
build_folder, args.base_image, skip_init=args.update_source_code
)
new_image_name, new_image_tag = new_image_path.split(':')
with open(os.path.join(build_folder, 'config.sh'), 'a') as file:
file.write(
(
f'\n'
f'DOCKER_IMAGE={new_image_name}\n'
f'DOCKER_IMAGE_TAG={new_image_tag}\n'
f'DOCKER_IMAGE_HASH_TAG={dir_hash}\n'
)
)
logger.info(

View File

@ -44,10 +44,8 @@ RUN /opendevin/miniforge3/bin/mamba install conda-forge::poetry python=3.11 -y
# ================================================================
# START: Copy Project and Install/Update Dependencies
# ================================================================
COPY project.tar.gz /opendevin
RUN if [ -d /opendevin/code ]; then rm -rf /opendevin/code; fi
RUN cd /opendevin && tar -xzvf project.tar.gz && rm project.tar.gz
RUN mv /opendevin/{{ source_code_dirname }} /opendevin/code
COPY ./code /opendevin/code
# Install/Update Dependencies
# 1. Install pyproject.toml via poetry

33
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
[[package]]
name = "aenum"
@ -1378,6 +1378,20 @@ files = [
graph = ["objgraph (>=1.7.2)"]
profile = ["gprof2dot (>=2022.7.29)"]
[[package]]
name = "dirhash"
version = "0.5.0"
description = "Python module and CLI for hashing of file system directories."
optional = false
python-versions = ">=3.8"
files = [
{file = "dirhash-0.5.0-py3-none-any.whl", hash = "sha256:523dfd6b058c64f45b31604376926c6e2bd2ea301d0df23095d4055674e38b09"},
{file = "dirhash-0.5.0.tar.gz", hash = "sha256:e60760f0ab2e935d8cb088923ea2c6492398dca42cec785df778985fd4cd5386"},
]
[package.dependencies]
scantree = ">=0.0.4"
[[package]]
name = "dirtyjson"
version = "1.0.8"
@ -7060,6 +7074,21 @@ tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"]
testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools-rust (>=1.5.2)"]
torch = ["safetensors[numpy]", "torch (>=1.10)"]
[[package]]
name = "scantree"
version = "0.0.4"
description = "Flexible recursive directory iterator: scandir meets glob(\"**\", recursive=True)"
optional = false
python-versions = ">=3.8"
files = [
{file = "scantree-0.0.4-py3-none-any.whl", hash = "sha256:7616ab65aa6b7f16fcf8e6fa1d9afaa99a27ab72bba05c61b691853b96763174"},
{file = "scantree-0.0.4.tar.gz", hash = "sha256:15bd5cb24483b04db2c70653604e8ea3522e98087db7e38ab8482f053984c0ac"},
]
[package.dependencies]
attrs = ">=18.0.0"
pathspec = ">=0.10.1"
[[package]]
name = "scikit-learn"
version = "1.5.0"
@ -9078,4 +9107,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "1a27bcc3e0448356d0d224f17e020131e38ec7d8ae31bff8680dcafde3d89bde"
content-hash = "662460a252456de9099b04039975c3858c977599b4441fcb2ba4f81f430dd85a"

View File

@ -41,6 +41,7 @@ grep-ast = "0.3.3"
tree-sitter = "0.21.3"
bashlex = "^0.18"
pyjwt = "^2.9.0"
dirhash = "*"
[tool.poetry.group.llama-index.dependencies]
llama-index = "*"

View File

@ -1,7 +1,7 @@
import os
import tarfile
import tempfile
from importlib.metadata import version
from unittest.mock import MagicMock, patch
from unittest.mock import ANY, MagicMock, patch
import pytest
import toml
@ -13,6 +13,7 @@ from opendevin.runtime.utils.runtime_build import (
_put_source_code_to_dir,
build_runtime_image,
get_new_image_name,
prep_docker_build_folder,
)
OD_VERSION = f'od_v{_get_package_version()}'
@ -24,31 +25,111 @@ def temp_dir(tmp_path_factory: TempPathFactory) -> str:
return str(tmp_path_factory.mktemp('test_runtime_build'))
def test_put_source_code_to_dir(temp_dir):
folder_name = _put_source_code_to_dir(temp_dir)
# assert there is a file called 'project.tar.gz' in the temp_dir
assert os.path.exists(os.path.join(temp_dir, 'project.tar.gz'))
# untar the file
with tarfile.open(os.path.join(temp_dir, 'project.tar.gz'), 'r:gz') as tar:
tar.extractall(path=temp_dir)
def _check_source_code_in_dir(temp_dir):
# assert there is a folder called 'code' in the temp_dir
code_dir = os.path.join(temp_dir, 'code')
assert os.path.exists(code_dir)
assert os.path.isdir(code_dir)
# check the source file is the same as the current code base
assert os.path.exists(os.path.join(temp_dir, folder_name, 'pyproject.toml'))
assert os.path.exists(os.path.join(code_dir, 'pyproject.toml'))
# The source code should only include the `opendevin` folder, but not the other folders
assert set(os.listdir(code_dir)) == {
'opendevin',
'pyproject.toml',
'poetry.lock',
'LICENSE',
'README.md',
'PKG-INFO',
}
assert os.path.exists(os.path.join(code_dir, 'opendevin'))
assert os.path.isdir(os.path.join(code_dir, 'opendevin'))
# make sure the version from the pyproject.toml is the same as the current version
with open(os.path.join(temp_dir, folder_name, 'pyproject.toml'), 'r') as f:
with open(os.path.join(code_dir, 'pyproject.toml'), 'r') as f:
pyproject = toml.load(f)
_pyproject_version = pyproject['tool']['poetry']['version']
assert _pyproject_version == version('opendevin')
def test_put_source_code_to_dir(temp_dir):
_put_source_code_to_dir(temp_dir)
_check_source_code_in_dir(temp_dir)
def test_docker_build_folder(temp_dir):
prep_docker_build_folder(
temp_dir,
base_image='ubuntu:22.04',
skip_init=False,
)
# check the source code is in the folder
_check_source_code_in_dir(temp_dir)
# Now check dockerfile is in the folder
dockerfile_path = os.path.join(temp_dir, 'Dockerfile')
assert os.path.exists(dockerfile_path)
assert os.path.isfile(dockerfile_path)
# check the folder only contains the source code and the Dockerfile
assert set(os.listdir(temp_dir)) == {'code', 'Dockerfile'}
def test_hash_folder_same(temp_dir):
dir_hash_1 = prep_docker_build_folder(
temp_dir,
base_image='ubuntu:22.04',
skip_init=False,
)
with tempfile.TemporaryDirectory() as temp_dir_2:
dir_hash_2 = prep_docker_build_folder(
temp_dir_2,
base_image='ubuntu:22.04',
skip_init=False,
)
assert dir_hash_1 == dir_hash_2
def test_hash_folder_diff_init(temp_dir):
dir_hash_1 = prep_docker_build_folder(
temp_dir,
base_image='ubuntu:22.04',
skip_init=False,
)
with tempfile.TemporaryDirectory() as temp_dir_2:
dir_hash_2 = prep_docker_build_folder(
temp_dir_2,
base_image='ubuntu:22.04',
skip_init=True,
)
assert dir_hash_1 != dir_hash_2
def test_hash_folder_diff_image(temp_dir):
dir_hash_1 = prep_docker_build_folder(
temp_dir,
base_image='ubuntu:22.04',
skip_init=False,
)
with tempfile.TemporaryDirectory() as temp_dir_2:
dir_hash_2 = prep_docker_build_folder(
temp_dir_2,
base_image='debian:11',
skip_init=False,
)
assert dir_hash_1 != dir_hash_2
def test_generate_dockerfile_scratch():
base_image = 'debian:11'
source_code_dirname = 'dummy'
dockerfile_content = _generate_dockerfile(
base_image,
source_code_dirname=source_code_dirname,
skip_init=False,
)
assert base_image in dockerfile_content
@ -60,7 +141,7 @@ def test_generate_dockerfile_scratch():
)
# Check the update command
assert f'mv /opendevin/{source_code_dirname} /opendevin/code' in dockerfile_content
assert 'COPY ./code /opendevin/code' in dockerfile_content
assert (
'/opendevin/miniforge3/bin/mamba run -n base poetry install'
in dockerfile_content
@ -69,10 +150,8 @@ def test_generate_dockerfile_scratch():
def test_generate_dockerfile_skip_init():
base_image = 'debian:11'
source_code_dirname = 'dummy'
dockerfile_content = _generate_dockerfile(
base_image,
source_code_dirname=source_code_dirname,
skip_init=True,
)
@ -84,9 +163,7 @@ def test_generate_dockerfile_skip_init():
)
# These update commands SHOULD still in the dockerfile
assert (
f'RUN mv /opendevin/{source_code_dirname} /opendevin/code' in dockerfile_content
)
assert 'COPY ./code /opendevin/code' in dockerfile_content
assert (
'/opendevin/miniforge3/bin/mamba run -n base poetry install'
in dockerfile_content
@ -147,29 +224,39 @@ def test_get_new_image_name_eventstream_dev_invalid_base_image():
get_new_image_name(base_image, dev_mode=True)
@patch('opendevin.runtime.utils.runtime_build._build_sandbox_image')
@patch('opendevin.runtime.utils.runtime_build.docker.DockerClient')
def test_build_runtime_image_from_scratch(mock_docker_client, mock_build_sandbox_image):
def test_build_runtime_image_from_scratch(mock_docker_client, temp_dir):
base_image = 'debian:11'
mock_docker_client.images.list.return_value = []
# for image.tag(target_repo, target_image_tag)
mock_image = MagicMock()
mock_docker_client.images.get.return_value = mock_image
image_name = build_runtime_image(base_image, mock_docker_client)
assert image_name == f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11'
mock_build_sandbox_image.assert_called_once_with(
dir_hash = prep_docker_build_folder(
temp_dir,
base_image,
f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11',
mock_docker_client,
skip_init=False,
extra_deps=None,
)
image_name = build_runtime_image(base_image, mock_docker_client)
# The build call should be called with the hash tag
mock_docker_client.api.build.assert_called_once_with(
path=ANY,
tag=f'{RUNTIME_IMAGE_PREFIX}:{dir_hash}',
rm=True,
decode=True,
nocache=False,
)
# Then the hash tag should be tagged to the version
mock_image.tag.assert_called_once_with(
f'{RUNTIME_IMAGE_PREFIX}', f'{OD_VERSION}_image_debian_tag_11'
)
assert image_name == f'{RUNTIME_IMAGE_PREFIX}:{dir_hash}'
@patch('opendevin.runtime.utils.runtime_build._build_sandbox_image')
@patch('opendevin.runtime.utils.runtime_build.docker.DockerClient')
def test_build_runtime_image_exist_no_update_source(
mock_docker_client, mock_build_sandbox_image
):
def test_build_runtime_image_exist_no_update_source(mock_docker_client):
base_image = 'debian:11'
mock_docker_client.images.list.return_value = [
MagicMock(tags=[f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11'])
@ -178,28 +265,44 @@ def test_build_runtime_image_exist_no_update_source(
image_name = build_runtime_image(base_image, mock_docker_client)
assert image_name == f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11'
mock_build_sandbox_image.assert_not_called()
mock_docker_client.api.build.assert_not_called()
@patch('opendevin.runtime.utils.runtime_build._build_sandbox_image')
@patch('opendevin.runtime.utils.runtime_build.docker.DockerClient')
def test_build_runtime_image_exist_with_update_source(
mock_docker_client, mock_build_sandbox_image
):
def test_build_runtime_image_exist_with_update_source(mock_docker_client, temp_dir):
base_image = 'debian:11'
mock_docker_client.images.list.return_value = [
MagicMock(tags=[f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11'])
]
expected_new_image_tag = f'{OD_VERSION}_image_debian_tag_11'
od_runtime_base_image = f'{RUNTIME_IMAGE_PREFIX}:{expected_new_image_tag}'
mock_docker_client.images.list.return_value = [
MagicMock(tags=[od_runtime_base_image])
]
# for image.tag(target_repo, target_image_tag)
mock_image = MagicMock()
mock_docker_client.images.get.return_value = mock_image
# call the function to get the dir_hash to calculate the new image name
dir_hash = prep_docker_build_folder(
temp_dir,
od_runtime_base_image,
skip_init=True,
)
# actual call to build the image
image_name = build_runtime_image(
base_image, mock_docker_client, update_source_code=True
)
assert image_name == f'{RUNTIME_IMAGE_PREFIX}_dev:{OD_VERSION}_image_debian_tag_11'
mock_build_sandbox_image.assert_called_once_with(
f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11',
f'{RUNTIME_IMAGE_PREFIX}_dev:{OD_VERSION}_image_debian_tag_11',
mock_docker_client,
skip_init=True,
extra_deps=None,
# check the build call
mock_docker_client.api.build.assert_called_once_with(
path=ANY,
tag=f'{RUNTIME_IMAGE_PREFIX}_dev:{dir_hash}',
rm=True,
decode=True,
nocache=True,
)
# Then check the hash tag should be tagged to expected image tag
mock_image.tag.assert_called_once_with(
f'{RUNTIME_IMAGE_PREFIX}_dev', expected_new_image_tag
)
assert image_name == f'{RUNTIME_IMAGE_PREFIX}_dev:{dir_hash}'