feat(runtime): add kubernetes support (#8814)

Co-authored-by: Corey White <corey.white@ziffdavis.com>
Co-authored-by: luke_schulz <luke.schulz@ziffmedia.com>
This commit is contained in:
brettstewart 2025-06-18 16:25:50 -05:00 committed by GitHub
parent ef582a6335
commit 54af9ff3fe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 1941 additions and 13 deletions

View File

@ -12,6 +12,7 @@ DEFAULT_MODEL = "gpt-4o"
CONFIG_FILE = config.toml
PRE_COMMIT_CONFIG_PATH = "./dev_config/python/.pre-commit-config.yaml"
PYTHON_VERSION = 3.12
KIND_CLUSTER_NAME = "local-hands"
# ANSI color codes
GREEN=$(shell tput -Txterm setaf 2)
@ -199,6 +200,40 @@ lint:
@$(MAKE) -s lint-frontend
@$(MAKE) -s lint-backend
kind:
@echo "$(YELLOW)Checking if kind is installed...$(RESET)"
@if ! command -v kind > /dev/null; then \
echo "$(RED)kind is not installed. Please install kind with `brew install kind` to continue$(RESET)"; \
exit 1; \
else \
echo "$(BLUE)kind $(shell kind version) is already installed.$(RESET)"; \
fi
@echo "$(YELLOW)Checking if kind cluster '$(KIND_CLUSTER_NAME)' already exists...$(RESET)"
@if kind get clusters | grep -q "^$(KIND_CLUSTER_NAME)$$"; then \
echo "$(BLUE)Kind cluster '$(KIND_CLUSTER_NAME)' already exists.$(RESET)"; \
kubectl config use-context kind-$(KIND_CLUSTER_NAME); \
else \
echo "$(YELLOW)Creating kind cluster '$(KIND_CLUSTER_NAME)'...$(RESET)"; \
kind create cluster --name $(KIND_CLUSTER_NAME) --config kind/cluster.yaml; \
fi
@echo "$(YELLOW)Checking if mirrord is installed...$(RESET)"
@if ! command -v mirrord > /dev/null; then \
echo "$(RED)mirrord is not installed. Please install mirrord with `brew install metalbear-co/mirrord/mirrord` to continue$(RESET)"; \
exit 1; \
else \
echo "$(BLUE)mirrord $(shell mirrord --version) is already installed.$(RESET)"; \
fi
@echo "$(YELLOW)Installing k8s mirrord resources...$(RESET)"
@kubectl apply -f kind/manifests
@echo "$(GREEN)Mirrord resources installed successfully.$(RESET)"
@echo "$(YELLOW)Waiting for Mirrord pod to be ready.$(RESET)"
@sleep 5
@kubectl wait --for=condition=Available deployment/ubuntu-dev
@echo "$(YELLOW)Waiting for Nginx to be ready.$(RESET)"
@kubectl -n ingress-nginx wait --for=condition=Available deployment/ingress-nginx-controller
@echo "$(YELLOW)Running make run inside of mirrord.$(RESET)"
@mirrord exec --target deployment/ubuntu-dev -- make run
test-frontend:
@echo "$(YELLOW)Running tests for frontend...$(RESET)"
@cd frontend && npm run test
@ -333,3 +368,4 @@ help:
# Phony targets
.PHONY: build check-dependencies check-system check-python check-npm check-nodejs check-docker check-poetry install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint-backend lint-frontend lint test-frontend test build-frontend start-backend start-frontend _run_setup run run-wsl setup-config setup-config-prompts setup-config-basic openhands-cloud-run docker-dev docker-run clean help
.PHONY: kind

View File

@ -415,3 +415,47 @@ type = "noop"
# Configuration for the evaluation, please refer to the specific evaluation
# plugin for the available options
##############################################################################
########################### Kubernetes #######################################
# Kubernetes configuration when using the Kubernetes runtime
##############################################################################
[kubernetes]
# The Kubernetes namespace to use for OpenHands resources
#namespace = "default"
# Domain for ingress resources
#ingress_domain = "localhost"
# Size of the persistent volume claim
#pvc_storage_size = "2Gi"
# Storage class for persistent volume claims
#pvc_storage_class = "standard"
# CPU request for runtime pods
#resource_cpu_request = "1"
# Memory request for runtime pods
#resource_memory_request = "1Gi"
# Memory limit for runtime pods
#resource_memory_limit = "2Gi"
# Optional name of image pull secret for private registries
#image_pull_secret = ""
# Optional name of TLS secret for ingress
#ingress_tls_secret = ""
# Optional node selector key for pod scheduling
#node_selector_key = ""
# Optional node selector value for pod scheduling
#node_selector_val = ""
# Optional YAML string defining pod tolerations
#tolerations_yaml = ""
# Run the runtime sandbox container in privileged mode for use with docker-in-docker
#privileged = false

View File

@ -7,6 +7,7 @@ repos:
- id: end-of-file-fixer
exclude: docs/modules/python
- id: check-yaml
args: ["--allow-multiple-documents"]
- id: debug-statements
- repo: https://github.com/tox-dev/pyproject-fmt

9
kind/cluster.yaml Normal file
View File

@ -0,0 +1,9 @@
---
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
name: local-hands
nodes:
- role: control-plane
extraPortMappings:
- containerPort: 80 # node port on the cluster for nginx.
hostPort: 80 # local port for nginx http.

View File

@ -0,0 +1,19 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: ubuntu-dev
spec:
replicas: 1
selector:
matchLabels:
app: ubuntu-dev
template:
metadata:
labels:
app: ubuntu-dev
spec:
containers:
- name: ubuntu
image: ubuntu:22.04
command: ["sleep", "infinity"]

678
kind/manifests/nginx.yaml Normal file
View File

@ -0,0 +1,678 @@
apiVersion: v1
kind: Namespace
metadata:
labels:
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
name: ingress-nginx
---
apiVersion: v1
automountServiceAccountToken: true
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx
namespace: ingress-nginx
---
apiVersion: v1
automountServiceAccountToken: true
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-admission
namespace: ingress-nginx
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx
namespace: ingress-nginx
rules:
- apiGroups:
- ""
resources:
- namespaces
verbs:
- get
- apiGroups:
- ""
resources:
- configmaps
- pods
- secrets
- endpoints
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- services
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses/status
verbs:
- update
- apiGroups:
- networking.k8s.io
resources:
- ingressclasses
verbs:
- get
- list
- watch
- apiGroups:
- coordination.k8s.io
resourceNames:
- ingress-nginx-leader
resources:
- leases
verbs:
- get
- update
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- create
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- list
- watch
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-admission
namespace: ingress-nginx
rules:
- apiGroups:
- ""
resources:
- secrets
verbs:
- get
- create
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx
rules:
- apiGroups:
- ""
resources:
- configmaps
- endpoints
- nodes
- pods
- secrets
- namespaces
verbs:
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- list
- watch
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- apiGroups:
- ""
resources:
- services
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- networking.k8s.io
resources:
- ingresses/status
verbs:
- update
- apiGroups:
- networking.k8s.io
resources:
- ingressclasses
verbs:
- get
- list
- watch
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- list
- watch
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-admission
rules:
- apiGroups:
- admissionregistration.k8s.io
resources:
- validatingwebhookconfigurations
verbs:
- get
- update
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx
namespace: ingress-nginx
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: ingress-nginx
subjects:
- kind: ServiceAccount
name: ingress-nginx
namespace: ingress-nginx
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-admission
namespace: ingress-nginx
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: ingress-nginx-admission
subjects:
- kind: ServiceAccount
name: ingress-nginx-admission
namespace: ingress-nginx
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: ingress-nginx
subjects:
- kind: ServiceAccount
name: ingress-nginx
namespace: ingress-nginx
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-admission
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: ingress-nginx-admission
subjects:
- kind: ServiceAccount
name: ingress-nginx-admission
namespace: ingress-nginx
---
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-controller
namespace: ingress-nginx
data:
worker-processes: "2" # Set to a lower number than default
max-worker-connections: "1024"
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-controller
namespace: ingress-nginx
spec:
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- appProtocol: http
name: http
port: 80
protocol: TCP
targetPort: http
- appProtocol: https
name: https
port: 443
protocol: TCP
targetPort: https
selector:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
type: LoadBalancer
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-controller-admission
namespace: ingress-nginx
spec:
ports:
- appProtocol: https
name: https-webhook
port: 443
targetPort: webhook
selector:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
type: ClusterIP
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-controller
namespace: ingress-nginx
spec:
minReadySeconds: 0
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
strategy:
rollingUpdate:
maxUnavailable: 1
type: RollingUpdate
template:
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
spec:
containers:
- args:
- /nginx-ingress-controller
- --election-id=ingress-nginx-leader
- --controller-class=k8s.io/ingress-nginx
- --ingress-class=nginx
- --configmap=$(POD_NAMESPACE)/ingress-nginx-controller
- --validating-webhook=:8443
- --validating-webhook-certificate=/usr/local/certificates/cert
- --validating-webhook-key=/usr/local/certificates/key
- --watch-ingress-without-class=true
- --publish-status-address=localhost
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: LD_PRELOAD
value: /usr/local/lib/libmimalloc.so
image: registry.k8s.io/ingress-nginx/controller:v1.12.1@sha256:9724476b928967173d501040631b23ba07f47073999e80e34b120e8db5f234d5
imagePullPolicy: IfNotPresent
lifecycle:
preStop:
exec:
command:
- /wait-shutdown
livenessProbe:
failureThreshold: 5
httpGet:
path: /healthz
port: 10254
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
name: controller
ports:
- containerPort: 80
hostPort: 80
name: http
protocol: TCP
- containerPort: 443
hostPort: 443
name: https
protocol: TCP
- containerPort: 8443
name: webhook
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: 10254
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
resources:
requests:
cpu: 300m
memory: 256Mi
limits:
memory: 512Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
add:
- NET_BIND_SERVICE
drop:
- ALL
readOnlyRootFilesystem: false
runAsGroup: 82
runAsNonRoot: true
runAsUser: 101
seccompProfile:
type: RuntimeDefault
volumeMounts:
- mountPath: /usr/local/certificates/
name: webhook-cert
readOnly: true
dnsPolicy: ClusterFirst
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: ingress-nginx
terminationGracePeriodSeconds: 0
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Equal
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Equal
volumes:
- name: webhook-cert
secret:
secretName: ingress-nginx-admission
---
apiVersion: batch/v1
kind: Job
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-admission-create
namespace: ingress-nginx
spec:
template:
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-admission-create
spec:
containers:
- args:
- create
- --host=ingress-nginx-controller-admission,ingress-nginx-controller-admission.$(POD_NAMESPACE).svc
- --namespace=$(POD_NAMESPACE)
- --secret-name=ingress-nginx-admission
env:
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
image: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v1.4.4@sha256:a9f03b34a3cbfbb26d103a14046ab2c5130a80c3d69d526ff8063d2b37b9fd3f
imagePullPolicy: IfNotPresent
name: create
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
seccompProfile:
type: RuntimeDefault
nodeSelector:
kubernetes.io/os: linux
restartPolicy: OnFailure
serviceAccountName: ingress-nginx-admission
---
apiVersion: batch/v1
kind: Job
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-admission-patch
namespace: ingress-nginx
spec:
template:
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-admission-patch
spec:
containers:
- args:
- patch
- --webhook-name=ingress-nginx-admission
- --namespace=$(POD_NAMESPACE)
- --patch-mutating=false
- --secret-name=ingress-nginx-admission
- --patch-failure-policy=Fail
env:
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
image: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v1.4.4@sha256:a9f03b34a3cbfbb26d103a14046ab2c5130a80c3d69d526ff8063d2b37b9fd3f
imagePullPolicy: IfNotPresent
name: patch
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
seccompProfile:
type: RuntimeDefault
nodeSelector:
kubernetes.io/os: linux
restartPolicy: OnFailure
serviceAccountName: ingress-nginx-admission
---
apiVersion: networking.k8s.io/v1
kind: IngressClass
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: nginx
spec:
controller: k8s.io/ingress-nginx
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.12.1
name: ingress-nginx-admission
webhooks:
- admissionReviewVersions:
- v1
clientConfig:
service:
name: ingress-nginx-controller-admission
namespace: ingress-nginx
path: /networking/v1/ingresses
port: 443
failurePolicy: Fail
matchPolicy: Equivalent
name: validate.nginx.ingress.kubernetes.io
rules:
- apiGroups:
- networking.k8s.io
apiVersions:
- v1
operations:
- CREATE
- UPDATE
resources:
- ingresses
sideEffects: None

14
kind/manifests/role.yaml Normal file
View File

@ -0,0 +1,14 @@
---
# mirrord-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: mirrord-role
namespace: default
rules:
- apiGroups: [""]
resources: ["pods", "pods/exec", "pods/portforward", "services", "persistentvolumeclaims"]
verbs: ["get", "list", "create", "delete", "watch", "update"]
- apiGroups: ["networking.k8s.io"] # Networking API group (for ingress, networkpolicies, etc.)
resources: ["ingresses", "networkpolicies"]
verbs: ["get", "list", "create", "delete", "watch", "update"]

View File

@ -0,0 +1,14 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: mirrord-binding
namespace: default
subjects:
- kind: ServiceAccount
name: default
namespace: default
roleRef:
kind: Role
name: mirrord-role
apiGroup: rbac.authorization.k8s.io

View File

@ -0,0 +1,12 @@
---
apiVersion: v1
kind: Service
metadata:
name: ubuntu-dev
spec:
selector:
app: ubuntu-dev
ports:
- protocol: TCP
port: 8099
targetPort: 3000

View File

@ -0,0 +1,86 @@
from pydantic import BaseModel, Field, ValidationError
class KubernetesConfig(BaseModel):
"""Configuration for Kubernetes runtime.
Attributes:
namespace: The Kubernetes namespace to use for OpenHands resources
ingress_domain: Domain for ingress resources
pvc_storage_size: Size of the persistent volume claim (e.g. "2Gi")
pvc_storage_class: Storage class for persistent volume claims
resource_cpu_request: CPU request for runtime pods
resource_memory_request: Memory request for runtime pods
resource_memory_limit: Memory limit for runtime pods
image_pull_secret: Optional name of image pull secret for private registries
ingress_tls_secret: Optional name of TLS secret for ingress
node_selector_key: Optional node selector key for pod scheduling
node_selector_val: Optional node selector value for pod scheduling
tolerations_yaml: Optional YAML string defining pod tolerations
"""
namespace: str = Field(
default='default',
description='The Kubernetes namespace to use for OpenHands resources',
)
ingress_domain: str = Field(
default='localhost', description='Domain for ingress resources'
)
pvc_storage_size: str = Field(
default='2Gi', description='Size of the persistent volume claim'
)
pvc_storage_class: str | None = Field(
default=None, description='Storage class for persistent volume claims'
)
resource_cpu_request: str = Field(
default='1', description='CPU request for runtime pods'
)
resource_memory_request: str = Field(
default='1Gi', description='Memory request for runtime pods'
)
resource_memory_limit: str = Field(
default='2Gi', description='Memory limit for runtime pods'
)
image_pull_secret: str | None = Field(
default=None,
description='Optional name of image pull secret for private registries',
)
ingress_tls_secret: str | None = Field(
default=None, description='Optional name of TLS secret for ingress'
)
node_selector_key: str | None = Field(
default=None, description='Optional node selector key for pod scheduling'
)
node_selector_val: str | None = Field(
default=None, description='Optional node selector value for pod scheduling'
)
tolerations_yaml: str | None = Field(
default=None, description='Optional YAML string defining pod tolerations'
)
privileged: bool = Field(
default=False,
description='Run the runtime sandbox container in privileged mode for use with docker-in-docker',
)
model_config = {'extra': 'forbid'}
@classmethod
def from_toml_section(cls, data: dict) -> dict[str, 'KubernetesConfig']:
"""
Create a mapping of KubernetesConfig instances from a toml dictionary representing the [kubernetes] section.
The configuration is built from all keys in data.
Returns:
dict[str, KubernetesConfig]: A mapping where the key "kubernetes" corresponds to the [kubernetes] configuration
"""
# Initialize the result mapping
kubernetes_mapping: dict[str, KubernetesConfig] = {}
# Try to create the configuration instance
try:
kubernetes_mapping['kubernetes'] = cls.model_validate(data)
except ValidationError as e:
raise ValueError(f'Invalid kubernetes configuration: {e}')
return kubernetes_mapping

View File

@ -11,6 +11,7 @@ from openhands.core.config.config_utils import (
model_defaults_to_dict,
)
from openhands.core.config.extended_config import ExtendedConfig
from openhands.core.config.kubernetes_config import KubernetesConfig
from openhands.core.config.llm_config import LLMConfig
from openhands.core.config.mcp_config import MCPConfig
from openhands.core.config.sandbox_config import SandboxConfig
@ -107,6 +108,7 @@ class OpenHandsConfig(BaseModel):
) # Maximum number of concurrent agent loops allowed per user
mcp_host: str = Field(default=f'localhost:{os.getenv("port", 3000)}')
mcp: MCPConfig = Field(default_factory=MCPConfig)
kubernetes: KubernetesConfig = Field(default_factory=KubernetesConfig)
defaults_dict: ClassVar[dict] = {}

View File

@ -25,6 +25,7 @@ from openhands.core.config.config_utils import (
OH_MAX_ITERATIONS,
)
from openhands.core.config.extended_config import ExtendedConfig
from openhands.core.config.kubernetes_config import KubernetesConfig
from openhands.core.config.llm_config import LLMConfig
from openhands.core.config.mcp_config import MCPConfig
from openhands.core.config.openhands_config import OpenHandsConfig
@ -228,6 +229,19 @@ def load_from_toml(cfg: OpenHandsConfig, toml_file: str = 'config.toml') -> None
# Re-raise ValueError from MCPConfig.from_toml_section
raise ValueError('Error in MCP sections in config.toml')
# Process kubernetes section if present
if 'kubernetes' in toml_config:
try:
kubernetes_mapping = KubernetesConfig.from_toml_section(
toml_config['kubernetes']
)
if 'kubernetes' in kubernetes_mapping:
cfg.kubernetes = kubernetes_mapping['kubernetes']
except (TypeError, KeyError, ValidationError) as e:
logger.openhands_logger.warning(
f'Cannot parse [kubernetes] config from toml, values have not been applied.\nError: {e}'
)
# Process condenser section if present
if 'condenser' in toml_config:
try:
@ -286,6 +300,7 @@ def load_from_toml(cfg: OpenHandsConfig, toml_file: str = 'config.toml') -> None
'sandbox',
'condenser',
'mcp',
'kubernetes',
}
for key in toml_config:
if key.lower() not in known_sections:

View File

@ -5,6 +5,7 @@ from openhands.runtime.impl.docker.docker_runtime import (
DockerRuntime,
)
from openhands.runtime.impl.e2b.e2b_runtime import E2BRuntime
from openhands.runtime.impl.kubernetes.kubernetes_runtime import KubernetesRuntime
from openhands.runtime.impl.local.local_runtime import LocalRuntime
from openhands.runtime.impl.modal.modal_runtime import ModalRuntime
from openhands.runtime.impl.remote.remote_runtime import RemoteRuntime
@ -21,6 +22,7 @@ _DEFAULT_RUNTIME_CLASSES: dict[str, type[Runtime]] = {
'runloop': RunloopRuntime,
'local': LocalRuntime,
'daytona': DaytonaRuntime,
'kubernetes': KubernetesRuntime,
'cli': CLIRuntime,
}
@ -50,6 +52,7 @@ __all__ = [
'RunloopRuntime',
'DockerRuntime',
'DaytonaRuntime',
'KubernetesRuntime',
'CLIRuntime',
'get_runtime_cls',
]

View File

@ -0,0 +1,141 @@
# OpenHands Kubernetes Runtime
This directory contains the Kubernetes runtime implementation for OpenHands, which allows the software to run on Kubernetes clusters for scalable and isolated execution environments.
## Local Development with KIND
For local development and testing, OpenHands provides a convenient setup using KIND (Kubernetes IN Docker) that creates a local Kubernetes cluster.
### Prerequisites
Before setting up the local Kubernetes environment, ensure you have the following tools installed:
1. **KIND (Kubernetes IN Docker)** - [Installation Guide](https://kind.sigs.k8s.io/docs/user/quick-start/)
2. **kubectl** - [Installation Guide](https://kubernetes.io/docs/tasks/tools/#kubectl)
3. **mirrord** - [Installation Guide](https://metalbear.co/mirrord/docs/overview/quick-start/#installation)
MirrorD is used for network mirroring allowing the locally running process to interact with the kubernetes cluster as if it were running inside of kubernetes.
4. **Docker or Podman** - Required for KIND to work
- Docker: Follow the official Docker installation guide for your platform
- Podman: [Installation Guide](https://podman.io/docs/installation)
### Configuration
To use the Kubernetes runtime, you need to configure OpenHands properly. The configuration is done through a TOML configuration file.
#### Required Configuration
Two configuration options are required to use the Kubernetes runtime:
1. **Runtime Type**: Set the runtime to use Kubernetes
```toml
[core]
runtime = "kubernetes"
```
2. **Runtime Container Image**: Specify the container image to use for the runtime environment
```toml
[sandbox]
runtime_container_image = "docker.all-hands.dev/all-hands-ai/runtime:0.44-nikolaik"
```
#### Additional Kubernetes Options
OpenHands provides extensive configuration options for Kubernetes deployments under the `[kubernetes]` section. These options allow you to customize:
- Kubernetes namespace
- Persistent volume configuration
- Ingress and networking settings
- Runtime Pod Security settings
- Resource limits and requests
For a complete list of available Kubernetes configuration options, refer to the `[kubernetes]` section in the `config.template.toml` file in the repository root.
## Local Development Setup
### Quick Start
To set up and run OpenHands with the Kubernetes runtime locally:
First build the application with
```bash
make build
```
Then
```bash
make kind # target is stateless and will check for an existing kind cluster or make a new one if not present.
```
This command will:
1. **Check Dependencies**: Verify that `kind`, `kubectl`, and `mirrord` are installed
2. **Create KIND Cluster**: Create a local Kubernetes cluster named "local-hands" using the configuration in `kind/cluster.yaml`
3. **Deploy Infrastructure**: Apply Kubernetes manifests including:
- Ubuntu development pod for runtime execution
- Nginx ingress controller for HTTP routing
- RBAC configurations for proper permissions
4. **Setup Mirrord**: Install mirrord resources for development workflow
5. **Run Application**: Execute `make run` inside the mirrord environment
### Cluster Configuration
The KIND cluster is configured with:
- **Cluster Name**: `local-hands`
- **Node Configuration**: Single control-plane node
- **Port Mapping**: Host port 80 maps to container port 80 for nginx ingress
- **Base Image**: Ubuntu 22.04 for the development environment
### Infrastructure Components
The local setup includes several Kubernetes resources:
#### Development Environment
- **Deployment**: `ubuntu-dev` - Ubuntu 22.04 container for code execution
- **Service**: Exposes the development environment within the cluster
#### Ingress Controller (Nginx)
- **Namespace**: `ingress-nginx` - Dedicated namespace for ingress resources
- **Deployment**: `ingress-nginx-controller` - Handles HTTP routing and load balancing
- **Service**: LoadBalancer service for external access
- **ConfigMap**: Custom configuration for nginx controller
- **RBAC**: Roles and bindings for proper cluster permissions
#### Development Workflow
- **Mirrord Integration**: Allows running local development server while connecting to cluster resources
- **Port Forwarding**: Direct access to cluster services from localhost
### Usage
Once the environment is set up with `make kind`, the system will:
1. Wait for all deployments to be ready
2. Automatically start the OpenHands application using mirrord
3. Provide access to the application at http://127.0.0.1:3000/
The mirrord integration allows you to develop locally while your application has access to the Kubernetes cluster resources, providing a seamless development experience that mirrors production behavior.
### Troubleshooting
If you encounter issues:
1. **Check cluster status**: `kubectl get nodes`
2. **Verify deployments**: `kubectl get deployments --all-namespaces`
3. **Check ingress**: `kubectl get ingress --all-namespaces`
4. **View logs**: `kubectl logs -l app=ubuntu-dev`
To clean up the environment:
```bash
kind delete cluster --name local-hands
```

View File

@ -0,0 +1,752 @@
from functools import lru_cache
from typing import Callable
from uuid import UUID
import tenacity
import yaml
from kubernetes import client, config
from kubernetes.client.models import (
V1Container,
V1ContainerPort,
V1EnvVar,
V1HTTPIngressPath,
V1HTTPIngressRuleValue,
V1Ingress,
V1IngressBackend,
V1IngressRule,
V1IngressServiceBackend,
V1IngressSpec,
V1IngressTLS,
V1ObjectMeta,
V1PersistentVolumeClaim,
V1PersistentVolumeClaimSpec,
V1PersistentVolumeClaimVolumeSource,
V1Pod,
V1PodSpec,
V1ResourceRequirements,
V1SecurityContext,
V1Service,
V1ServiceBackendPort,
V1ServicePort,
V1ServiceSpec,
V1Toleration,
V1Volume,
V1VolumeMount,
)
from openhands.core.config import OpenHandsConfig
from openhands.core.exceptions import (
AgentRuntimeDisconnectedError,
AgentRuntimeNotFoundError,
)
from openhands.core.logger import DEBUG
from openhands.core.logger import openhands_logger as logger
from openhands.events import EventStream
from openhands.integrations.provider import PROVIDER_TOKEN_TYPE
from openhands.runtime.impl.action_execution.action_execution_client import (
ActionExecutionClient,
)
from openhands.runtime.plugins import PluginRequirement
from openhands.runtime.runtime_status import RuntimeStatus
from openhands.runtime.utils.command import get_action_execution_server_startup_command
from openhands.utils.async_utils import call_sync_from_async
from openhands.utils.shutdown_listener import add_shutdown_listener
from openhands.utils.tenacity_stop import stop_if_should_exit
POD_NAME_PREFIX = 'openhands-runtime-'
POD_LABEL = 'openhands-runtime'
class KubernetesRuntime(ActionExecutionClient):
"""
A Kubernetes runtime for OpenHands that works with Kind.
This runtime creates pods in a Kubernetes cluster to run the agent code.
It uses the Kubernetes Python client to create and manage the pods.
Args:
config (OpenHandsConfig): The application configuration.
event_stream (EventStream): The event stream to subscribe to.
sid (str, optional): The session ID. Defaults to 'default'.
plugins (list[PluginRequirement] | None, optional): List of plugin requirements. Defaults to None.
env_vars (dict[str, str] | None, optional): Environment variables to set. Defaults to None.
status_callback (Callable | None, optional): Callback for status updates. Defaults to None.
attach_to_existing (bool, optional): Whether to attach to an existing pod. Defaults to False.
headless_mode (bool, optional): Whether to run in headless mode. Defaults to True.
"""
_shutdown_listener_id: UUID | None = None
_namespace: str = ''
def __init__(
self,
config: OpenHandsConfig,
event_stream: EventStream,
sid: str = 'default',
plugins: list[PluginRequirement] | None = None,
env_vars: dict[str, str] | None = None,
status_callback: Callable | None = None,
attach_to_existing: bool = False,
headless_mode: bool = True,
user_id: str | None = None,
git_provider_tokens: PROVIDER_TOKEN_TYPE | None = None,
):
if not KubernetesRuntime._shutdown_listener_id:
KubernetesRuntime._shutdown_listener_id = add_shutdown_listener(
lambda: KubernetesRuntime._cleanup_k8s_resources(
namespace=self._k8s_namespace,
remove_pvc=True,
conversation_id=self.sid,
) # this is when you ctrl+c.
)
self.config = config
self._runtime_initialized: bool = False
self.status_callback = status_callback
# Load and validate Kubernetes configuration
if self.config.kubernetes is None:
raise ValueError(
'Kubernetes configuration is required when using KubernetesRuntime. '
'Please add a [kubernetes] section to your configuration.'
)
self._k8s_config = self.config.kubernetes
self._k8s_namespace = self._k8s_config.namespace
KubernetesRuntime._namespace = self._k8s_namespace
# Initialize ports with default values in the required range
self._container_port = 8080 # Default internal container port
self._vscode_port = 8081 # Default VSCode port.
self._app_ports: list[int] = [
30082,
30083,
] # Default app ports in valid range # The agent prefers these when exposing an application.
self.k8s_client, self.k8s_networking_client = self._init_kubernetes_client()
self.pod_image = self.config.sandbox.runtime_container_image
if not self.pod_image:
# If runtime_container_image isn't set, use the base_container_image as a fallback
self.pod_image = self.config.sandbox.base_container_image
self.pod_name = POD_NAME_PREFIX + sid
# Initialize the API URL with the initial port value
self.k8s_local_url = f'http://{self._get_svc_name(self.pod_name)}.{self._k8s_namespace}.svc.cluster.local'
self.api_url = f'{self.k8s_local_url}:{self._container_port}'
super().__init__(
config,
event_stream,
sid,
plugins,
env_vars,
status_callback,
attach_to_existing,
headless_mode,
user_id,
git_provider_tokens,
)
@staticmethod
def _get_svc_name(pod_name: str) -> str:
"""Get the service name for the pod."""
return f'{pod_name}-svc'
@staticmethod
def _get_vscode_svc_name(pod_name: str) -> str:
"""Get the VSCode service name for the pod."""
return f'{pod_name}-svc-code'
@staticmethod
def _get_vscode_ingress_name(pod_name: str) -> str:
"""Get the VSCode ingress name for the pod."""
return f'{pod_name}-ingress-code'
@staticmethod
def _get_vscode_tls_secret_name(pod_name: str) -> str:
"""Get the TLS secret name for the VSCode ingress."""
return f'{pod_name}-tls-secret'
@staticmethod
def _get_pvc_name(pod_name: str) -> str:
"""Get the PVC name for the pod."""
return f'{pod_name}-pvc'
@staticmethod
def _get_pod_name(sid: str) -> str:
"""Get the pod name for the session."""
return POD_NAME_PREFIX + sid
@property
def action_execution_server_url(self):
return self.api_url
@property
def node_selector(self) -> dict[str, str] | None:
if (
not self._k8s_config.node_selector_key
or not self._k8s_config.node_selector_val
):
return None
return {self._k8s_config.node_selector_key: self._k8s_config.node_selector_val}
@property
def tolerations(self) -> list[V1Toleration] | None:
if not self._k8s_config.tolerations_yaml:
return None
tolerations_yaml_str = self._k8s_config.tolerations_yaml
tolerations = []
try:
tolerations_data = yaml.safe_load(tolerations_yaml_str)
if isinstance(tolerations_data, list):
for toleration in tolerations_data:
tolerations.append(V1Toleration(**toleration))
else:
logger.error(
f'Invalid tolerations format. Should be type list: {tolerations_yaml_str}. Expected a list.'
)
return None
except yaml.YAMLError as e:
logger.error(
f'Error parsing tolerations YAML: {tolerations_yaml_str}. Error: {e}'
)
return None
return tolerations
async def connect(self):
"""Connect to the runtime by creating or attaching to a pod."""
self.log('info', f'Connecting to runtime with conversation ID: {self.sid}')
self.log('info', f'self._attach_to_existing: {self.attach_to_existing}')
self.set_runtime_status(RuntimeStatus.STARTING_RUNTIME)
self.log('info', f'Using API URL {self.api_url}')
try:
await call_sync_from_async(self._attach_to_pod)
except client.rest.ApiException as e:
# we are not set to attach to existing, ignore error and init k8s resources.
if self.attach_to_existing:
self.log(
'error',
f'Pod {self.pod_name} not found or cannot connect to it.',
)
raise AgentRuntimeDisconnectedError from e
self.log('info', f'Starting runtime with image: {self.pod_image}')
try:
await call_sync_from_async(self._init_k8s_resources)
self.log(
'info',
f'Pod started: {self.pod_name}. VSCode URL: {self.vscode_url}',
)
except Exception as init_error:
self.log('error', f'Failed to initialize k8s resources: {init_error}')
raise AgentRuntimeNotFoundError(
f'Failed to initialize kubernetes resources: {init_error}'
) from init_error
if not self.attach_to_existing:
self.log('info', 'Waiting for pod to become ready ...')
self.set_runtime_status(RuntimeStatus.STARTING_RUNTIME)
try:
await call_sync_from_async(self._wait_until_ready)
except Exception as alive_error:
self.log('error', f'Failed to connect to runtime: {alive_error}')
self.send_error_message(
'ERROR$RUNTIME_CONNECTION',
f'Failed to connect to runtime: {alive_error}',
)
raise AgentRuntimeDisconnectedError(
f'Failed to connect to runtime: {alive_error}'
) from alive_error
if not self.attach_to_existing:
self.log('info', 'Runtime is ready.')
if not self.attach_to_existing:
await call_sync_from_async(self.setup_initial_env)
self.log(
'info',
f'Pod initialized with plugins: {[plugin.name for plugin in self.plugins]}. VSCode URL: {self.vscode_url}',
)
if not self.attach_to_existing:
self.set_runtime_status(RuntimeStatus.READY)
self._runtime_initialized = True
def _attach_to_pod(self):
"""Attach to an existing pod."""
try:
pod = self.k8s_client.read_namespaced_pod(
name=self.pod_name, namespace=self._k8s_namespace
)
if pod.status.phase != 'Running':
try:
self._wait_until_ready()
except TimeoutError:
raise AgentRuntimeDisconnectedError(
f'Pod {self.pod_name} exists but failed to become ready.'
)
self.log('info', f'Successfully attached to pod {self.pod_name}')
return True
except client.rest.ApiException as e:
self.log('error', f'Failed to attach to pod: {e}')
raise
@tenacity.retry(
stop=tenacity.stop_after_delay(300) | stop_if_should_exit(),
retry=tenacity.retry_if_exception_type(TimeoutError),
reraise=True,
wait=tenacity.wait_fixed(2),
)
def _wait_until_ready(self):
"""Wait until the runtime server is alive by checking the pod status in Kubernetes."""
self.log('info', f'Checking if pod {self.pod_name} is ready in Kubernetes')
pod = self.k8s_client.read_namespaced_pod(
name=self.pod_name, namespace=self._k8s_namespace
)
if pod.status.phase == 'Running' and pod.status.conditions:
for condition in pod.status.conditions:
if condition.type == 'Ready' and condition.status == 'True':
self.log('info', f'Pod {self.pod_name} is ready!')
return True # Exit the function if the pod is ready
self.log(
'info',
f'Pod {self.pod_name} is not ready yet. Current phase: {pod.status.phase}',
)
raise TimeoutError(f'Pod {self.pod_name} is not in Running state yet.')
@staticmethod
@lru_cache(maxsize=1)
def _init_kubernetes_client() -> tuple[client.CoreV1Api, client.NetworkingV1Api]:
"""Initialize the Kubernetes client."""
try:
config.load_incluster_config() # Even local usage with mirrord technically uses an incluster config.
return client.CoreV1Api(), client.NetworkingV1Api()
except Exception as ex:
logger.error(
'Failed to initialize Kubernetes client. Make sure you have kubectl configured correctly or are running in a Kubernetes cluster.',
)
raise ex
@staticmethod
def _cleanup_k8s_resources(
namespace: str, remove_pvc: bool = False, conversation_id: str = ''
):
"""Clean up Kubernetes resources with our prefix in the namespace.
:param remove_pvc: If True, also remove persistent volume claims (defaults to False).
"""
try:
k8s_api, k8s_networking_api = KubernetesRuntime._init_kubernetes_client()
pod_name = KubernetesRuntime._get_pod_name(conversation_id)
service_name = KubernetesRuntime._get_svc_name(pod_name)
vscode_service_name = KubernetesRuntime._get_vscode_svc_name(pod_name)
ingress_name = KubernetesRuntime._get_vscode_ingress_name(pod_name)
pvc_name = KubernetesRuntime._get_pvc_name(pod_name)
try:
if remove_pvc:
# Delete PVC if requested
k8s_api.delete_namespaced_persistent_volume_claim(
name=pvc_name,
namespace=namespace,
body=client.V1DeleteOptions(),
)
logger.info(f'Deleted PVC {pvc_name}')
k8s_api.delete_namespaced_pod(
name=pod_name,
namespace=namespace,
body=client.V1DeleteOptions(),
)
logger.info(f'Deleted pod {pod_name}')
k8s_api.delete_namespaced_service(
name=service_name,
namespace=namespace,
)
logger.info(f'Deleted service {service_name}')
# Delete the vs code service
k8s_api.delete_namespaced_service(
name=vscode_service_name, namespace=namespace
)
logger.info(f'Deleted service {vscode_service_name}')
k8s_networking_api.delete_namespaced_ingress(
name=ingress_name, namespace=namespace
)
logger.info(f'Deleted ingress {ingress_name}')
except client.rest.ApiException:
# Service might not exist, ignore
pass
logger.info('Cleaned up Kubernetes resources')
except Exception as e:
logger.error(f'Error cleaning up k8s resources: {e}')
def _get_pvc_manifest(self):
"""Create a PVC manifest for the runtime pod."""
# Create PVC
pvc = V1PersistentVolumeClaim(
api_version='v1',
kind='PersistentVolumeClaim',
metadata=V1ObjectMeta(
name=self._get_pvc_name(self.pod_name), namespace=self._k8s_namespace
),
spec=V1PersistentVolumeClaimSpec(
access_modes=['ReadWriteOnce'],
resources=client.V1ResourceRequirements(
requests={'storage': self._k8s_config.pvc_storage_size}
),
storage_class_name=self._k8s_config.pvc_storage_class,
),
)
return pvc
def _get_vscode_service_manifest(self):
"""Create a service manifest for the VSCode server."""
vscode_service_spec = V1ServiceSpec(
selector={'app': POD_LABEL, 'session': self.sid},
type='ClusterIP',
ports=[
V1ServicePort(
port=self._vscode_port,
target_port='vscode',
name='code',
)
],
)
vscode_service = V1Service(
metadata=V1ObjectMeta(name=self._get_vscode_svc_name(self.pod_name)),
spec=vscode_service_spec,
)
return vscode_service
def _get_runtime_service_manifest(self):
"""Create a service manifest for the runtime pod execution-server."""
service_spec = V1ServiceSpec(
selector={'app': POD_LABEL, 'session': self.sid},
type='ClusterIP',
ports=[
V1ServicePort(
port=self._container_port,
target_port='http',
name='execution-server',
)
],
)
service = V1Service(
metadata=V1ObjectMeta(name=self._get_svc_name(self.pod_name)),
spec=service_spec,
)
return service
def _get_runtime_pod_manifest(self):
"""Create a pod manifest for the runtime sandbox."""
# Prepare environment variables
environment = [
V1EnvVar(name='port', value=str(self._container_port)),
V1EnvVar(name='PYTHONUNBUFFERED', value='1'),
V1EnvVar(name='VSCODE_PORT', value=str(self._vscode_port)),
]
if self.config.debug or DEBUG:
environment.append(V1EnvVar(name='DEBUG', value='true'))
# Add runtime startup env vars
for key, value in self.config.sandbox.runtime_startup_env_vars.items():
environment.append(V1EnvVar(name=key, value=value))
# Prepare volume mounts if workspace is configured
volume_mounts = [
V1VolumeMount(
name='workspace-volume',
mount_path=self.config.workspace_mount_path_in_sandbox,
),
]
volumes = [
V1Volume(
name='workspace-volume',
persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
claim_name=self._get_pvc_name(self.pod_name)
),
)
]
# Prepare container ports
container_ports = [
V1ContainerPort(container_port=self._container_port, name='http'),
]
if self.vscode_enabled:
container_ports.append(
V1ContainerPort(container_port=self._vscode_port, name='vscode')
)
for port in self._app_ports:
container_ports.append(V1ContainerPort(container_port=port))
# Define the readiness probe
health_check = client.V1Probe(
http_get=client.V1HTTPGetAction(
path='/alive',
port=self._container_port, # Or the port your application listens on
),
initial_delay_seconds=5, # Adjust as needed
period_seconds=10, # Adjust as needed
timeout_seconds=5, # Adjust as needed
success_threshold=1,
failure_threshold=3,
)
# Prepare command
# Entry point command for generated sandbox runtime pod.
command = get_action_execution_server_startup_command(
server_port=self._container_port,
plugins=self.plugins,
app_config=self.config,
override_user_id=0, # if we use the default of app_config.run_as_openhands then we cant edit files in vscode due to file perms.
override_username='root',
)
# Prepare resource requirements based on config
resources = V1ResourceRequirements(
limits={'memory': self._k8s_config.resource_memory_limit},
requests={
'cpu': self._k8s_config.resource_cpu_request,
'memory': self._k8s_config.resource_memory_request,
},
)
# Set security context for the container
security_context = V1SecurityContext(privileged=self._k8s_config.privileged)
# Create the container definition
container = V1Container(
name='runtime',
image=self.pod_image,
command=command,
env=environment,
ports=container_ports,
volume_mounts=volume_mounts,
working_dir='/openhands/code/',
resources=resources,
readiness_probe=health_check,
security_context=security_context,
)
# Create the pod definition
image_pull_secrets = None
if self._k8s_config.image_pull_secret:
image_pull_secrets = [
client.V1LocalObjectReference(name=self._k8s_config.image_pull_secret)
]
pod = V1Pod(
metadata=V1ObjectMeta(
name=self.pod_name, labels={'app': POD_LABEL, 'session': self.sid}
),
spec=V1PodSpec(
containers=[container],
volumes=volumes,
restart_policy='Never',
image_pull_secrets=image_pull_secrets,
node_selector=self.node_selector,
tolerations=self.tolerations,
),
)
return pod
def _get_vscode_ingress_manifest(self):
"""Create an ingress manifest for the VSCode server."""
tls = []
if self._k8s_config.ingress_tls_secret:
runtime_tls = V1IngressTLS(
hosts=[self.ingress_domain],
secret_name=self._k8s_config.ingress_tls_secret,
)
tls = [runtime_tls]
rules = [
V1IngressRule(
host=self.ingress_domain,
http=V1HTTPIngressRuleValue(
paths=[
V1HTTPIngressPath(
path='/',
path_type='Prefix',
backend=V1IngressBackend(
service=V1IngressServiceBackend(
port=V1ServiceBackendPort(
number=self._vscode_port,
),
name=self._get_vscode_svc_name(self.pod_name),
)
),
)
]
),
)
]
ingress_spec = V1IngressSpec(rules=rules, tls=tls)
ingress = V1Ingress(
api_version='networking.k8s.io/v1',
metadata=V1ObjectMeta(
name=self._get_vscode_ingress_name(self.pod_name),
annotations={
'external-dns.alpha.kubernetes.io/hostname': self.ingress_domain
},
),
spec=ingress_spec,
)
return ingress
def _pvc_exists(self):
"""Check if the PVC already exists."""
try:
pvc = self.k8s_client.read_namespaced_persistent_volume_claim(
name=self._get_pvc_name(self.pod_name), namespace=self._k8s_namespace
)
return pvc is not None
except client.rest.ApiException as e:
if e.status == 404:
return False
self.log('error', f'Error checking PVC existence: {e}')
def _init_k8s_resources(self):
"""Initialize the Kubernetes resources."""
self.log('info', 'Preparing to start pod...')
self.set_runtime_status(RuntimeStatus.STARTING_RUNTIME)
self.log('info', f'Runtime will be accessible at {self.api_url}')
pod = self._get_runtime_pod_manifest()
service = self._get_runtime_service_manifest()
vscode_service = self._get_vscode_service_manifest()
pvc_manifest = self._get_pvc_manifest()
ingress = self._get_vscode_ingress_manifest()
# Create the pod in Kubernetes
try:
if not self._pvc_exists():
# Create PVC if it doesn't exist
self.k8s_client.create_namespaced_persistent_volume_claim(
namespace=self._k8s_namespace, body=pvc_manifest
)
self.log('info', f'Created PVC {self._get_pvc_name(self.pod_name)}')
self.k8s_client.create_namespaced_pod(
namespace=self._k8s_namespace, body=pod
)
self.log('info', f'Created pod {self.pod_name}.')
# Create a service to expose the pod for external access
self.k8s_client.create_namespaced_service(
namespace=self._k8s_namespace, body=service
)
self.log('info', f'Created service {self._get_svc_name(self.pod_name)}')
# Create second service service for the vscode server.
self.k8s_client.create_namespaced_service(
namespace=self._k8s_namespace, body=vscode_service
)
self.log(
'info', f'Created service {self._get_vscode_svc_name(self.pod_name)}'
)
# create the vscode ingress.
self.k8s_networking_client.create_namespaced_ingress(
namespace=self._k8s_namespace, body=ingress
)
self.log(
'info',
f'Created ingress {self._get_vscode_ingress_name(self.pod_name)}',
)
# Wait for the pod to be running
self._wait_until_ready()
except client.rest.ApiException as e:
self.log('error', f'Failed to create pod and services: {e}')
raise
except RuntimeError as e:
self.log('error', f'Port forwarding failed: {e}')
raise
def close(self):
"""Close the runtime and clean up resources."""
# this is called when a single conversation question is answered or a tab is closed.
self.log(
'info',
f'Closing runtime and cleaning up resources for conersation ID: {self.sid}',
)
# Call parent class close method first
super().close()
# Return early if we should keep the runtime alive or if we're attaching to existing
if self.config.sandbox.keep_runtime_alive or self.attach_to_existing:
self.log(
'info', 'Keeping runtime alive due to configuration or attach mode'
)
return
try:
self._cleanup_k8s_resources(
namespace=self._k8s_namespace,
remove_pvc=False,
conversation_id=self.sid,
)
except Exception as e:
self.log('error', f'Error closing runtime: {e}')
@property
def ingress_domain(self) -> str:
"""Get the ingress domain for the runtime."""
return f'{self.sid}.{self._k8s_config.ingress_domain}'
@property
def vscode_url(self) -> str | None:
"""Get the URL for VSCode server if enabled."""
if not self.vscode_enabled:
return None
token = super().get_vscode_token()
if not token:
return None
protocol = 'https' if self._k8s_config.ingress_tls_secret else 'http'
vscode_url = f'{protocol}://{self.ingress_domain}/?tkn={token}&folder={self.config.workspace_mount_path_in_sandbox}'
self.log('info', f'VSCode URL: {vscode_url}')
return vscode_url
@property
def web_hosts(self) -> dict[str, int]:
"""Get web hosts dict mapping for browser access."""
hosts = {}
for idx, port in enumerate(self._app_ports):
hosts[f'{self.k8s_local_url}:{port}'] = port
return hosts
@classmethod
async def delete(cls, conversation_id: str):
"""Delete resources associated with a conversation."""
# This is triggered when you actually do the delete in the UI on the convo.
try:
cls._cleanup_k8s_resources(
namespace=cls._namespace,
remove_pvc=True,
conversation_id=conversation_id,
)
except Exception as e:
logger.error(
f'Error deleting resources for conversation {conversation_id}: {e}'
)

64
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.
# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
[[package]]
name = "aioboto3"
@ -462,7 +462,7 @@ description = "LTS Port of Python audioop"
optional = false
python-versions = ">=3.13"
groups = ["main"]
markers = "python_version == \"3.13\""
markers = "python_version >= \"3.13\""
files = [
{file = "audioop_lts-0.2.1-cp313-abi3-macosx_10_13_universal2.whl", hash = "sha256:fd1345ae99e17e6910f47ce7d52673c6a1a70820d78b67de1b7abb3af29c426a"},
{file = "audioop_lts-0.2.1-cp313-abi3-macosx_10_13_x86_64.whl", hash = "sha256:e175350da05d2087e12cea8e72a70a1a8b14a17e92ed2022952a4419689ede5e"},
@ -2299,6 +2299,18 @@ https = ["urllib3 (>=1.24.1)"]
paramiko = ["paramiko"]
pgp = ["gpg"]
[[package]]
name = "durationpy"
version = "0.10"
description = "Module for converting between datetime.timedelta and Go's Duration strings."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "durationpy-0.10-py3-none-any.whl", hash = "sha256:3b41e1b601234296b4fb368338fdcd3e13e0b4fb5b67345948f4f2bf9868b286"},
{file = "durationpy-0.10.tar.gz", hash = "sha256:1fa6893409a6e739c9c72334fc65cca1f355dbdd93405d30f726deb5bde42fba"},
]
[[package]]
name = "e2b"
version = "1.5.1"
@ -3039,8 +3051,8 @@ files = [
google-api-core = {version = ">=1.34.1,<2.0.dev0 || >=2.11.dev0,<3.0.0dev", extras = ["grpc"]}
google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0dev"
proto-plus = [
{version = ">=1.25.0,<2.0.0dev", markers = "python_version >= \"3.13\""},
{version = ">=1.22.3,<2.0.0dev"},
{version = ">=1.25.0,<2.0.0dev", markers = "python_version >= \"3.13\""},
]
protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0dev"
@ -3062,8 +3074,8 @@ googleapis-common-protos = ">=1.56.2,<2.0.0"
grpcio = {version = ">=1.49.1,<2.0.0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}
grpcio-status = {version = ">=1.49.1,<2.0.0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}
proto-plus = [
{version = ">=1.25.0,<2.0.0", markers = "python_version >= \"3.13\""},
{version = ">=1.22.3,<2.0.0"},
{version = ">=1.25.0,<2.0.0", markers = "python_version >= \"3.13\""},
]
protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<7.0.0"
requests = ">=2.18.0,<3.0.0"
@ -3281,8 +3293,8 @@ google-api-core = {version = ">=1.34.1,<2.0.dev0 || >=2.11.dev0,<3.0.0", extras
google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0"
grpc-google-iam-v1 = ">=0.14.0,<1.0.0"
proto-plus = [
{version = ">=1.25.0,<2.0.0", markers = "python_version >= \"3.13\""},
{version = ">=1.22.3,<2.0.0"},
{version = ">=1.25.0,<2.0.0", markers = "python_version >= \"3.13\""},
]
protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<7.0.0"
@ -4788,6 +4800,34 @@ files = [
{file = "kiwisolver-1.4.8.tar.gz", hash = "sha256:23d5f023bdc8c7e54eb65f03ca5d5bb25b601eac4d7f1a042888a1f45237987e"},
]
[[package]]
name = "kubernetes"
version = "33.1.0"
description = "Kubernetes python client"
optional = false
python-versions = ">=3.6"
groups = ["main"]
files = [
{file = "kubernetes-33.1.0-py2.py3-none-any.whl", hash = "sha256:544de42b24b64287f7e0aa9513c93cb503f7f40eea39b20f66810011a86eabc5"},
{file = "kubernetes-33.1.0.tar.gz", hash = "sha256:f64d829843a54c251061a8e7a14523b521f2dc5c896cf6d65ccf348648a88993"},
]
[package.dependencies]
certifi = ">=14.05.14"
durationpy = ">=0.7"
google-auth = ">=1.0.1"
oauthlib = ">=3.2.2"
python-dateutil = ">=2.5.3"
pyyaml = ">=5.4.1"
requests = "*"
requests-oauthlib = "*"
six = ">=1.9.0"
urllib3 = ">=1.24.2"
websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0"
[package.extras]
adal = ["adal (>=1.0.2)"]
[[package]]
name = "lazy-loader"
version = "0.4"
@ -5146,11 +5186,8 @@ files = [
{file = "lxml-5.4.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7ce1a171ec325192c6a636b64c94418e71a1964f56d002cc28122fceff0b6121"},
{file = "lxml-5.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:795f61bcaf8770e1b37eec24edf9771b307df3af74d1d6f27d812e15a9ff3872"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29f451a4b614a7b5b6c2e043d7b64a15bd8304d7e767055e8ab68387a8cacf4e"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:891f7f991a68d20c75cb13c5c9142b2a3f9eb161f1f12a9489c82172d1f133c0"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4aa412a82e460571fad592d0f93ce9935a20090029ba08eca05c614f99b0cc92"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:ac7ba71f9561cd7d7b55e1ea5511543c0282e2b6450f122672a2694621d63b7e"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:c5d32f5284012deaccd37da1e2cd42f081feaa76981f0eaa474351b68df813c5"},
{file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:ce31158630a6ac85bddd6b830cffd46085ff90498b397bd0a259f59d27a12188"},
{file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:31e63621e073e04697c1b2d23fcb89991790eef370ec37ce4d5d469f40924ed6"},
{file = "lxml-5.4.0-cp37-cp37m-win32.whl", hash = "sha256:be2ba4c3c5b7900246a8f866580700ef0d538f2ca32535e991027bdaba944063"},
{file = "lxml-5.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:09846782b1ef650b321484ad429217f5154da4d6e786636c38e434fa32e94e49"},
@ -6544,8 +6581,8 @@ files = [
[package.dependencies]
googleapis-common-protos = ">=1.52,<2.0"
grpcio = [
{version = ">=1.66.2,<2.0.0", markers = "python_version >= \"3.13\""},
{version = ">=1.63.2,<2.0.0", markers = "python_version < \"3.13\""},
{version = ">=1.66.2,<2.0.0", markers = "python_version >= \"3.13\""},
]
opentelemetry-api = ">=1.15,<2.0"
opentelemetry-exporter-otlp-proto-common = "1.34.1"
@ -9308,6 +9345,7 @@ files = [
{file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"},
{file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"},
]
markers = {evaluation = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
[package.extras]
check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""]
@ -9550,7 +9588,7 @@ description = "Standard library aifc redistribution. \"dead battery\"."
optional = false
python-versions = "*"
groups = ["main"]
markers = "python_version == \"3.13\""
markers = "python_version >= \"3.13\""
files = [
{file = "standard_aifc-3.13.0-py3-none-any.whl", hash = "sha256:f7ae09cc57de1224a0dd8e3eb8f73830be7c3d0bc485de4c1f82b4a7f645ac66"},
{file = "standard_aifc-3.13.0.tar.gz", hash = "sha256:64e249c7cb4b3daf2fdba4e95721f811bde8bdfc43ad9f936589b7bb2fae2e43"},
@ -9567,7 +9605,7 @@ description = "Standard library chunk redistribution. \"dead battery\"."
optional = false
python-versions = "*"
groups = ["main"]
markers = "python_version == \"3.13\""
markers = "python_version >= \"3.13\""
files = [
{file = "standard_chunk-3.13.0-py3-none-any.whl", hash = "sha256:17880a26c285189c644bd5bd8f8ed2bdb795d216e3293e6dbe55bbd848e2982c"},
{file = "standard_chunk-3.13.0.tar.gz", hash = "sha256:4ac345d37d7e686d2755e01836b8d98eda0d1a3ee90375e597ae43aaf064d654"},
@ -10899,7 +10937,7 @@ version = "1.8.0"
description = "WebSocket client for Python with low level API options"
optional = false
python-versions = ">=3.8"
groups = ["runtime"]
groups = ["main", "runtime"]
files = [
{file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"},
{file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"},
@ -11729,4 +11767,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.1"
python-versions = "^3.12,<3.14"
content-hash = "df8217d9808a5a1f5886e0328cbeb5032b20c28a677154888bd010f7bc945cb2"
content-hash = "cce67d8303f93acbf92f3a3603ad07ff82fea4163fc8c38614b3ecb172c34052"

View File

@ -85,6 +85,8 @@ stripe = ">=11.5,<13.0"
google-cloud-aiplatform = "*"
anthropic = { extras = [ "vertex" ], version = "*" }
boto3 = "*"
kubernetes = "^33.1.0"
pyyaml = "^6.0.2"
[tool.poetry.group.dev]
optional = true

View File

@ -0,0 +1,62 @@
import pytest
from pydantic import ValidationError
from openhands.core.config.kubernetes_config import KubernetesConfig
def test_kubernetes_config_defaults():
"""Test that KubernetesConfig has correct default values."""
config = KubernetesConfig()
assert config.namespace == 'default'
assert config.ingress_domain == 'localhost'
assert config.pvc_storage_size == '2Gi'
assert config.pvc_storage_class is None
assert config.resource_cpu_request == '1'
assert config.resource_memory_request == '1Gi'
assert config.resource_memory_limit == '2Gi'
assert config.image_pull_secret is None
assert config.ingress_tls_secret is None
assert config.node_selector_key is None
assert config.node_selector_val is None
assert config.tolerations_yaml is None
assert config.privileged is False
def test_kubernetes_config_custom_values():
"""Test that KubernetesConfig accepts custom values."""
config = KubernetesConfig(
namespace='test-ns',
ingress_domain='test.example.com',
pvc_storage_size='5Gi',
pvc_storage_class='fast',
resource_cpu_request='2',
resource_memory_request='2Gi',
resource_memory_limit='4Gi',
image_pull_secret='pull-secret',
ingress_tls_secret='tls-secret',
node_selector_key='zone',
node_selector_val='us-east-1',
tolerations_yaml='- key: special\n value: true',
privileged=True,
)
assert config.namespace == 'test-ns'
assert config.ingress_domain == 'test.example.com'
assert config.pvc_storage_size == '5Gi'
assert config.pvc_storage_class == 'fast'
assert config.resource_cpu_request == '2'
assert config.resource_memory_request == '2Gi'
assert config.resource_memory_limit == '4Gi'
assert config.image_pull_secret == 'pull-secret'
assert config.ingress_tls_secret == 'tls-secret'
assert config.node_selector_key == 'zone'
assert config.node_selector_val == 'us-east-1'
assert config.tolerations_yaml == '- key: special\n value: true'
assert config.privileged is True
def test_kubernetes_config_validation():
"""Test that KubernetesConfig validates input correctly."""
# Test that extra fields are not allowed
with pytest.raises(ValidationError):
KubernetesConfig(extra_field='not allowed')