diff --git a/enterprise/poetry.lock b/enterprise/poetry.lock index 1b72e15831..39d031384b 100644 --- a/enterprise/poetry.lock +++ b/enterprise/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "aiofiles" @@ -5860,7 +5860,7 @@ wsproto = ">=1.2.0" [[package]] name = "openhands-ai" -version = "0.0.0-post.5803+a8098505c" +version = "1.1.0" description = "OpenHands: Code Less, Make More" optional = false python-versions = "^3.12,<3.14" @@ -6858,22 +6858,6 @@ files = [ [package.extras] twisted = ["twisted"] -[[package]] -name = "prometheus-fastapi-instrumentator" -version = "7.1.0" -description = "Instrument your FastAPI app with Prometheus metrics" -optional = false -python-versions = ">=3.8" -groups = ["main"] -files = [ - {file = "prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl", hash = "sha256:978130f3c0bb7b8ebcc90d35516a6fe13e02d2eb358c8f83887cdef7020c31e9"}, - {file = "prometheus_fastapi_instrumentator-7.1.0.tar.gz", hash = "sha256:be7cd61eeea4e5912aeccb4261c6631b3f227d8924542d79eaf5af3f439cbe5e"}, -] - -[package.dependencies] -prometheus-client = ">=0.8.0,<1.0.0" -starlette = ">=0.30.0,<1.0.0" - [[package]] name = "prompt-toolkit" version = "3.0.52" @@ -14508,4 +14492,4 @@ cffi = ["cffi (>=1.17) ; python_version >= \"3.13\" and platform_python_implemen [metadata] lock-version = "2.1" python-versions = "^3.12,<3.14" -content-hash = "fac67a8991a3e2c840a23702dc90f99e98d381f3537ad50b4c4739cdbde941ca" +content-hash = "ab703edc73639f22f498894d16bf7170fe3ab9c2697761cdd494587caee77973" diff --git a/enterprise/pyproject.toml b/enterprise/pyproject.toml index f18407fea9..b737b7f895 100644 --- a/enterprise/pyproject.toml +++ b/enterprise/pyproject.toml @@ -29,7 +29,6 @@ cloud-sql-python-connector = "^1.16.0" psycopg2-binary = "^2.9.10" pg8000 = "^1.31.2" stripe = "^11.5.0" -prometheus-fastapi-instrumentator = "^7.0.2" python-json-logger = "^3.2.1" python-keycloak = "^5.3.1" asyncpg = "^0.30.0" diff --git a/enterprise/saas_server.py b/enterprise/saas_server.py index ec1480cbda..1748f86463 100644 --- a/enterprise/saas_server.py +++ b/enterprise/saas_server.py @@ -18,7 +18,6 @@ from server.auth.constants import ( # noqa: E402 ) from server.constants import PERMITTED_CORS_ORIGINS # noqa: E402 from server.logger import logger # noqa: E402 -from server.metrics import metrics_app # noqa: E402 from server.middleware import SetAuthCookieMiddleware # noqa: E402 from server.rate_limit import setup_rate_limit_handler # noqa: E402 from server.routes.api_keys import api_router as api_keys_router # noqa: E402 @@ -61,9 +60,6 @@ def is_saas(): return {'saas': True} -# This requires a trailing slash to access, like /api/metrics/ -base_app.mount('/internal/metrics', metrics_app()) - base_app.include_router(readiness_router) # Add routes for readiness checks base_app.include_router(api_router) # Add additional route for github auth base_app.include_router(oauth_router) # Add additional route for oauth callback diff --git a/enterprise/server/metrics.py b/enterprise/server/metrics.py deleted file mode 100644 index 5afed66979..0000000000 --- a/enterprise/server/metrics.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import Callable - -from prometheus_client import Gauge, make_asgi_app -from server.clustered_conversation_manager import ClusteredConversationManager - -from openhands.server.shared import ( - conversation_manager, -) - -RUNNING_AGENT_LOOPS_GAUGE = Gauge( - 'saas_running_agent_loops', - 'Count of running agent loops, aggregate by session_id to dedupe', - ['session_id'], -) - - -async def _update_metrics(): - """Update any prometheus metrics that are not updated during normal operation.""" - if isinstance(conversation_manager, ClusteredConversationManager): - running_agent_loops = ( - await conversation_manager.get_running_agent_loops_locally() - ) - # Clear so we don't keep counting old sessions. - # This is theoretically a race condition but this is scraped on a regular interval. - RUNNING_AGENT_LOOPS_GAUGE.clear() - # running_agent_loops shouldn't be None, but can be. - if running_agent_loops is not None: - for sid in running_agent_loops: - RUNNING_AGENT_LOOPS_GAUGE.labels(session_id=sid).set(1) - - -def metrics_app() -> Callable: - metrics_callable = make_asgi_app() - - async def wrapped_handler(scope, receive, send): - """ - Call _update_metrics before serving Prometheus metrics endpoint. - Not wrapped in a `try`, failing would make metrics endpoint unavailable. - """ - await _update_metrics() - await metrics_callable(scope, receive, send) - - return wrapped_handler diff --git a/enterprise/server/saas_monitoring_listener.py b/enterprise/server/saas_monitoring_listener.py index 1b687f04c8..83d3a3657a 100644 --- a/enterprise/server/saas_monitoring_listener.py +++ b/enterprise/server/saas_monitoring_listener.py @@ -1,4 +1,3 @@ -from prometheus_client import Counter, Histogram from server.logger import logger from openhands.core.config.openhands_config import OpenHandsConfig @@ -9,45 +8,27 @@ from openhands.events.observation import ( ) from openhands.server.monitoring import MonitoringListener -AGENT_STATUS_ERROR_COUNT = Counter( - 'saas_agent_status_errors', 'Agent Status change events to status error' -) -CREATE_CONVERSATION_COUNT = Counter( - 'saas_create_conversation', 'Create conversation attempts' -) -AGENT_SESSION_START_HISTOGRAM = Histogram( - 'saas_agent_session_start', - 'AgentSession starts with success and duration', - labelnames=['success'], -) - class SaaSMonitoringListener(MonitoringListener): - """ - Forward app signals to Prometheus. - """ + """Forward app signals to structured logging for GCP native monitoring.""" def on_session_event(self, event: Event) -> None: - """ - Track metrics about events being added to a Session's EventStream. - """ + """Track metrics about events being added to a Session's EventStream.""" if ( isinstance(event, AgentStateChangedObservation) and event.agent_state == AgentState.ERROR ): - AGENT_STATUS_ERROR_COUNT.inc() logger.info( 'Tracking agent status error', extra={'signal': 'saas_agent_status_errors'}, ) def on_agent_session_start(self, success: bool, duration: float) -> None: - """ - Track an agent session start. + """Track an agent session start. + Success is true if startup completed without error. Duration is start time in seconds observed by AgentSession. """ - AGENT_SESSION_START_HISTOGRAM.labels(success=success).observe(duration) logger.info( 'Tracking agent session start', extra={ @@ -58,11 +39,10 @@ class SaaSMonitoringListener(MonitoringListener): ) def on_create_conversation(self) -> None: - """ - Track the beginning of conversation creation. + """Track the beginning of conversation creation. + Does not currently capture whether it succeed. """ - CREATE_CONVERSATION_COUNT.inc() logger.info( 'Tracking create conversation', extra={'signal': 'saas_create_conversation'} )