refactor(enterprise): Remove custom Prometheus metrics app (#12253)

Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Graham Neubig
2026-01-07 14:49:50 -05:00
committed by GitHub
parent e485c93119
commit 11d1e79506
5 changed files with 9 additions and 93 deletions

22
enterprise/poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.
# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
[[package]]
name = "aiofiles"
@@ -5860,7 +5860,7 @@ wsproto = ">=1.2.0"
[[package]]
name = "openhands-ai"
version = "0.0.0-post.5803+a8098505c"
version = "1.1.0"
description = "OpenHands: Code Less, Make More"
optional = false
python-versions = "^3.12,<3.14"
@@ -6858,22 +6858,6 @@ files = [
[package.extras]
twisted = ["twisted"]
[[package]]
name = "prometheus-fastapi-instrumentator"
version = "7.1.0"
description = "Instrument your FastAPI app with Prometheus metrics"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl", hash = "sha256:978130f3c0bb7b8ebcc90d35516a6fe13e02d2eb358c8f83887cdef7020c31e9"},
{file = "prometheus_fastapi_instrumentator-7.1.0.tar.gz", hash = "sha256:be7cd61eeea4e5912aeccb4261c6631b3f227d8924542d79eaf5af3f439cbe5e"},
]
[package.dependencies]
prometheus-client = ">=0.8.0,<1.0.0"
starlette = ">=0.30.0,<1.0.0"
[[package]]
name = "prompt-toolkit"
version = "3.0.52"
@@ -14508,4 +14492,4 @@ cffi = ["cffi (>=1.17) ; python_version >= \"3.13\" and platform_python_implemen
[metadata]
lock-version = "2.1"
python-versions = "^3.12,<3.14"
content-hash = "fac67a8991a3e2c840a23702dc90f99e98d381f3537ad50b4c4739cdbde941ca"
content-hash = "ab703edc73639f22f498894d16bf7170fe3ab9c2697761cdd494587caee77973"

View File

@@ -29,7 +29,6 @@ cloud-sql-python-connector = "^1.16.0"
psycopg2-binary = "^2.9.10"
pg8000 = "^1.31.2"
stripe = "^11.5.0"
prometheus-fastapi-instrumentator = "^7.0.2"
python-json-logger = "^3.2.1"
python-keycloak = "^5.3.1"
asyncpg = "^0.30.0"

View File

@@ -18,7 +18,6 @@ from server.auth.constants import ( # noqa: E402
)
from server.constants import PERMITTED_CORS_ORIGINS # noqa: E402
from server.logger import logger # noqa: E402
from server.metrics import metrics_app # noqa: E402
from server.middleware import SetAuthCookieMiddleware # noqa: E402
from server.rate_limit import setup_rate_limit_handler # noqa: E402
from server.routes.api_keys import api_router as api_keys_router # noqa: E402
@@ -61,9 +60,6 @@ def is_saas():
return {'saas': True}
# This requires a trailing slash to access, like /api/metrics/
base_app.mount('/internal/metrics', metrics_app())
base_app.include_router(readiness_router) # Add routes for readiness checks
base_app.include_router(api_router) # Add additional route for github auth
base_app.include_router(oauth_router) # Add additional route for oauth callback

View File

@@ -1,43 +0,0 @@
from typing import Callable
from prometheus_client import Gauge, make_asgi_app
from server.clustered_conversation_manager import ClusteredConversationManager
from openhands.server.shared import (
conversation_manager,
)
RUNNING_AGENT_LOOPS_GAUGE = Gauge(
'saas_running_agent_loops',
'Count of running agent loops, aggregate by session_id to dedupe',
['session_id'],
)
async def _update_metrics():
"""Update any prometheus metrics that are not updated during normal operation."""
if isinstance(conversation_manager, ClusteredConversationManager):
running_agent_loops = (
await conversation_manager.get_running_agent_loops_locally()
)
# Clear so we don't keep counting old sessions.
# This is theoretically a race condition but this is scraped on a regular interval.
RUNNING_AGENT_LOOPS_GAUGE.clear()
# running_agent_loops shouldn't be None, but can be.
if running_agent_loops is not None:
for sid in running_agent_loops:
RUNNING_AGENT_LOOPS_GAUGE.labels(session_id=sid).set(1)
def metrics_app() -> Callable:
metrics_callable = make_asgi_app()
async def wrapped_handler(scope, receive, send):
"""
Call _update_metrics before serving Prometheus metrics endpoint.
Not wrapped in a `try`, failing would make metrics endpoint unavailable.
"""
await _update_metrics()
await metrics_callable(scope, receive, send)
return wrapped_handler

View File

@@ -1,4 +1,3 @@
from prometheus_client import Counter, Histogram
from server.logger import logger
from openhands.core.config.openhands_config import OpenHandsConfig
@@ -9,45 +8,27 @@ from openhands.events.observation import (
)
from openhands.server.monitoring import MonitoringListener
AGENT_STATUS_ERROR_COUNT = Counter(
'saas_agent_status_errors', 'Agent Status change events to status error'
)
CREATE_CONVERSATION_COUNT = Counter(
'saas_create_conversation', 'Create conversation attempts'
)
AGENT_SESSION_START_HISTOGRAM = Histogram(
'saas_agent_session_start',
'AgentSession starts with success and duration',
labelnames=['success'],
)
class SaaSMonitoringListener(MonitoringListener):
"""
Forward app signals to Prometheus.
"""
"""Forward app signals to structured logging for GCP native monitoring."""
def on_session_event(self, event: Event) -> None:
"""
Track metrics about events being added to a Session's EventStream.
"""
"""Track metrics about events being added to a Session's EventStream."""
if (
isinstance(event, AgentStateChangedObservation)
and event.agent_state == AgentState.ERROR
):
AGENT_STATUS_ERROR_COUNT.inc()
logger.info(
'Tracking agent status error',
extra={'signal': 'saas_agent_status_errors'},
)
def on_agent_session_start(self, success: bool, duration: float) -> None:
"""
Track an agent session start.
"""Track an agent session start.
Success is true if startup completed without error.
Duration is start time in seconds observed by AgentSession.
"""
AGENT_SESSION_START_HISTOGRAM.labels(success=success).observe(duration)
logger.info(
'Tracking agent session start',
extra={
@@ -58,11 +39,10 @@ class SaaSMonitoringListener(MonitoringListener):
)
def on_create_conversation(self) -> None:
"""
Track the beginning of conversation creation.
"""Track the beginning of conversation creation.
Does not currently capture whether it succeed.
"""
CREATE_CONVERSATION_COUNT.inc()
logger.info(
'Tracking create conversation', extra={'signal': 'saas_create_conversation'}
)