mirror of
https://github.com/OpenHands/OpenHands.git
synced 2026-03-22 05:37:20 +08:00
refactor(enterprise): Remove custom Prometheus metrics app (#12253)
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
22
enterprise/poetry.lock
generated
22
enterprise/poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiofiles"
|
||||
@@ -5860,7 +5860,7 @@ wsproto = ">=1.2.0"
|
||||
|
||||
[[package]]
|
||||
name = "openhands-ai"
|
||||
version = "0.0.0-post.5803+a8098505c"
|
||||
version = "1.1.0"
|
||||
description = "OpenHands: Code Less, Make More"
|
||||
optional = false
|
||||
python-versions = "^3.12,<3.14"
|
||||
@@ -6858,22 +6858,6 @@ files = [
|
||||
[package.extras]
|
||||
twisted = ["twisted"]
|
||||
|
||||
[[package]]
|
||||
name = "prometheus-fastapi-instrumentator"
|
||||
version = "7.1.0"
|
||||
description = "Instrument your FastAPI app with Prometheus metrics"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl", hash = "sha256:978130f3c0bb7b8ebcc90d35516a6fe13e02d2eb358c8f83887cdef7020c31e9"},
|
||||
{file = "prometheus_fastapi_instrumentator-7.1.0.tar.gz", hash = "sha256:be7cd61eeea4e5912aeccb4261c6631b3f227d8924542d79eaf5af3f439cbe5e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
prometheus-client = ">=0.8.0,<1.0.0"
|
||||
starlette = ">=0.30.0,<1.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "prompt-toolkit"
|
||||
version = "3.0.52"
|
||||
@@ -14508,4 +14492,4 @@ cffi = ["cffi (>=1.17) ; python_version >= \"3.13\" and platform_python_implemen
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = "^3.12,<3.14"
|
||||
content-hash = "fac67a8991a3e2c840a23702dc90f99e98d381f3537ad50b4c4739cdbde941ca"
|
||||
content-hash = "ab703edc73639f22f498894d16bf7170fe3ab9c2697761cdd494587caee77973"
|
||||
|
||||
@@ -29,7 +29,6 @@ cloud-sql-python-connector = "^1.16.0"
|
||||
psycopg2-binary = "^2.9.10"
|
||||
pg8000 = "^1.31.2"
|
||||
stripe = "^11.5.0"
|
||||
prometheus-fastapi-instrumentator = "^7.0.2"
|
||||
python-json-logger = "^3.2.1"
|
||||
python-keycloak = "^5.3.1"
|
||||
asyncpg = "^0.30.0"
|
||||
|
||||
@@ -18,7 +18,6 @@ from server.auth.constants import ( # noqa: E402
|
||||
)
|
||||
from server.constants import PERMITTED_CORS_ORIGINS # noqa: E402
|
||||
from server.logger import logger # noqa: E402
|
||||
from server.metrics import metrics_app # noqa: E402
|
||||
from server.middleware import SetAuthCookieMiddleware # noqa: E402
|
||||
from server.rate_limit import setup_rate_limit_handler # noqa: E402
|
||||
from server.routes.api_keys import api_router as api_keys_router # noqa: E402
|
||||
@@ -61,9 +60,6 @@ def is_saas():
|
||||
return {'saas': True}
|
||||
|
||||
|
||||
# This requires a trailing slash to access, like /api/metrics/
|
||||
base_app.mount('/internal/metrics', metrics_app())
|
||||
|
||||
base_app.include_router(readiness_router) # Add routes for readiness checks
|
||||
base_app.include_router(api_router) # Add additional route for github auth
|
||||
base_app.include_router(oauth_router) # Add additional route for oauth callback
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
from typing import Callable
|
||||
|
||||
from prometheus_client import Gauge, make_asgi_app
|
||||
from server.clustered_conversation_manager import ClusteredConversationManager
|
||||
|
||||
from openhands.server.shared import (
|
||||
conversation_manager,
|
||||
)
|
||||
|
||||
RUNNING_AGENT_LOOPS_GAUGE = Gauge(
|
||||
'saas_running_agent_loops',
|
||||
'Count of running agent loops, aggregate by session_id to dedupe',
|
||||
['session_id'],
|
||||
)
|
||||
|
||||
|
||||
async def _update_metrics():
|
||||
"""Update any prometheus metrics that are not updated during normal operation."""
|
||||
if isinstance(conversation_manager, ClusteredConversationManager):
|
||||
running_agent_loops = (
|
||||
await conversation_manager.get_running_agent_loops_locally()
|
||||
)
|
||||
# Clear so we don't keep counting old sessions.
|
||||
# This is theoretically a race condition but this is scraped on a regular interval.
|
||||
RUNNING_AGENT_LOOPS_GAUGE.clear()
|
||||
# running_agent_loops shouldn't be None, but can be.
|
||||
if running_agent_loops is not None:
|
||||
for sid in running_agent_loops:
|
||||
RUNNING_AGENT_LOOPS_GAUGE.labels(session_id=sid).set(1)
|
||||
|
||||
|
||||
def metrics_app() -> Callable:
|
||||
metrics_callable = make_asgi_app()
|
||||
|
||||
async def wrapped_handler(scope, receive, send):
|
||||
"""
|
||||
Call _update_metrics before serving Prometheus metrics endpoint.
|
||||
Not wrapped in a `try`, failing would make metrics endpoint unavailable.
|
||||
"""
|
||||
await _update_metrics()
|
||||
await metrics_callable(scope, receive, send)
|
||||
|
||||
return wrapped_handler
|
||||
@@ -1,4 +1,3 @@
|
||||
from prometheus_client import Counter, Histogram
|
||||
from server.logger import logger
|
||||
|
||||
from openhands.core.config.openhands_config import OpenHandsConfig
|
||||
@@ -9,45 +8,27 @@ from openhands.events.observation import (
|
||||
)
|
||||
from openhands.server.monitoring import MonitoringListener
|
||||
|
||||
AGENT_STATUS_ERROR_COUNT = Counter(
|
||||
'saas_agent_status_errors', 'Agent Status change events to status error'
|
||||
)
|
||||
CREATE_CONVERSATION_COUNT = Counter(
|
||||
'saas_create_conversation', 'Create conversation attempts'
|
||||
)
|
||||
AGENT_SESSION_START_HISTOGRAM = Histogram(
|
||||
'saas_agent_session_start',
|
||||
'AgentSession starts with success and duration',
|
||||
labelnames=['success'],
|
||||
)
|
||||
|
||||
|
||||
class SaaSMonitoringListener(MonitoringListener):
|
||||
"""
|
||||
Forward app signals to Prometheus.
|
||||
"""
|
||||
"""Forward app signals to structured logging for GCP native monitoring."""
|
||||
|
||||
def on_session_event(self, event: Event) -> None:
|
||||
"""
|
||||
Track metrics about events being added to a Session's EventStream.
|
||||
"""
|
||||
"""Track metrics about events being added to a Session's EventStream."""
|
||||
if (
|
||||
isinstance(event, AgentStateChangedObservation)
|
||||
and event.agent_state == AgentState.ERROR
|
||||
):
|
||||
AGENT_STATUS_ERROR_COUNT.inc()
|
||||
logger.info(
|
||||
'Tracking agent status error',
|
||||
extra={'signal': 'saas_agent_status_errors'},
|
||||
)
|
||||
|
||||
def on_agent_session_start(self, success: bool, duration: float) -> None:
|
||||
"""
|
||||
Track an agent session start.
|
||||
"""Track an agent session start.
|
||||
|
||||
Success is true if startup completed without error.
|
||||
Duration is start time in seconds observed by AgentSession.
|
||||
"""
|
||||
AGENT_SESSION_START_HISTOGRAM.labels(success=success).observe(duration)
|
||||
logger.info(
|
||||
'Tracking agent session start',
|
||||
extra={
|
||||
@@ -58,11 +39,10 @@ class SaaSMonitoringListener(MonitoringListener):
|
||||
)
|
||||
|
||||
def on_create_conversation(self) -> None:
|
||||
"""
|
||||
Track the beginning of conversation creation.
|
||||
"""Track the beginning of conversation creation.
|
||||
|
||||
Does not currently capture whether it succeed.
|
||||
"""
|
||||
CREATE_CONVERSATION_COUNT.inc()
|
||||
logger.info(
|
||||
'Tracking create conversation', extra={'signal': 'saas_create_conversation'}
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user