Remove deduplication logic from telemetry implementation

- Remove metrics_hash column and related logic from TelemetryMetrics model
- Update database migration to remove metrics_hash column and index
- Remove hash-related imports (hashlib, json) from telemetry_metrics.py
- Clean up all hash-related test cases from test_telemetry_metrics.py
- Update design document to remove deduplication references
- Simplify model initialization by removing hash calculation logic

This simplifies the telemetry implementation by removing the deduplication
feature while maintaining all core functionality for metrics collection
and storage.

Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
openhands 2025-10-21 23:54:20 +00:00
parent d21d56e4bf
commit ad53463fcb
4 changed files with 5 additions and 93 deletions

View File

@ -149,7 +149,6 @@ Stores collected metrics with transmission status tracking:
CREATE TABLE telemetry_metrics (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
collected_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
metrics_hash VARCHAR(64) NOT NULL,
metrics_data JSONB NOT NULL,
uploaded_at TIMESTAMP WITH TIME ZONE NULL,
upload_attempts INTEGER DEFAULT 0,
@ -160,7 +159,6 @@ CREATE TABLE telemetry_metrics (
CREATE INDEX idx_telemetry_metrics_collected_at ON telemetry_metrics(collected_at);
CREATE INDEX idx_telemetry_metrics_uploaded_at ON telemetry_metrics(uploaded_at);
CREATE INDEX idx_telemetry_metrics_hash ON telemetry_metrics(metrics_hash);
```
#### 4.1.2 Telemetry Identity Table
@ -197,14 +195,13 @@ from dataclasses import dataclass
class MetricResult:
key: str
value: Any
hash_key: str
class MetricsCollector(ABC):
"""Base class for metrics collectors."""
@abstractmethod
def collect(self) -> List[MetricResult]:
"""Collect metrics and return results with hash keys."""
"""Collect metrics and return results."""
pass
@property
@ -275,8 +272,7 @@ class SystemMetricsCollector(MetricsCollector):
user_count = session.query(UserSettings).count()
results.append(MetricResult(
key="total_users",
value=user_count,
hash_key=f"users_{user_count}"
value=user_count
))
# Collect conversation count (last 30 days)
@ -287,8 +283,7 @@ class SystemMetricsCollector(MetricsCollector):
results.append(MetricResult(
key="conversations_30d",
value=conversation_count,
hash_key=f"conv30_{conversation_count}"
value=conversation_count
))
return results
@ -327,11 +322,8 @@ class TelemetryCollectionProcessor(MaintenanceTaskProcessor):
collector_results[collector.collector_name] = f"error: {e}"
# Store metrics in database
metrics_hash = self._generate_metrics_hash(all_metrics)
with session_maker() as session:
telemetry_record = TelemetryMetrics(
metrics_hash=metrics_hash,
metrics_data=all_metrics,
collected_at=datetime.now(timezone.utc)
)
@ -344,8 +336,7 @@ class TelemetryCollectionProcessor(MaintenanceTaskProcessor):
return {
"status": "completed",
"metrics_collected": len(all_metrics),
"collectors_run": collector_results,
"metrics_hash": metrics_hash
"collectors_run": collector_results
}
def _should_collect(self) -> bool:

View File

@ -35,11 +35,6 @@ def upgrade() -> None:
nullable=False,
server_default=sa.text('CURRENT_TIMESTAMP'),
),
sa.Column(
'metrics_hash',
sa.String(64),
nullable=False,
),
sa.Column(
'metrics_data',
sa.JSON(),
@ -82,7 +77,6 @@ def upgrade() -> None:
op.create_index(
'ix_telemetry_metrics_uploaded_at', 'telemetry_metrics', ['uploaded_at']
)
op.create_index('ix_telemetry_metrics_hash', 'telemetry_metrics', ['metrics_hash'])
# Create telemetry_identity table (minimal persistent identity data)
op.create_table(
@ -125,7 +119,6 @@ def upgrade() -> None:
def downgrade() -> None:
"""Drop telemetry tables."""
# Drop indexes first
op.drop_index('ix_telemetry_metrics_hash', 'telemetry_metrics')
op.drop_index('ix_telemetry_metrics_uploaded_at', 'telemetry_metrics')
op.drop_index('ix_telemetry_metrics_collected_at', 'telemetry_metrics')

View File

@ -4,8 +4,6 @@ This model stores individual metric collection records with upload tracking
and retry logic for the OpenHands Enterprise Telemetry Service.
"""
import hashlib
import json
import uuid
from datetime import UTC, datetime
from typing import Any, Dict, Optional
@ -18,7 +16,7 @@ class TelemetryMetrics(Base): # type: ignore
"""Stores collected telemetry metrics with upload tracking.
Each record represents a single metrics collection event with associated
metadata for upload status, retry logic, and deduplication.
metadata for upload status and retry logic.
"""
__tablename__ = 'telemetry_metrics'
@ -30,7 +28,6 @@ class TelemetryMetrics(Base): # type: ignore
default=lambda: datetime.now(UTC),
index=True,
)
metrics_hash = Column(String(64), nullable=False, index=True)
metrics_data = Column(JSON, nullable=False)
uploaded_at = Column(DateTime(timezone=True), nullable=True, index=True)
upload_attempts = Column(Integer, nullable=False, default=0)
@ -74,28 +71,11 @@ class TelemetryMetrics(Base): # type: ignore
self.updated_at = now
self.metrics_data = metrics_data
self.metrics_hash = self._calculate_metrics_hash(metrics_data)
if collected_at:
self.collected_at = collected_at
elif not hasattr(self, 'collected_at') or self.collected_at is None:
self.collected_at = now
@staticmethod
def _calculate_metrics_hash(metrics_data: Dict[str, Any]) -> str:
"""Calculate a SHA-256 hash of the metrics data for deduplication.
Args:
metrics_data: Dictionary containing the metrics data
Returns:
Hexadecimal string representation of the SHA-256 hash
"""
# Sort keys to ensure consistent hashing
normalized_json = json.dumps(
metrics_data, sort_keys=True, separators=(',', ':')
)
return hashlib.sha256(normalized_json.encode('utf-8')).hexdigest()
def mark_uploaded(self) -> None:
"""Mark this metrics record as successfully uploaded."""
self.uploaded_at = datetime.now(UTC)

View File

@ -1,7 +1,5 @@
"""Unit tests for TelemetryMetrics model."""
import hashlib
import json
import uuid
from datetime import UTC, datetime
@ -22,8 +20,6 @@ class TestTelemetryMetrics:
metrics = TelemetryMetrics(metrics_data=metrics_data)
assert metrics.metrics_data == metrics_data
assert metrics.metrics_hash is not None
assert len(metrics.metrics_hash) == 64 # SHA-256 hex string
assert metrics.upload_attempts == 0
assert metrics.uploaded_at is None
assert metrics.last_upload_error is None
@ -40,44 +36,6 @@ class TestTelemetryMetrics:
assert metrics.collected_at == custom_time
def test_calculate_metrics_hash(self):
"""Test metrics hash calculation."""
metrics_data = {
'b_key': 'value2',
'a_key': 'value1',
'c_key': 123,
}
# Calculate expected hash
normalized_json = json.dumps(
metrics_data, sort_keys=True, separators=(',', ':')
)
expected_hash = hashlib.sha256(normalized_json.encode('utf-8')).hexdigest()
calculated_hash = TelemetryMetrics._calculate_metrics_hash(metrics_data)
assert calculated_hash == expected_hash
def test_hash_consistency(self):
"""Test that identical data produces identical hashes."""
metrics_data1 = {'key1': 'value1', 'key2': 'value2'}
metrics_data2 = {'key2': 'value2', 'key1': 'value1'} # Different order
hash1 = TelemetryMetrics._calculate_metrics_hash(metrics_data1)
hash2 = TelemetryMetrics._calculate_metrics_hash(metrics_data2)
assert hash1 == hash2
def test_hash_uniqueness(self):
"""Test that different data produces different hashes."""
metrics_data1 = {'key': 'value1'}
metrics_data2 = {'key': 'value2'}
hash1 = TelemetryMetrics._calculate_metrics_hash(metrics_data1)
hash2 = TelemetryMetrics._calculate_metrics_hash(metrics_data2)
assert hash1 != hash2
def test_mark_uploaded(self):
"""Test marking metrics as uploaded."""
metrics = TelemetryMetrics(metrics_data={'test': 'data'})
@ -219,22 +177,12 @@ class TestTelemetryMetrics:
metrics = TelemetryMetrics(metrics_data=complex_data)
assert metrics.metrics_data == complex_data
assert len(metrics.metrics_hash) == 64
# Verify hash is deterministic for complex data
metrics2 = TelemetryMetrics(metrics_data=complex_data)
assert metrics.metrics_hash == metrics2.metrics_hash
def test_empty_metrics_data(self):
"""Test with empty metrics data."""
metrics = TelemetryMetrics(metrics_data={})
assert metrics.metrics_data == {}
assert len(metrics.metrics_hash) == 64
# Empty dict should have consistent hash
metrics2 = TelemetryMetrics(metrics_data={})
assert metrics.metrics_hash == metrics2.metrics_hash
def test_config_class(self):
"""Test that Config class is properly set."""