Fix memory leak in JSON encoder (#6620)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
This commit is contained in:
Graham Neubig 2025-02-05 12:39:04 -05:00 committed by GitHub
parent 5491ad3318
commit 2832dba27a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 77 additions and 13 deletions

View File

@ -11,24 +11,32 @@ from openhands.events.serialization import event_to_dict
from openhands.llm.metrics import Metrics
def my_default_encoder(obj):
class OpenHandsJSONEncoder(json.JSONEncoder):
"""Custom JSON encoder that handles datetime and event objects"""
if isinstance(obj, datetime):
return obj.isoformat()
if isinstance(obj, Event):
return event_to_dict(obj)
if isinstance(obj, Metrics):
return obj.get()
if isinstance(obj, ModelResponse):
return obj.model_dump()
if isinstance(obj, CmdOutputMetadata):
return obj.model_dump()
return json.JSONEncoder().default(obj)
def default(self, obj):
if isinstance(obj, datetime):
return obj.isoformat()
if isinstance(obj, Event):
return event_to_dict(obj)
if isinstance(obj, Metrics):
return obj.get()
if isinstance(obj, ModelResponse):
return obj.model_dump()
if isinstance(obj, CmdOutputMetadata):
return obj.model_dump()
return super().default(obj)
# Create a single reusable encoder instance
_json_encoder = OpenHandsJSONEncoder()
def dumps(obj, **kwargs):
"""Serialize an object to str format"""
return json.dumps(obj, default=my_default_encoder, **kwargs)
if not kwargs:
return _json_encoder.encode(obj)
return json.dumps(obj, cls=OpenHandsJSONEncoder, **kwargs)
def loads(json_str, **kwargs):

View File

@ -0,0 +1,56 @@
import gc
from datetime import datetime
import psutil
from openhands.core.utils.json import dumps
def get_memory_usage():
"""Get current memory usage of the process"""
process = psutil.Process()
return process.memory_info().rss
def test_json_encoder_memory_leak():
# Force garbage collection before test
gc.collect()
initial_memory = get_memory_usage()
# Create a large dataset that will need encoding
large_data = {
'datetime': datetime.now(),
'nested': [{'timestamp': datetime.now()} for _ in range(1000)],
}
# Track memory usage over multiple iterations
memory_samples = []
for i in range(10):
# Perform multiple serializations in each iteration
for _ in range(100):
dumps(large_data)
dumps(large_data, indent=2) # Test with kwargs too
# Force garbage collection
gc.collect()
memory_samples.append(get_memory_usage())
# Check if memory usage is stable (not continuously growing)
# We expect some fluctuation but not a steady increase
max_memory = max(memory_samples)
min_memory = min(memory_samples)
memory_variation = max_memory - min_memory
# Allow for some memory variation (2MB) due to Python's memory management
assert (
memory_variation < 2 * 1024 * 1024
), f'Memory usage unstable: {memory_variation} bytes variation'
# Also check total memory increase from start
final_memory = memory_samples[-1]
memory_increase = final_memory - initial_memory
# Allow for some memory increase (2MB) as some objects may be cached
assert (
memory_increase < 2 * 1024 * 1024
), f'Memory leak detected: {memory_increase} bytes increase'