Logging security (#1943)

* update .gitignore

* Rename the confusing 'INFO' style to 'DETAIL'

* override str and repr

* feat: api_key desensitize

* feat: add SensitiveDataFilter in file handler

* tweak regex, add tests

* more tweaks, include other attrs

* add env vars, those with equivalent config

* fix tests

* tests are invaluable

---------

Co-authored-by: Shimada666 <649940882@qq.com>
This commit is contained in:
Engel Nyst
2024-05-22 18:27:38 +02:00
committed by GitHub
parent 8e3e51e984
commit 46352e890b
11 changed files with 292 additions and 8 deletions

2
.gitignore vendored
View File

@@ -126,6 +126,7 @@ env/
venv/
ENV/
env.bak/
.env.bak
venv.bak/
*venv/
@@ -202,6 +203,7 @@ cache
# configuration
config.toml
config.toml.bak
evaluation/swe_bench/eval_workspace
evaluation/outputs
evaluation/evaluation_outputs

View File

@@ -155,7 +155,7 @@ def get_prompt(state: State) -> str:
else:
plan_status = "You're not currently working on any tasks. Your next action MUST be to mark a task as in_progress."
hint = get_hint(event_to_memory(latest_action).get('action', ''))
logger.info('HINT:\n' + hint, extra={'msg_type': 'INFO'})
logger.info('HINT:\n' + hint, extra={'msg_type': 'DETAIL'})
task = state.get_current_user_intent()
return prompt % {
'task': task,

View File

@@ -75,6 +75,22 @@ class LLMConfig(metaclass=Singleton):
dict[f.name] = get_field_info(f)
return dict
def __str__(self):
attr_str = []
for f in fields(self):
attr_name = f.name
attr_value = getattr(self, f.name)
if attr_name in ['api_key', 'aws_access_key_id', 'aws_secret_access_key']:
attr_value = '******' if attr_value else None
attr_str.append(f'{attr_name}={repr(attr_value)}')
return f"LLMConfig({', '.join(attr_str)})"
def __repr__(self):
return self.__str__()
@dataclass
class AgentConfig(metaclass=Singleton):
@@ -185,6 +201,22 @@ class AppConfig(metaclass=Singleton):
dict[f.name] = get_field_info(f)
return dict
def __str__(self):
attr_str = []
for f in fields(self):
attr_name = f.name
attr_value = getattr(self, f.name)
if attr_name in ['e2b_api_key', 'github_token']:
attr_value = '******' if attr_value else None
attr_str.append(f'{attr_name}={repr(attr_value)}')
return f"AppConfig({', '.join(attr_str)}"
def __repr__(self):
return self.__str__()
def get_field_info(field):
"""

View File

@@ -1,5 +1,6 @@
import logging
import os
import re
import sys
import traceback
from datetime import datetime
@@ -33,7 +34,7 @@ LOG_COLORS: Mapping[str, ColorType] = {
'BACKGROUND LOG': 'blue',
'ACTION': 'green',
'OBSERVATION': 'yellow',
'INFO': 'cyan',
'DETAIL': 'cyan',
'ERROR': 'red',
'PLAN': 'light_magenta',
}
@@ -48,9 +49,9 @@ class ColoredFormatter(logging.Formatter):
time_str = colored(
self.formatTime(record, self.datefmt), LOG_COLORS[msg_type]
)
name_str = colored(record.name, 'cyan')
level_str = colored(record.levelname, 'yellow')
if msg_type in ['ERROR', 'INFO']:
name_str = colored(record.name, LOG_COLORS[msg_type])
level_str = colored(record.levelname, LOG_COLORS[msg_type])
if msg_type in ['ERROR']:
return f'{time_str} - {name_str}:{level_str}: {record.filename}:{record.lineno}\n{msg_type_color}\n{msg}'
return f'{time_str} - {msg_type_color}\n{msg}'
elif msg_type == 'STEP':
@@ -71,6 +72,38 @@ file_formatter = logging.Formatter(
llm_formatter = logging.Formatter('%(message)s')
class SensitiveDataFilter(logging.Filter):
def filter(self, record):
# start with attributes
sensitive_patterns = [
'api_key',
'aws_access_key_id',
'aws_secret_access_key',
'e2b_api_key',
'github_token',
]
# add env var names
env_vars = [attr.upper() for attr in sensitive_patterns]
sensitive_patterns.extend(env_vars)
# and some special cases
sensitive_patterns.append('LLM_API_KEY')
sensitive_patterns.append('SANDBOX_ENV_GITHUB_TOKEN')
# this also formats the message with % args
msg = record.getMessage()
record.args = ()
for attr in sensitive_patterns:
pattern = rf"{attr}='?([\w-]+)'?"
msg = re.sub(pattern, f"{attr}='******'", msg)
# passed with msg
record.msg = msg
return True
def get_console_handler():
"""
Returns a console handler for logging.
@@ -121,6 +154,7 @@ opendevin_logger = logging.getLogger('opendevin')
opendevin_logger.setLevel(logging.INFO)
opendevin_logger.addHandler(get_file_handler())
opendevin_logger.addHandler(get_console_handler())
opendevin_logger.addFilter(SensitiveDataFilter(opendevin_logger.name))
opendevin_logger.propagate = False
opendevin_logger.debug('Logging initialized')
opendevin_logger.debug(

View File

@@ -282,3 +282,6 @@ class LLM:
elif self.base_url:
return f'LLM(model={self.model_name}, base_url={self.base_url})'
return f'LLM(model={self.model_name})'
def __repr__(self):
return str(self)

View File

@@ -235,4 +235,4 @@ NOW, LET'S START!
Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
ENVIRONMENT REMINDER: You have 9 turns left to complete the task.

View File

@@ -254,4 +254,4 @@ Once the installation is complete, we'll import `pymsgbox` and print its version
OBSERVATION:
Package installed successfully
ENVIRONMENT REMINDER: You have 8 turns left to complete the task.
ENVIRONMENT REMINDER: You have 8 turns left to complete the task.

View File

@@ -279,4 +279,4 @@ with open("/workspace/test.txt", "w") as f:
OBSERVATION:
[Code executed successfully with no output]
ENVIRONMENT REMINDER: You have 7 turns left to complete the task.
ENVIRONMENT REMINDER: You have 7 turns left to complete the task.

View File

@@ -381,3 +381,5 @@ Notes:
* whenever an action fails, always send a `message` about why it may have happened before acting again.
What is your next single thought or action? Again, you must reply with JSON, and only with JSON. You must respond with exactly one 'action' object.

View File

@@ -210,3 +210,78 @@ def test_cache_dir_creation(default_config, tmpdir):
default_config.cache_dir = str(tmpdir.join('test_cache'))
finalize_config(default_config)
assert os.path.exists(default_config.cache_dir)
def test_api_keys_repr_str():
# Test LLMConfig
llm_config = LLMConfig(
api_key='my_api_key',
aws_access_key_id='my_access_key',
aws_secret_access_key='my_secret_key',
)
assert "api_key='******'" in repr(llm_config)
assert "aws_access_key_id='******'" in repr(llm_config)
assert "aws_secret_access_key='******'" in repr(llm_config)
assert "api_key='******'" in str(llm_config)
assert "aws_access_key_id='******'" in str(llm_config)
assert "aws_secret_access_key='******'" in str(llm_config)
# Check that no other attrs in LLMConfig have 'key' or 'token' in their name
# This will fail when new attrs are added, and attract attention
known_key_token_attrs_llm = [
'api_key',
'aws_access_key_id',
'aws_secret_access_key',
]
for attr_name in dir(LLMConfig):
if (
not attr_name.startswith('__')
and attr_name not in known_key_token_attrs_llm
):
assert (
'key' not in attr_name.lower()
), f"Unexpected attribute '{attr_name}' contains 'key' in LLMConfig"
assert (
'token' not in attr_name.lower() or 'tokens' in attr_name.lower()
), f"Unexpected attribute '{attr_name}' contains 'token' in LLMConfig"
# Test AgentConfig
# No attrs in AgentConfig have 'key' or 'token' in their name
agent_config = AgentConfig(
name='my_agent', memory_enabled=True, memory_max_threads=4
)
for attr_name in dir(AgentConfig):
if not attr_name.startswith('__'):
assert (
'key' not in attr_name.lower()
), f"Unexpected attribute '{attr_name}' contains 'key' in AgentConfig"
assert (
'token' not in attr_name.lower() or 'tokens' in attr_name.lower()
), f"Unexpected attribute '{attr_name}' contains 'token' in AgentConfig"
# Test AppConfig
app_config = AppConfig(
llm=llm_config,
agent=agent_config,
e2b_api_key='my_e2b_api_key',
github_token='my_github_token',
)
assert "e2b_api_key='******'" in repr(app_config)
assert "github_token='******'" in repr(app_config)
assert "e2b_api_key='******'" in str(app_config)
assert "github_token='******'" in str(app_config)
# Check that no other attrs in AppConfig have 'key' or 'token' in their name
# This will fail when new attrs are added, and attract attention
known_key_token_attrs_app = ['e2b_api_key', 'github_token']
for attr_name in dir(AppConfig):
if (
not attr_name.startswith('__')
and attr_name not in known_key_token_attrs_app
):
assert (
'key' not in attr_name.lower()
), f"Unexpected attribute '{attr_name}' contains 'key' in AppConfig"
assert (
'token' not in attr_name.lower() or 'tokens' in attr_name.lower()
), f"Unexpected attribute '{attr_name}' contains 'token' in AppConfig"

136
tests/unit/test_logging.py Normal file
View File

@@ -0,0 +1,136 @@
import logging
from io import StringIO
import pytest
from opendevin.core.config import AppConfig, LLMConfig
from opendevin.core.logger import opendevin_logger as opendevin_logger
@pytest.fixture
def test_handler():
stream = StringIO()
handler = logging.StreamHandler(stream)
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
opendevin_logger.addHandler(handler)
yield opendevin_logger, stream
opendevin_logger.removeHandler(handler)
def test_openai_api_key_masking(test_handler):
logger, stream = test_handler
api_key = 'sk-1234567890abcdef'
message = f"OpenAI API key: api_key='{api_key}'and there's some stuff here"
logger.info(message)
log_output = stream.getvalue()
assert "api_key='******'" in log_output
assert api_key not in log_output
def test_azure_api_key_masking(test_handler):
logger, stream = test_handler
api_key = '1a2b3c4d5e6f7g8h9i0j'
message = f"Azure API key: api_key='{api_key}' and chatty chat with ' and \" and '"
logger.info(message)
log_output = stream.getvalue()
assert "api_key='******'" in log_output
assert api_key not in log_output
def test_google_vertex_api_key_masking(test_handler):
logger, stream = test_handler
api_key = 'AIzaSyA1B2C3D4E5F6G7H8I9J0'
message = f"Google Vertex API key: api_key='{api_key}' or not"
logger.info(message)
log_output = stream.getvalue()
assert "api_key='******'" in log_output
assert api_key not in log_output
def test_anthropic_api_key_masking(test_handler):
logger, stream = test_handler
api_key = 'sk-ant-1234567890abcdef-some-more-stuff-here'
message = f"Anthropic API key: api_key='{api_key}' and there's some 'stuff' here"
logger.info(message)
log_output = stream.getvalue()
assert "api_key='******'" in log_output
assert api_key not in log_output
def test_llm_config_attributes_masking(test_handler):
logger, stream = test_handler
llm_config = LLMConfig(
api_key='sk-abc123',
aws_access_key_id='AKIAIOSFODNN7EXAMPLE',
aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY',
)
logger.info(f'LLM Config: {llm_config}')
log_output = stream.getvalue()
assert "api_key='******'" in log_output
assert "aws_access_key_id='******'" in log_output
assert "aws_secret_access_key='******'" in log_output
assert 'sk-abc123' not in log_output
assert 'AKIAIOSFODNN7EXAMPLE' not in log_output
assert 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' not in log_output
# reset the LLMConfig
LLMConfig.reset()
def test_app_config_attributes_masking(test_handler):
logger, stream = test_handler
app_config = AppConfig(
e2b_api_key='e2b-xyz789', github_token='ghp_abcdefghijklmnopqrstuvwxyz'
)
logger.info(f'App Config: {app_config}')
log_output = stream.getvalue()
assert "e2b_api_key='******'" in log_output
assert "github_token='******'" in log_output
assert 'e2b-xyz789' not in log_output
assert 'ghp_abcdefghijklmnopqrstuvwxyz' not in log_output
# reset the AppConfig
AppConfig.reset()
def test_sensitive_env_vars_masking(test_handler):
logger, stream = test_handler
sensitive_data = {
'API_KEY': 'API_KEY_VALUE',
'AWS_ACCESS_KEY_ID': 'AWS_ACCESS_KEY_ID_VALUE',
'AWS_SECRET_ACCESS_KEY': 'AWS_SECRET_ACCESS_KEY_VALUE',
'E2B_API_KEY': 'E2B_API_KEY_VALUE',
'GITHUB_TOKEN': 'GITHUB_TOKEN_VALUE',
}
log_message = ' '.join(
f"{attr}='{value}'" for attr, value in sensitive_data.items()
)
logger.info(log_message)
log_output = stream.getvalue()
for attr, value in sensitive_data.items():
assert f"{attr}='******'" in log_output
assert value not in log_output
def test_special_cases_masking(test_handler):
logger, stream = test_handler
sensitive_data = {
'LLM_API_KEY': 'LLM_API_KEY_VALUE',
'SANDBOX_ENV_GITHUB_TOKEN': 'SANDBOX_ENV_GITHUB_TOKEN_VALUE',
}
log_message = ' '.join(
f"{attr}={value} with no single quotes' and something"
for attr, value in sensitive_data.items()
)
logger.info(log_message)
log_output = stream.getvalue()
for attr, value in sensitive_data.items():
assert f"{attr}='******'" in log_output
assert value not in log_output