Skip to content

Commit 77960a2

Browse files
authored
Merge pull request #32 from pandpara/main
Add errors metadata in agent debug info with granular sdk client error metrics Also, adding metrics which tells us how many of RFN exception will try to create profiling group. ErrorsMetadata is part of AgentDebugInfo which is part of the Profile. So, everytime we submit a profile we submit these metrics along with it. SdkClientErrors will tell us total failures happening because of API calls. Individual API call failures are captured with the same name as APIs and the RFN Exception resulting in auto creation of PG is also captured.
2 parents 53c7695 + 2fda871 commit 77960a2

File tree

14 files changed

+525
-62
lines changed

14 files changed

+525
-62
lines changed
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import logging
2+
import os
3+
4+
from codeguru_profiler_agent.utils.synchronization import synchronized
5+
from codeguru_profiler_agent.utils.time import to_iso
6+
7+
logger = logging.getLogger(__name__)
8+
9+
10+
class ErrorsMetadata:
11+
def __init__(self):
12+
self.reset()
13+
14+
def reset(self):
15+
"""
16+
We want to differentiate API call errors more granularly. We want to gather ResourceNotFoundException errors
17+
because we are going to get this exception with auto-create feature and want to monitor how many times
18+
the agent is not able to create the PG and resulting in subsequent ResourceNotFoundException.
19+
"""
20+
self.errors_count = 0
21+
self.sdk_client_errors = 0
22+
self.configure_agent_errors = 0
23+
self.configure_agent_rnfe_auto_create_enabled_errors = 0
24+
self.create_profiling_group_errors = 0
25+
self.post_agent_profile_errors = 0
26+
self.post_agent_profile_rnfe_auto_create_enabled_errors = 0
27+
28+
def serialize_to_json(self):
29+
"""
30+
This needs to be compliant with errors count schema.
31+
"""
32+
return {
33+
"errorsCount": self.errors_count,
34+
"sdkClientErrors": self.sdk_client_errors,
35+
"configureAgentErrors": self.configure_agent_errors,
36+
"configureAgentRnfeAutoCreateEnabledErrors": self.configure_agent_rnfe_auto_create_enabled_errors,
37+
"createProfilingGroupErrors": self.create_profiling_group_errors,
38+
"postAgentProfileErrors": self.post_agent_profile_errors,
39+
"postAgentProfileRnfeAutoCreateEnabledErrors": self.post_agent_profile_rnfe_auto_create_enabled_errors
40+
}
41+
42+
@synchronized
43+
def increment_sdk_error(self, error_type):
44+
"""
45+
ErrorsCount is the umbrella of all the kinds of error we want to capture. Currently we have only SdkClientErrors
46+
in it. SdkClientErrors is comprised of different API level errors like ConfigureAgentErrors,
47+
PostAgentProfileErrors, CreateProfilingGroupErrors.
48+
:param error_type: The type of API level error that we want to capture.
49+
"""
50+
self.errors_count += 1
51+
self.sdk_client_errors += 1
52+
53+
"""
54+
Special handling for ResourceNotFoundException errors.
55+
For example configureAgentRnfeAutoCreateEnabledErrors is also a configureAgentErrors.
56+
"""
57+
if error_type == "configureAgentErrors":
58+
self.configure_agent_errors += 1
59+
elif error_type == "configureAgentRnfeAutoCreateEnabledErrors":
60+
self.configure_agent_errors += 1
61+
self.configure_agent_rnfe_auto_create_enabled_errors += 1
62+
elif error_type == "createProfilingGroupErrors":
63+
self.create_profiling_group_errors += 1
64+
elif error_type == "postAgentProfileErrors":
65+
self.post_agent_profile_errors += 1
66+
elif error_type == "postAgentProfileRnfeAutoCreateEnabledErrors":
67+
self.post_agent_profile_errors += 1
68+
self.post_agent_profile_rnfe_auto_create_enabled_errors += 1
69+
70+
def record_sdk_error(self, error_type):
71+
self.increment_sdk_error(error_type)
72+
73+
74+
class AgentDebugInfo:
75+
def __init__(self, errors_metadata=None, agent_start_time=None, timer=None):
76+
self.process_id = get_process_id()
77+
self.errors_metadata = errors_metadata
78+
self.agent_start_time = agent_start_time
79+
self.timer = timer
80+
81+
def serialize_to_json(self):
82+
"""
83+
This needs to be compliant with agent debug info schema.
84+
"""
85+
json = {}
86+
87+
self.add_agent_start_time(json)
88+
self.add_process_id(json)
89+
self.add_errors_metadata(json)
90+
self.add_generic_metrics(json)
91+
92+
return json
93+
94+
def add_agent_start_time(self, json):
95+
if self.agent_start_time is not None:
96+
json["agentStartTime"] = to_iso(self.agent_start_time)
97+
98+
def add_errors_metadata(self, json):
99+
if self.errors_metadata is not None:
100+
json["errorsCount"] = self.errors_metadata.serialize_to_json()
101+
102+
def add_process_id(self, json):
103+
if self.process_id is not None:
104+
json["processId"] = self.process_id
105+
106+
def add_generic_metrics(self, json):
107+
if self.timer is not None and self.timer.metrics:
108+
generic_metrics = {}
109+
110+
for metric, metric_value in self.timer.metrics.items():
111+
generic_metrics[metric + "_timings_max"] = metric_value.max
112+
generic_metrics[metric + "_timings_average"] = metric_value.average()
113+
114+
if generic_metrics:
115+
json["genericMetrics"] = generic_metrics
116+
117+
118+
def get_process_id():
119+
try:
120+
return os.getpid()
121+
except Exception as e:
122+
logger.info("Failed to get the process id", exc_info=True)
123+
return None
124+

codeguru_profiler_agent/local_aggregator.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import time
33
import datetime
44

5+
from codeguru_profiler_agent.agent_metadata.agent_debug_info import AgentDebugInfo
56
from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration
67
from codeguru_profiler_agent.metrics.with_timer import with_timer
78
from codeguru_profiler_agent.model.profile import Profile
@@ -30,20 +31,23 @@ def __init__(self, reporter, environment=dict()):
3031
:param host_weight: (required inside environment) scale factor used to rescale the profile collected in this
3132
host to make the profile representative of the whole fleet
3233
:param timer: (required inside environment) timer to be used for metrics
34+
:param errors_metadata: (required inside environment) metadata capturing errors in the current profile.
3335
:param profile_factory: (inside environment) the factory to created profiler; default Profile.
3436
:param clock: (inside environment) clock to be used; default is time.time
3537
"""
3638
self.reporter = reporter
3739
self.profiling_group_name = environment["profiling_group_name"]
3840
self.host_weight = environment["host_weight"]
3941
self.timer = environment["timer"]
42+
self.errors_metadata = environment["errors_metadata"]
4043

4144
self.profile_factory = environment.get("profile_factory") or Profile
4245
self.clock = environment.get("clock") or time.time
4346

4447
self.profile = None
4548
self.memory_limit_bytes = environment["memory_limit_bytes"]
4649
self.last_report_attempted = current_milli_time(clock=self.clock)
50+
self.agent_start_time = current_milli_time(clock=self.clock)
4751

4852
self.reset()
4953

@@ -71,14 +75,16 @@ def _check_memory_limit(self):
7175
self.flush(force=True)
7276

7377
def reset(self):
78+
self.errors_metadata.reset()
79+
self.timer.reset()
7480
self.profile = self.profile_factory(
7581
profiling_group_name=self.profiling_group_name,
7682
sampling_interval_seconds=AgentConfiguration.get().sampling_interval.total_seconds(),
7783
host_weight=self.host_weight,
7884
start=current_milli_time(clock=self.clock),
85+
agent_debug_info=AgentDebugInfo(self.errors_metadata, self.agent_start_time, self.timer),
7986
clock=self.clock
8087
)
81-
self.timer.reset()
8288

8389
@with_timer("flush")
8490
def flush(self, force=False, reset=True):

codeguru_profiler_agent/model/profile.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
class Profile:
14-
def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, start, clock=time.time):
14+
def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, start, agent_debug_info, clock=time.time):
1515
"""
1616
A profile holds the root node of the call graph and the metadata related to the profile
1717
"""
@@ -35,6 +35,7 @@ def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight,
3535
self.host_weight = int(host_weight)
3636
self._start_process_time = time.process_time() # provides process time in fractional seconds as float.
3737
self.overhead_ms = 0
38+
self.agent_debug_info = agent_debug_info
3839

3940
@property
4041
def end(self):
@@ -97,6 +98,9 @@ def _insert_stack(self, stack, runnable_count_increase=1):
9798
def get_memory_usage_bytes(self):
9899
return self.memory_counter.get_memory_usage_bytes()
99100

101+
def serialize_agent_debug_info_to_json(self):
102+
return self.agent_debug_info.serialize_to_json()
103+
100104
def pause(self):
101105
if self.last_pause is not None:
102106
# pause gets called when profile is paused

codeguru_profiler_agent/profiler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from datetime import timedelta
88
from random import SystemRandom
99
from types import MappingProxyType as UnmodifiableDict
10+
11+
from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata
1012
from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata
1113
from codeguru_profiler_agent.profiler_disabler import ProfilerDisabler
1214
from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration, AgentConfigurationMerger
@@ -167,6 +169,7 @@ def _setup_final_environment(self, environment, environment_override):
167169
frozenset({environment['profiler_thread_name']}.union(environment['excluded_threads']))
168170
# TODO delay metadata lookup until we need it
169171
environment['agent_metadata'] = environment.get('agent_metadata') or AgentMetadata()
172+
environment['errors_metadata'] = environment.get('errors_metadata') or ErrorsMetadata()
170173
environment['collector'] = environment.get('collector') or self._select_collector(environment)
171174
environment["profiler_disabler"] = environment.get('profiler_disabler') or ProfilerDisabler(environment)
172175
return UnmodifiableDict(environment)

codeguru_profiler_agent/sdk_reporter/profile_encoder.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,15 @@ def encode_content(self):
9999
"start": int(self._profile.start),
100100
"end": int(self._profile.end),
101101
"agentMetadata": self._encode_agent_metadata(),
102-
"callgraph": self._encode_call_graph(self._profile.callgraph)
102+
"callgraph": self._encode_call_graph(self._profile.callgraph),
103+
"debugInfo": self._encode_debug_info()
103104
}
104105

105106
return json.dumps(profile_in_map)
106107

108+
def _encode_debug_info(self):
109+
return self._profile.serialize_agent_debug_info_to_json()
110+
107111
def _encode_agent_metadata(self):
108112
profile_duration_seconds = self._profile.get_active_millis_since_start() / 1000.0
109113
sample_weight = 1.0 if (profile_duration_seconds == 0) else self._profile.total_sample_count / profile_duration_seconds

codeguru_profiler_agent/sdk_reporter/sdk_reporter.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@
1515
logger = logging.getLogger(__name__)
1616
AWS_EXECUTION_ENV_KEY = "AWS_EXECUTION_ENV"
1717

18+
1819
class SdkReporter(Reporter):
1920
"""
2021
Handles communication with the CodeGuru Profiler Service backend.
2122
Encodes profiles using the ProfilerEncoder and reports them using the CodeGuru profiler SDK.
2223
"""
2324
is_create_pg_called_during_submit_profile = False
25+
2426
def __init__(self, environment):
2527
"""
2628
:param environment: dependency container dictionary for the current profiler.
@@ -35,6 +37,7 @@ def __init__(self, environment):
3537
self.timer = environment.get("timer")
3638
self.metadata = environment["agent_metadata"]
3739
self.agent_config_merger = environment["agent_config_merger"]
40+
self.errors_metadata = environment["errors_metadata"]
3841

3942
def _encode_profile(self, profile):
4043
output_profile_stream = io.BytesIO()
@@ -76,18 +79,23 @@ def refresh_configuration(self):
7679
# We handle service exceptions like this in boto3
7780
# see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
7881
if error.response['Error']['Code'] == 'ValidationException':
82+
self.errors_metadata.record_sdk_error("configureAgentErrors")
7983
self.agent_config_merger.disable_profiling()
8084
self._log_request_failed(operation="configure_agent", exception=error)
81-
if error.response['Error']['Code'] == 'ResourceNotFoundException':
85+
elif error.response['Error']['Code'] == 'ResourceNotFoundException':
8286
if self.should_auto_create_profiling_group():
87+
self.errors_metadata.record_sdk_error("configureAgentRnfeAutoCreateEnabledErrors")
8388
logger.info(
8489
"Profiling group not found. Will try to create a profiling group "
8590
"with name = {} and compute platform = {} and retry calling configure agent after 5 minutes. "
8691
"Make sure that Lambda's execution role has AmazonCodeGuruProfilerAgentAccess policy added."
8792
.format(self.profiling_group_name, 'AWSLambda'))
8893
self.create_profiling_group()
8994
else:
95+
self.errors_metadata.record_sdk_error("configureAgentErrors")
9096
self.agent_config_merger.disable_profiling()
97+
else:
98+
self.errors_metadata.record_sdk_error("configureAgentErrors")
9199
except Exception as e:
92100
self._log_request_failed(operation="configure_agent", exception=e)
93101

@@ -117,12 +125,17 @@ def report(self, profile):
117125
if error.response['Error']['Code'] == 'ResourceNotFoundException':
118126
if self.should_auto_create_profiling_group():
119127
self.__class__.is_create_pg_called_during_submit_profile = True
128+
self.errors_metadata.record_sdk_error("postAgentProfileRnfeAutoCreateEnabledErrors")
120129
logger.info(
121130
"Profiling group not found. Will try to create a profiling group "
122131
"with name = {} and compute platform = {} and retry reporting during next invocation. "
123132
"Make sure that Lambda's execution role has AmazonCodeGuruProfilerAgentAccess policy added."
124133
.format(self.profiling_group_name, 'AWSLambda'))
125134
self.create_profiling_group()
135+
else:
136+
self.errors_metadata.record_sdk_error("postAgentProfileErrors")
137+
else:
138+
self.errors_metadata.record_sdk_error("postAgentProfileErrors")
126139
return False
127140
except Exception as e:
128141
self._log_request_failed(operation="post_agent_profile", exception=e)
@@ -143,6 +156,8 @@ def create_profiling_group(self):
143156
if error.response['Error']['Code'] == 'ConflictException':
144157
logger.info("Profiling Group with name {} already exists. Please use a different name."
145158
.format(self.profiling_group_name))
159+
else:
160+
self.errors_metadata.record_sdk_error("createProfilingGroupErrors")
146161
except Exception as e:
147162
self._log_request_failed(operation="create_profiling_group", exception=e)
148163

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import functools
2+
import threading
3+
4+
5+
def synchronized(wrapped):
6+
"""The missing @synchronized decorator
7+
8+
https://git.io/vydTA"""
9+
_lock = threading.RLock()
10+
11+
@functools.wraps(wrapped)
12+
def _wrapper(*args, **kwargs):
13+
with _lock:
14+
return wrapped(*args, **kwargs)
15+
return _wrapper

codeguru_profiler_agent/utils/time.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@
33
import time
44
from datetime import datetime
55

6+
67
def to_iso(epoch_milli):
78
try:
8-
return datetime.fromtimestamp(epoch_milli / 1000).isoformat()
9+
return datetime.utcfromtimestamp(epoch_milli / 1000).isoformat(
10+
timespec='milliseconds') + "Z" # ISO 8601 date-time format
911
except ValueError:
1012
return str(epoch_milli)
1113

14+
1215
def current_milli_time(clock=time.time):
1316
return int(clock() * 1000)

test/integration/test_live_backend_reporting.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from datetime import timedelta
77

8+
from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo
89
from test.help_utils import MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS
910
from test.pytestutils import before
1011

@@ -32,8 +33,10 @@ def before(self):
3233
stacks=[[Frame(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS)]],
3334
attempted_sample_threads_count=1,
3435
seen_threads_count=1)
36+
errors_metadata = ErrorsMetadata()
3537

36-
self.profile = Profile(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS, 1.0, 1.0, five_minutes_ago_millis)
38+
self.profile = Profile(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS, 1.0, 1.0, five_minutes_ago_millis,
39+
AgentDebugInfo(errors_metadata))
3740
# FIXME: Remove adding the end time manually below after feature fully support
3841
self.profile.end = now_millis
3942
self.profile.add(sample)
@@ -47,7 +50,8 @@ def before(self):
4750
"minimum_time_reporting": timedelta(minutes=6),
4851
"max_stack_depth": 2345,
4952
"cpu_limit_percentage": 29,
50-
"agent_metadata": AgentMetadata(fleet_info=DefaultFleetInfo())
53+
"agent_metadata": AgentMetadata(fleet_info=DefaultFleetInfo()),
54+
"errors_metadata": errors_metadata
5155
}
5256
self.environment["codeguru_profiler_builder"] = CodeGuruClientBuilder(self.environment)
5357
self.agent_config = AgentConfiguration(

0 commit comments

Comments
 (0)