Skip to content

Commit 8f30db7

Browse files
committed
Fix error count calculation for Rnfe errors
Added comments for more clarity.
1 parent a659e33 commit 8f30db7

File tree

5 files changed

+217
-164
lines changed

5 files changed

+217
-164
lines changed
Lines changed: 70 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
1-
from codeguru_profiler_agent.utils.synchronization import synchronized
1+
import logging
2+
import os
3+
4+
logger = logging.getLogger(__name__)
25

36

47
class ErrorsMetadata:
58
def __init__(self):
6-
self.errors_count = 0
7-
self.sdk_client_errors = 0
8-
self.configure_agent_errors = 0
9-
self.configure_agent_rnfe_auto_create_enabled_errors = 0
10-
self.create_profiling_group_errors = 0
11-
self.post_agent_profile_errors = 0
12-
self.post_agent_profile_rnfe_auto_create_enabled_errors = 0
9+
self.reset()
1310

1411
def reset(self):
12+
"""
13+
We want to differentiate API call errors more granularly. We want to gather ResourceNotFoundException errors
14+
because we are going to get this exception with auto-create feature and want to monitor how many times
15+
the agent is not able to create the PG and resulting in subsequent ResourceNotFoundException.
16+
"""
1517
self.errors_count = 0
1618
self.sdk_client_errors = 0
1719
self.configure_agent_errors = 0
@@ -20,12 +22,10 @@ def reset(self):
2022
self.post_agent_profile_errors = 0
2123
self.post_agent_profile_rnfe_auto_create_enabled_errors = 0
2224

23-
"""
24-
This needs to be compliant with errors count schema.
25-
https://code.amazon.com/packages/SkySailProfileIonSchema/blobs/811cc0e7e406e37a5b878acf31468be3dcd2963d/--/src/main/resources/schema/DebugInfo.isl#L21
26-
"""
27-
2825
def serialize_to_json(self):
26+
"""
27+
This needs to be compliant with errors count schema.
28+
"""
2929
return {
3030
"errorsCount": self.errors_count,
3131
"sdkClientErrors": self.sdk_client_errors,
@@ -36,35 +36,86 @@ def serialize_to_json(self):
3636
"postAgentProfileRnfeAutoCreateEnabledErrors": self.post_agent_profile_rnfe_auto_create_enabled_errors
3737
}
3838

39-
@synchronized
4039
def increment_sdk_error(self, error_type):
40+
"""
41+
ErrorsCount is the umbrella of all the kinds of error we want to capture. Currently we have only SdkClientErrors
42+
in it. SdkClientErrors is comprised of different API level errors like ConfigureAgentErrors,
43+
PostAgentProfileErrors, CreateProfilingGroupErrors.
44+
:param error_type: The type of API level error that we want to capture.
45+
"""
4146
self.errors_count += 1
4247
self.sdk_client_errors += 1
4348

49+
"""
50+
Special handling for ResourceNotFoundException errors.
51+
For example configureAgentRnfeAutoCreateEnabledErrors is also a configureAgentErrors.
52+
"""
4453
if error_type == "configureAgentErrors":
4554
self.configure_agent_errors += 1
4655
elif error_type == "configureAgentRnfeAutoCreateEnabledErrors":
56+
self.configure_agent_errors += 1
4757
self.configure_agent_rnfe_auto_create_enabled_errors += 1
4858
elif error_type == "createProfilingGroupErrors":
4959
self.create_profiling_group_errors += 1
5060
elif error_type == "postAgentProfileErrors":
5161
self.post_agent_profile_errors += 1
5262
elif error_type == "postAgentProfileRnfeAutoCreateEnabledErrors":
63+
self.post_agent_profile_errors += 1
5364
self.post_agent_profile_rnfe_auto_create_enabled_errors += 1
5465

5566
def record_sdk_error(self, error_type):
5667
self.increment_sdk_error(error_type)
5768

5869

5970
class AgentDebugInfo:
60-
def __init__(self, errors_metadata):
71+
def __init__(self, errors_metadata=None, agent_start_time=None, timer=None):
72+
self.process_id = get_process_id()
6173
self.errors_metadata = errors_metadata
74+
self.agent_start_time = agent_start_time
75+
self.timer = timer
6276

6377
def serialize_to_json(self):
6478
"""
6579
This needs to be compliant with agent debug info schema.
66-
https://code.amazon.com/packages/SkySailProfileIonSchema/blobs/811cc0e7e406e37a5b878acf31468be3dcd2963d/--/src/main/resources/schema/DebugInfo.isl#L21
6780
"""
68-
return {
69-
"errorsCount": self.errors_metadata.serialize_to_json()
70-
}
81+
json = {}
82+
83+
self.add_agent_start_time(json)
84+
self.add_process_id(json)
85+
self.add_errors_metadata(json)
86+
self.add_generic_metrics(json)
87+
88+
return json
89+
90+
def add_agent_start_time(self, json):
91+
if self.agent_start_time is not None:
92+
json["agentStartTime"] = int(self.agent_start_time)
93+
94+
def add_errors_metadata(self, json):
95+
if self.errors_metadata is not None:
96+
json["errorsCount"] = self.errors_metadata.serialize_to_json()
97+
98+
def add_process_id(self, json):
99+
if self.process_id is not None:
100+
json["processId"] = self.process_id
101+
102+
def add_generic_metrics(self, json):
103+
if self.timer is not None and self.timer.metrics:
104+
generic_metrics = {}
105+
106+
for metric in self.timer.metrics:
107+
metric_value = self.timer.metrics[metric]
108+
generic_metrics[metric + "_max"] = metric_value.max
109+
generic_metrics[metric + "_average"] = metric_value.average()
110+
111+
if generic_metrics:
112+
json["genericMetrics"] = generic_metrics
113+
114+
115+
def get_process_id():
116+
try:
117+
return os.getpid()
118+
except Exception as e:
119+
logger.info("Failed to get the process id, " + repr(e))
120+
return None
121+

codeguru_profiler_agent/local_aggregator.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def __init__(self, reporter, environment=dict()):
4747
self.profile = None
4848
self.memory_limit_bytes = environment["memory_limit_bytes"]
4949
self.last_report_attempted = current_milli_time(clock=self.clock)
50+
self.agent_start_time = int(current_milli_time(clock=self.clock))
5051

5152
self.reset()
5253

@@ -75,15 +76,15 @@ def _check_memory_limit(self):
7576

7677
def reset(self):
7778
self.errors_metadata.reset()
79+
self.timer.reset()
7880
self.profile = self.profile_factory(
7981
profiling_group_name=self.profiling_group_name,
8082
sampling_interval_seconds=AgentConfiguration.get().sampling_interval.total_seconds(),
8183
host_weight=self.host_weight,
8284
start=current_milli_time(clock=self.clock),
83-
agent_debug_info=AgentDebugInfo(self.errors_metadata),
85+
agent_debug_info=AgentDebugInfo(self.errors_metadata, self.agent_start_time, self.timer),
8486
clock=self.clock
8587
)
86-
self.timer.reset()
8788

8889
@with_timer("flush")
8990
def flush(self, force=False, reset=True):

codeguru_profiler_agent/utils/synchronization.py

Lines changed: 0 additions & 15 deletions
This file was deleted.

0 commit comments

Comments
 (0)