Skip to content

Commit f49ff7e

Browse files
authored
Merge pull request #20 from aws/issue-19
Improve implementation of ProfilerDisabler Introduce two layer of check: should_stop_sampling and should_stop_profiling should_stop_sampling is checked whenever we sample (usually every second) and it is responsible to make sure the sampling overhead is low. should_stop_profiling is checked whenever a profiling cycle end (whenever we submit profile) and it is responsible to make sure the profiling cycle overhead is low. The major difference between the two check is the way of calculation of overhead. should_stop_sampling used "average cpu usage for sample / sampling interval" while should_stop_profiling used "total cpu usage for full profiling cycle / profile.get_active_millis_since_start".
2 parents 302f219 + 33152e2 commit f49ff7e

File tree

6 files changed

+175
-62
lines changed

6 files changed

+175
-62
lines changed

codeguru_profiler_agent/local_aggregator.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def __init__(self, reporter, environment=dict()):
4444
self.memory_limit_bytes = environment["memory_limit_bytes"]
4545
self.last_report_attempted = current_milli_time(clock=self.clock)
4646

47-
self._reset()
47+
self.reset()
4848

4949
def add(self, sample):
5050
"""
@@ -69,7 +69,7 @@ def _check_memory_limit(self):
6969
"Profiler memory usage limit has been reached")
7070
self.flush(force=True)
7171

72-
def _reset(self):
72+
def reset(self):
7373
self.profile = self.profile_factory(
7474
profiling_group_name=self.profiling_group_name,
7575
sampling_interval_seconds=AgentConfiguration.get().sampling_interval.total_seconds(),
@@ -80,7 +80,7 @@ def _reset(self):
8080
self.timer.reset()
8181

8282
@with_timer("flush")
83-
def flush(self, force=False):
83+
def flush(self, force=False, reset=True):
8484
now = current_milli_time(clock=self.clock)
8585
reported = False
8686
if not force and not self._is_over_reporting_interval(now):
@@ -92,8 +92,8 @@ def flush(self, force=False):
9292
self._report_profile(now)
9393
reported = True
9494

95-
if force or reported:
96-
self._reset()
95+
if force or (reset and reported):
96+
self.reset()
9797
return reported
9898

9999
def refresh_configuration(self):

codeguru_profiler_agent/profiler_disabler.py

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import time
33
import logging
44
from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration
5-
from codeguru_profiler_agent.utils.time import current_milli_time
65

76
logger = logging.getLogger(__name__)
87
CHECK_KILLSWITCH_FILE_INTERVAL_SECONDS = 60
@@ -20,13 +19,18 @@ def __init__(self, environment, clock=time.time):
2019
self.killswitch = KillSwitch(environment['killswitch_filepath'], clock)
2120
self.memory_limit_bytes = environment['memory_limit_bytes']
2221

22+
def should_stop_sampling(self, profile=None):
23+
return (self.killswitch.is_killswitch_on()
24+
or self.cpu_usage_check.is_sampling_cpu_usage_limit_reached(profile)
25+
or self._is_memory_limit_reached(profile))
26+
2327
def should_stop_profiling(self, profile=None):
2428
return (self.killswitch.is_killswitch_on()
25-
or self.cpu_usage_check.is_cpu_usage_limit_reached(profile)
26-
or profile is not None and self._is_memory_limit_reached(profile))
29+
or self.cpu_usage_check.is_overall_cpu_usage_limit_reached(profile)
30+
or self._is_memory_limit_reached(profile))
2731

2832
def _is_memory_limit_reached(self, profile):
29-
return profile.get_memory_usage_bytes() > self.memory_limit_bytes
33+
return False if profile is None else profile.get_memory_usage_bytes() > self.memory_limit_bytes
3034

3135

3236
class CpuUsageCheck:
@@ -38,19 +42,45 @@ class CpuUsageCheck:
3842
def __init__(self, timer):
3943
self.timer = timer
4044

41-
def is_cpu_usage_limit_reached(self, profile=None):
45+
def is_overall_cpu_usage_limit_reached(self, profile=None):
46+
"""
47+
This function carries out an overall cpu limit check that covers the cpu overhead caused for the full
48+
sampling cycle: refresh config -> (sample -> aggregate) * n -> profile submission. We expect this function to
49+
be called after profile submission.
50+
"""
4251
profiler_metric = self.timer.metrics.get("runProfiler")
43-
if not profiler_metric or profiler_metric.counter < MINIMUM_MEASURES_IN_DURATION_METRICS:
52+
if not profile or not profiler_metric or profiler_metric.counter < MINIMUM_MEASURES_IN_DURATION_METRICS:
53+
return False
54+
55+
used_time_percentage = 100 * profiler_metric.total/(profile.get_active_millis_since_start()/1000)
56+
57+
cpu_limit_percentage = AgentConfiguration.get().cpu_limit_percentage
58+
59+
if used_time_percentage >= cpu_limit_percentage:
60+
logger.debug(self.timer.metrics)
61+
logger.info(
62+
"Profiler overall cpu usage limit reached: {:.2f} % (limit: {:.2f} %), will stop CodeGuru Profiler."
63+
.format(used_time_percentage, cpu_limit_percentage))
64+
return True
65+
else:
66+
return False
67+
68+
def is_sampling_cpu_usage_limit_reached(self, profile=None):
69+
sample_and_aggregate_metric = self.timer.metrics.get("sampleAndAggregate")
70+
if not sample_and_aggregate_metric or \
71+
sample_and_aggregate_metric.counter < MINIMUM_MEASURES_IN_DURATION_METRICS:
4472
return False
4573

4674
sampling_interval_seconds = self._get_average_sampling_interval_seconds(profile)
47-
used_time_percentage = 100 * profiler_metric.average() / sampling_interval_seconds
75+
used_time_percentage = 100 * sample_and_aggregate_metric.average() / sampling_interval_seconds
76+
77+
cpu_limit_percentage = AgentConfiguration.get().cpu_limit_percentage
4878

49-
if used_time_percentage >= AgentConfiguration.get().cpu_limit_percentage:
79+
if used_time_percentage >= cpu_limit_percentage:
5080
logger.debug(self.timer.metrics)
5181
logger.info(
52-
"Profiler cpu usage limit reached: {:.2f} % (limit: {:.2f} %), will stop CodeGuru Profiler.".format(
53-
used_time_percentage, AgentConfiguration.get().cpu_limit_percentage))
82+
"Profiler sampling cpu usage limit reached: {:.2f} % (limit: {:.2f} %), will stop CodeGuru Profiler."
83+
.format(used_time_percentage, cpu_limit_percentage))
5484
return True
5585
else:
5686
return False

codeguru_profiler_agent/profiler_runner.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,27 +71,37 @@ def _profiling_command(self):
7171
if self._first_execution:
7272
self.collector.setup()
7373
self._first_execution = False
74-
return self._run_profiler()
74+
sample_result = self._run_profiler()
75+
if sample_result.success and sample_result.is_end_of_cycle:
76+
if self.profiler_disabler.should_stop_profiling(profile=self.collector.profile):
77+
return False
78+
self.collector.reset()
79+
return True
80+
return sample_result.success
7581
except:
7682
logger.info("An unexpected issue caused the profiling command to terminate.", exc_info=True)
7783
return False
7884

7985
@with_timer("runProfiler")
8086
def _run_profiler(self):
81-
if self.profiler_disabler.should_stop_profiling(self.collector.profile):
82-
return False
87+
if self.profiler_disabler.should_stop_sampling(self.collector.profile):
88+
return RunProfilerStatus(success=False, is_end_of_cycle=False)
8389

8490
if not self.is_profiling_in_progress:
8591
self._refresh_configuration()
8692

8793
# after the refresh we may be working on a profile
8894
if self.is_profiling_in_progress:
89-
if self.collector.flush():
95+
if self.collector.flush(reset=False):
9096
self.is_profiling_in_progress = False
91-
return True
92-
sample = self.sampler.sample()
93-
self.collector.add(sample)
94-
return True
97+
return RunProfilerStatus(success=True, is_end_of_cycle=True)
98+
self._sample_and_aggregate()
99+
return RunProfilerStatus(success=True, is_end_of_cycle=False)
100+
101+
@with_timer("sampleAndAggregate")
102+
def _sample_and_aggregate(self):
103+
sample = self.sampler.sample()
104+
self.collector.add(sample)
95105

96106
def is_running(self):
97107
return self.scheduler.is_running()
@@ -125,3 +135,9 @@ def pause(self, block=False):
125135
"""
126136
self.scheduler.pause(block)
127137
self.collector.profile.pause()
138+
139+
140+
class RunProfilerStatus:
141+
def __init__(self, success, is_end_of_cycle):
142+
self.success = success
143+
self.is_end_of_cycle = is_end_of_cycle

test/acceptance/test_cpu_limit.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,6 @@ def test_profiler_terminates(self):
3131
# With sampling_interval to be 0.01 seconds, having runProfiler as 0.5 seconds should breach
3232
# the cpu limit. We need to sample 20 times before we check the CPU limit
3333
for i in range(20):
34-
self.timer.record('runProfiler', 0.5)
34+
self.timer.record('sampleAndAggregate', 0.5)
3535

3636
assert wait_for(lambda: not self.profiler.is_running(), timeout_seconds=5)

test/unit/test_profiler_disabler.py

Lines changed: 93 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@ def set_agent_config(sampling_interval_seconds=1, cpu_limit_percentage=DEFAULT_C
2828

2929

3030
def assert_config_sampling_interval_used(process_duration_check, profile):
31-
assert process_duration_check.is_cpu_usage_limit_reached(profile)
31+
assert process_duration_check.is_sampling_cpu_usage_limit_reached(profile)
3232

3333
set_agent_config(sampling_interval_seconds=42, cpu_limit_percentage=80)
34-
assert not process_duration_check.is_cpu_usage_limit_reached(profile)
34+
assert not process_duration_check.is_sampling_cpu_usage_limit_reached(profile)
3535

3636

3737
class TestProfilerDisabler:
@@ -59,28 +59,57 @@ def test_it_sets_all_parameters(self):
5959
assert AgentConfiguration.get().cpu_limit_percentage == DEFAULT_CPU_LIMIT_PERCENTAGE
6060

6161

62-
class TestWhenAnyFails(TestProfilerDisabler):
63-
@before
64-
def before(self):
65-
super().before()
66-
self.profiler = Mock()
67-
self.disabler.killswitch = Mock()
68-
self.disabler.cpu_usage_check = Mock()
69-
self.disabler._is_memory_limit_reached = Mock(return_value=False)
70-
self.disabler.killswitch.is_killswitch_on = Mock(return_value=False)
71-
self.disabler.killswitch.is_process_duration_limit_reached = Mock(return_value=False)
62+
class TestShouldStopSampling:
63+
class TestWhenAnyFails(TestProfilerDisabler):
64+
@before
65+
def before(self):
66+
super().before()
67+
self.disabler.killswitch = Mock()
68+
self.disabler.cpu_usage_check = Mock()
69+
self.disabler.cpu_usage_check.is_sampling_cpu_usage_limit_reached = Mock(return_value=False)
70+
self.disabler._is_memory_limit_reached = Mock(return_value=False)
71+
self.disabler.killswitch.is_killswitch_on = Mock(return_value=False)
72+
self.disabler.killswitch.is_process_duration_limit_reached = Mock(return_value=False)
73+
assert not self.disabler.should_stop_sampling()
74+
75+
def test_it_stops_profiling_if_killswitch_is_on(self):
76+
self.disabler.killswitch.is_killswitch_on = Mock(return_value=True)
77+
assert self.disabler.should_stop_sampling()
78+
79+
def test_it_stops_profiling_if_memory_limit_is_reached(self):
80+
self.disabler._is_memory_limit_reached = Mock(return_value=True)
81+
assert self.disabler.should_stop_sampling()
82+
83+
def test_it_stops_profiling_if_process_duration_is_reached(self):
84+
self.disabler.cpu_usage_check.is_sampling_cpu_usage_limit_reached = Mock(return_value=True)
85+
assert self.disabler.should_stop_sampling()
86+
87+
88+
class TestShouldStopProfiling:
89+
class TestWhenAnyFails(TestProfilerDisabler):
90+
@before
91+
def before(self):
92+
super().before()
93+
self.profiler = Mock()
94+
self.disabler.killswitch = Mock()
95+
self.disabler.cpu_usage_check = Mock()
96+
self.disabler.cpu_usage_check.is_overall_cpu_usage_limit_reached = Mock(return_value=False)
97+
self.disabler._is_memory_limit_reached = Mock(return_value=False)
98+
self.disabler.killswitch.is_killswitch_on = Mock(return_value=False)
99+
self.disabler.killswitch.is_process_duration_limit_reached = Mock(return_value=False)
100+
assert not self.disabler.should_stop_profiling()
72101

73-
def test_it_stops_profiling_if_killswitch_is_on(self):
74-
self.disabler.killswitch.is_killswitch_on = Mock(return_value=True)
75-
assert self.disabler.should_stop_profiling(self.profiler)
102+
def test_it_stops_profiling_if_killswitch_is_on(self):
103+
self.disabler.killswitch.is_killswitch_on = Mock(return_value=True)
104+
assert self.disabler.should_stop_profiling()
76105

77-
def test_it_stops_profiling_if_memory_limit_is_reached(self):
78-
self.disabler._is_memory_limit_reached = Mock(return_value=True)
79-
assert self.disabler.should_stop_profiling(self.profiler)
106+
def test_it_stops_profiling_if_memory_limit_is_reached(self):
107+
self.disabler._is_memory_limit_reached = Mock(return_value=True)
108+
assert self.disabler.should_stop_profiling()
80109

81-
def test_it_stops_profiling_if_process_duration_is_reached(self):
82-
self.disabler.cpu_usage_check.is_cpu_usage_limit_reached = Mock(return_value=True)
83-
assert self.disabler.should_stop_profiling(self.profiler)
110+
def test_it_stops_profiling_if_process_duration_is_reached(self):
111+
self.disabler.cpu_usage_check.is_overall_cpu_usage_limit_reached = Mock(return_value=True)
112+
assert self.disabler.should_stop_profiling()
84113

85114

86115
class TestKillSwitch:
@@ -145,17 +174,17 @@ def test_it_returns_false_after_a_minute(self):
145174
assert not self.killswitch.is_killswitch_on()
146175

147176

148-
class TestCpuUsageCheck:
177+
class TestSamplingCpuUsageCheck:
149178
def before(self):
150179
self.timer = Timer()
151180
self.profile = Mock(spec=Profile)
152181
for i in range(20):
153-
self.timer.record('runProfiler', 0.5)
182+
self.timer.record('sampleAndAggregate', 0.5)
154183
set_agent_config(sampling_interval_seconds=1, cpu_limit_percentage=10)
155184
self.process_duration_check = CpuUsageCheck(self.timer)
156185

157186

158-
class TestGetAverageSamplingIntervalSeconds(TestCpuUsageCheck):
187+
class TestGetAverageSamplingIntervalSeconds(TestSamplingCpuUsageCheck):
159188
@before
160189
def before(self):
161190
super().before()
@@ -176,7 +205,7 @@ def test_when_profiler_sample_count_less_than_min_samples_in_profile_it_returns_
176205
assert CpuUsageCheck._get_average_sampling_interval_seconds(self.profile) == 23
177206

178207

179-
class TestIsCpuUsageLimitReached(TestCpuUsageCheck):
208+
class TestIsSamplingCpuUsageLimitReached(TestSamplingCpuUsageCheck):
180209
@before
181210
def before(self):
182211
super().before()
@@ -187,43 +216,70 @@ def before(self):
187216
yield
188217

189218
def test_it_calls_get_average_sampling_interval_with_profile(self):
190-
self.process_duration_check.is_cpu_usage_limit_reached(self.profile)
219+
self.process_duration_check.is_sampling_cpu_usage_limit_reached(self.profile)
191220
self.get_average_sampling_interval_mock.assert_called_once_with(self.profile)
192221

193222
def test_when_average_duration_exceeds_limit_it_returns_true(self):
194223
# timer: (0.5/4) * 100= 12.5%
195-
assert self.process_duration_check.is_cpu_usage_limit_reached()
224+
assert self.process_duration_check.is_sampling_cpu_usage_limit_reached()
196225

197-
def test_when_average_duragtion_is_below_limit_it_returns_false(self):
226+
def test_when_average_duration_is_below_limit_it_returns_false(self):
198227
# timer: (0.5/4) * 100= 12.5%
199228
set_agent_config(cpu_limit_percentage=13)
200-
assert not self.process_duration_check.is_cpu_usage_limit_reached()
229+
assert not self.process_duration_check.is_sampling_cpu_usage_limit_reached()
201230

202231
def test_when_profile_is_none_it_calls_get_average_sampling_interval_without_profile(self):
203-
self.process_duration_check.is_cpu_usage_limit_reached()
232+
self.process_duration_check.is_sampling_cpu_usage_limit_reached()
204233
self.get_average_sampling_interval_mock.assert_called_once_with(None)
205234

206235

207-
class TestWhenTimerDoesNotHaveTheKey(TestCpuUsageCheck):
236+
class TestIsOverallCpuUsageLimitReached():
237+
@before
238+
def before(self):
239+
self.timer = Timer()
240+
self.profile = Mock(spec=Profile)
241+
for i in range(20):
242+
self.timer.record('runProfiler', 0.5)
243+
set_agent_config(cpu_limit_percentage=9)
244+
self.process_duration_check = CpuUsageCheck(self.timer)
245+
self.profile.get_active_millis_since_start = Mock(return_value=100*1000)
246+
247+
def test_when_average_duration_exceeds_limit_it_returns_true(self):
248+
# timer: (0.5*20/100) * 100= 10%
249+
assert self.process_duration_check.is_overall_cpu_usage_limit_reached(self.profile)
250+
251+
def test_when_average_duration_is_below_limit_it_returns_false(self):
252+
# timer: (0.5*20/100) * 100= 10%
253+
set_agent_config(cpu_limit_percentage=11)
254+
assert not self.process_duration_check.is_overall_cpu_usage_limit_reached(self.profile)
255+
256+
def test_when_profile_is_none_it_returns_false(self):
257+
assert not self.process_duration_check.is_overall_cpu_usage_limit_reached()
258+
259+
260+
class TestWhenTimerDoesNotHaveTheKey(TestSamplingCpuUsageCheck):
208261
@before
209262
def before(self):
210263
super().before()
211264

212265
def test_it_returns_false(self):
213266
self.process_duration_check.timer = Timer()
214-
assert not self.process_duration_check.is_cpu_usage_limit_reached()
267+
assert not self.process_duration_check.is_sampling_cpu_usage_limit_reached()
215268

216269

217-
class TestWhenTimerDoesNotHaveEnoughMeasures(TestCpuUsageCheck):
270+
class TestWhenTimerDoesNotHaveEnoughMeasures(TestSamplingCpuUsageCheck):
218271
@before
219272
def before(self):
220273
super().before()
221-
222-
def test_it_returns_false(self):
223274
self.timer.reset()
224275
for i in range(4):
225-
self.timer.record('runProfiler', 0.5)
226-
assert not self.process_duration_check.is_cpu_usage_limit_reached()
276+
self.timer.record('sampleAndAggregate', 0.5)
277+
278+
def test_sampling_cpu_usage_limit_reached_returns_false(self):
279+
assert not self.process_duration_check.is_sampling_cpu_usage_limit_reached()
280+
281+
def test_overall_cpu_usage_limit_reached_returns_false(self):
282+
assert not self.process_duration_check.is_overall_cpu_usage_limit_reached()
227283

228284

229285
class TestMemoryLimitCheck:

0 commit comments

Comments
 (0)