Skip to content

Commit d55e0b9

Browse files
authored
Fixing bug in calculating active time since start (#40)
* Fixing bug in calculating active time since start In case of lambda where there are pause times, it should be considered when calculating active time. Essentially total pause time should be removed from [end-start]. Earlier we were considering the last sample time to be the end of the profile but in some corner cases where there is huge pause after that we would include it in the calculation even though pause is after the "end" of the profile. So updating the way we calculate "end" of the profile. For more detailed information, please have a look at the comments. Also, added some debug logs. * Fixing unit tests * Updating comment and minor change in resume * Fixing unit tests for latest changes
1 parent ccd6348 commit d55e0b9

File tree

3 files changed

+42
-10
lines changed

3 files changed

+42
-10
lines changed

codeguru_profiler_agent/model/profile.py

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import time
2+
import logging
23

34
from random import SystemRandom
45
from codeguru_profiler_agent.model.call_graph_node import CallGraphNode
@@ -9,6 +10,7 @@
910

1011
ROOT_NODE_NAME = "ALL"
1112

13+
logger = logging.getLogger(__name__)
1214

1315
class Profile:
1416
def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, start, agent_debug_info, clock=time.time):
@@ -56,11 +58,40 @@ def get_active_millis_since_start(self):
5658
This returns the total "active" wall clock time since start. In AWS lambda the process can be frozen, the
5759
time while we are frozen should not be counted in here. In an EC2 or other type of host it is simply the wall
5860
clock time since start.
59-
If the "end" time has been set, we give the active time between start and end otherwise between start and now
61+
62+
Previously self.end was used to calculate active_millis_since_start but self.end is updated when a sample is added
63+
so in rare cases where a sample is collected before the last pause time then we might add additional pause time
64+
which can lead to incorrect calculation of active time. In worst cases, it can even lead to negative values depending
65+
on the pause time.
66+
67+
Below is an example indicating the potential error of considering self.end in the calculation.
68+
------------------------------------------------------------------
69+
| | | | | | | | ||||||
70+
S P R P R SE P R REPORT
71+
s1_________|___p1___|____|____p2___|____e1____|___p3__|__________|
72+
73+
S - Start
74+
P - Pause
75+
R - Resume
76+
SE - self.end (last sample)
77+
REPORT - Calculating active time.
78+
79+
If we consider self.end which is e1 in above case then active time would be e1-s1-(p1+p2+p3). But pause p3 is after e1
80+
so that leads to incorrect calculation of active time.
81+
82+
Ideally we would want to set profile_end to be last sample time and subtract pause times only before that but it requires
83+
additional work in maintaining pause time which isn't worth as it makes the logic complex with very little gain.
84+
85+
So we are setting it to current time and in some corner cases to last_pause time.
6086
"""
61-
end = self.last_pause if self.last_pause is not None else \
62-
self._end if self._end is not None else current_milli_time(clock=self._clock)
63-
return end - self.start - self._paused_ms
87+
end = self.last_pause if self.last_pause is not None else current_milli_time(clock=self._clock)
88+
active_time_millis_since_start = end - self.start - self._paused_ms
89+
logger.debug(
90+
"Active time since start is {activeTime} which is calculated using start: {start}, end: {end}, last_pause: {last_pause}, paused_ms: {paused_ms}, last_resume: {last_resume}"
91+
.format(activeTime = active_time_millis_since_start, start = self.start, end = self._end, last_pause = self.last_pause, paused_ms = self._paused_ms, last_resume = self.last_resume)
92+
)
93+
94+
return active_time_millis_since_start
6495

6596
def add(self, sample):
6697
"""
@@ -113,8 +144,9 @@ def resume(self):
113144
# resume gets called when profile is running
114145
return
115146
self.last_resume = current_milli_time(clock=self._clock)
116-
self._paused_ms += self.last_resume - self.last_pause
147+
prev_last_pause = self.last_pause
117148
self.last_pause = None
149+
self._paused_ms += self.last_resume - prev_last_pause
118150

119151
def is_empty(self):
120152
return self.total_seen_threads_count == 0.0

test/unit/model/test_profile.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -420,10 +420,10 @@ def before(self):
420420
self.subject.resume()
421421
self.turn_clock(seconds=5)
422422
self.subject.end = self.mock_clock() * 1000
423-
self.turn_clock(seconds=1) # that extra second should not be included in the duration
423+
self.turn_clock(seconds=5)
424424

425425
def test_it_returns_duration_up_to_end_time(self):
426-
assert self.subject.get_active_millis_since_start() == 14000
426+
assert self.subject.get_active_millis_since_start() == 19000
427427

428428

429429
class TestSetOverhead(TestProfile):

test/unit/sdk_reporter/test_sdk_profile_encoder.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@
2323

2424
def example_profile():
2525
start_time = 1514764800000
26-
end_time = 1514772000000
26+
end_time = 1514771000000
2727
profile = Profile(profiling_group_name="TestProfilingGroupName", sampling_interval_seconds=1.0, host_weight=2,
28-
start=start_time, agent_debug_info=AgentDebugInfo(errors_metadata), clock=lambda: 1514772000000)
28+
start=start_time, agent_debug_info=AgentDebugInfo(errors_metadata), clock=lambda: 1514772000)
2929
profile.add(
3030
Sample(stacks=[[Frame("bottom"), Frame("middle"), Frame("top")],
3131
[Frame("bottom"), Frame("middle"), Frame("different_top")],
@@ -89,7 +89,7 @@ def test_it_includes_the_start_time_from_the_profile_in_epoch_millis(
8989

9090
def test_it_includes_the_end_time_from_the_profile_in_epoch_millis(
9191
self):
92-
assert (self.decoded_json_result()["end"] == 1514772000000)
92+
assert (self.decoded_json_result()["end"] == 1514771000000)
9393

9494
def test_it_includes_the_agent_info_in_the_agent_metadata(self):
9595
assert (self.decoded_json_result()["agentMetadata"]["agentInfo"]["type"] ==

0 commit comments

Comments
 (0)