14
14
from pyth_observer .event import DatadogEvent # Used dynamically
15
15
from pyth_observer .event import LogEvent # Used dynamically
16
16
from pyth_observer .event import TelegramEvent # Used dynamically
17
- from pyth_observer .event import ZendutyEvent # Used dynamically
18
- from pyth_observer .event import Event
17
+ from pyth_observer .event import Context , Event , ZendutyEvent
19
18
from pyth_observer .zenduty import send_zenduty_alert
20
19
21
20
assert DatadogEvent
@@ -46,6 +45,9 @@ def __init__(self, config, publishers):
46
45
if "ZendutyEvent" in self .config ["events" ]:
47
46
self .open_alerts_file = os .environ ["OPEN_ALERTS_FILE" ]
48
47
self .open_alerts = self .load_alerts ()
48
+ # below is used to store events to later send if mutilple failures occur
49
+ # events cannot be stored in open_alerts as they are not JSON serializable.
50
+ self .zenduty_events = {}
49
51
50
52
def load_alerts (self ):
51
53
try :
@@ -68,17 +70,14 @@ async def run(self, states: List[State]):
68
70
69
71
# Then, wrap each failed check in events and send them
70
72
sent_events : List [Awaitable ] = []
71
- context = {
72
- "network" : self .config ["network" ]["name" ],
73
- "publishers" : self .publishers ,
74
- }
73
+ context = Context (
74
+ network = self .config ["network" ]["name" ], publishers = self .publishers
75
+ )
75
76
76
77
for check in failed_checks :
77
78
for event_type in self .config ["events" ]:
78
79
event : Event = globals ()[event_type ](check , context )
79
80
80
- sent_events .append (event .send ())
81
-
82
81
if event_type == "ZendutyEvent" :
83
82
# Add failed check to open alerts
84
83
alert_identifier = (
@@ -87,28 +86,45 @@ async def run(self, states: List[State]):
87
86
state = check .state ()
88
87
if isinstance (state , PublisherState ):
89
88
alert_identifier += f"-{ state .publisher_name } "
90
- self .open_alerts [alert_identifier ] = datetime .now ().isoformat ()
89
+ try :
90
+ failures = self .open_alerts [alert_identifier ]["failures" ] + 1
91
+ except KeyError :
92
+ failures = 1
93
+ self .open_alerts [alert_identifier ] = {
94
+ "last_failure" : datetime .now ().isoformat (),
95
+ "failures" : failures ,
96
+ }
97
+ # store the event to send it later if it fails multiple times
98
+ self .zenduty_events [alert_identifier ] = event
99
+ continue # do not immediately send a zenduty alert
100
+
101
+ sent_events .append (event .send ())
91
102
92
103
await asyncio .gather (* sent_events )
93
104
94
- # Check open alerts and resolve those that are older than 2 minutes
105
+ # Check open alerts for zenduty
95
106
if "ZendutyEvent" in self .config ["events" ]:
96
107
97
108
to_remove = []
98
109
current_time = datetime .now ()
99
- for identifier , last_failure in self .open_alerts .items ():
100
- if current_time - datetime .fromisoformat (last_failure ) >= timedelta (
101
- minutes = 2
102
- ):
110
+ for identifier , info in self .open_alerts .items ():
111
+ # Resolve the alert if it last failed > 2 minutes ago
112
+ if current_time - datetime .fromisoformat (
113
+ info ["last_failure" ]
114
+ ) >= timedelta (minutes = 2 ):
103
115
logger .debug (f"Resolving Zenduty alert { identifier } " )
104
116
response = await send_zenduty_alert (
105
117
alert_identifier = identifier , message = identifier , resolved = True
106
118
)
107
119
if response and 200 <= response .status < 300 :
108
120
to_remove .append (identifier )
121
+ elif info ["failures" ] > 2 :
122
+ # Raise alert if the check has failed more than twice before self-resolving
123
+ await self .zenduty_events [identifier ].send ()
109
124
110
125
for identifier in to_remove :
111
126
del self .open_alerts [identifier ]
127
+ del self .zenduty_events [identifier ]
112
128
113
129
# Write open alerts to file to ensure persistence
114
130
with open (self .open_alerts_file , "w" ) as file :
0 commit comments