Skip to content

Commit d8ca2b6

Browse files
authored
reduce noise of zenduty alerts (#70)
* reduce noise of zenduty alerts * keep resolution threshold at 2 minutes * fix zenduty ratelimited log * ensure event is dropped from memory when resolved * update comment * bump version
1 parent 7088a26 commit d8ca2b6

File tree

3 files changed

+34
-17
lines changed

3 files changed

+34
-17
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ ignore_missing_imports = true
44

55
[tool.poetry]
66
name = "pyth-observer"
7-
version = "0.2.6"
7+
version = "0.2.7"
88
description = "Alerts and stuff"
99
authors = []
1010
readme = "README.md"

pyth_observer/dispatch.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
from pyth_observer.event import DatadogEvent # Used dynamically
1515
from pyth_observer.event import LogEvent # Used dynamically
1616
from pyth_observer.event import TelegramEvent # Used dynamically
17-
from pyth_observer.event import ZendutyEvent # Used dynamically
18-
from pyth_observer.event import Event
17+
from pyth_observer.event import Context, Event, ZendutyEvent
1918
from pyth_observer.zenduty import send_zenduty_alert
2019

2120
assert DatadogEvent
@@ -46,6 +45,9 @@ def __init__(self, config, publishers):
4645
if "ZendutyEvent" in self.config["events"]:
4746
self.open_alerts_file = os.environ["OPEN_ALERTS_FILE"]
4847
self.open_alerts = self.load_alerts()
48+
# below is used to store events to later send if mutilple failures occur
49+
# events cannot be stored in open_alerts as they are not JSON serializable.
50+
self.zenduty_events = {}
4951

5052
def load_alerts(self):
5153
try:
@@ -68,17 +70,14 @@ async def run(self, states: List[State]):
6870

6971
# Then, wrap each failed check in events and send them
7072
sent_events: List[Awaitable] = []
71-
context = {
72-
"network": self.config["network"]["name"],
73-
"publishers": self.publishers,
74-
}
73+
context = Context(
74+
network=self.config["network"]["name"], publishers=self.publishers
75+
)
7576

7677
for check in failed_checks:
7778
for event_type in self.config["events"]:
7879
event: Event = globals()[event_type](check, context)
7980

80-
sent_events.append(event.send())
81-
8281
if event_type == "ZendutyEvent":
8382
# Add failed check to open alerts
8483
alert_identifier = (
@@ -87,28 +86,45 @@ async def run(self, states: List[State]):
8786
state = check.state()
8887
if isinstance(state, PublisherState):
8988
alert_identifier += f"-{state.publisher_name}"
90-
self.open_alerts[alert_identifier] = datetime.now().isoformat()
89+
try:
90+
failures = self.open_alerts[alert_identifier]["failures"] + 1
91+
except KeyError:
92+
failures = 1
93+
self.open_alerts[alert_identifier] = {
94+
"last_failure": datetime.now().isoformat(),
95+
"failures": failures,
96+
}
97+
# store the event to send it later if it fails multiple times
98+
self.zenduty_events[alert_identifier] = event
99+
continue # do not immediately send a zenduty alert
100+
101+
sent_events.append(event.send())
91102

92103
await asyncio.gather(*sent_events)
93104

94-
# Check open alerts and resolve those that are older than 2 minutes
105+
# Check open alerts for zenduty
95106
if "ZendutyEvent" in self.config["events"]:
96107

97108
to_remove = []
98109
current_time = datetime.now()
99-
for identifier, last_failure in self.open_alerts.items():
100-
if current_time - datetime.fromisoformat(last_failure) >= timedelta(
101-
minutes=2
102-
):
110+
for identifier, info in self.open_alerts.items():
111+
# Resolve the alert if it last failed > 2 minutes ago
112+
if current_time - datetime.fromisoformat(
113+
info["last_failure"]
114+
) >= timedelta(minutes=2):
103115
logger.debug(f"Resolving Zenduty alert {identifier}")
104116
response = await send_zenduty_alert(
105117
alert_identifier=identifier, message=identifier, resolved=True
106118
)
107119
if response and 200 <= response.status < 300:
108120
to_remove.append(identifier)
121+
elif info["failures"] > 2:
122+
# Raise alert if the check has failed more than twice before self-resolving
123+
await self.zenduty_events[identifier].send()
109124

110125
for identifier in to_remove:
111126
del self.open_alerts[identifier]
127+
del self.zenduty_events[identifier]
112128

113129
# Write open alerts to file to ensure persistence
114130
with open(self.open_alerts_file, "w") as file:

pyth_observer/zenduty.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,12 @@ async def send_zenduty_alert(alert_identifier, message, resolved=False, summary=
3333
elif response.status == 429:
3434
retries += 1
3535
if retries < max_retries:
36+
sleeptime = min(30, 2**retries)
3637
logger.error(
37-
f"Received 429 Too Many Requests for {alert_identifier}. Retrying in 1 second..."
38+
f"Received 429 Too Many Requests for {alert_identifier}. Retrying in {sleeptime} s..."
3839
)
3940
await asyncio.sleep(
40-
min(30, 2**retries)
41+
sleeptime
4142
) # Backoff before retrying, wait upto 30s
4243
else:
4344
logger.error(

0 commit comments

Comments
 (0)