Skip to content

Commit

Permalink
Instances now get their Status updated correctly, ICMPChecks can't ha…
Browse files Browse the repository at this point in the history
…ve more than one Instance attached anymore (used to error instead). Minor cleanups.
  • Loading branch information
Jonathan Montineri authored and dbuxton committed Aug 1, 2014
1 parent 6cb4bd4 commit 558f18c
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 24 deletions.
88 changes: 67 additions & 21 deletions app/cabotapp/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,23 @@
from django.core.exceptions import ValidationError
from polymorphic import PolymorphicModel
from django.db.models import F
from django.core.urlresolvers import reverse
from django.contrib.admin.models import User

from jenkins import get_job_status
from .alert import send_alert
from .calendar import get_events
from .graphite import parse_metric
from .alert import send_alert
from .tasks import update_service
from .tasks import update_service, update_instance
from datetime import datetime, timedelta
from django.utils import timezone

import json
import re
import time
import os
import subprocess

import requests
from celery.utils.log import get_task_logger
Expand Down Expand Up @@ -132,24 +134,6 @@ class Meta:
def __unicode__(self):
return self.name

def update_status(self):
self.old_overall_status = self.overall_status
# Only active checks feed into our calculation
status_checks_failed_count = self.all_failing_checks().count()
self.overall_status = self.most_severe(self.all_failing_checks())
self.snapshot = ServiceStatusSnapshot(
service=self,
num_checks_active=self.active_status_checks().count(),
num_checks_passing=self.active_status_checks(
).count() - status_checks_failed_count,
num_checks_failing=status_checks_failed_count,
overall_status=self.overall_status,
time=timezone.now(),
)
self.snapshot.save()
self.save()
if not (self.overall_status == Service.PASSING_STATUS and self.old_overall_status == Service.PASSING_STATUS):
self.alert()

def most_severe(self, check_list):
failures = [c.importance for c in check_list]
Expand Down Expand Up @@ -233,6 +217,24 @@ def all_failing_checks(self):

class Service(CheckGroupMixin):

def update_status(self):
self.old_overall_status = self.overall_status
# Only active checks feed into our calculation
status_checks_failed_count = self.all_failing_checks().count()
self.overall_status = self.most_severe(self.all_failing_checks())
self.snapshot = ServiceStatusSnapshot(
service=self,
num_checks_active=self.active_status_checks().count(),
num_checks_passing=self.active_status_checks(
).count() - status_checks_failed_count,
num_checks_failing=status_checks_failed_count,
overall_status=self.overall_status,
time=timezone.now(),
)
self.snapshot.save()
self.save()
if not (self.overall_status == Service.PASSING_STATUS and self.old_overall_status == Service.PASSING_STATUS):
self.alert()
instances = models.ManyToManyField(
'Instance',
blank=True,
Expand All @@ -252,6 +254,26 @@ class Meta:
class Instance(CheckGroupMixin):


def update_status(self):
self.old_overall_status = self.overall_status
# Only active checks feed into our calculation
status_checks_failed_count = self.all_failing_checks().count()
self.overall_status = self.most_severe(self.all_failing_checks())
self.snapshot = InstanceStatusSnapshot(
instance=self,
num_checks_active=self.active_status_checks().count(),
num_checks_passing=self.active_status_checks(
).count() - status_checks_failed_count,
num_checks_failing=status_checks_failed_count,
overall_status=self.overall_status,
time=timezone.now(),
)
self.snapshot.save()
self.save()
if not (self.overall_status == Service.PASSING_STATUS and self.old_overall_status == Service.PASSING_STATUS):
self.alert()


class Meta:
ordering = ['name']

Expand Down Expand Up @@ -279,6 +301,17 @@ class ServiceStatusSnapshot(models.Model):
def __unicode__(self):
return u"%s: %s" % (self.service.name, self.overall_status)

class InstanceStatusSnapshot(models.Model):
instance = models.ForeignKey(Instance, related_name='snapshots')
time = models.DateTimeField(db_index=True)
num_checks_active = models.IntegerField(default=0)
num_checks_passing = models.IntegerField(default=0)
num_checks_failing = models.IntegerField(default=0)
overall_status = models.TextField(default=Service.PASSING_STATUS)
did_send_alert = models.IntegerField(default=False)

def __unicode__(self):
return u"%s: %s" % (self.instance.name, self.overall_status)

class StatusCheck(PolymorphicModel):

Expand Down Expand Up @@ -403,6 +436,11 @@ def run(self):
except Exception as e:
result = StatusCheckResult(check=self)
result.error = u'Error in performing check: %s' % (e,)
if result.error.startswith("Error in performing check: get() returned more than one Instance"):
first_instance = self.instance_set.all().order_by('id')[0]
self.instance_set = [first_instance]
first_instance_link = '<a href="%s">' % reverse('instance', kwargs={'pk': first_instance.pk}) + first_instance.name + "</a>"
result.error = "Error: This type of check can only be attached to one instance. All instances, apart from the oldest one (%s), have been detached from this check. The check will run normally next time." % first_instance_link
result.succeeded = False
finish = timezone.now()
result.time = start
Expand All @@ -427,13 +465,15 @@ def save(self, *args, **kwargs):
ret = super(StatusCheck, self).save(*args, **kwargs)
# Update linked services
self.update_related_services()
self.update_related_instances()
return ret

def update_related_services(self):
services = self.service_set.all()
for service in services:
update_service.delay(service.id)

def update_related_instances(self):
instances = self.instance_set.all()
for instance in instances:
update_service.delay(instance.id)
Expand All @@ -451,12 +491,18 @@ def _run(self):
result = StatusCheckResult(check=self)
instances = self.instance_set.all()
target = self.instance_set.get().address
response = os.system("ping -c 1 " + target)

#We need to read both STDOUT and STDERR because ping can write to both, depending on the kind of error. Thanks a lot, ping.
ping_process = subprocess.Popen("ping -c 1 " + target, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
response = ping_process.wait()

if response == 0:
result.succeeded = True
else:
output = ping_process.stdout.read()
result.succeeded = False
result.error = "Could not connect, host is most likely down"
result.error = output

return result


Expand Down
8 changes: 8 additions & 0 deletions app/cabotapp/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,14 @@ def update_service(service_or_id):
service = service_or_id
service.update_status()

@task(ignore_result=True)
def update_instance(instance_or_id):
from .models import Instance
if not isinstance(instance_or_id, Service):
instance = Instance.objects.get(id=instance_or_id)
else:
instance = instance_or_id
instance.update_status()

@task(ignore_result=True)
def update_shifts():
Expand Down
2 changes: 1 addition & 1 deletion app/templates/cabotapp/_statuscheck_list.html
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ <h3>
</td>
{% endif %}
<td title="">
{% if check.polymorphic_ctype.model == 'graphitestatuscheck' %}{{ check.metric|truncatechars:70 }} {{ check.check_type }} {{ check.value }}{% if check.expected_num_hosts %} (from {{ check.expected_num_hosts }} hosts){% endif %}{% elif check.polymorphic_ctype.model == 'httpstatuscheck' %}Status code {{ check.status_code }} from {{ check.endpoint }}{% if check.text_match %}; match text /{{ check.text_match }}/{% endif %}{% elif check.polymorphic_ctype.model == 'jenkinsstatuscheck' %}Monitor job {{ check.name }}{% if check.max_queued_build_time %}; check no build waiting for >{{ check.max_queued_build_time }} minutes{% endif %}{% endif %}
{% if check.polymorphic_ctype.model == 'graphitestatuscheck' %}{{ check.metric|truncatechars:70 }} {{ check.check_type }} {{ check.value }}{% if check.expected_num_hosts %} (from {{ check.expected_num_hosts }} hosts){% endif %}{% elif check.polymorphic_ctype.model == 'icmpstatuscheck' %}ICMP Reply from {{ check.instance_set.all.0.address }}{% elif check.polymorphic_ctype.model == 'httpstatuscheck' %}Status code {{ check.status_code }} from {{ check.endpoint }}{% if check.text_match %}; match text /{{ check.text_match }}/{% endif %}{% elif check.polymorphic_ctype.model == 'jenkinsstatuscheck' %}Monitor job {{ check.name }}{% if check.max_queued_build_time %}; check no build waiting for >{{ check.max_queued_build_time }} minutes{% endif %}{% endif %}
</td>
<td>{{ check.get_importance_display }}</td>
<td>
Expand Down
2 changes: 1 addition & 1 deletion app/templates/cabotapp/_statuscheck_list_instance.html
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ <h3>
</td>
{% endif %}
<td title="">
{% if check.polymorphic_ctype.model == 'graphitestatuscheck' %}{{ check.metric|truncatechars:70 }} {{ check.check_type }} {{ check.value }}{% if check.expected_num_hosts %} (from {{ check.expected_num_hosts }} hosts){% endif %}{% elif check.polymorphic_ctype.model == 'httpstatuscheck' %}Status code {{ check.status_code }} from {{ check.endpoint }}{% if check.text_match %}; match text /{{ check.text_match }}/{% endif %}{% elif check.polymorphic_ctype.model == 'jenkinsstatuscheck' %}Monitor job {{ check.name }}{% if check.max_queued_build_time %}; check no build waiting for >{{ check.max_queued_build_time }} minutes{% endif %}{% endif %}
{% if check.polymorphic_ctype.model == 'graphitestatuscheck' %}{{ check.metric|truncatechars:70 }} {{ check.check_type }} {{ check.value }}{% if check.expected_num_hosts %} (from {{ check.expected_num_hosts }} hosts){% endif %}{% elif check.polymorphic_ctype.model == 'icmpstatuscheck' %}ICMP Reply from {{ check.instance_set.all.0.address }}{% elif check.polymorphic_ctype.model == 'httpstatuscheck' %}Status code {{ check.status_code }} from {{ check.endpoint }}{% if check.text_match %}; match text /{{ check.text_match }}/{% endif %}{% elif check.polymorphic_ctype.model == 'jenkinsstatuscheck' %}Monitor job {{ check.name }}{% if check.max_queued_build_time %}; check no build waiting for >{{ check.max_queued_build_time }} minutes{% endif %}{% endif %}
</td>
<td>{{ check.get_importance_display }}</td>
<td>
Expand Down
2 changes: 1 addition & 1 deletion app/templates/cabotapp/statuscheck_detail.html
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
</td>
<td>{{ result.time_complete }}</td>
<td>{{ result.took }}</td>
<td>{{ result.error|default:"" }}</td>
<td>{% autoescape off %}{{ result.error|default:"" }}{% endautoescape %}</td>
</tr>
{% endfor %}
</tbody>
Expand Down

0 comments on commit 558f18c

Please sign in to comment.