diff --git a/sensorsafrica/api/v2/serializers.py b/sensorsafrica/api/v2/serializers.py index 735a2ab..b86e9a4 100644 --- a/sensorsafrica/api/v2/serializers.py +++ b/sensorsafrica/api/v2/serializers.py @@ -2,9 +2,9 @@ class SensorDataStatSerializer(serializers.Serializer): - average = serializers.FloatField() - minimum = serializers.FloatField() - maximum = serializers.FloatField() + calculated_average = serializers.FloatField() + calculated_minimum = serializers.FloatField() + calculated_maximum = serializers.FloatField() value_type = serializers.CharField(max_length=200) start_datetime = serializers.DateTimeField() end_datetime = serializers.DateTimeField() diff --git a/sensorsafrica/api/v2/views.py b/sensorsafrica/api/v2/views.py index 73b6c18..332f494 100644 --- a/sensorsafrica/api/v2/views.py +++ b/sensorsafrica/api/v2/views.py @@ -8,7 +8,7 @@ from django.utils import timezone from dateutil.relativedelta import relativedelta from django.db.models import ExpressionWrapper, F, FloatField, Max, Min, Sum, Avg, Q -from django.db.models.functions import Cast, TruncDate +from django.db.models.functions import Cast, TruncHour, TruncDay, TruncMonth from rest_framework import mixins, pagination, viewsets from ..models import SensorDataStat, LastActiveNodes, City, Node @@ -60,6 +60,7 @@ def get_paginated_response(self, data_stats): # If filtering from a date # We will need to have a list of the value_types e.g. { 'P1': [{}, {}] } from_date = self.request.query_params.get("from", None) + interval = self.request.query_params.get("interval", None) results = {} for data_stat in data_stats: @@ -69,19 +70,19 @@ def get_paginated_response(self, data_stats): if city_slug not in results: results[city_slug] = { "city_slug": city_slug, - value_type: [] if from_date else {}, + value_type: [] if from_date or interval else {}, } if value_type not in results[city_slug]: - results[city_slug][value_type] = [] if from_date else {} + results[city_slug][value_type] = [] if from_date or interval else {} values = results[city_slug][value_type] - include_result = getattr(values, "append" if from_date else "update") + include_result = getattr(values, "append" if from_date or interval else "update") include_result( { - "average": data_stat["average"], - "minimum": data_stat["minimum"], - "maximum": data_stat["maximum"], + "average": data_stat["calculated_average"], + "minimum": data_stat["calculated_minimum"], + "maximum": data_stat["calculated_maximum"], "start_datetime": data_stat["start_datetime"], "end_datetime": data_stat["end_datetime"], } @@ -112,6 +113,7 @@ def get_queryset(self): city_slugs = self.request.query_params.get("city", None) from_date = self.request.query_params.get("from", None) to_date = self.request.query_params.get("to", None) + interval = self.request.query_params.get("interval", None) if to_date and not from_date: raise ValidationError({"from": "Must be provide along with to query"}) @@ -129,43 +131,10 @@ def get_queryset(self): ) if not from_date and not to_date: - return self._retrieve_past_24hrs(city_slugs, filter_value_types) - - return self._retrieve_range(from_date, to_date, city_slugs, filter_value_types) - - @staticmethod - def _retrieve_past_24hrs(city_slugs, filter_value_types): - to_date = timezone.now().replace(minute=0, second=0, microsecond=0) - from_date = to_date - datetime.timedelta(hours=24) - - queryset = SensorDataStat.objects.filter( - value_type__in=filter_value_types, - timestamp__gte=from_date, - timestamp__lte=to_date, - ) - - if city_slugs: - queryset = queryset.filter(city_slug__in=city_slugs.split(",")) - - return ( - queryset.order_by() - .values("value_type", "city_slug") - .annotate( - start_datetime=Min("timestamp"), - end_datetime=Max("timestamp"), - average=ExpressionWrapper( - Sum(F("average") * F("sample_size")) / Sum("sample_size"), - output_field=FloatField(), - ), - minimum=Min("minimum"), - maximum=Max("maximum"), - ) - .order_by("city_slug") - ) - - @staticmethod - def _retrieve_range(from_date, to_date, city_slugs, filter_value_types): - if not to_date: + to_date = timezone.now().replace(minute=0, second=0, microsecond=0) + from_date = to_date - datetime.timedelta(hours=24) + interval = 'day' if not interval else interval + elif not to_date: from_date = beginning_of_day(from_date) # Get data from_date until the end # of day yesterday which is the beginning of today @@ -177,27 +146,47 @@ def _retrieve_range(from_date, to_date, city_slugs, filter_value_types): queryset = SensorDataStat.objects.filter( value_type__in=filter_value_types, timestamp__gte=from_date, - timestamp__lt=to_date, + timestamp__lte=to_date, ) + if interval == 'month': + truncate = TruncMonth("timestamp") + elif interval == 'day': + truncate = TruncDay("timestamp") + else: + truncate = TruncHour("timestamp") + if city_slugs: queryset = queryset.filter(city_slug__in=city_slugs.split(",")) return ( - queryset.annotate(date=TruncDate("timestamp")) - .values("date", "value_type") + queryset + .values( + "value_type", + "city_slug" + ) .annotate( - city_slug=F("city_slug"), + truncated_timestamp=truncate, start_datetime=Min("timestamp"), end_datetime=Max("timestamp"), - average=ExpressionWrapper( + calculated_average=ExpressionWrapper( Sum(F("average") * F("sample_size")) / Sum("sample_size"), output_field=FloatField(), ), - minimum=Min("minimum"), - maximum=Max("maximum"), + calculated_minimum=Min("minimum"), + calculated_maximum=Max("maximum"), + ) + .values( + "value_type", + "city_slug", + "truncated_timestamp", + "start_datetime", + "end_datetime", + "calculated_average", + "calculated_minimum", + "calculated_maximum" ) - .order_by("-date") + .order_by("city_slug", "-truncated_timestamp") ) diff --git a/sensorsafrica/management/commands/calculate_data_statistics.py b/sensorsafrica/management/commands/calculate_data_statistics.py index e49f9cf..f976fcf 100644 --- a/sensorsafrica/management/commands/calculate_data_statistics.py +++ b/sensorsafrica/management/commands/calculate_data_statistics.py @@ -5,6 +5,8 @@ from feinstaub.sensors.models import Node, Sensor, SensorDataValue, SensorLocation from ...api.models import SensorDataStat +from django.core.paginator import Paginator + def map_stat(stat, city): return SensorDataStat( @@ -22,6 +24,12 @@ def map_stat(stat, city): ) +def chunked_iterator(queryset, chunk_size=100): + paginator = Paginator(queryset, chunk_size) + for page in range(1, paginator.num_pages + 1): + yield paginator.page(page).object_list + + class Command(BaseCommand): help = "Calculate and store data statistics" @@ -64,32 +72,29 @@ def handle(self, *args, **options): Q(value__regex=r"^\-?\d+(\.?\d+)?$"), ) - stats = list( + for stats in chunked_iterator( queryset.annotate(timestamp=TruncHour("created")) - .values( - "timestamp", - "value_type", - "sensordata__sensor", - "sensordata__location", - "sensordata__sensor__node", - ) - .order_by() - .annotate( - last_datetime=Max("created"), - average=Avg(Cast("value", FloatField())), - minimum=Min(Cast("value", FloatField())), - maximum=Max(Cast("value", FloatField())), - sample_size=Count("created", FloatField()), - ) - .filter( - ~Q(average=float("NaN")), - ~Q(minimum=float("NaN")), - ~Q(maximum=float("NaN")), - ) - .order_by("-timestamp") - ) - - if len(stats): + .values( + "timestamp", + "value_type", + "sensordata__sensor", + "sensordata__location", + "sensordata__sensor__node", + ) + .order_by() + .annotate( + last_datetime=Max("created"), + average=Avg(Cast("value", FloatField())), + minimum=Min(Cast("value", FloatField())), + maximum=Max(Cast("value", FloatField())), + sample_size=Count("created", FloatField()), + ) + .filter( + ~Q(average=float("NaN")), + ~Q(minimum=float("NaN")), + ~Q(maximum=float("NaN")), + ) + .order_by("-timestamp")): SensorDataStat.objects.bulk_create( list(map(lambda stat: map_stat(stat, city), stats)) ) diff --git a/sensorsafrica/tests/conftest.py b/sensorsafrica/tests/conftest.py index d95fd03..cbef41c 100644 --- a/sensorsafrica/tests/conftest.py +++ b/sensorsafrica/tests/conftest.py @@ -1,4 +1,6 @@ import datetime +import math +from dateutil.relativedelta import relativedelta import pytest from django.core.management import call_command @@ -229,6 +231,33 @@ def additional_sensorsdatastats(sensors, locations, sensorsdatastats): call_command("calculate_data_statistics") +@pytest.fixture +def large_sensorsdatastats(sensors, locations): + + now = timezone.now() + months = 6 + points = math.floor((now - (now - relativedelta(months=months-1))).days * 24 * 60 / 5) + minutes = points * 5 * months + for point in range(1, points): + created_sd = SensorData.objects.create(sensor=sensors[0], location=locations[0]) + created_sv = SensorDataValue.objects.create(sensordata=created_sd, value="4", value_type="P2") + created_sv.update_modified = False + created_sv.created = now - datetime.timedelta(minutes=point * 5) + created_sv.save() + + last_date = created_sv.created + + from django.core.management import call_command + + call_command("calculate_data_statistics") + + return { + 'months': months, + 'minutes': minutes, + 'last_date': last_date + } + + @pytest.fixture def last_active(sensors, locations, sensorsdatastats): timestamps = [ diff --git a/sensorsafrica/tests/test_large_dataset.py b/sensorsafrica/tests/test_large_dataset.py new file mode 100644 index 0000000..3719232 --- /dev/null +++ b/sensorsafrica/tests/test_large_dataset.py @@ -0,0 +1,23 @@ +import datetime + +import pytest +from django.utils import timezone + + +@pytest.mark.django_db +class TestGettingDataFromLargeDataset: + + def test_getting_air_data_on_large_dataset(self, client, large_sensorsdatastats): + response = client.get( + "/v2/data/air/?city=dar-es-salaam&interval=month&from=%s" % + large_sensorsdatastats["last_date"].date(), + format="json", + ) + assert response.status_code == 200 + + data = response.json() + + assert data["count"] == 1 + + assert type(data["results"][0]["P2"]) == list + assert len(data["results"][0]["P2"]) == large_sensorsdatastats["months"] diff --git a/sensorsafrica/tests/test_sensordatastats_view.py b/sensorsafrica/tests/test_sensordatastats_view.py index 1712f8f..8e8dced 100644 --- a/sensorsafrica/tests/test_sensordatastats_view.py +++ b/sensorsafrica/tests/test_sensordatastats_view.py @@ -163,3 +163,35 @@ def test_getting_air_data_now_with_additional_values( assert result["P2"]["maximum"] == 8.0 assert result["P2"]["minimum"] == 0.0 + + def test_getting_air_data_by_hour(self, client, sensorsdatastats): + response = client.get( + "/v2/data/air/?city=dar-es-salaam&interval=hour", + format="json", + ) + assert response.status_code == 200 + + data = response.json() + + assert data["count"] == 1 + + assert type(data["results"][0]["P1"]) == list + assert len(data["results"][0]["P1"]) == 1 + assert type(data["results"][0]["P2"]) == list + assert len(data["results"][0]["P2"]) == 4 + + def test_getting_air_data_by_month(self, client, sensorsdatastats): + response = client.get( + "/v2/data/air/?city=dar-es-salaam&interval=month", + format="json", + ) + assert response.status_code == 200 + + data = response.json() + + assert data["count"] == 1 + + assert type(data["results"][0]["P1"]) == list + assert len(data["results"][0]["P1"]) == 1 + assert type(data["results"][0]["P2"]) == list + assert len(data["results"][0]["P2"]) == 1