Skip to content

Commit b2a13af

Browse files
Improve queries (#61)
* calculate data stats in chunks * Improve query + add more granularity * Add tests * add large dataset testing
1 parent 2aae828 commit b2a13af

File tree

6 files changed

+158
-80
lines changed

6 files changed

+158
-80
lines changed

sensorsafrica/api/v2/serializers.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33

44
class SensorDataStatSerializer(serializers.Serializer):
5-
average = serializers.FloatField()
6-
minimum = serializers.FloatField()
7-
maximum = serializers.FloatField()
5+
calculated_average = serializers.FloatField()
6+
calculated_minimum = serializers.FloatField()
7+
calculated_maximum = serializers.FloatField()
88
value_type = serializers.CharField(max_length=200)
99
start_datetime = serializers.DateTimeField()
1010
end_datetime = serializers.DateTimeField()

sensorsafrica/api/v2/views.py

Lines changed: 41 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from django.utils import timezone
99
from dateutil.relativedelta import relativedelta
1010
from django.db.models import ExpressionWrapper, F, FloatField, Max, Min, Sum, Avg, Q
11-
from django.db.models.functions import Cast, TruncDate
11+
from django.db.models.functions import Cast, TruncHour, TruncDay, TruncMonth
1212
from rest_framework import mixins, pagination, viewsets
1313

1414
from ..models import SensorDataStat, LastActiveNodes, City, Node
@@ -60,6 +60,7 @@ def get_paginated_response(self, data_stats):
6060
# If filtering from a date
6161
# We will need to have a list of the value_types e.g. { 'P1': [{}, {}] }
6262
from_date = self.request.query_params.get("from", None)
63+
interval = self.request.query_params.get("interval", None)
6364

6465
results = {}
6566
for data_stat in data_stats:
@@ -69,19 +70,19 @@ def get_paginated_response(self, data_stats):
6970
if city_slug not in results:
7071
results[city_slug] = {
7172
"city_slug": city_slug,
72-
value_type: [] if from_date else {},
73+
value_type: [] if from_date or interval else {},
7374
}
7475

7576
if value_type not in results[city_slug]:
76-
results[city_slug][value_type] = [] if from_date else {}
77+
results[city_slug][value_type] = [] if from_date or interval else {}
7778

7879
values = results[city_slug][value_type]
79-
include_result = getattr(values, "append" if from_date else "update")
80+
include_result = getattr(values, "append" if from_date or interval else "update")
8081
include_result(
8182
{
82-
"average": data_stat["average"],
83-
"minimum": data_stat["minimum"],
84-
"maximum": data_stat["maximum"],
83+
"average": data_stat["calculated_average"],
84+
"minimum": data_stat["calculated_minimum"],
85+
"maximum": data_stat["calculated_maximum"],
8586
"start_datetime": data_stat["start_datetime"],
8687
"end_datetime": data_stat["end_datetime"],
8788
}
@@ -112,6 +113,7 @@ def get_queryset(self):
112113
city_slugs = self.request.query_params.get("city", None)
113114
from_date = self.request.query_params.get("from", None)
114115
to_date = self.request.query_params.get("to", None)
116+
interval = self.request.query_params.get("interval", None)
115117

116118
if to_date and not from_date:
117119
raise ValidationError({"from": "Must be provide along with to query"})
@@ -129,43 +131,10 @@ def get_queryset(self):
129131
)
130132

131133
if not from_date and not to_date:
132-
return self._retrieve_past_24hrs(city_slugs, filter_value_types)
133-
134-
return self._retrieve_range(from_date, to_date, city_slugs, filter_value_types)
135-
136-
@staticmethod
137-
def _retrieve_past_24hrs(city_slugs, filter_value_types):
138-
to_date = timezone.now().replace(minute=0, second=0, microsecond=0)
139-
from_date = to_date - datetime.timedelta(hours=24)
140-
141-
queryset = SensorDataStat.objects.filter(
142-
value_type__in=filter_value_types,
143-
timestamp__gte=from_date,
144-
timestamp__lte=to_date,
145-
)
146-
147-
if city_slugs:
148-
queryset = queryset.filter(city_slug__in=city_slugs.split(","))
149-
150-
return (
151-
queryset.order_by()
152-
.values("value_type", "city_slug")
153-
.annotate(
154-
start_datetime=Min("timestamp"),
155-
end_datetime=Max("timestamp"),
156-
average=ExpressionWrapper(
157-
Sum(F("average") * F("sample_size")) / Sum("sample_size"),
158-
output_field=FloatField(),
159-
),
160-
minimum=Min("minimum"),
161-
maximum=Max("maximum"),
162-
)
163-
.order_by("city_slug")
164-
)
165-
166-
@staticmethod
167-
def _retrieve_range(from_date, to_date, city_slugs, filter_value_types):
168-
if not to_date:
134+
to_date = timezone.now().replace(minute=0, second=0, microsecond=0)
135+
from_date = to_date - datetime.timedelta(hours=24)
136+
interval = 'day' if not interval else interval
137+
elif not to_date:
169138
from_date = beginning_of_day(from_date)
170139
# Get data from_date until the end
171140
# of day yesterday which is the beginning of today
@@ -177,27 +146,47 @@ def _retrieve_range(from_date, to_date, city_slugs, filter_value_types):
177146
queryset = SensorDataStat.objects.filter(
178147
value_type__in=filter_value_types,
179148
timestamp__gte=from_date,
180-
timestamp__lt=to_date,
149+
timestamp__lte=to_date,
181150
)
182151

152+
if interval == 'month':
153+
truncate = TruncMonth("timestamp")
154+
elif interval == 'day':
155+
truncate = TruncDay("timestamp")
156+
else:
157+
truncate = TruncHour("timestamp")
158+
183159
if city_slugs:
184160
queryset = queryset.filter(city_slug__in=city_slugs.split(","))
185161

186162
return (
187-
queryset.annotate(date=TruncDate("timestamp"))
188-
.values("date", "value_type")
163+
queryset
164+
.values(
165+
"value_type",
166+
"city_slug"
167+
)
189168
.annotate(
190-
city_slug=F("city_slug"),
169+
truncated_timestamp=truncate,
191170
start_datetime=Min("timestamp"),
192171
end_datetime=Max("timestamp"),
193-
average=ExpressionWrapper(
172+
calculated_average=ExpressionWrapper(
194173
Sum(F("average") * F("sample_size")) / Sum("sample_size"),
195174
output_field=FloatField(),
196175
),
197-
minimum=Min("minimum"),
198-
maximum=Max("maximum"),
176+
calculated_minimum=Min("minimum"),
177+
calculated_maximum=Max("maximum"),
178+
)
179+
.values(
180+
"value_type",
181+
"city_slug",
182+
"truncated_timestamp",
183+
"start_datetime",
184+
"end_datetime",
185+
"calculated_average",
186+
"calculated_minimum",
187+
"calculated_maximum"
199188
)
200-
.order_by("-date")
189+
.order_by("city_slug", "-truncated_timestamp")
201190
)
202191

203192

sensorsafrica/management/commands/calculate_data_statistics.py

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from feinstaub.sensors.models import Node, Sensor, SensorDataValue, SensorLocation
66
from ...api.models import SensorDataStat
77

8+
from django.core.paginator import Paginator
9+
810

911
def map_stat(stat, city):
1012
return SensorDataStat(
@@ -22,6 +24,12 @@ def map_stat(stat, city):
2224
)
2325

2426

27+
def chunked_iterator(queryset, chunk_size=100):
28+
paginator = Paginator(queryset, chunk_size)
29+
for page in range(1, paginator.num_pages + 1):
30+
yield paginator.page(page).object_list
31+
32+
2533
class Command(BaseCommand):
2634
help = "Calculate and store data statistics"
2735

@@ -64,32 +72,29 @@ def handle(self, *args, **options):
6472
Q(value__regex=r"^\-?\d+(\.?\d+)?$"),
6573
)
6674

67-
stats = list(
75+
for stats in chunked_iterator(
6876
queryset.annotate(timestamp=TruncHour("created"))
69-
.values(
70-
"timestamp",
71-
"value_type",
72-
"sensordata__sensor",
73-
"sensordata__location",
74-
"sensordata__sensor__node",
75-
)
76-
.order_by()
77-
.annotate(
78-
last_datetime=Max("created"),
79-
average=Avg(Cast("value", FloatField())),
80-
minimum=Min(Cast("value", FloatField())),
81-
maximum=Max(Cast("value", FloatField())),
82-
sample_size=Count("created", FloatField()),
83-
)
84-
.filter(
85-
~Q(average=float("NaN")),
86-
~Q(minimum=float("NaN")),
87-
~Q(maximum=float("NaN")),
88-
)
89-
.order_by("-timestamp")
90-
)
91-
92-
if len(stats):
77+
.values(
78+
"timestamp",
79+
"value_type",
80+
"sensordata__sensor",
81+
"sensordata__location",
82+
"sensordata__sensor__node",
83+
)
84+
.order_by()
85+
.annotate(
86+
last_datetime=Max("created"),
87+
average=Avg(Cast("value", FloatField())),
88+
minimum=Min(Cast("value", FloatField())),
89+
maximum=Max(Cast("value", FloatField())),
90+
sample_size=Count("created", FloatField()),
91+
)
92+
.filter(
93+
~Q(average=float("NaN")),
94+
~Q(minimum=float("NaN")),
95+
~Q(maximum=float("NaN")),
96+
)
97+
.order_by("-timestamp")):
9398
SensorDataStat.objects.bulk_create(
9499
list(map(lambda stat: map_stat(stat, city), stats))
95100
)

sensorsafrica/tests/conftest.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import datetime
2+
import math
3+
from dateutil.relativedelta import relativedelta
24

35
import pytest
46
from django.core.management import call_command
@@ -229,6 +231,33 @@ def additional_sensorsdatastats(sensors, locations, sensorsdatastats):
229231
call_command("calculate_data_statistics")
230232

231233

234+
@pytest.fixture
235+
def large_sensorsdatastats(sensors, locations):
236+
237+
now = timezone.now()
238+
months = 6
239+
points = math.floor((now - (now - relativedelta(months=months-1))).days * 24 * 60 / 5)
240+
minutes = points * 5 * months
241+
for point in range(1, points):
242+
created_sd = SensorData.objects.create(sensor=sensors[0], location=locations[0])
243+
created_sv = SensorDataValue.objects.create(sensordata=created_sd, value="4", value_type="P2")
244+
created_sv.update_modified = False
245+
created_sv.created = now - datetime.timedelta(minutes=point * 5)
246+
created_sv.save()
247+
248+
last_date = created_sv.created
249+
250+
from django.core.management import call_command
251+
252+
call_command("calculate_data_statistics")
253+
254+
return {
255+
'months': months,
256+
'minutes': minutes,
257+
'last_date': last_date
258+
}
259+
260+
232261
@pytest.fixture
233262
def last_active(sensors, locations, sensorsdatastats):
234263
timestamps = [
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import datetime
2+
3+
import pytest
4+
from django.utils import timezone
5+
6+
7+
@pytest.mark.django_db
8+
class TestGettingDataFromLargeDataset:
9+
10+
def test_getting_air_data_on_large_dataset(self, client, large_sensorsdatastats):
11+
response = client.get(
12+
"/v2/data/air/?city=dar-es-salaam&interval=month&from=%s" %
13+
large_sensorsdatastats["last_date"].date(),
14+
format="json",
15+
)
16+
assert response.status_code == 200
17+
18+
data = response.json()
19+
20+
assert data["count"] == 1
21+
22+
assert type(data["results"][0]["P2"]) == list
23+
assert len(data["results"][0]["P2"]) == large_sensorsdatastats["months"]

sensorsafrica/tests/test_sensordatastats_view.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,3 +163,35 @@ def test_getting_air_data_now_with_additional_values(
163163

164164
assert result["P2"]["maximum"] == 8.0
165165
assert result["P2"]["minimum"] == 0.0
166+
167+
def test_getting_air_data_by_hour(self, client, sensorsdatastats):
168+
response = client.get(
169+
"/v2/data/air/?city=dar-es-salaam&interval=hour",
170+
format="json",
171+
)
172+
assert response.status_code == 200
173+
174+
data = response.json()
175+
176+
assert data["count"] == 1
177+
178+
assert type(data["results"][0]["P1"]) == list
179+
assert len(data["results"][0]["P1"]) == 1
180+
assert type(data["results"][0]["P2"]) == list
181+
assert len(data["results"][0]["P2"]) == 4
182+
183+
def test_getting_air_data_by_month(self, client, sensorsdatastats):
184+
response = client.get(
185+
"/v2/data/air/?city=dar-es-salaam&interval=month",
186+
format="json",
187+
)
188+
assert response.status_code == 200
189+
190+
data = response.json()
191+
192+
assert data["count"] == 1
193+
194+
assert type(data["results"][0]["P1"]) == list
195+
assert len(data["results"][0]["P1"]) == 1
196+
assert type(data["results"][0]["P2"]) == list
197+
assert len(data["results"][0]["P2"]) == 1

0 commit comments

Comments
 (0)