Skip to content

Commit b8a408f

Browse files
fix(uptime): Aggregate timeseries data instead of overwriting (#102589)
The uptime stats endpoint was missing OK checks before and after downtime because Snuba returns separate timeseries for each (check_status, incident_status) combination. When processing these timeseries, the code was overwriting values instead of aggregating them, causing checks with NO_INCIDENT status to be erased by empty buckets from IN_INCIDENT timeseries. Changed the value assignment to use addition (+=) so that success checks from both incident states are properly aggregated into the final timeline.
1 parent 3cb3cfc commit b8a408f

File tree

2 files changed

+94
-1
lines changed

2 files changed

+94
-1
lines changed

src/sentry/uptime/endpoints/organization_uptime_stats.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,9 @@ def _format_response(
200200

201201
for bucket, data_point in zip(timeseries.buckets, timeseries.data_points):
202202
value = int(data_point.data) if data_point.data_present else 0
203-
formatted_data[subscription_id][bucket.seconds][status] = value
203+
# Add to existing value instead of overwriting, since multiple timeseries
204+
# may contribute to the same status (e.g., success with different incident_status values)
205+
formatted_data[subscription_id][bucket.seconds][status] += value
204206

205207
final_data: dict[str, list[tuple[int, dict[str, int]]]] = {}
206208
for subscription_id, timestamps in formatted_data.items():

tests/sentry/uptime/endpoints/test_organization_uptime_stats.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,3 +212,94 @@ def test_detector_ids_with_eap(self) -> None:
212212
"success": 1,
213213
"missed_window": 0,
214214
}
215+
216+
def test_missing_ok_checks_around_downtime(self) -> None:
217+
"""
218+
Test that OK checks before and after downtime are included in the timeline.
219+
220+
Reproduces the bug where OK checks with NO_INCIDENT status were being overwritten
221+
by checks with IN_INCIDENT status in the same time buckets.
222+
223+
Timeline:
224+
- 2 OK checks before incident (NO_INCIDENT)
225+
- 1 failure (NO_INCIDENT, failure threshold not met)
226+
- 1 failure (IN_INCIDENT, failure threshold met, downtime starts)
227+
- 2 OK checks during recovery (IN_INCIDENT, recovery threshold not met)
228+
- 2 OK checks after recovery (NO_INCIDENT, recovery threshold met)
229+
"""
230+
detector_subscription_id = uuid.uuid4().hex
231+
uptime_subscription = self.create_uptime_subscription(
232+
url="https://test-downtime.com", subscription_id=detector_subscription_id
233+
)
234+
detector = self.create_uptime_detector(
235+
uptime_subscription=uptime_subscription,
236+
downtime_threshold=2,
237+
recovery_threshold=2,
238+
)
239+
240+
base_time = datetime(2025, 10, 29, 13, 30, 0, tzinfo=timezone.utc)
241+
242+
test_scenarios = [
243+
# 2 OK checks before incident
244+
(base_time, "success", IncidentStatus.NO_INCIDENT),
245+
(base_time + timedelta(minutes=1), "success", IncidentStatus.NO_INCIDENT),
246+
# First failure (failure threshold = 2, not yet downtime)
247+
(base_time + timedelta(minutes=2), "failure", IncidentStatus.NO_INCIDENT),
248+
# Second failure (failure threshold met, downtime starts)
249+
(base_time + timedelta(minutes=3), "failure", IncidentStatus.IN_INCIDENT),
250+
# 2 OK checks during recovery (still IN_INCIDENT)
251+
(base_time + timedelta(minutes=4), "success", IncidentStatus.IN_INCIDENT),
252+
(base_time + timedelta(minutes=5), "success", IncidentStatus.IN_INCIDENT),
253+
# 2 OK checks after recovery
254+
(base_time + timedelta(minutes=6), "success", IncidentStatus.NO_INCIDENT),
255+
(base_time + timedelta(minutes=7), "success", IncidentStatus.NO_INCIDENT),
256+
]
257+
258+
uptime_results = [
259+
self.create_eap_uptime_result(
260+
subscription_id=uuid.UUID(detector_subscription_id).hex,
261+
guid=uuid.UUID(detector_subscription_id).hex,
262+
request_url="https://test-downtime.com",
263+
scheduled_check_time=scheduled_time,
264+
check_status=check_status,
265+
incident_status=incident_status,
266+
)
267+
for scheduled_time, check_status, incident_status in test_scenarios
268+
]
269+
self.store_uptime_results(uptime_results)
270+
271+
start_time = base_time
272+
end_time = base_time + timedelta(minutes=8)
273+
274+
with self.feature(self.features):
275+
response = self.get_success_response(
276+
self.organization.slug,
277+
project=[self.project.id],
278+
uptimeDetectorId=[str(detector.id)],
279+
since=start_time.timestamp(),
280+
until=end_time.timestamp(),
281+
resolution="1m",
282+
)
283+
data = json.loads(json.dumps(response.data))
284+
timeline = data[str(detector.id)]
285+
286+
assert len(timeline) == 8, f"Expected 8 buckets, got {len(timeline)}"
287+
288+
# Buckets 0-1: OK checks before incident
289+
assert timeline[0][1]["success"] == 1, "First check should be success"
290+
assert timeline[1][1]["success"] == 1, "Second check should be success"
291+
292+
# Bucket 2: First failure (threshold not met)
293+
assert timeline[2][1]["failure"] == 1, "Third check should be failure"
294+
assert timeline[2][1]["failure_incident"] == 0
295+
296+
# Bucket 3: Second failure (threshold met, downtime starts)
297+
assert timeline[3][1]["failure_incident"] == 1, "Fourth check should be failure_incident"
298+
299+
# Buckets 4-5: OK checks during recovery (still IN_INCIDENT)
300+
assert timeline[4][1]["success"] == 1, "Fifth check should be success"
301+
assert timeline[5][1]["success"] == 1, "Sixth check should be success"
302+
303+
# Buckets 6-7: OK checks after recovery
304+
assert timeline[6][1]["success"] == 1, "Seventh check should be success"
305+
assert timeline[7][1]["success"] == 1, "Eighth check should be success"

0 commit comments

Comments
 (0)