Skip to content
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 189 additions & 0 deletions ureport/stats/migrations/0033_backfill_poll_stats_counters_dedupes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# Generated by Django 5.2.8 on 2025-12-01 13:54
import time
from datetime import timedelta

from django.core.cache import cache
from django.db import migrations
from django.db.models import Count
from django.utils import timezone

from ureport.utils import chunk_list


def noop(apps, schema_editor): # pragma: no cover
pass


def dedupe_poll_stats_by_questions(apps, schema_editor): # pragma: no cover
Copy link

Copilot AI Dec 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The function name dedupe_poll_stats_by_questions is misleading. The function doesn't deduplicate by questions - it removes duplicate PollStats entries for a flow result and sets the remaining entry's question to None. A more accurate name would be remove_duplicate_poll_stats_for_flow_results or normalize_poll_stats_questions.

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Dec 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The dedupe_poll_stats_by_questions function lacks documentation explaining its purpose and the deduplication logic. Given the complexity of the operation (removing duplicate PollStats entries and nullifying question references), a docstring explaining why this is necessary and what the expected outcome is would improve maintainability.

Suggested change
def dedupe_poll_stats_by_questions(apps, schema_editor): # pragma: no cover
def dedupe_poll_stats_by_questions(apps, schema_editor): # pragma: no cover
"""
Deduplicate PollStats entries for FlowResults with multiple associated PollQuestions.
For each FlowResult that is linked to two or more PollQuestions, this function:
1. Deletes all PollStats entries for that FlowResult except the one associated with the first PollQuestion.
2. Sets the question field to None for the remaining PollStats entry.
This is necessary to ensure that each FlowResult has at most one PollStats entry,
and that the PollStats entry is not ambiguously linked to a specific PollQuestion.
The expected outcome is that there are no duplicate PollStats per FlowResult,
and all remaining PollStats have question=None.
"""

Copilot uses AI. Check for mistakes.
PollStats = apps.get_model("stats", "PollStats")
FlowResult = apps.get_model("flows", "FlowResult")

results_with_duplicate_questions = (
FlowResult.objects.all().annotate(qs_count=Count("pollquestion")).filter(qs_count__gte=2).order_by("id")
)

for flow_result in results_with_duplicate_questions:
questions = flow_result.pollquestion_set.all()
first_question = questions[0]
stats = (
PollStats.objects.exclude(question=None).filter(flow_result=flow_result).exclude(question=first_question)
)
if stats.exists():
print(f"Found {stats.count()} PollStats duplicates for FlowResult ID {flow_result.id}")
stats.delete()
else:
print(f"No duplicate PollStats for FlowResult ID {flow_result.id}")
updated = (
PollStats.objects.exclude(question=None)
.filter(flow_result=flow_result)
.filter(question=first_question)
.update(question=None)
)
print(f"Deduped PollStats for FlowResult ID {flow_result.id}, updated {updated} entries")
print("======================")


def backfill_poll_stats_counters(apps, schema_editor): # pragma: no cover
PollStats = apps.get_model("stats", "PollStats")

GenderSegment = apps.get_model("stats", "GenderSegment")
Comment on lines +49 to +51
Copy link

Copilot AI Dec 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The backfill_poll_stats_counters function lacks documentation explaining its purpose, parameters, and the complex backfilling logic. Given the function's complexity (handling segments, locations, scopes, caching), a comprehensive docstring would significantly improve maintainability.

Suggested change
PollStats = apps.get_model("stats", "PollStats")
GenderSegment = apps.get_model("stats", "GenderSegment")
"""
Backfills PollStatsCounter and PollEngagementDailyCount data for existing PollStats entries.
This migration function iterates over all PollStats records and reconstructs the associated
counter and engagement daily count data, ensuring that statistics are correctly populated
for all relevant segments (gender, age, scheme), locations (state, district, ward), and
scopes (overall, segmented, location-specific).
The function:
- Retrieves all necessary models using the provided `apps` registry.
- Iterates through PollStats records, handling each according to its segmentation and location.
- Handles deduplication and aggregation logic for segments and locations.
- Utilizes caching to optimize repeated lookups and avoid redundant database queries.
- Ensures that counters and engagement counts are created or updated as needed.
Args:
apps: The Django app registry for migrations, used to get historical models.
schema_editor: The database schema editor (unused, but required by Django migration API).
This function is intended to be run as part of a Django data migration and should not be called directly.
"""
PollStats = apps.get_model("stats", "PollStats")

Copilot uses AI. Check for mistakes.
AgeSegment = apps.get_model("stats", "AgeSegment")
SchemeSegment = apps.get_model("stats", "SchemeSegment")

PollStatsCounter = apps.get_model("stats", "PollStatsCounter")
PollEngagementDailyCount = apps.get_model("stats", "PollEngagementDailyCount")

Poll = apps.get_model("polls", "Poll")
PollQuestion = apps.get_model("polls", "PollQuestion")
Boundary = apps.get_model("locations", "Boundary")
Copy link

Copilot AI Dec 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The magic numbers STATE_LEVEL = 1, DISTRICT_LEVEL = 2, and WARD_LEVEL = 3 are defined directly in the migration function. If these constants are defined elsewhere in the codebase (e.g., in a model or constants file), they should be referenced from there to maintain consistency. If the source model or constants change, this migration could have incorrect hardcoded values.

Consider documenting why these values are hardcoded here if they cannot be imported from the model definition.

Suggested change
Boundary = apps.get_model("locations", "Boundary")
Boundary = apps.get_model("locations", "Boundary")
# These values are hardcoded here because Django migrations should not import from application code.
# If the values of STATE_LEVEL, DISTRICT_LEVEL, or WARD_LEVEL change in the source models/constants,
# this migration may need to be updated accordingly.

Copilot uses AI. Check for mistakes.
STATE_LEVEL = 1
DISTRICT_LEVEL = 2
WARD_LEVEL = 3

stopped_polls = list(Poll.objects.filter(stopped_syncing=True).values_list("id", flat=True))
flow_result_ids = list(
PollQuestion.objects.filter(poll_id__in=stopped_polls).values_list("flow_result_id", flat=True)
)

last_backfilled_poll_stats_id_key = "deduped_migrations_backfilled_poll_stats_last_id"
last_id = cache.get(last_backfilled_poll_stats_id_key, 0)

if last_id == 0:
poll_stats_counters_ids = PollStatsCounter.objects.filter(flow_result_id__in=flow_result_ids)
poll_stats_counters_ids = list(poll_stats_counters_ids.values_list("pk", flat=True))

poll_stats_counters_ids_count = len(poll_stats_counters_ids)

for batch in chunk_list(poll_stats_counters_ids, 1000):
PollStatsCounter.objects.filter(pk__in=batch).delete()

poll_engagement_daily_count_ids = PollEngagementDailyCount.objects.filter(flow_result_id__in=flow_result_ids)
poll_engagement_daily_count_ids = list(poll_engagement_daily_count_ids.values_list("pk", flat=True))

poll_engagement_daily_count_ids_count = len(poll_engagement_daily_count_ids)

for batch in chunk_list(poll_engagement_daily_count_ids, 1000):
PollEngagementDailyCount.objects.filter(pk__in=batch).delete()
print(
"Backfill for the first time, Deleted %d PollStatsCounter and %d PollEngagementDailyCount entries"
% (poll_stats_counters_ids_count, poll_engagement_daily_count_ids_count)
)

poll_stat_ids = list(
PollStats.objects.filter(flow_result_id__in=flow_result_ids, id__gt=last_id)
.order_by("id")
.values_list("id", flat=True)
)
total = len(poll_stat_ids)
print(f"Total PollStats to migrate: {total}")
start_time = time.time()

gender_dict = {elt.id: elt.gender.lower() for elt in GenderSegment.objects.all()}
age_dict = {elt.id: elt.min_age for elt in AgeSegment.objects.all()}
scheme_dict = {elt.id: elt.scheme.lower() for elt in SchemeSegment.objects.all()}

boundaries = Boundary.objects.all().select_related("parent__parent")
location_dict = {elt.id: elt for elt in boundaries}

processed = 0
for batch in chunk_list(poll_stat_ids, 1000):
batch_ids = list(batch)
stats = PollStats.objects.filter(id__in=batch_ids)

poll_stats_counter_obj_to_insert = []
poll_engagement_daily_count_obj_to_insert = []

for stat in stats:
stat_counter_kwargs = dict(
org_id=stat.org_id,
flow_result_id=stat.flow_result_id,
flow_result_category_id=stat.flow_result_category_id,
count=stat.count,
)
engagement_counter_kwargs = dict()
if stat.date is not None and stat.date >= (timezone.now() - timedelta(days=400)):
Copy link

Copilot AI Dec 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The magic number 400 for the days threshold is hardcoded without explanation. This threshold determines which poll stats get engagement counters created. Consider adding a comment explaining why 400 days was chosen, or defining it as a named constant (e.g., ENGAGEMENT_TRACKING_DAYS = 400) to improve code clarity.

Copilot uses AI. Check for mistakes.
engagement_counter_kwargs = dict(
org_id=stat.org_id,
flow_result_id=stat.flow_result_id,
is_responded=bool(stat.flow_result_category_id),
day=stat.date.date(),
count=stat.count,
)

scopes = ["all"]
if stat.age_segment_id and age_dict.get(stat.age_segment_id) is not None:
scopes.append(f"age:{age_dict.get(stat.age_segment_id)}")
if stat.gender_segment_id and gender_dict.get(stat.gender_segment_id) is not None:
scopes.append(f"gender:{gender_dict.get(stat.gender_segment_id)}")
if stat.scheme_segment_id and scheme_dict.get(stat.scheme_segment_id) is not None:
scopes.append(f"scheme:{scheme_dict.get(stat.scheme_segment_id)}")
if stat.location_id:
location = location_dict.get(stat.location_id)
if location:
if location.level == WARD_LEVEL:
scopes.append(f"ward:{location.osm_id.upper()}")
if location.parent:
scopes.append(f"district:{location.parent.osm_id.upper()}")
if location.parent and location.parent.parent:
scopes.append(f"state:{location.parent.parent.osm_id.upper()}")
if location.level == DISTRICT_LEVEL:
scopes.append(f"district:{location.osm_id.upper()}")
if location.parent:
scopes.append(f"state:{location.parent.osm_id.upper()}")
if location.level == STATE_LEVEL:
scopes.append(f"state:{location.osm_id.upper()}")

for scope in scopes:
poll_stats_counter_obj_to_insert.append(PollStatsCounter(**stat_counter_kwargs, scope=scope))
if engagement_counter_kwargs and "district:" not in scope and "ward:" not in scope:
poll_engagement_daily_count_obj_to_insert.append(
PollEngagementDailyCount(**engagement_counter_kwargs, scope=scope)
)

PollStatsCounter.objects.bulk_create(poll_stats_counter_obj_to_insert, batch_size=1000)
PollEngagementDailyCount.objects.bulk_create(poll_engagement_daily_count_obj_to_insert, batch_size=1000)

cache.set(last_backfilled_poll_stats_id_key, batch_ids[-1], None)

processed += len(batch_ids)
elapsed = time.time() - start_time
print(f"Backfilled {processed} of {total} PollStats in {elapsed:.1f} seconds")


def apply_manual(): # pragma: no cover
from django.apps import apps

dedupe_poll_stats_by_questions(apps, None)
backfill_poll_stats_counters(apps, None)


class Migration(migrations.Migration):

dependencies = [
("stats", "0032_backfill_poll_stats_counters"),
]

operations = [
migrations.RunPython(dedupe_poll_stats_by_questions, noop),
migrations.RunPython(backfill_poll_stats_counters, noop),
]