covid_hosp_facility lookup tests passing

krivard · krivard · commit c54719cc11e8 · 2022-11-16T15:30:59.000-05:00
diff --git a/integrations/acquisition/covid_hosp/facility/test_scenarios.py b/integrations/acquisition/covid_hosp/facility/test_scenarios.py
@@ -76,7 +76,7 @@ def test_acquire_dataset(self):
       response = Epidata.covid_hosp_facility(
           '450822', Epidata.range(20200101, 20210101))
       self.assertEqual(response['result'], 1)
-      self.assertEqual(len(response['epidata']), 1)
+      self.assertEqual(len(response['epidata']), 2)
       row = response['epidata'][0]
       for k,v in expected_spotchecks.items():
         self.assertTrue(
@@ -101,9 +101,9 @@ def test_acquire_dataset(self):
       response = Epidata.covid_hosp_facility(
           '450822', Epidata.range(20200101, 20210101))
       self.assertEqual(response['result'], 1)
-      self.assertEqual(len(response['epidata']), 1)
+      self.assertEqual(len(response['epidata']), 2)
 
-  @freeze_time("2021-03-16")
+  @freeze_time("2021-03-17")
   def test_facility_lookup(self):
     """Lookup facilities using various filters."""
 
@@ -188,7 +188,7 @@ def test_facility_lookup(self):
         self.test_utils.load_sample_dataset('dataset_update_facility.csv')
 
     # acquire sample data into local database
-    with self.subTest(name='first acquisition'):
+    with self.subTest(name='second acquisition'):
       acquired = Update.run(network=mock_network)
       self.assertTrue(acquired)
 
diff --git a/src/acquisition/covid_hosp/common/database.py b/src/acquisition/covid_hosp/common/database.py
@@ -177,7 +177,9 @@ def insert_dataset(self, publication_date, dataframe):
       ak_cols = self.AGGREGATE_KEY_COLS
 
       # restrict data to just the key columns and remove duplicate rows
-      ak_data = dataframe[ak_cols].drop_duplicates()
+      ak_data = (dataframe[set(ak_cols + self.KEY_COLS)]
+                 .sort_values(self.KEY_COLS)[ak_cols]
+                 .drop_duplicates())
       # cast types
       dataframe_typemap = {
         name: dtype
@@ -196,13 +198,13 @@ def cast_but_sidestep_nans(i):
       # create string of tick-quoted and comma-seperated column list
       ak_cols_str = ','.join(f'`{col}`' for col in ak_cols)
       # ...and ticked and comma-sep'd "column=column" list for ON UPDATE (to keep only the most recent values for each pk)
-      ak_updates_str = ','.join(f'`{col}`=`{col}`' for col in ak_cols)
+      ak_updates_str = ','.join(f'`{col}`=v.{col}' for col in ak_cols)
       # ...and string of VALUES placeholders
       values_str = ','.join( ['%s'] * len(ak_cols) )
       # use aggregate key table alias
       ak_table = self.table_name + '_key'
       # assemble full SQL statement
-      ak_insert_sql = f'INSERT INTO `{ak_table}` ({ak_cols_str}) VALUES ({values_str}) ON DUPLICATE KEY UPDATE {ak_updates_str}'
+      ak_insert_sql = f'INSERT INTO `{ak_table}` ({ak_cols_str}) VALUES ({values_str}) as v ON DUPLICATE KEY UPDATE {ak_updates_str}'
 
       # commit the data
       with self.new_cursor() as cur:
diff --git a/src/acquisition/covid_hosp/common/utils.py b/src/acquisition/covid_hosp/common/utils.py
@@ -86,6 +86,7 @@ def issues_to_fetch(metadata, newer_than, older_than):
       for issues after newer_than and before older_than
     """
     daily_issues = {}
+    n_beyond = 0
     for index in sorted(set(metadata.index)):
       day = index.date()
       if day > newer_than and day < older_than:
@@ -95,6 +96,10 @@ def issues_to_fetch(metadata, newer_than, older_than):
           daily_issues[day] = urls_list
         else:
           daily_issues[day] += urls_list
+      elif day >= older_than:
+        n_beyond += 1
+    if n_beyond > 0:
+      print(f"{n_beyond} issues available on {older_than} or newer")
     return daily_issues
 
   @staticmethod
diff --git a/testdata/acquisition/covid_hosp/facility/metadata_update_facility.csv b/testdata/acquisition/covid_hosp/facility/metadata_update_facility.csv