cmu-delphi
diff --git a/‎.bumpversion.cfg
+1-1 b/‎.bumpversion.cfg
+1-1
diff --git a/‎dev/local/Makefile
+14 b/‎dev/local/Makefile
+14
diff --git a/‎integrations/acquisition/covid_hosp/facility/test_scenarios.py
+27-10 b/‎integrations/acquisition/covid_hosp/facility/test_scenarios.py
+27-10
diff --git a/‎src/acquisition/covid_hosp/common/database.py
+57-9 b/‎src/acquisition/covid_hosp/common/database.py
+57-9
diff --git a/‎src/acquisition/covid_hosp/common/utils.py
+8-2 b/‎src/acquisition/covid_hosp/common/utils.py
+8-2
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.1
+current_version = 0.4.2
 commit = False
 tag = False
 
 
@@ -139,6 +139,20 @@ test:
 		--env "FLASK_SECRET=abc" \
 		delphi_web_python python -m pytest --import-mode importlib $(pdb) $(test) | tee test_output_$(NOW).log
 
+.PHONY=bash
+bash:
+	@docker run -it --rm --network delphi-net \
+		--mount type=bind,source=$(CWD)repos/delphi/delphi-epidata,target=/usr/src/app/repos/delphi/delphi-epidata,readonly \
+		--mount type=bind,source=$(CWD)repos/delphi/delphi-epidata/src,target=/usr/src/app/delphi/epidata,readonly \
+		--env "SQLALCHEMY_DATABASE_URI=mysql+mysqldb://user:pass@delphi_database_epidata:3306/epidata" \
+		--env "FLASK_SECRET=abc" \
+		delphi_web_python bash
+
+.PHONY=sql
+sql:
+	@docker run --rm -it --network delphi-net --cap-add=sys_nice \
+		percona mysql --user=user --password=pass --port 3306 --host delphi_database_epidata epidata
+
 .PHONY=clean
 clean:
 	@docker images -f "dangling=true" -q | xargs docker rmi >/dev/null 2>&1
@@ -76,7 +76,7 @@ def test_acquire_dataset(self):
       response = Epidata.covid_hosp_facility(
           '450822', Epidata.range(20200101, 20210101))
       self.assertEqual(response['result'], 1)
-      self.assertEqual(len(response['epidata']), 1)
+      self.assertEqual(len(response['epidata']), 2)
       row = response['epidata'][0]
       for k,v in expected_spotchecks.items():
         self.assertTrue(
@@ -101,9 +101,9 @@ def test_acquire_dataset(self):
       response = Epidata.covid_hosp_facility(
           '450822', Epidata.range(20200101, 20210101))
       self.assertEqual(response['result'], 1)
-      self.assertEqual(len(response['epidata']), 1)
+      self.assertEqual(len(response['epidata']), 2)
 
-  @freeze_time("2021-03-16")
+  @freeze_time("2021-03-17")
   def test_facility_lookup(self):
     """Lookup facilities using various filters."""
 
@@ -120,7 +120,7 @@ def test_facility_lookup(self):
       self.assertTrue(acquired)
 
     # texas ground truth, sorted by `hospital_pk`
-    # see sample data at testdata/acquisition/covid_hosp/facility/dataset_old.csv
+    # see sample data at testdata/acquisition/covid_hosp/facility/dataset.csv
     texas_hospitals = [{
       'hospital_pk': '450771',
       'state': 'TX',
@@ -139,7 +139,7 @@ def test_facility_lookup(self):
       'hospital_name': 'MEDICAL CITY LAS COLINAS',
       'address': '6800 N MACARTHUR BLVD',
       'city': 'IRVING',
-      'zip': '75039',
+      'zip': '77777', # most-recent collection week should take precedence
       'hospital_subtype': 'Short Term',
       'fips_code': '48113',
       'is_metro_micro': 1,
@@ -150,7 +150,7 @@ def test_facility_lookup(self):
       'hospital_name': 'RANKIN HOSPITAL MEDICAL CLINIC',
       'address': '1611 SPUR 576',
       'city': 'RANKIN',
-      'zip': '79778',
+      'zip': '99999', # most-recent collection week should take precedence
       'hospital_subtype': 'Critical Access Hospitals',
       'fips_code': '48461',
       'is_metro_micro': 0,
@@ -160,16 +160,16 @@ def test_facility_lookup(self):
       response = Epidata.covid_hosp_facility_lookup(state='tx')
       self.assertEqual(response['epidata'], texas_hospitals)
 
-    with self.subTest(name='by ccn'):
-      response = Epidata.covid_hosp_facility_lookup(ccn='450771')
+    with self.subTest(name='by zip'):
+      response = Epidata.covid_hosp_facility_lookup(zip='75093')
       self.assertEqual(response['epidata'], texas_hospitals[0:1])
 
     with self.subTest(name='by city'):
       response = Epidata.covid_hosp_facility_lookup(city='irving')
       self.assertEqual(response['epidata'], texas_hospitals[1:2])
 
-    with self.subTest(name='by zip'):
-      response = Epidata.covid_hosp_facility_lookup(zip='79778')
+    with self.subTest(name='by ccn'):
+      response = Epidata.covid_hosp_facility_lookup(ccn='451329')
       self.assertEqual(response['epidata'], texas_hospitals[2:3])
 
     with self.subTest(name='by fips_code'):
@@ -179,3 +179,20 @@ def test_facility_lookup(self):
     with self.subTest(name='no results'):
       response = Epidata.covid_hosp_facility_lookup(state='not a state')
       self.assertEqual(response['result'], -2)
+
+    # update facility info
+    mock_network = MagicMock()
+    mock_network.fetch_metadata.return_value = \
+        self.test_utils.load_sample_metadata('metadata_update_facility.csv')
+    mock_network.fetch_dataset.return_value = \
+        self.test_utils.load_sample_dataset('dataset_update_facility.csv')
+
+    # acquire sample data into local database
+    with self.subTest(name='second acquisition'):
+      acquired = Update.run(network=mock_network)
+      self.assertTrue(acquired)
+
+    texas_hospitals[1]['zip'] = '88888'
+    with self.subTest(name='by city after update'):
+      response = Epidata.covid_hosp_facility_lookup(city='irving')
+      self.assertEqual(response['epidata'], texas_hospitals[1:2])
@@ -1,6 +1,7 @@
 """Common database code used by multiple `covid_hosp` scrapers."""
 
 # standard library
+from collections import namedtuple
 from contextlib import contextmanager
 import math
 
@@ -11,13 +12,15 @@
 # first party
 import delphi.operations.secrets as secrets
 
+Columndef = namedtuple("Columndef", "csv_name sql_name dtype")
 
 class Database:
 
   def __init__(self,
                connection,
                table_name=None,
                columns_and_types=None,
+               key_columns=None,
                additional_fields=None):
     """Create a new Database object.
 
@@ -39,7 +42,11 @@ def __init__(self,
     self.table_name = table_name
     self.publication_col_name = "issue" if table_name == 'covid_hosp_state_timeseries' else \
       'publication_date'
-    self.columns_and_types = columns_and_types
+    self.columns_and_types = {
+      c.csv_name: c
+      for c in (columns_and_types if columns_and_types is not None else [])
+    }
+    self.key_columns = key_columns if key_columns is not None else []
     self.additional_fields = additional_fields if additional_fields is not None else []
 
   @classmethod
@@ -151,26 +158,67 @@ def insert_dataset(self, publication_date, dataframe):
       The dataset.
     """
     dataframe_columns_and_types = [
-      x for x in self.columns_and_types if x[0] in dataframe.columns
+      x for x in self.columns_and_types.values() if x.csv_name in dataframe.columns
     ]
+
+    def nan_safe_dtype(dtype, value):
+      if isinstance(value, float) and math.isnan(value):
+        return None
+      return dtype(value)
+
+    # first convert keys and save the results; we'll need them later
+    for csv_name in self.key_columns:
+      dataframe.loc[:, csv_name] = dataframe[csv_name].map(self.columns_and_types[csv_name].dtype)
+
     num_columns = 2 + len(dataframe_columns_and_types) + len(self.additional_fields)
     value_placeholders = ', '.join(['%s'] * num_columns)
-    columns = ', '.join(f'`{i[1]}`' for i in dataframe_columns_and_types + self.additional_fields)
+    columns = ', '.join(f'`{i.sql_name}`' for i in dataframe_columns_and_types + self.additional_fields)
     sql = f'INSERT INTO `{self.table_name}` (`id`, `{self.publication_col_name}`, {columns}) ' \
           f'VALUES ({value_placeholders})'
     id_and_publication_date = (0, publication_date)
     with self.new_cursor() as cursor:
       for _, row in dataframe.iterrows():
         values = []
-        for name, _, dtype in dataframe_columns_and_types:
-          if isinstance(row[name], float) and math.isnan(row[name]):
-            values.append(None)
-          else:
-            values.append(dtype(row[name]))
+        for c in dataframe_columns_and_types:
+          values.append(nan_safe_dtype(c.dtype, row[c.csv_name]))
         cursor.execute(sql,
                        id_and_publication_date +
                        tuple(values) +
-                       tuple(i[0] for i in self.additional_fields))
+                       tuple(i.csv_name for i in self.additional_fields))
+
+    # deal with non/seldomly updated columns used like a fk table (if this database needs it)
+    if hasattr(self, 'AGGREGATE_KEY_COLS'):
+      ak_cols = self.AGGREGATE_KEY_COLS
+
+      # restrict data to just the key columns and remove duplicate rows
+      # sort by key columns to ensure that the last ON DUPLICATE KEY overwrite
+      # uses the most-recent aggregate key information
+      ak_data = (dataframe[set(ak_cols + self.key_columns)]
+                 .sort_values(self.key_columns)[ak_cols]
+                 .drop_duplicates())
+      # cast types
+      for col in ak_cols:
+          ak_data[col] = ak_data[col].map(
+            lambda value: nan_safe_dtype(self.columns_and_types[col].dtype, value)
+          )
+      # fix NULLs
+      ak_data = ak_data.to_numpy(na_value=None).tolist()
+
+      # create string of tick-quoted and comma-seperated column list
+      ak_cols_str = ','.join(f'`{col}`' for col in ak_cols)
+      # ...and ticked and comma-sep'd "column=column" list for ON UPDATE (to keep only the most recent values for each pk)
+      ak_updates_str = ','.join(f'`{col}`=v.{col}' for col in ak_cols)
+      # ...and string of VALUES placeholders
+      values_str = ','.join( ['%s'] * len(ak_cols) )
+      # use aggregate key table alias
+      ak_table = self.table_name + '_key'
+      # assemble full SQL statement
+      ak_insert_sql = f'INSERT INTO `{ak_table}` ({ak_cols_str}) VALUES ({values_str}) AS v ON DUPLICATE KEY UPDATE {ak_updates_str}'
+
+      # commit the data
+      with self.new_cursor() as cur:
+        cur.executemany(ak_insert_sql, ak_data)
+
 
   def get_max_issue(self):
     """Fetch the most recent issue.
 
@@ -35,8 +35,9 @@ def int_from_date(date):
     int
       Date in YYYYMMDD format.
     """
-
-    return int(date[:10].replace('/', '').replace('-', ''))
+    if isinstance(date, str):
+      return int(date[:10].replace('/', '').replace('-', ''))
+    return date
 
   def parse_bool(value):
     """Convert a string to a boolean.
@@ -86,6 +87,7 @@ def issues_to_fetch(metadata, newer_than, older_than):
       for issues after newer_than and before older_than
     """
     daily_issues = {}
+    n_beyond = 0
     for index in sorted(set(metadata.index)):
       day = index.date()
       if day > newer_than and day < older_than:
@@ -95,6 +97,10 @@ def issues_to_fetch(metadata, newer_than, older_than):
           daily_issues[day] = urls_list
         else:
           daily_issues[day] += urls_list
+      elif day >= older_than:
+        n_beyond += 1
+    if n_beyond > 0:
+      print(f"{n_beyond} issues available on {older_than} or newer")
     return daily_issues
 
   @staticmethod