diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 0535dd429..b0b010ef4 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.5 +current_version = 0.4.6 commit = False tag = False diff --git a/dev/local/Makefile b/dev/local/Makefile index 111505790..da910bd86 100644 --- a/dev/local/Makefile +++ b/dev/local/Makefile @@ -36,6 +36,7 @@ # pdb=1 Drops you into debug mode upon test failure, if running tests. # test= Only runs tests in the directories provided here, e.g. # repos/delphi/delphi-epidata/tests/acquisition/covidcast +# sql= Overrides the default SQL connection string. # Set optional argument defaults @@ -49,6 +50,12 @@ ifndef test test=repos/delphi/delphi-epidata/tests repos/delphi/delphi-epidata/integrations endif +ifdef sql + sqlalchemy_uri:=$(sql) +else + sqlalchemy_uri:=mysql+mysqldb://user:pass@delphi_database_epidata:3306/epidata +endif + SHELL:=/bin/sh # Get the Makefile's absolute path: https://stackoverflow.com/a/324782/4784655 @@ -78,7 +85,7 @@ web: @# Run the web server @docker run --rm -p 127.0.0.1:10080:80 \ - --env "SQLALCHEMY_DATABASE_URI=mysql+mysqldb://user:pass@delphi_database_epidata:3306/epidata" \ + --env "SQLALCHEMY_DATABASE_URI=$(sqlalchemy_uri)" \ --env "FLASK_SECRET=abc" --env "FLASK_PREFIX=/epidata" --env "LOG_DEBUG" \ --network delphi-net --name delphi_web_epidata \ delphi_web_epidata >$(LOG_WEB) 2>&1 & @@ -123,7 +130,7 @@ test: @docker run -i --rm --network delphi-net \ --mount type=bind,source=$(CWD)repos/delphi/delphi-epidata,target=/usr/src/app/repos/delphi/delphi-epidata,readonly \ --mount type=bind,source=$(CWD)repos/delphi/delphi-epidata/src,target=/usr/src/app/delphi/epidata,readonly \ - --env "SQLALCHEMY_DATABASE_URI=mysql+mysqldb://user:pass@delphi_database_epidata:3306/epidata" \ + --env "SQLALCHEMY_DATABASE_URI=$(sqlalchemy_uri)" \ --env "FLASK_SECRET=abc" \ delphi_web_python python -m pytest --import-mode importlib $(pdb) $(test) | tee test_output_$(NOW).log @@ -132,7 +139,7 @@ bash: @docker run -it --rm --network delphi-net \ --mount type=bind,source=$(CWD)repos/delphi/delphi-epidata,target=/usr/src/app/repos/delphi/delphi-epidata,readonly \ --mount type=bind,source=$(CWD)repos/delphi/delphi-epidata/src,target=/usr/src/app/delphi/epidata,readonly \ - --env "SQLALCHEMY_DATABASE_URI=mysql+mysqldb://user:pass@delphi_database_epidata:3306/epidata" \ + --env "SQLALCHEMY_DATABASE_URI=$(sqlalchemy_uri)" \ --env "FLASK_SECRET=abc" \ delphi_web_python bash diff --git a/dev/local/setup.cfg b/dev/local/setup.cfg index 005f1d30b..4c80959ea 100644 --- a/dev/local/setup.cfg +++ b/dev/local/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = Delphi Development -version = 0.4.5 +version = 0.4.6 [options] packages = diff --git a/docs/api/covidcast-signals/_source-template.md b/docs/api/covidcast-signals/_source-template.md index f2d373167..137d7f38a 100644 --- a/docs/api/covidcast-signals/_source-template.md +++ b/docs/api/covidcast-signals/_source-template.md @@ -62,6 +62,16 @@ geographic coverage, limits in its interpretation (symptoms in a survey aren't always caused by COVID, our healthcare partner only is part of the market, there may be a demographic bias in respondents, etc.), known inaccuracies, etc. +## Missingness + +Describe *all* situations under which a value may not be reported, and what that +means. If the signal ever reports NA, describe what that means and how it is +different from missingness. For example: + +When fewer than 100 survey responses are received in a geographic area on a +specific day, no data is reported for that area on that day; an API query for +all reported geographic areas on that day will not include it. + ## Lag and Backfill If this signal is reported with a consistent lag, describe it here. diff --git a/docs/symptom-survey/end-of-survey.md b/docs/symptom-survey/end-of-survey.md index 9f55fb85e..30d9fc597 100644 --- a/docs/symptom-survey/end-of-survey.md +++ b/docs/symptom-survey/end-of-survey.md @@ -1,7 +1,7 @@ --- title: End of CTIS Data Collection parent: COVID-19 Trends and Impact Survey -nav_order: 10 +nav_order: 11 --- # End of CTIS Data Collection diff --git a/docs/symptom-survey/index.md b/docs/symptom-survey/index.md index 60b7bbea5..ac1ba9ac7 100644 --- a/docs/symptom-survey/index.md +++ b/docs/symptom-survey/index.md @@ -16,12 +16,6 @@ international version is [conducted by the University of Maryland](https://covidmap.umd.edu/). Data collection [ceased on June 25, 2022](end-of-survey.md). -The [survey results dashboard](https://delphi.cmu.edu/covidcast/survey-results/) -provides a high-level summary of survey results. Geographically aggregated data -from this survey is publicly available through the [COVIDcast API](../api/covidcast.md) -as the [`fb-survey` data source](../api/covidcast-signals/fb-survey.md). Demographic breakdowns of survey -data are publicly available as [downloadable contingency tables](contingency-tables.md). - The [CTIS Methodology Report](https://dataforgood.facebook.com/dfg/resources/CTIS-methodology-report) describes the survey design, data collection process, weighting, and aggregation @@ -34,6 +28,16 @@ access to the data, see our page on getting [data access](data-access.md). If you have questions about the survey or getting access to data, contact us at . +## Results + +The [survey results dashboard](https://delphi.cmu.edu/covidcast/survey-results/) +provides a high-level summary of survey results. Geographically aggregated data +from this survey is publicly available through the [COVIDcast API](../api/covidcast.md) +as the [`fb-survey` data source](../api/covidcast-signals/fb-survey.md). Demographic breakdowns of survey +data are publicly available as [downloadable contingency tables](contingency-tables.md). + +CTIS data has been used in [numerous peer-reviewed publications](publications.md). + ## Credits The US COVID-19 Trends and Impact Survey (CTIS) is a project of the [Delphi diff --git a/docs/symptom-survey/publications.md b/docs/symptom-survey/publications.md new file mode 100644 index 000000000..bff92fc4f --- /dev/null +++ b/docs/symptom-survey/publications.md @@ -0,0 +1,224 @@ +--- +title: Publications +parent: COVID-19 Trends and Impact Survey +nav_order: 10 +--- + +# Publications Using CTIS + +Since 2020, academic and nonprofit researchers have used data from the US +COVID-19 Trends and Impact Survey in peer-reviewed research publications. + +Our primary description of the survey and its results over the first year of +operation was published as part of the [special feature "Beyond Cases and +Deaths: The Benefits of Auxiliary Data Streams in Tracking the COVID-19 +Pandemic"](https://www.pnas.org/topic/548) in *PNAS*: + +- J. Salomon, A. Reinhart, A. Bilinski, E. J. Chua, W. La Motte-Kerr, M. M. + Rönn, M. B. Reitsma, K. A. Morris, S. LaRocca, T. H. Farag, F. Kreuter, R. + Rosenfeld, and R. J. Tibshirani (2021). [The US COVID-19 Trends and Impact + Survey: Continuous real-time measurement of COVID-19 symptoms, risks, + protective behaviors, testing, and + vaccination](https://doi.org/10.1073/pnas.2111454118). *Proceedings of the + National Academy of Sciences* 118 (51) e2111454118. + +## Peer Reviewed Papers + +Research publications using the survey data include: + +- Rebecca L. Weintraub et al (2023). [Identifying COVID-19 Vaccine Deserts and + Ways to Reduce Them: A Digital Tool to Support Public Health + Decision-Making](https://doi.org/10.2105/AJPH.2022.307198). *American Journal + of Public Health*. +- Anzalone AJ, Sun J, Vinson AJ, Beasley WH, Hillegass WB, Murray K, et al. + (2023). [Community risks for SARS-CoV-2 infection among fully vaccinated US + adults by rurality: A retrospective cohort study from the National COVID + Cohort Collaborative](https://doi.org/10.1371/journal.pone.0279968). *PLoS + ONE* 18(1): e0279968. +- Rufino, J., Baquero, C., Frey, D. et al (2023). [Using survey data to estimate + the impact of the omicron variant on vaccine efficacy against COVID-19 + infection](https://doi.org/10.1038/s41598-023-27951-3). *Scientific Reports* + 13, 900 (2023). +- Rader, B., Astley, C.M., Sewalk, K. et al (2022). [Spatial modeling of vaccine + deserts as barriers to controlling + SARS-CoV-2](https://doi.org/10.1038/s43856-022-00183-8). *Communications + Medicine* 2, 141. +- C. Lupton-Smith, E. Badillo Goicoechea, M. Collins, J. Lessler, M. K. + Grabowski & E. A. Stuart (2022). [Consistency between Household and County + Measures of Onsite Schooling during the COVID-19 + Pandemic](https://doi.org/10.1080/19345747.2022.2131660). *Journal of Research + on Educational Effectiveness*. +- Nguyen, Q.C., Yardi, I., Gutierrez, F.X.M. et al. (2022). [Leveraging 13 + million responses to the U.S. COVID-19 Trends and Impact Survey to examine + vaccine hesitancy, vaccination, and mask wearing, January 2021-February + 2022](https://doi.org/10.1186/s12889-022-14286-3). *BMC Public Health* 22, + 1911. +- J. G. Lu (2022). [Two large-scale global studies on COVID-19 vaccine hesitancy + over time: Culture, uncertainty avoidance, and vaccine side-effect + concerns](https://doi.org/10.1037/pspa0000320). *Journal of Personality and + Social Psychology*. +- J. M. Cox-Ganser, P. K. Henneberger, D. N. Weissman, G. Guthrie, and C. P. + Groth (2022). [COVID-19 test positivity by occupation using the Delphi US + COVID-19 Trends and Impact Survey, September–November + 2020](https://doi.org/10.1002/ajim.23410). *American Journal of Industrial + Medicine* 65 (9), 721-730. +- M. Jahja, A. Chin, and R.J. Tibshirani (2022). [Real-Time Estimation of + COVID-19 Infections: Deconvolution and Sensor + Fusion](https://doi.org/10.1214/22-STS856). *Statistical Science* 37 (2), + 207-228. +- Henneberger, PK, Cox-Ganser, JM, Guthrie, GM, Groth, CP (2022). [Estimates of + COVID-19 vaccine uptake in major occupational groups and detailed occupational + categories in the United States, April–May + 2021](https://doi.org/10.1002/ajim.23370). *American Journal of Industrial + Medicine* 65 (7), 525-536. +- K. E. Wiens, C. P. Smith, E. Badillo-Goicoechea, K. H. Grantz, M. K. + Grabowski, A. S. Azman, E. A. Stuart, and J. Lessler (2022). [In-person + schooling and associated COVID-19 risk in the United States over spring + semester 2021](https://doi.org/10.1126/sciadv.abm9128). *Science Advances* 8, + eabm9128. +- F. Petersen, A. Errore, and P. Karaca-Mandic (2022). [Lifting statewide mask + mandates and COVID-19 cases: A synthetic control + study](https://doi.org/10.1097/MLR.0000000000001725). *Medical Care* 60 (7), + 538-544. +- F. Pierri, B. L. Perry, M. R. DeVerna, et al. (2022). [Online misinformation + is linked to early COVID-19 vaccination hesitancy and + refusal](https://doi.org/10.1038/s41598-022-10070-w). *Scientific Reports* 12, + 5966. +- A. Pilehvari, J. Ton, M. R. Mohan, A. Marathe, and A. Vullikanti (2022). + [Drivers and Predictors of COVID-19 Vaccine Hesitancy in + Virginia](https://doi.org/10.1007/978-3-030-96188-6_8). In: Yang, Z., von + Briesen, E. (eds), *Proceedings of the 2021 Conference of The Computational + Social Science Society of the Americas*. CSSSA 2021. +- L. S. Flor, J. Friedman, C. N. Spencer, et al. (2022). [Quantifying the + effects of the COVID-19 pandemic on gender equality on health, social, and + economic indicators: a comprehensive review of data from March, 2020, to + September, 2021](https://doi.org/10.1016/S0140-6736(22)00008-3). *The Lancet* + 399 (10344), 2381-2397. +- D. P. Do and R. Frank (2022). [Prior COVID-19 infection: an underappreciated + factor in vaccine hesitancy in the + USA](https://doi.org/10.1093/pubmed/fdab404). *Journal of Public Health* 44 + (2), 471-474. +- W. C. King, M. Rubinstein, A. Reinhart, and R. J. Mejia (2021). [Time trends, + factors associated with, and reasons for COVID-19 vaccine hesitancy: A massive + online survey of US adults from January-May + 2021](https://doi.org/10.1371/journal.pone.0260731). *PLoS ONE* 16 (12), + e0260731. +- C. Lupton-Smith, E. Badillo-Goicochea, T.-H. Chang, H. Maniates, K. E. Riehm, + I. Schmid, and E. A. Stuart (2021). [Factors associated with county-level + mental health during the COVID-19 + pandemic](https://doi.org/10.1002/jcop.22785). *Journal of Community + Psychology* 50 (5), 2431-2442. +- D. P. Do and R. Frank (2021). [U.S. frontline workers and COVID-19 + inequities](https://doi.org/10.1016/j.ypmed.2021.106833). *Preventive + Medicine* 153, 106833. +- W. C. King, M. Rubinstein, A. Reinhart, and R. J. Mejia (2021). [COVID-19 + vaccine hesitancy January-May 2021 among 18–64 year old US adults by + employment and occupation](https://doi.org/10.1016/j.pmedr.2021.101569). + *Preventive Medicine Reports* 24, 101569. +- C. H. Sudre, A. Keshet, M. S. Graham, A. D. Joshi, S. Shilo, H. Rossman, B. + Murray, E. Molteni, K. Klaser, L. D. Canas, M. Antonelli, L. H. Nguyen, D. A. + Drew, M. Modat, J. Capdevila Pujol, S. Ganesh, J. Wolf, T. Meir, A. T. Chan, + C. J. Steves, T. D. Spector, J. S. Brownstein, E. Segal, S. Ourselin, and C. + M. Astley (2021). [Anosmia, ageusia, and other COVID-19-like symptoms in + association with a positive SARS-CoV-2 test, across six national digital + surveillance platforms: an observational + study](https://doi.org/10.1016/S2589-7500(21)00115-1). *The Lancet Digital + Health* 3 (9), e577-e586. +- R. Sukumaran, P. Patwa, S. T V, S. Shankar, R. Kanaparti, J. Bae, Y. Mathur, + A. Singh, A. Chopra, M. Kang, P. Ramaswamy, and R. Raskar (2021). [COVID-19 + outbreak prediction and analysis using self reported + symptoms](https://doi.org/10.35566/jbds/v1n1/p8). *Journal of Behavioral Data + Science* 1 (1), 154–169. +- D. Adjodah, K. Dinakar, M. Chinazzi, S. P. Fraiberger, A. Pentland, S. Bates, + K. Staller, A. Vespignani, and D. L. Bhatt (2021). [Association between + COVID-19 outcomes and mask mandates, adherence, and + attitudes](https://doi.org/10.1371/journal.pone.0252315). *PLoS ONE* 16 (6), + e0252315. +- C. M. Zipfel, V. Colizza, and S. Bansal (2021). [The missing season: The + impacts of the COVID-19 pandemic on + influenza](https://doi.org/10.1016/j.vaccine.2021.05.049). *Vaccine* 39 (28), + 3645-3648. +- J. Lessler, M. K. Grabowski, K. H. Grantz, E. Badillo-Goicoechea, C. J. E. + Metcalf, C. Lupton-Smith, A. S. Azman, and E. A. Stuart (2021). [Household + COVID-19 risk and in-person + schooling](https://doi.org/10.1126/science.abh2939). *Science* 372 (6545), + 1092-1097. +- Doerr, A.J. (2021). [Locked (Down) and Loaded (Language): Effect of Policy and + Speech on COVID-19 Outcomes](https://doi.org/10.1177/15480518211012404). + *Journal of Leadership & Organizational Studies* 28 (3), 340-348. +- Fischer CB, Adrien N, Silguero JJ, Hopper JJ, Chowdhury AI, Werler MM (2021). + [Mask adherence and rate of COVID-19 across the United + States](https://doi.org/10.1371/journal.pone.0249891). *PLoS ONE* 16 (4), + e0249891. +- Bilinski, A., Emanuel, E., Salomon, J.A. and Venkataramani, A. (2021). [Better + Late Than Never: Trends in COVID-19 Infection Rates, Risk Perceptions, and + Behavioral Responses in the USA](https://doi.org/10.1007/s11606-021-06633-8). + *Journal of General Internal Medicine* 36, 1825-1828. +- Molteni, E., Astley, C.M., Ma, W. et al. (2021). [Symptoms and syndromes + associated with SARS-CoV-2 infection and severity in pregnant women from two + community cohorts](https://doi.org/10.1038/s41598-021-86452-3). *Scientific + Reports* 11, 6928. +- Rebeiro, P., Aronoff, D, and Smith, M.K. (2021). [The Impact of State + Mask-Wearing Requirements on the Growth of COVID-19 Cases, Hospitalizations, + and Deaths in the United States](https://doi.org/10.1093/cid/ciab101). + *Clinical Infectious Diseases* 73 (9), 1703–1706. +- Rader, White, Burns, Chen, Brilliant, Cohen, Shaman, Brilliant, Kraemer, + Moritz, Hawkins, Scarpino, Astley, and Brownstein (2021). [Mask-wearing and + control of SARS-CoV-2 transmission in the USA: a cross-sectional + study](https://doi.org/10.1016/S2589-7500(20)30293-4). *Lancet Digital Health* + 3 (3), e148-e157. +- Flaxman AD, Henning DJ and Duber HC (2020). [The relative incidence of + COVID-19 in healthcare workers versus non-healthcare workers: evidence from a + web-based survey of Facebook users in the United + States](https://doi.org/10.12688/gatesopenres.13202.2). *Gates Open Research*, + 4:174. +- Kreuter, F., Barkay, N., Bilinski, A., Bradford, A., Chiu, S., Eliat, R., Fan, + J., Galili, T., Haimovich, D., Kim, B., LaRocca, S., Li, Y., Morris, K., + Presser, S., Sarig, T., Salomon, J. A., Stewart, K., Stuart, E. A., & + Tibshirani, R. J. (2020). [Partnering with a global platform to inform + research and public policy + making](https://doi.org/10.18148/srm/2020.v14i2.7761). *Survey Research + Methods*, 14 (2), 159-163. + +## Blog Posts and Reports + +* Surgo Ventures, Resolve to Save Lives (2021). [COVID-19 Vaccine Precision + Response Toolkit: An End-to-End Vaccination Improvement Framework to Improve + COVID-19 Vaccine + Uptake](https://surgoventures.org/resource-library/increasing-covid-19vaccine-uptake-a-four-step-framework-to-promote-access-acceptance-and-equity). +* Alejandra Arrieta, Emmanuela Gakidou, Heidi Larson, Erin Mullany, and + Christopher Troeger, April 1, 2021. [Through Understanding and Empathy, We Can + Convince Women to Get the COVID-19 + Vaccine](https://www.thinkglobalhealth.org/article/through-understanding-and-empathy-we-can-convince-women-get-covid-19-vaccine). + Think Global Health. +* Joseph Friedman, Silvia Montoya and Emmanuela Gakidou, March 8, 2021. [Gender + Equality in the Global Return to + School](https://www.thinkglobalhealth.org/article/gender-equality-global-return-school). + Think Global Health. +* Leanna Morinishi, Kris Barkume, Esther Kim, and Alex Reinhart, February + 2, 2021. [Home for the Holidays? The Impact of US Holidays on Social Behaviors + and Preventative + Measures](https://delphi.cmu.edu/blog/2021/02/02/home-for-the-holidays-the-impact-of-us-holidays-on-social-behaviors-and-preventative-measures/). + Delphi blog. +* Alex Reinhart, Esther Kim, Andy Garcia, and Sarah LaRocca, January 28, 2021. + [Using the COVID-19 Symptom Survey to Track Vaccination Uptake and Sentiment + in the United + States](https://delphi.cmu.edu/blog/2021/01/28/using-the-covid-19-symptom-survey-to-track-vaccination-uptake-and-sentiment-in-the-united-states/). + Delphi blog. +* Alex Reinhart, October 12, 2020. [New and Improved COVID Symptom Survey Tracks + Testing and + Mask-Wearing](https://delphi.cmu.edu/blog/2020/10/12/new-and-improved-covid-symptom-survey-tracks-testing-and-mask-wearing/). + Delphi blog. +* Ryan Tibshirani, September 21, 2020. [Can Symptom Surveys Improve COVID-19 + Forecasts?](https://delphi.cmu.edu/blog/2020/09/21/can-symptoms-surveys-improve-covid-19-forecasts/) + Delphi blog. +* Alex Reinhart and Ryan Tibshirani, August 26, 2020. [COVID-19 Symptom Surveys + through + Facebook](https://delphi.cmu.edu/blog/2020/08/26/covid-19-symptom-surveys-through-facebook/). + Delphi blog. + +## Send Your Publications + +If you have used the survey data, or the aggregate data available in the +COVIDcast API, to publish research results, please contact us at + so we can include your work here. diff --git a/integrations/acquisition/covidcast/delete_batch.csv b/integrations/acquisition/covidcast/delete_batch.csv index e0e1eb82c..5c1602218 100644 --- a/integrations/acquisition/covidcast/delete_batch.csv +++ b/integrations/acquisition/covidcast/delete_batch.csv @@ -1,4 +1,4 @@ geo_id,value,stderr,sample_size,issue,time_value,geo_type,signal,source -d_nonlatest,0,0,0,1,0,geo,sig,src -d_latest, 0,0,0,3,0,geo,sig,src -d_justone, 0,0,0,1,0,geo,sig,src \ No newline at end of file +d_nonlatest,0,0,0,1,0,county,sig,src +d_latest, 0,0,0,3,0,county,sig,src +d_justone, 0,0,0,1,0,county,sig,src \ No newline at end of file diff --git a/integrations/acquisition/covidcast/test_csv_uploading.py b/integrations/acquisition/covidcast/test_csv_uploading.py index de3eb5f13..f975ecfa0 100644 --- a/integrations/acquisition/covidcast/test_csv_uploading.py +++ b/integrations/acquisition/covidcast/test_csv_uploading.py @@ -213,8 +213,8 @@ def test_uploading(self): "time_value": [20200419], "signal": [signal_name], "direction": [None]})], axis=1).rename(columns=uploader_column_rename) - expected_values_df["missing_value"].iloc[0] = Nans.OTHER - expected_values_df["missing_sample_size"].iloc[0] = Nans.NOT_MISSING + expected_values_df.loc[0, "missing_value"] = Nans.OTHER + expected_values_df.loc[0, "missing_sample_size"] = Nans.NOT_MISSING expected_values = expected_values_df.to_dict(orient="records") expected_response = {'result': 1, 'epidata': self.apply_lag(expected_values), 'message': 'success'} diff --git a/integrations/acquisition/covidcast/test_db.py b/integrations/acquisition/covidcast/test_db.py index 3cd7e91a7..7b9d80770 100644 --- a/integrations/acquisition/covidcast/test_db.py +++ b/integrations/acquisition/covidcast/test_db.py @@ -1,9 +1,8 @@ -import unittest - from delphi_utils import Nans -from delphi.epidata.acquisition.covidcast.database import Database, CovidcastRow, DBLoadStateException -from delphi.epidata.acquisition.covidcast.test_utils import CovidcastBase -import delphi.operations.secrets as secrets + +from delphi.epidata.acquisition.covidcast.database import DBLoadStateException +from delphi.epidata.acquisition.covidcast.test_utils import CovidcastBase, CovidcastTestRow + # all the Nans we use here are just one value, so this is a shortcut to it: nmv = Nans.NOT_MISSING.value @@ -11,7 +10,7 @@ class TestTest(CovidcastBase): def _find_matches_for_row(self, row): - # finds (if existing) row from both history and latest views that matches long-key of provided CovidcastRow + # finds (if existing) row from both history and latest views that matches long-key of provided CovidcastTestRow cols = "source signal time_type time_value geo_type geo_value issue".split() results = {} cur = self._db._cursor @@ -31,8 +30,8 @@ def _find_matches_for_row(self, row): def test_insert_or_update_with_nonempty_load_table(self): # make rows - a_row = self._make_placeholder_row()[0] - another_row = self._make_placeholder_row(time_value=self.DEFAULT_TIME_VALUE+1, issue=self.DEFAULT_ISSUE+1)[0] + a_row = CovidcastTestRow.make_default_row(time_value=2020_02_02) + another_row = CovidcastTestRow.make_default_row(time_value=2020_02_03, issue=2020_02_03) # insert one self._db.insert_or_update_bulk([a_row]) # put something into the load table @@ -61,7 +60,7 @@ def test_id_sync(self): latest_view = 'epimetric_latest_v' # add a data point - base_row, _ = self._make_placeholder_row() + base_row = CovidcastTestRow.make_default_row() self._insert_rows([base_row]) # ensure the primary keys match in the latest and history tables matches = self._find_matches_for_row(base_row) @@ -71,7 +70,7 @@ def test_id_sync(self): old_pk_id = matches[latest_view][pk_column] # add a reissue for said data point - next_row, _ = self._make_placeholder_row() + next_row = CovidcastTestRow.make_default_row() next_row.issue += 1 self._insert_rows([next_row]) # ensure the new keys also match diff --git a/integrations/acquisition/covidcast/test_delete_batch.py b/integrations/acquisition/covidcast/test_delete_batch.py index 915c9341b..4624df27c 100644 --- a/integrations/acquisition/covidcast/test_delete_batch.py +++ b/integrations/acquisition/covidcast/test_delete_batch.py @@ -5,13 +5,10 @@ import unittest from os import path -# third party -import mysql.connector - # first party -from delphi_utils import Nans -from delphi.epidata.acquisition.covidcast.database import Database, CovidcastRow import delphi.operations.secrets as secrets +from delphi.epidata.acquisition.covidcast.database import Database +from delphi.epidata.acquisition.covidcast.test_utils import covidcast_rows_from_args # py3tester coverage target (equivalent to `import *`) __test_target__ = 'delphi.epidata.acquisition.covidcast.database' @@ -56,17 +53,13 @@ def test_delete_from_tuples(self): def _test_delete_batch(self, cc_deletions): # load sample data - rows = [] - for time_value in [0, 1]: - rows += [ - # varying numeric column here (2nd to last) is `issue` - CovidcastRow('src', 'sig', 'day', 'geo', time_value, "d_nonlatest", 0,0,0,0,0,0, 1, 0), - CovidcastRow('src', 'sig', 'day', 'geo', time_value, "d_nonlatest", 0,0,0,0,0,0, 2, 0), - CovidcastRow('src', 'sig', 'day', 'geo', time_value, "d_latest", 0,0,0,0,0,0, 1, 0), - CovidcastRow('src', 'sig', 'day', 'geo', time_value, "d_latest", 0,0,0,0,0,0, 2, 0), - CovidcastRow('src', 'sig', 'day', 'geo', time_value, "d_latest", 0,0,0,0,0,0, 3, 0) - ] - rows.append(CovidcastRow('src', 'sig', 'day', 'geo', 0, "d_justone", 0,0,0,0,0,0, 1, 0)) + rows = covidcast_rows_from_args( + time_value = [0] * 5 + [1] * 5 + [0], + geo_value = ["d_nonlatest"] * 2 + ["d_latest"] * 3 + ["d_nonlatest"] * 2 + ["d_latest"] * 3 + ["d_justone"], + issue = [1, 2] + [1, 2, 3] + [1, 2] + [1, 2, 3] + [1], + sanitize_fields = True + ) + self._db.insert_or_update_bulk(rows) # delete entries diff --git a/integrations/client/test_delphi_epidata.py b/integrations/client/test_delphi_epidata.py index 625d2859d..82c1452ec 100644 --- a/integrations/client/test_delphi_epidata.py +++ b/integrations/client/test_delphi_epidata.py @@ -1,26 +1,26 @@ """Integration tests for delphi_epidata.py.""" # standard library -import unittest import time -from unittest.mock import patch, MagicMock from json import JSONDecodeError +from unittest.mock import MagicMock, patch -# third party -from aiohttp.client_exceptions import ClientResponseError -import mysql.connector +# first party import pytest +from aiohttp.client_exceptions import ClientResponseError -# first party -from delphi_utils import Nans -from delphi.epidata.client.delphi_epidata import Epidata -from delphi.epidata.acquisition.covidcast.database import Database, CovidcastRow -from delphi.epidata.acquisition.covidcast.covidcast_meta_cache_updater import main as update_covidcast_meta_cache -from delphi.epidata.acquisition.covidcast.test_utils import CovidcastBase +# third party import delphi.operations.secrets as secrets +from delphi.epidata.acquisition.covidcast.covidcast_meta_cache_updater import main as update_covidcast_meta_cache +from delphi.epidata.acquisition.covidcast.test_utils import CovidcastBase, CovidcastTestRow +from delphi.epidata.client.delphi_epidata import Epidata +from delphi_utils import Nans + # py3tester coverage target __test_target__ = 'delphi.epidata.client.delphi_epidata' +# all the Nans we use here are just one value, so this is a shortcut to it: +nmv = Nans.NOT_MISSING.value def fake_epidata_endpoint(func): """This can be used as a decorator to enable a bogus Epidata endpoint to return 404 responses.""" @@ -30,9 +30,6 @@ def wrapper(*args): Epidata.BASE_URL = 'http://delphi_web_epidata/epidata/api.php' return wrapper -# all the Nans we use here are just one value, so this is a shortcut to it: -nmv = Nans.NOT_MISSING.value - class DelphiEpidataPythonClientTests(CovidcastBase): """Tests the Python client.""" @@ -54,13 +51,11 @@ def test_covidcast(self): # insert placeholder data: three issues of one signal, one issue of another rows = [ - self._make_placeholder_row(issue=self.DEFAULT_ISSUE + i, value=i, lag=i)[0] + CovidcastTestRow.make_default_row(issue=2020_02_02 + i, value=i, lag=i) for i in range(3) ] row_latest_issue = rows[-1] - rows.append( - self._make_placeholder_row(signal="sig2")[0] - ) + rows.append(CovidcastTestRow.make_default_row(signal="sig2")) self._insert_rows(rows) with self.subTest(name='request two signals'): @@ -70,10 +65,11 @@ def test_covidcast(self): ) expected = [ - self.expected_from_row(row_latest_issue), - self.expected_from_row(rows[-1]) + row_latest_issue.as_api_compatibility_row_dict(), + rows[-1].as_api_compatibility_row_dict() ] + self.assertEqual(response['epidata'], expected) # check result self.assertEqual(response, { 'result': 1, @@ -89,10 +85,10 @@ def test_covidcast(self): expected = [{ rows[0].signal: [ - self.expected_from_row(row_latest_issue, self.DEFAULT_MINUS + ['signal']), + row_latest_issue.as_api_compatibility_row_dict(ignore_fields=['signal']), ], rows[-1].signal: [ - self.expected_from_row(rows[-1], self.DEFAULT_MINUS + ['signal']), + rows[-1].as_api_compatibility_row_dict(ignore_fields=['signal']), ], }] @@ -109,12 +105,12 @@ def test_covidcast(self): **self.params_from_row(rows[0]) ) - expected = self.expected_from_row(row_latest_issue) + expected = [row_latest_issue.as_api_compatibility_row_dict()] # check result self.assertEqual(response_1, { 'result': 1, - 'epidata': [expected], + 'epidata': expected, 'message': 'success', }) @@ -124,13 +120,13 @@ def test_covidcast(self): **self.params_from_row(rows[0], as_of=rows[1].issue) ) - expected = self.expected_from_row(rows[1]) + expected = [rows[1].as_api_compatibility_row_dict()] # check result self.maxDiff=None self.assertEqual(response_1a, { 'result': 1, - 'epidata': [expected], + 'epidata': expected, 'message': 'success', }) @@ -141,8 +137,8 @@ def test_covidcast(self): ) expected = [ - self.expected_from_row(rows[0]), - self.expected_from_row(rows[1]) + rows[0].as_api_compatibility_row_dict(), + rows[1].as_api_compatibility_row_dict() ] # check result @@ -158,12 +154,12 @@ def test_covidcast(self): **self.params_from_row(rows[0], lag=2) ) - expected = self.expected_from_row(row_latest_issue) + expected = [row_latest_issue.as_api_compatibility_row_dict()] # check result self.assertDictEqual(response_3, { 'result': 1, - 'epidata': [expected], + 'epidata': expected, 'message': 'success', }) with self.subTest(name='long request'): @@ -223,16 +219,16 @@ def test_geo_value(self): # insert placeholder data: three counties, three MSAs N = 3 rows = [ - self._make_placeholder_row(geo_type="county", geo_value=str(i)*5, value=i)[0] + CovidcastTestRow.make_default_row(geo_type="county", geo_value=str(i)*5, value=i) for i in range(N) ] + [ - self._make_placeholder_row(geo_type="msa", geo_value=str(i)*5, value=i*10)[0] + CovidcastTestRow.make_default_row(geo_type="msa", geo_value=str(i)*5, value=i*10) for i in range(N) ] self._insert_rows(rows) counties = [ - self.expected_from_row(rows[i]) for i in range(N) + rows[i].as_api_compatibility_row_dict() for i in range(N) ] def fetch(geo): @@ -241,41 +237,48 @@ def fetch(geo): ) # test fetch all - r = fetch('*') - self.assertEqual(r['message'], 'success') - self.assertEqual(r['epidata'], counties) + request = fetch('*') + self.assertEqual(request['message'], 'success') + self.assertEqual(request['epidata'], counties) # test fetch a specific region - r = fetch('11111') - self.assertEqual(r['message'], 'success') - self.assertEqual(r['epidata'], [counties[1]]) + request = fetch('11111') + self.assertEqual(request['message'], 'success') + self.assertEqual(request['epidata'], [counties[1]]) # test fetch a specific yet not existing region - r = fetch('55555') - self.assertEqual(r['message'], 'no results') + request = fetch('55555') + self.assertEqual(request['message'], 'no results') # test fetch a multiple regions - r = fetch(['11111', '22222']) - self.assertEqual(r['message'], 'success') - self.assertEqual(r['epidata'], [counties[1], counties[2]]) + request = fetch(['11111', '22222']) + self.assertEqual(request['message'], 'success') + self.assertEqual(request['epidata'], [counties[1], counties[2]]) # test fetch a multiple regions in another variant - r = fetch(['00000', '22222']) - self.assertEqual(r['message'], 'success') - self.assertEqual(r['epidata'], [counties[0], counties[2]]) + request = fetch(['00000', '22222']) + self.assertEqual(request['message'], 'success') + self.assertEqual(request['epidata'], [counties[0], counties[2]]) # test fetch a multiple regions but one is not existing - r = fetch(['11111', '55555']) - self.assertEqual(r['message'], 'success') - self.assertEqual(r['epidata'], [counties[1]]) + request = fetch(['11111', '55555']) + self.assertEqual(request['message'], 'success') + self.assertEqual(request['epidata'], [counties[1]]) # test fetch a multiple regions but specify no region - r = fetch([]) - self.assertEqual(r['message'], 'no results') + request = fetch([]) + self.assertEqual(request['message'], 'no results') def test_covidcast_meta(self): """Test that the covidcast_meta endpoint returns expected data.""" + DEFAULT_TIME_VALUE = 2020_02_02 + DEFAULT_ISSUE = 2020_02_02 + # insert placeholder data: three dates, three issues. values are: # 1st issue: 0 10 20 # 2nd issue: 1 11 21 # 3rd issue: 2 12 22 rows = [ - self._make_placeholder_row(time_value=self.DEFAULT_TIME_VALUE + t, issue=self.DEFAULT_ISSUE + i, value=t*10 + i)[0] + CovidcastTestRow.make_default_row( + time_value=DEFAULT_TIME_VALUE + t, + issue=DEFAULT_ISSUE + i, + value=t*10 + i + ) for i in range(3) for t in range(3) ] self._insert_rows(rows) @@ -299,14 +302,14 @@ def test_covidcast_meta(self): signal=rows[0].signal, time_type=rows[0].time_type, geo_type=rows[0].geo_type, - min_time=self.DEFAULT_TIME_VALUE, - max_time=self.DEFAULT_TIME_VALUE + 2, + min_time=DEFAULT_TIME_VALUE, + max_time=DEFAULT_TIME_VALUE + 2, num_locations=1, min_value=2., mean_value=12., max_value=22., stdev_value=8.1649658, # population stdev, not sample, which is 10. - max_issue=self.DEFAULT_ISSUE + 2, + max_issue=DEFAULT_ISSUE + 2, min_lag=0, max_lag=0, # we didn't set lag when inputting data ) @@ -322,10 +325,10 @@ def test_async_epidata(self): # insert placeholder data: three counties, three MSAs N = 3 rows = [ - self._make_placeholder_row(geo_type="county", geo_value=str(i)*5, value=i)[0] + CovidcastTestRow.make_default_row(geo_type="county", geo_value=str(i)*5, value=i) for i in range(N) ] + [ - self._make_placeholder_row(geo_type="msa", geo_value=str(i)*5, value=i*10)[0] + CovidcastTestRow.make_default_row(geo_type="msa", geo_value=str(i)*5, value=i*10) for i in range(N) ] self._insert_rows(rows) diff --git a/integrations/server/test_covidcast.py b/integrations/server/test_covidcast.py index bcca3b199..c3b50206d 100644 --- a/integrations/server/test_covidcast.py +++ b/integrations/server/test_covidcast.py @@ -1,7 +1,7 @@ """Integration tests for the `covidcast` endpoint.""" # standard library -import json +from typing import Callable import unittest # third party @@ -10,13 +10,11 @@ # first party from delphi_utils import Nans -from delphi.epidata.acquisition.covidcast.test_utils import CovidcastBase +from delphi.epidata.acquisition.covidcast.test_utils import CovidcastBase, CovidcastTestRow # use the local instance of the Epidata API BASE_URL = 'http://delphi_web_epidata/epidata/api.php' - - class CovidcastTests(CovidcastBase): """Tests the `covidcast` endpoint.""" @@ -24,28 +22,26 @@ def localSetUp(self): """Perform per-test setup.""" self._db._cursor.execute('update covidcast_meta_cache set timestamp = 0, epidata = "[]"') - def request_based_on_row(self, row, extract_response=lambda x: x.json(), **kwargs): + def request_based_on_row(self, row: CovidcastTestRow, extract_response: Callable = lambda x: x.json(), **kwargs): params = self.params_from_row(row, endpoint='covidcast', **kwargs) response = requests.get(BASE_URL, params=params) response.raise_for_status() response = extract_response(response) - expected = self.expected_from_row(row) - - return response, expected + return response def _insert_placeholder_set_one(self): - row, settings = self._make_placeholder_row() + row = CovidcastTestRow.make_default_row() self._insert_rows([row]) return row def _insert_placeholder_set_two(self): rows = [ - self._make_placeholder_row(geo_type='county', geo_value=str(i)*5, value=i*1., stderr=i*10., sample_size=i*100.)[0] + CovidcastTestRow.make_default_row(geo_type='county', geo_value=str(i)*5, value=i*1., stderr=i*10., sample_size=i*100.) for i in [1, 2, 3] ] + [ # geo value intended to overlap with counties above - self._make_placeholder_row(geo_type='msa', geo_value=str(i-3)*5, value=i*1., stderr=i*10., sample_size=i*100.)[0] + CovidcastTestRow.make_default_row(geo_type='msa', geo_value=str(i-3)*5, value=i*1., stderr=i*10., sample_size=i*100.) for i in [4, 5, 6] ] self._insert_rows(rows) @@ -53,11 +49,11 @@ def _insert_placeholder_set_two(self): def _insert_placeholder_set_three(self): rows = [ - self._make_placeholder_row(geo_type='county', geo_value='11111', time_value=2000_01_01+i, value=i*1., stderr=i*10., sample_size=i*100., issue=2000_01_03, lag=2-i)[0] + CovidcastTestRow.make_default_row(geo_type='county', geo_value='11111', time_value=2000_01_01+i, value=i*1., stderr=i*10., sample_size=i*100., issue=2000_01_03, lag=2-i) for i in [1, 2, 3] ] + [ # time value intended to overlap with 11111 above, with disjoint geo values - self._make_placeholder_row(geo_type='county', geo_value=str(i)*5, time_value=2000_01_01+i-3, value=i*1., stderr=i*10., sample_size=i*100., issue=2000_01_03, lag=5-i)[0] + CovidcastTestRow.make_default_row(geo_type='county', geo_value=str(i)*5, time_value=2000_01_01+i-3, value=i*1., stderr=i*10., sample_size=i*100., issue=2000_01_03, lag=5-i) for i in [4, 5, 6] ] self._insert_rows(rows) @@ -65,11 +61,11 @@ def _insert_placeholder_set_three(self): def _insert_placeholder_set_four(self): rows = [ - self._make_placeholder_row(source='src1', signal=str(i)*5, value=i*1., stderr=i*10., sample_size=i*100.)[0] + CovidcastTestRow.make_default_row(source='src1', signal=str(i)*5, value=i*1., stderr=i*10., sample_size=i*100.) for i in [1, 2, 3] ] + [ # signal intended to overlap with the signal above - self._make_placeholder_row(source='src2', signal=str(i-3)*5, value=i*1., stderr=i*10., sample_size=i*100.)[0] + CovidcastTestRow.make_default_row(source='src2', signal=str(i-3)*5, value=i*1., stderr=i*10., sample_size=i*100.) for i in [4, 5, 6] ] self._insert_rows(rows) @@ -77,11 +73,11 @@ def _insert_placeholder_set_four(self): def _insert_placeholder_set_five(self): rows = [ - self._make_placeholder_row(time_value=2000_01_01, value=i*1., stderr=i*10., sample_size=i*100., issue=2000_01_03+i)[0] + CovidcastTestRow.make_default_row(time_value=2000_01_01, value=i*1., stderr=i*10., sample_size=i*100., issue=2000_01_03+i) for i in [1, 2, 3] ] + [ # different time_values, same issues - self._make_placeholder_row(time_value=2000_01_01+i-3, value=i*1., stderr=i*10., sample_size=i*100., issue=2000_01_03+i-3)[0] + CovidcastTestRow.make_default_row(time_value=2000_01_01+i-3, value=i*1., stderr=i*10., sample_size=i*100., issue=2000_01_03+i-3) for i in [4, 5, 6] ] self._insert_rows(rows) @@ -94,10 +90,13 @@ def test_round_trip(self): row = self._insert_placeholder_set_one() # make the request - response, expected = self.request_based_on_row(row) + response = self.request_based_on_row(row) + + expected = [row.as_api_compatibility_row_dict()] + self.assertEqual(response, { 'result': 1, - 'epidata': [expected], + 'epidata': expected, 'message': 'success', }) @@ -154,32 +153,25 @@ def test_csv_format(self): # make the request # NB 'format' is a Python reserved word - response, _ = self.request_based_on_row( + response = self.request_based_on_row( row, extract_response=lambda resp: resp.text, **{'format':'csv'} ) - expected_response = ( - "geo_value,signal,time_value,direction,issue,lag,missing_value," + - "missing_stderr,missing_sample_size,value,stderr,sample_size\n" + - ",".join("" if x is None else str(x) for x in [ - row.geo_value, - row.signal, - row.time_value, - row.direction, - row.issue, - row.lag, - row.missing_value, - row.missing_stderr, - row.missing_sample_size, - row.value, - row.stderr, - row.sample_size - ]) + "\n" + + # This is a hardcoded mess because of api.php. + column_order = [ + "geo_value", "signal", "time_value", "direction", "issue", "lag", "missing_value", + "missing_stderr", "missing_sample_size", "value", "stderr", "sample_size" + ] + expected = ( + row.as_api_compatibility_row_df() + .assign(direction = None) + .to_csv(columns=column_order, index=False) ) # assert that the right data came back - self.assertEqual(response, expected_response) + self.assertEqual(response, expected) def test_raw_json_format(self): """Test generate raw json data.""" @@ -188,10 +180,12 @@ def test_raw_json_format(self): row = self._insert_placeholder_set_one() # make the request - response, expected = self.request_based_on_row(row, **{'format':'json'}) + response = self.request_based_on_row(row, **{'format':'json'}) + + expected = [row.as_api_compatibility_row_dict()] # assert that the right data came back - self.assertEqual(response, [expected]) + self.assertEqual(response, expected) def test_fields(self): """Test fields parameter""" @@ -200,7 +194,9 @@ def test_fields(self): row = self._insert_placeholder_set_one() # limit fields - response, expected = self.request_based_on_row(row, fields='time_value,geo_value') + response = self.request_based_on_row(row, fields='time_value,geo_value') + + expected = row.as_api_compatibility_row_dict() expected_all = { 'result': 1, 'epidata': [{ @@ -213,15 +209,14 @@ def test_fields(self): self.assertEqual(response, expected_all) # limit using invalid fields - response, _ = self.request_based_on_row(row, fields='time_value,geo_value,doesnt_exist') + response = self.request_based_on_row(row, fields='time_value,geo_value,doesnt_exist') # assert that the right data came back (only valid fields) self.assertEqual(response, expected_all) # limit exclude fields: exclude all except time_value and geo_value - - response, _ = self.request_based_on_row(row, fields=( + response = self.request_based_on_row(row, fields=( '-value,-stderr,-sample_size,-direction,-issue,-lag,-signal,' + '-missing_value,-missing_stderr,-missing_sample_size' )) @@ -234,18 +229,15 @@ def test_location_wildcard(self): # insert placeholder data rows = self._insert_placeholder_set_two() - expected_counties = [ - self.expected_from_row(r) for r in rows[:3] - ] - + expected = [row.as_api_compatibility_row_dict() for row in rows[:3]] # make the request - response, _ = self.request_based_on_row(rows[0], geo_value="*") + response = self.request_based_on_row(rows[0], geo_value="*") self.maxDiff = None # assert that the right data came back self.assertEqual(response, { 'result': 1, - 'epidata': expected_counties, + 'epidata': expected, 'message': 'success', }) @@ -254,18 +246,16 @@ def test_time_values_wildcard(self): # insert placeholder data rows = self._insert_placeholder_set_three() - expected_time_values = [ - self.expected_from_row(r) for r in rows[:3] - ] + expected = [row.as_api_compatibility_row_dict() for row in rows[:3]] # make the request - response, _ = self.request_based_on_row(rows[0], time_values="*") + response = self.request_based_on_row(rows[0], time_values="*") self.maxDiff = None # assert that the right data came back self.assertEqual(response, { 'result': 1, - 'epidata': expected_time_values, + 'epidata': expected, 'message': 'success', }) @@ -274,18 +264,16 @@ def test_issues_wildcard(self): # insert placeholder data rows = self._insert_placeholder_set_five() - expected_issues = [ - self.expected_from_row(r) for r in rows[:3] - ] + expected = [row.as_api_compatibility_row_dict() for row in rows[:3]] # make the request - response, _ = self.request_based_on_row(rows[0], issues="*") + response = self.request_based_on_row(rows[0], issues="*") self.maxDiff = None # assert that the right data came back self.assertEqual(response, { 'result': 1, - 'epidata': expected_issues, + 'epidata': expected, 'message': 'success', }) @@ -294,12 +282,10 @@ def test_signal_wildcard(self): # insert placeholder data rows = self._insert_placeholder_set_four() - expected_signals = [ - self.expected_from_row(r) for r in rows[:3] - ] + expected_signals = [row.as_api_compatibility_row_dict() for row in rows[:3]] # make the request - response, _ = self.request_based_on_row(rows[0], signals="*") + response = self.request_based_on_row(rows[0], signals="*") self.maxDiff = None # assert that the right data came back @@ -314,35 +300,33 @@ def test_geo_value(self): # insert placeholder data rows = self._insert_placeholder_set_two() - expected_counties = [ - self.expected_from_row(r) for r in rows[:3] - ] + expected = [row.as_api_compatibility_row_dict() for row in rows[:3]] def fetch(geo_value): # make the request - response, _ = self.request_based_on_row(rows[0], geo_value=geo_value) + response = self.request_based_on_row(rows[0], geo_value=geo_value) return response # test fetch a specific region r = fetch('11111') self.assertEqual(r['message'], 'success') - self.assertEqual(r['epidata'], [expected_counties[0]]) + self.assertEqual(r['epidata'], expected[0:1]) # test fetch a specific yet not existing region r = fetch('55555') self.assertEqual(r['message'], 'no results') # test fetch multiple regions r = fetch('11111,22222') self.assertEqual(r['message'], 'success') - self.assertEqual(r['epidata'], [expected_counties[0], expected_counties[1]]) + self.assertEqual(r['epidata'], expected[0:2]) # test fetch multiple noncontiguous regions r = fetch('11111,33333') self.assertEqual(r['message'], 'success') - self.assertEqual(r['epidata'], [expected_counties[0], expected_counties[2]]) + self.assertEqual(r['epidata'], [expected[0], expected[2]]) # test fetch multiple regions but one is not existing r = fetch('11111,55555') self.assertEqual(r['message'], 'success') - self.assertEqual(r['epidata'], [expected_counties[0]]) + self.assertEqual(r['epidata'], expected[0:1]) # test fetch empty region r = fetch('') self.assertEqual(r['message'], 'no results') @@ -352,12 +336,10 @@ def test_location_timeline(self): # insert placeholder data rows = self._insert_placeholder_set_three() - expected_timeseries = [ - self.expected_from_row(r) for r in rows[:3] - ] + expected_timeseries = [row.as_api_compatibility_row_dict() for row in rows[:3]] # make the request - response, _ = self.request_based_on_row(rows[0], time_values='20000101-20000105') + response = self.request_based_on_row(rows[0], time_values='20000101-20000105') # assert that the right data came back self.assertEqual(response, { @@ -383,15 +365,15 @@ def test_unique_key_constraint(self): def test_nullable_columns(self): """Missing values should be surfaced as null.""" - row, _ = self._make_placeholder_row( + row = CovidcastTestRow.make_default_row( stderr=None, sample_size=None, missing_stderr=Nans.OTHER.value, missing_sample_size=Nans.OTHER.value ) self._insert_rows([row]) # make the request - response, expected = self.request_based_on_row(row) - expected.update(stderr=None, sample_size=None) + response = self.request_based_on_row(row) + expected = row.as_api_compatibility_row_dict() # assert that the right data came back self.assertEqual(response, { @@ -405,18 +387,19 @@ def test_temporal_partitioning(self): # insert placeholder data rows = [ - self._make_placeholder_row(time_type=tt)[0] + CovidcastTestRow.make_default_row(time_type=tt) for tt in "hour day week month year".split() ] self._insert_rows(rows) # make the request - response, expected = self.request_based_on_row(rows[1], time_values="0-99999999") + response = self.request_based_on_row(rows[1], time_values="*") + expected = [rows[1].as_api_compatibility_row_dict()] # assert that the right data came back self.assertEqual(response, { 'result': 1, - 'epidata': [expected], + 'epidata': expected, 'message': 'success', }) @@ -427,37 +410,37 @@ def test_date_formats(self): rows = self._insert_placeholder_set_three() # make the request - response, expected = self.request_based_on_row(rows[0], time_values="20000102", geo_value="*") + response = self.request_based_on_row(rows[0], time_values="20000102", geo_value="*") # assert that the right data came back self.assertEqual(len(response['epidata']), 2) # make the request - response, expected = self.request_based_on_row(rows[0], time_values="2000-01-02", geo_value="*") + response = self.request_based_on_row(rows[0], time_values="2000-01-02", geo_value="*") # assert that the right data came back self.assertEqual(len(response['epidata']), 2) # make the request - response, expected = self.request_based_on_row(rows[0], time_values="20000102,20000103", geo_value="*") + response = self.request_based_on_row(rows[0], time_values="20000102,20000103", geo_value="*") # assert that the right data came back - self.assertEqual(len(response['epidata']), 4) + self.assertEqual(len(response['epidata']), 2 * 2) # make the request - response, expected = self.request_based_on_row(rows[0], time_values="2000-01-02,2000-01-03", geo_value="*") + response = self.request_based_on_row(rows[0], time_values="2000-01-02,2000-01-03", geo_value="*") # assert that the right data came back - self.assertEqual(len(response['epidata']), 4) + self.assertEqual(len(response['epidata']), 2 * 2) # make the request - response, expected = self.request_based_on_row(rows[0], time_values="20000102-20000104", geo_value="*") + response = self.request_based_on_row(rows[0], time_values="20000102-20000104", geo_value="*") # assert that the right data came back - self.assertEqual(len(response['epidata']), 6) + self.assertEqual(len(response['epidata']), 2 * 3) # make the request - response, expected = self.request_based_on_row(rows[0], time_values="2000-01-02:2000-01-04", geo_value="*") + response = self.request_based_on_row(rows[0], time_values="2000-01-02:2000-01-04", geo_value="*") # assert that the right data came back - self.assertEqual(len(response['epidata']), 6) + self.assertEqual(len(response['epidata']), 2 * 3) diff --git a/integrations/server/test_covidcast_endpoints.py b/integrations/server/test_covidcast_endpoints.py index 54974a874..41d942456 100644 --- a/integrations/server/test_covidcast_endpoints.py +++ b/integrations/server/test_covidcast_endpoints.py @@ -1,30 +1,23 @@ """Integration tests for the custom `covidcast/*` endpoints.""" # standard library -from typing import Iterable, Dict, Any -import unittest from io import StringIO - -# from typing import Optional -from dataclasses import dataclass +from typing import Sequence # third party -import mysql.connector +from more_itertools import windowed import requests import pandas as pd -from delphi_utils import Nans from delphi.epidata.acquisition.covidcast.covidcast_meta_cache_updater import main as update_cache - -from delphi.epidata.acquisition.covidcast.database import Database -from delphi.epidata.acquisition.covidcast.test_utils import CovidcastBase +from delphi.epidata.acquisition.covidcast.test_utils import CovidcastBase, CovidcastTestRow # use the local instance of the Epidata API BASE_URL = "http://delphi_web_epidata/epidata/covidcast" +BASE_URL_OLD = "http://delphi_web_epidata/epidata/api.php" class CovidcastEndpointTests(CovidcastBase): - """Tests the `covidcast/*` endpoint.""" def localSetUp(self): @@ -32,19 +25,36 @@ def localSetUp(self): # reset the `covidcast_meta_cache` table (it should always have one row) self._db._cursor.execute('update covidcast_meta_cache set timestamp = 0, epidata = "[]"') - def _fetch(self, endpoint="/", **params): + def _fetch(self, endpoint="/", is_compatibility=False, **params): # make the request - response = requests.get( - f"{BASE_URL}{endpoint}", - params=params, - ) + if is_compatibility: + url = BASE_URL_OLD + # only set endpoint if it's not already set + # only set endpoint if it's not already set + params.setdefault("endpoint", "covidcast") + if params.get("source"): + params.setdefault("data_source", params.get("source")) + else: + url = f"{BASE_URL}{endpoint}" + response = requests.get(url, params=params) response.raise_for_status() return response.json() + def _diff_rows(self, rows: Sequence[float]): + return [ + float(x - y) if x is not None and y is not None else None + for x, y in zip(rows[1:], rows[:-1]) + ] + + def _smooth_rows(self, rows: Sequence[float]): + return [ + sum(e)/len(e) if None not in e else None + for e in windowed(rows, 7) + ] + def test_basic(self): """Request a signal from the / endpoint.""" - - rows = [self._make_placeholder_row(time_value=20200401 + i, value=i)[0] for i in range(10)] + rows = [CovidcastTestRow.make_default_row(time_value=2020_04_01 + i, value=i) for i in range(10)] first = rows[0] self._insert_rows(rows) @@ -56,11 +66,25 @@ def test_basic(self): out = self._fetch("/", signal=first.signal_pair(), geo=first.geo_pair(), time="day:*") self.assertEqual(len(out["epidata"]), len(rows)) + def test_compatibility(self): + """Request at the /api.php endpoint.""" + rows = [CovidcastTestRow.make_default_row(source="src", signal="sig", time_value=2020_04_01 + i, value=i) for i in range(10)] + first = rows[0] + self._insert_rows(rows) + + with self.subTest("validation"): + out = self._fetch("/", is_compatibility=True) + self.assertEqual(out["result"], -1) + + with self.subTest("simple"): + out = self._fetch("/", signal=first.signal_pair(), geo=first.geo_pair(), time="day:*", is_compatibility=True) + self.assertEqual(len(out["epidata"]), len(rows)) + def test_trend(self): """Request a signal from the /trend endpoint.""" num_rows = 30 - rows = [self._make_placeholder_row(time_value=20200401 + i, value=i)[0] for i in range(num_rows)] + rows = [CovidcastTestRow.make_default_row(time_value=2020_04_01 + i, value=i) for i in range(num_rows)] first = rows[0] last = rows[-1] ref = rows[num_rows // 2] @@ -68,6 +92,7 @@ def test_trend(self): out = self._fetch("/trend", signal=first.signal_pair(), geo=first.geo_pair(), date=last.time_value, window="20200401-20201212", basis=ref.time_value) + self.assertEqual(out["result"], 1) self.assertEqual(len(out["epidata"]), 1) trend = out["epidata"][0] @@ -90,11 +115,12 @@ def test_trend(self): self.assertEqual(trend["max_value"], last.value) self.assertEqual(trend["max_trend"], "steady") + def test_trendseries(self): """Request a signal from the /trendseries endpoint.""" num_rows = 3 - rows = [self._make_placeholder_row(time_value=20200401 + i, value=num_rows - i)[0] for i in range(num_rows)] + rows = [CovidcastTestRow.make_default_row(time_value=2020_04_01 + i, value=num_rows - i) for i in range(num_rows)] first = rows[0] last = rows[-1] self._insert_rows(rows) @@ -127,6 +153,7 @@ def match_row(trend, row): self.assertEqual(trend["max_date"], first.time_value) self.assertEqual(trend["max_value"], first.value) self.assertEqual(trend["max_trend"], "steady") + with self.subTest("trend1"): trend = trends[1] match_row(trend, rows[1]) @@ -159,10 +186,10 @@ def test_correlation(self): """Request a signal from the /correlation endpoint.""" num_rows = 30 - reference_rows = [self._make_placeholder_row(signal="ref", time_value=20200401 + i, value=i)[0] for i in range(num_rows)] + reference_rows = [CovidcastTestRow.make_default_row(signal="ref", time_value=20200401 + i, value=i) for i in range(num_rows)] first = reference_rows[0] self._insert_rows(reference_rows) - other_rows = [self._make_placeholder_row(signal="other", time_value=20200401 + i, value=i)[0] for i in range(num_rows)] + other_rows = [CovidcastTestRow.make_default_row(signal="other", time_value=20200401 + i, value=i) for i in range(num_rows)] other = other_rows[0] self._insert_rows(other_rows) max_lag = 3 @@ -185,7 +212,7 @@ def test_correlation(self): def test_csv(self): """Request a signal from the /csv endpoint.""" - rows = [self._make_placeholder_row(time_value=20200401 + i, value=i)[0] for i in range(10)] + rows = [CovidcastTestRow.make_default_row(time_value=2020_04_01 + i, value=i) for i in range(10)] first = rows[0] self._insert_rows(rows) @@ -199,13 +226,15 @@ def test_csv(self): self.assertEqual(df.shape, (len(rows), 10)) self.assertEqual(list(df.columns), ["geo_value", "signal", "time_value", "issue", "lag", "value", "stderr", "sample_size", "geo_type", "data_source"]) + def test_backfill(self): """Request a signal from the /backfill endpoint.""" + TEST_DATE_VALUE = 2020_04_01 num_rows = 10 - issue_0 = [self._make_placeholder_row(time_value=20200401 + i, value=i, sample_size=1, lag=0, issue=20200401 + i)[0] for i in range(num_rows)] - issue_1 = [self._make_placeholder_row(time_value=20200401 + i, value=i + 1, sample_size=2, lag=1, issue=20200401 + i + 1)[0] for i in range(num_rows)] - last_issue = [self._make_placeholder_row(time_value=20200401 + i, value=i + 2, sample_size=3, lag=2, issue=20200401 + i + 2)[0] for i in range(num_rows)] # <-- the latest issues + issue_0 = [CovidcastTestRow.make_default_row(time_value=TEST_DATE_VALUE + i, value=i, sample_size=1, lag=0, issue=TEST_DATE_VALUE + i) for i in range(num_rows)] + issue_1 = [CovidcastTestRow.make_default_row(time_value=TEST_DATE_VALUE + i, value=i + 1, sample_size=2, lag=1, issue=TEST_DATE_VALUE + i + 1) for i in range(num_rows)] + last_issue = [CovidcastTestRow.make_default_row(time_value=TEST_DATE_VALUE + i, value=i + 2, sample_size=3, lag=2, issue=TEST_DATE_VALUE + i + 2) for i in range(num_rows)] # <-- the latest issues self._insert_rows([*issue_0, *issue_1, *last_issue]) first = issue_0[0] @@ -231,7 +260,7 @@ def test_meta(self): """Request a signal from the /meta endpoint.""" num_rows = 10 - rows = [self._make_placeholder_row(time_value=20200401 + i, value=i, source="fb-survey", signal="smoothed_cli")[0] for i in range(num_rows)] + rows = [CovidcastTestRow.make_default_row(time_value=2020_04_01 + i, value=i, source="fb-survey", signal="smoothed_cli") for i in range(num_rows)] self._insert_rows(rows) first = rows[0] last = rows[-1] @@ -271,8 +300,8 @@ def test_coverage(self): """Request a signal from the /coverage endpoint.""" num_geos_per_date = [10, 20, 30, 40, 44] - dates = [20200401 + i for i in range(len(num_geos_per_date))] - rows = [self._make_placeholder_row(time_value=dates[i], value=i, geo_value=str(geo_value))[0] for i, num_geo in enumerate(num_geos_per_date) for geo_value in range(num_geo)] + dates = [2020_04_01 + i for i in range(len(num_geos_per_date))] + rows = [CovidcastTestRow.make_default_row(time_value=dates[i], value=i, geo_value=str(geo_value)) for i, num_geo in enumerate(num_geos_per_date) for geo_value in range(num_geo)] self._insert_rows(rows) first = rows[0] diff --git a/requirements.api.txt b/requirements.api.txt index d5cc0e63b..6ccafc1e1 100644 --- a/requirements.api.txt +++ b/requirements.api.txt @@ -2,6 +2,7 @@ epiweeks==2.1.2 Flask==2.2.2 itsdangerous<2.1 jinja2==3.0.3 +more_itertools==8.4.0 mysqlclient==2.1.1 newrelic orjson==3.4.7 diff --git a/src/acquisition/covidcast/covidcast_row.py b/src/acquisition/covidcast/covidcast_row.py new file mode 100644 index 000000000..23e19eb57 --- /dev/null +++ b/src/acquisition/covidcast/covidcast_row.py @@ -0,0 +1,132 @@ +from dataclasses import asdict, dataclass +from typing import Any, ClassVar, Dict, List, Optional + +import pandas as pd + + +PANDAS_DTYPES = { + "source": str, + "signal": str, + "time_type": str, + "time_value": "Int64", + "geo_type": str, + "geo_value": str, + "value": float, + "stderr": float, + "sample_size": float, + "missing_value": "Int8", + "missing_stderr": "Int8", + "missing_sample_size": "Int8", + "issue": "Int64", + "lag": "Int64", + "id": "Int64", + "direction": "Int8", + "direction_updated_timestamp": "Int64", + "value_updated_timestamp": "Int64", +} + +@dataclass +class CovidcastRow: + """A container for the values of a single covidcast database row. + + Used for: + - inserting rows into the database + - creating test rows with default fields for testing + - converting from and to formats (dict, csv, df, kwargs) + - creating consistent views, with consistent data types (dict, csv, df) + + The rows are specified in 'v4_schema.sql'. The datatypes are made to match database. When writing to Pandas, the dtypes match the JIT model.py schema. + """ + + # Arguments. + source: str + signal: str + time_type: str + geo_type: str + time_value: int + geo_value: str + value: float + stderr: float + sample_size: float + # The following three fields are Nans enums from delphi_utils.nans. + missing_value: int + missing_stderr: int + missing_sample_size: int + issue: int + lag: int + # The following three fields are only the database, but are not ingested at acquisition and not returned by the API. + epimetric_id: Optional[int] = None + direction: Optional[int] = None + value_updated_timestamp: Optional[int] = 0 + + # Classvars. + _db_row_ignore_fields: ClassVar = [] + _api_row_ignore_fields: ClassVar = ["epimetric_id", "value_updated_timestamp"] + _api_row_compatibility_ignore_fields: ClassVar = _api_row_ignore_fields + ["source", "time_type", "geo_type"] + + _pandas_dtypes: ClassVar = PANDAS_DTYPES + + def as_dict(self, ignore_fields: Optional[List[str]] = None) -> dict: + d = asdict(self) + if ignore_fields: + for key in ignore_fields: + del d[key] + return d + + def as_api_row_dict(self, ignore_fields: Optional[List[str]] = None) -> dict: + """Returns a dict view into the row with the fields returned by the API server.""" + return self.as_dict(ignore_fields=self._api_row_ignore_fields + (ignore_fields or [])) + + def as_api_compatibility_row_dict(self, ignore_fields: Optional[List[str]] = None) -> dict: + """Returns a dict view into the row with the fields returned by the old API server (the PHP server).""" + return self.as_dict(ignore_fields=self._api_row_compatibility_ignore_fields + (ignore_fields or [])) + + def as_db_row_dict(self, ignore_fields: Optional[List[str]] = None) -> dict: + """Returns a dict view into the row with the fields returned by the database.""" + return self.as_dict(ignore_fields=self._db_row_ignore_fields + (ignore_fields or [])) + + def as_dataframe(self, ignore_fields: Optional[List[str]] = None) -> pd.DataFrame: + df = pd.DataFrame.from_records([self.as_dict(ignore_fields=ignore_fields)]) + # This is to mirror the types in model.py. + df = set_df_dtypes(df, self._pandas_dtypes) + return df + + def as_api_row_df(self, ignore_fields: Optional[List[str]] = None) -> pd.DataFrame: + """Returns a dataframe view into the row with the fields returned by the API server.""" + return self.as_dataframe(ignore_fields=self._api_row_ignore_fields + (ignore_fields or [])) + + def as_api_compatibility_row_df(self, ignore_fields: Optional[List[str]] = None) -> pd.DataFrame: + """Returns a dataframe view into the row with the fields returned by the old API server (the PHP server).""" + return self.as_dataframe(ignore_fields=self._api_row_compatibility_ignore_fields + (ignore_fields or [])) + + def as_db_row_df(self, ignore_fields: Optional[List[str]] = None) -> pd.DataFrame: + """Returns a dataframe view into the row with the fields returned by an all-field database query.""" + return self.as_dataframe(ignore_fields=self._db_row_ignore_fields + (ignore_fields or [])) + + def signal_pair(self): + return f"{self.source}:{self.signal}" + + def geo_pair(self): + return f"{self.geo_type}:{self.geo_value}" + + def time_pair(self): + return f"{self.time_type}:{self.time_value}" + + + +def check_valid_dtype(dtype): + try: + pd.api.types.pandas_dtype(dtype) + except TypeError: + raise ValueError(f"Invalid dtype {dtype}") + + +def set_df_dtypes(df: pd.DataFrame, dtypes: Dict[str, Any]) -> pd.DataFrame: + """Set the dataframe column datatypes.""" + [check_valid_dtype(d) for d in dtypes.values()] + + df = df.copy() + for k, v in dtypes.items(): + if k in df.columns: + df[k] = df[k].astype(v) + return df diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py index 7b88ba00f..0fa936802 100644 --- a/src/acquisition/covidcast/csv_importer.py +++ b/src/acquisition/covidcast/csv_importer.py @@ -1,20 +1,55 @@ """Collects and reads covidcast data from a set of local CSV files.""" # standard library -from datetime import date -import glob import os import re +from dataclasses import dataclass +from datetime import date +from glob import glob +from typing import Iterator, NamedTuple, Optional, Tuple # third party -import pandas import epiweeks as epi +import pandas as pd # first party from delphi_utils import Nans from delphi.utils.epiweek import delta_epiweeks +from delphi.epidata.acquisition.covidcast.covidcast_row import CovidcastRow from delphi.epidata.acquisition.covidcast.logger import get_structured_logger +DataFrameRow = NamedTuple('DFRow', [ + ('geo_id', str), + ('value', float), + ('stderr', float), + ('sample_size', float), + ('missing_value', int), + ('missing_stderr', int), + ('missing_sample_size', int) +]) +PathDetails = NamedTuple('PathDetails', [ + ('issue', int), + ('lag', int), + ('source', str), + ("signal", str), + ('time_type', str), + ('time_value', int), + ('geo_type', str), +]) + + +@dataclass +class CsvRowValue: + """A container for the values of a single validated covidcast CSV row.""" + geo_value: str + value: float + stderr: float + sample_size: float + missing_value: int + missing_stderr: int + missing_sample_size: int + + class CsvImporter: """Finds and parses covidcast CSV files.""" @@ -37,6 +72,7 @@ class CsvImporter: MIN_YEAR = 2019 MAX_YEAR = 2030 + # The datatypes expected by pandas.read_csv. Int64 is like float in that it can handle both numbers and nans. DTYPES = { "geo_id": str, "val": float, @@ -47,19 +83,6 @@ class CsvImporter: "missing_sample_size": "Int64" } - # NOTE: this should be a Python 3.7+ `dataclass`, but the server is on 3.4 - # See https://docs.python.org/3/library/dataclasses.html - class RowValues: - """A container for the values of a single covidcast row.""" - - def __init__(self, geo_value, value, stderr, sample_size, missing_value, missing_stderr, missing_sample_size): - self.geo_value = geo_value - self.value = value - self.stderr = stderr - self.sample_size = sample_size - self.missing_value = missing_value - self.missing_stderr = missing_stderr - self.missing_sample_size = missing_sample_size @staticmethod def is_sane_day(value): @@ -77,6 +100,7 @@ def is_sane_day(value): return False return date(year=year,month=month,day=day) + @staticmethod def is_sane_week(value): """Return whether `value` is a sane (maybe not valid) YYYYWW epiweek. @@ -92,22 +116,24 @@ def is_sane_week(value): return False return value + @staticmethod - def find_issue_specific_csv_files(scan_dir, glob=glob): + def find_issue_specific_csv_files(scan_dir): logger = get_structured_logger('find_issue_specific_csv_files') - for path in sorted(glob.glob(os.path.join(scan_dir, '*'))): + for path in sorted(glob(os.path.join(scan_dir, '*'))): issuedir_match = CsvImporter.PATTERN_ISSUE_DIR.match(path.lower()) if issuedir_match and os.path.isdir(path): issue_date_value = int(issuedir_match.group(2)) issue_date = CsvImporter.is_sane_day(issue_date_value) if issue_date: logger.info(event='processing csv files from issue', detail=issue_date, file=path) - yield from CsvImporter.find_csv_files(path, issue=(issue_date, epi.Week.fromdate(issue_date)), glob=glob) + yield from CsvImporter.find_csv_files(path, issue=(issue_date, epi.Week.fromdate(issue_date))) else: logger.warning(event='invalid issue directory day', detail=issue_date_value, file=path) + @staticmethod - def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today())), glob=glob): + def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()))): """Recursively search for and yield covidcast-format CSV files. scan_dir: the directory to scan (recursively) @@ -123,11 +149,11 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today() issue_value=-1 lag_value=-1 - for path in sorted(glob.glob(os.path.join(scan_dir, '*', '*'))): - + for path in sorted(glob(os.path.join(scan_dir, '*', '*'))): + # safe to ignore this file if not path.lower().endswith('.csv'): - # safe to ignore this file continue + # match a daily or weekly naming pattern daily_match = CsvImporter.PATTERN_DAILY.match(path.lower()) weekly_match = CsvImporter.PATTERN_WEEKLY.match(path.lower()) @@ -175,7 +201,8 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today() yield (path, None) continue - yield (path, (source, signal, time_type, geo_type, time_value, issue_value, lag_value)) + yield (path, PathDetails(issue_value, lag_value, source, signal, time_type, time_value, geo_type)) + @staticmethod def is_header_valid(columns): @@ -183,8 +210,9 @@ def is_header_valid(columns): return set(columns) >= CsvImporter.REQUIRED_COLUMNS + @staticmethod - def floaty_int(value): + def floaty_int(value: str) -> int: """Cast a string to an int, even if it looks like a float. For example, "-1" and "-1.0" should both result in -1. Non-integer floats @@ -196,6 +224,7 @@ def floaty_int(value): raise ValueError('not an int: "%s"' % str(value)) return int(float_value) + @staticmethod def maybe_apply(func, quantity): """Apply the given function to the given quantity if not null-ish.""" @@ -206,6 +235,7 @@ def maybe_apply(func, quantity): else: return func(quantity) + @staticmethod def validate_quantity(row, attr_quantity): """Take a row and validate a given associated quantity (e.g., val, se, stderr). @@ -219,6 +249,7 @@ def validate_quantity(row, attr_quantity): # val was a string or another data return "Error" + @staticmethod def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=None): """Take a row and validate the missing code associated with @@ -251,9 +282,10 @@ def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=N return missing_entry + @staticmethod - def extract_and_check_row(row, geo_type, filepath=None): - """Extract and return `RowValues` from a CSV row, with sanity checks. + def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[str] = None) -> Tuple[Optional[CsvRowValue], Optional[str]]: + """Extract and return `CsvRowValue` from a CSV row, with sanity checks. Also returns the name of the field which failed sanity check, or None. @@ -330,14 +362,11 @@ def extract_and_check_row(row, geo_type, filepath=None): missing_sample_size = CsvImporter.validate_missing_code(row, sample_size, "sample_size", filepath) # return extracted and validated row values - row_values = CsvImporter.RowValues( - geo_id, value, stderr, sample_size, - missing_value, missing_stderr, missing_sample_size - ) - return (row_values, None) + return (CsvRowValue(geo_id, value, stderr, sample_size, missing_value, missing_stderr, missing_sample_size), None) + @staticmethod - def load_csv(filepath, geo_type, pandas=pandas): + def load_csv(filepath: str, details: PathDetails) -> Iterator[Optional[CovidcastRow]]: """Load, validate, and yield data as `RowValues` from a CSV file. filepath: the CSV file to be loaded @@ -349,10 +378,10 @@ def load_csv(filepath, geo_type, pandas=pandas): logger = get_structured_logger('load_csv') try: - table = pandas.read_csv(filepath, dtype=CsvImporter.DTYPES) + table = pd.read_csv(filepath, dtype=CsvImporter.DTYPES) except ValueError as e: logger.warning(event='Failed to open CSV with specified dtypes, switching to str', detail=str(e), file=filepath) - table = pandas.read_csv(filepath, dtype='str') + table = pd.read_csv(filepath, dtype='str') if not CsvImporter.is_header_valid(table.columns): logger.warning(event='invalid header', detail=table.columns, file=filepath) @@ -362,9 +391,26 @@ def load_csv(filepath, geo_type, pandas=pandas): table.rename(columns={"val": "value", "se": "stderr", "missing_val": "missing_value", "missing_se": "missing_stderr"}, inplace=True) for row in table.itertuples(index=False): - row_values, error = CsvImporter.extract_and_check_row(row, geo_type, filepath) + csv_row_values, error = CsvImporter.extract_and_check_row(row, details.geo_type, filepath) + if error: logger.warning(event = 'invalid value for row', detail=(str(row), error), file=filepath) yield None continue - yield row_values + + yield CovidcastRow( + details.source, + details.signal, + details.time_type, + details.geo_type, + details.time_value, + csv_row_values.geo_value, + csv_row_values.value, + csv_row_values.stderr, + csv_row_values.sample_size, + csv_row_values.missing_value, + csv_row_values.missing_stderr, + csv_row_values.missing_sample_size, + details.issue, + details.lag, + ) diff --git a/src/acquisition/covidcast/csv_to_database.py b/src/acquisition/covidcast/csv_to_database.py index 34cbad663..842e820c9 100644 --- a/src/acquisition/covidcast/csv_to_database.py +++ b/src/acquisition/covidcast/csv_to_database.py @@ -4,10 +4,12 @@ import argparse import os import time +from logging import Logger +from typing import Callable, Iterable, Optional, Tuple # first party -from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter -from delphi.epidata.acquisition.covidcast.database import Database, CovidcastRow, DBLoadStateException +from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter, PathDetails +from delphi.epidata.acquisition.covidcast.database import Database, DBLoadStateException from delphi.epidata.acquisition.covidcast.file_archiver import FileArchiver from delphi.epidata.acquisition.covidcast.logger import get_structured_logger @@ -28,17 +30,19 @@ def get_argument_parser(): help="filename for log output (defaults to stdout)") return parser -def collect_files(data_dir, specific_issue_date,csv_importer_impl=CsvImporter): + +def collect_files(data_dir: str, specific_issue_date: bool): """Fetch path and data profile details for each file to upload.""" logger= get_structured_logger('collect_files') if specific_issue_date: - results = list(csv_importer_impl.find_issue_specific_csv_files(data_dir)) + results = list(CsvImporter.find_issue_specific_csv_files(data_dir)) else: - results = list(csv_importer_impl.find_csv_files(os.path.join(data_dir, 'receiving'))) + results = list(CsvImporter.find_csv_files(os.path.join(data_dir, 'receiving'))) logger.info(f'found {len(results)} files') return results -def make_handlers(data_dir, specific_issue_date, file_archiver_impl=FileArchiver): + +def make_handlers(data_dir: str, specific_issue_date: bool): if specific_issue_date: # issue-specific uploads are always one-offs, so we can leave all # files in place without worrying about cleaning up @@ -47,7 +51,7 @@ def handle_failed(path_src, filename, source, logger): def handle_successful(path_src, filename, source, logger): logger.info(event='archiving as successful',file=filename) - file_archiver_impl.archive_inplace(path_src, filename) + FileArchiver.archive_inplace(path_src, filename) else: # normal automation runs require some shuffling to remove files # from receiving and place them in the archive @@ -59,22 +63,24 @@ def handle_failed(path_src, filename, source, logger): logger.info(event='archiving as failed - ', detail=source, file=filename) path_dst = os.path.join(archive_failed_dir, source) compress = False - file_archiver_impl.archive_file(path_src, path_dst, filename, compress) + FileArchiver.archive_file(path_src, path_dst, filename, compress) # helper to archive a successful file with compression def handle_successful(path_src, filename, source, logger): logger.info(event='archiving as successful',file=filename) path_dst = os.path.join(archive_successful_dir, source) compress = True - file_archiver_impl.archive_file(path_src, path_dst, filename, compress) + FileArchiver.archive_file(path_src, path_dst, filename, compress) + return handle_successful, handle_failed + def upload_archive( - path_details, - database, - handlers, - logger, - csv_importer_impl=CsvImporter): + path_details: Iterable[Tuple[str, Optional[PathDetails]]], + database: Database, + handlers: Tuple[Callable], + logger: Logger + ): """Upload CSVs to the database and archive them using the specified handlers. :path_details: output from CsvImporter.find*_csv_files @@ -89,20 +95,16 @@ def upload_archive( total_modified_row_count = 0 # iterate over each file for path, details in path_details: - logger.info(event='handling',dest=path) + logger.info(event='handling', dest=path) path_src, filename = os.path.split(path) + # file path or name was invalid, source is unknown if not details: - # file path or name was invalid, source is unknown archive_as_failed(path_src, filename, 'unknown',logger) continue - (source, signal, time_type, geo_type, time_value, issue, lag) = details - - csv_rows = csv_importer_impl.load_csv(path, geo_type) - - cc_rows = CovidcastRow.fromCsvRows(csv_rows, source, signal, time_type, geo_type, time_value, issue, lag) - rows_list = list(cc_rows) + csv_rows = CsvImporter.load_csv(path, details) + rows_list = list(csv_rows) all_rows_valid = rows_list and all(r is not None for r in rows_list) if all_rows_valid: try: @@ -111,12 +113,13 @@ def upload_archive( logger.info( "Inserted database rows", row_count = modified_row_count, - source = source, - signal = signal, - geo_type = geo_type, - time_value = time_value, - issue = issue, - lag = lag) + source = details.source, + signal = details.signal, + geo_type = details.geo_type, + time_value = details.time_value, + issue = details.issue, + lag = details.lag + ) if modified_row_count is None or modified_row_count: # else would indicate zero rows inserted total_modified_row_count += (modified_row_count if modified_row_count else 0) database.commit() @@ -131,40 +134,37 @@ def upload_archive( # archive the current file based on validation results if all_rows_valid: - archive_as_successful(path_src, filename, source, logger) + archive_as_successful(path_src, filename, details.source, logger) else: - archive_as_failed(path_src, filename, source,logger) + archive_as_failed(path_src, filename, details.source, logger) return total_modified_row_count -def main( - args, - database_impl=Database, - collect_files_impl=collect_files, - upload_archive_impl=upload_archive): +def main(args): """Find, parse, and upload covidcast signals.""" logger = get_structured_logger("csv_ingestion", filename=args.log_file) start_time = time.time() # shortcut escape without hitting db if nothing to do - path_details = collect_files_impl(args.data_dir, args.specific_issue_date) + path_details = collect_files(args.data_dir, args.specific_issue_date) if not path_details: logger.info('nothing to do; exiting...') return logger.info("Ingesting CSVs", csv_count = len(path_details)) - database = database_impl() + database = Database() database.connect() try: - modified_row_count = upload_archive_impl( + modified_row_count = upload_archive( path_details, database, make_handlers(args.data_dir, args.specific_issue_date), - logger) + logger + ) logger.info("Finished inserting/updating database rows", row_count = modified_row_count) finally: database.do_analyze() @@ -175,5 +175,6 @@ def main( "Ingested CSVs into database", total_runtime_in_seconds=round(time.time() - start_time, 2)) + if __name__ == '__main__': main(get_argument_parser().parse_args()) diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index d21a27c35..3beedac82 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -2,69 +2,20 @@ See src/ddl/covidcast.sql for an explanation of each field. """ +import threading +from math import ceil +from multiprocessing import cpu_count +from queue import Queue, Empty +from typing import List # third party import json import mysql.connector -import numpy as np -from math import ceil - -from queue import Queue, Empty -import threading -from multiprocessing import cpu_count # first party import delphi.operations.secrets as secrets - from delphi.epidata.acquisition.covidcast.logger import get_structured_logger - -class CovidcastRow(): - """A container for all the values of a single covidcast row.""" - - @staticmethod - def fromCsvRowValue(row_value, source, signal, time_type, geo_type, time_value, issue, lag): - if row_value is None: return None - return CovidcastRow(source, signal, time_type, geo_type, time_value, - row_value.geo_value, - row_value.value, - row_value.stderr, - row_value.sample_size, - row_value.missing_value, - row_value.missing_stderr, - row_value.missing_sample_size, - issue, lag) - - @staticmethod - def fromCsvRows(row_values, source, signal, time_type, geo_type, time_value, issue, lag): - # NOTE: returns a generator, as row_values is expected to be a generator - return (CovidcastRow.fromCsvRowValue(row_value, source, signal, time_type, geo_type, time_value, issue, lag) - for row_value in row_values) - - def __init__(self, source, signal, time_type, geo_type, time_value, geo_value, value, stderr, - sample_size, missing_value, missing_stderr, missing_sample_size, issue, lag): - self.id = None - self.source = source - self.signal = signal - self.time_type = time_type - self.geo_type = geo_type - self.time_value = time_value - self.geo_value = geo_value # from CSV row - self.value = value # ... - self.stderr = stderr # ... - self.sample_size = sample_size # ... - self.missing_value = missing_value # ... - self.missing_stderr = missing_stderr # ... - self.missing_sample_size = missing_sample_size # from CSV row - self.direction_updated_timestamp = 0 - self.direction = None - self.issue = issue - self.lag = lag - - def signal_pair(self): - return f"{self.source}:{self.signal}" - - def geo_pair(self): - return f"{self.geo_type}:{self.geo_value}" +from delphi.epidata.acquisition.covidcast.covidcast_row import CovidcastRow class DBLoadStateException(Exception): @@ -156,7 +107,7 @@ def do_analyze(self): def insert_or_update_bulk(self, cc_rows): return self.insert_or_update_batch(cc_rows) - def insert_or_update_batch(self, cc_rows, batch_size=2**20, commit_partial=False, suppress_jobs=False): + def insert_or_update_batch(self, cc_rows: List[CovidcastRow], batch_size=2**20, commit_partial=False, suppress_jobs=False): """ Insert new rows into the load table and dispatch into dimension and fact tables. """ diff --git a/src/acquisition/covidcast/test_utils.py b/src/acquisition/covidcast/test_utils.py index 181dfac68..96db2c164 100644 --- a/src/acquisition/covidcast/test_utils.py +++ b/src/acquisition/covidcast/test_utils.py @@ -1,12 +1,151 @@ +from dataclasses import fields +from datetime import date +from typing import Any, Dict, Iterable, List, Optional, Sequence import unittest +import pandas as pd + from delphi_utils import Nans -from delphi.epidata.acquisition.covidcast.database import Database, CovidcastRow +from delphi.epidata.acquisition.covidcast.covidcast_row import CovidcastRow +from delphi.epidata.acquisition.covidcast.database import Database +from delphi.epidata.server.utils.dates import day_to_time_value, time_value_to_day import delphi.operations.secrets as secrets # all the Nans we use here are just one value, so this is a shortcut to it: nmv = Nans.NOT_MISSING.value + +class CovidcastTestRow(CovidcastRow): + @staticmethod + def make_default_row(**kwargs) -> "CovidcastTestRow": + default_args = { + "source": "src", + "signal": "sig", + "time_type": "day", + "geo_type": "county", + "time_value": 2020_02_02, + "geo_value": "01234", + "value": 10.0, + "stderr": 10.0, + "sample_size": 10.0, + "missing_value": Nans.NOT_MISSING.value, + "missing_stderr": Nans.NOT_MISSING.value, + "missing_sample_size": Nans.NOT_MISSING.value, + "issue": 2020_02_02, + "lag": 0, + } + default_args.update(kwargs) + return CovidcastTestRow(**default_args) + + def __post_init__(self): + # Convert time values to ints by default. + if isinstance(self.time_value, date): + self.time_value = day_to_time_value(self.time_value) + if isinstance(self.issue, date): + self.issue = day_to_time_value(self.issue) + if isinstance(self.value_updated_timestamp, date): + self.value_updated_timestamp = day_to_time_value(self.value_updated_timestamp) + + def _sanitize_fields(self, extra_checks: bool = True): + if self.issue and self.issue < self.time_value: + self.issue = self.time_value + + if self.issue: + self.lag = (time_value_to_day(self.issue) - time_value_to_day(self.time_value)).days + else: + self.lag = None + + # This sanity checking is already done in CsvImporter, but it's here so the testing class gets it too. + if pd.isna(self.value) and self.missing_value == Nans.NOT_MISSING: + self.missing_value = Nans.NOT_APPLICABLE.value if extra_checks else Nans.OTHER.value + + if pd.isna(self.stderr) and self.missing_stderr == Nans.NOT_MISSING: + self.missing_stderr = Nans.NOT_APPLICABLE.value if extra_checks else Nans.OTHER.value + + if pd.isna(self.sample_size) and self.missing_sample_size == Nans.NOT_MISSING: + self.missing_sample_size = Nans.NOT_APPLICABLE.value if extra_checks else Nans.OTHER.value + + return self + + +def covidcast_rows_from_args(sanitize_fields: bool = False, test_mode: bool = True, **kwargs: Dict[str, Iterable]) -> List[CovidcastTestRow]: + """A convenience constructor for test rows. + + Example: + covidcast_rows_from_args(value=[1, 2, 3], time_value=[1, 2, 3]) will yield + [CovidcastTestRow.make_default_row(value=1, time_value=1), CovidcastTestRow.make_default_row(value=2, time_value=2), CovidcastTestRow.make_default_row(value=3, time_value=3)] + with all the defaults from CovidcastTestRow. + """ + # If any iterables were passed instead of lists, convert them to lists. + kwargs = {key: list(value) for key, value in kwargs.items()} + # All the arg values must be lists of the same length. + assert len(set(len(lst) for lst in kwargs.values())) == 1 + + if sanitize_fields: + return [CovidcastTestRow.make_default_row(**_kwargs)._sanitize_fields(extra_checks=test_mode) for _kwargs in transpose_dict(kwargs)] + else: + return [CovidcastTestRow.make_default_row(**_kwargs) for _kwargs in transpose_dict(kwargs)] + + +def covidcast_rows_from_records(records: Iterable[dict], sanity_check: bool = False) -> List[CovidcastTestRow]: + """A convenience constructor. + + Default is different from from_args, because from_records is usually called on faux-API returns in tests, + where we don't want any values getting default filled in. + + You can use csv.DictReader before this to read a CSV file. + """ + records = list(records) + return [CovidcastTestRow.make_default_row(**record) if not sanity_check else CovidcastTestRow.make_default_row(**record)._sanitize_fields() for record in records] + + +def covidcast_rows_as_dicts(rows: Iterable[CovidcastTestRow], ignore_fields: Optional[List[str]] = None) -> List[dict]: + return [row.as_dict(ignore_fields=ignore_fields) for row in rows] + + +def covidcast_rows_as_dataframe(rows: Iterable[CovidcastTestRow], ignore_fields: Optional[List[str]] = None) -> pd.DataFrame: + if ignore_fields is None: + ignore_fields = [] + + columns = [field.name for field in fields(CovidcastTestRow) if field.name not in ignore_fields] + + if rows: + df = pd.concat([row.as_dataframe(ignore_fields=ignore_fields) for row in rows], ignore_index=True) + return df[columns] + else: + return pd.DataFrame(columns=columns) + + +def covidcast_rows_as_api_row_df(rows: Iterable[CovidcastTestRow]) -> pd.DataFrame: + return covidcast_rows_as_dataframe(rows, ignore_fields=CovidcastTestRow._api_row_ignore_fields) + + +def covidcast_rows_as_api_compatibility_row_df(rows: Iterable[CovidcastTestRow]) -> pd.DataFrame: + return covidcast_rows_as_dataframe(rows, ignore_fields=CovidcastTestRow._api_row_compatibility_ignore_fields) + + +def covidcast_rows_as_db_row_df(rows: Iterable[CovidcastTestRow]) -> pd.DataFrame: + return covidcast_rows_as_dataframe(rows, ignore_fields=CovidcastTestRow._db_row_ignore_fields) + + +def transpose_dict(d: Dict[Any, List[Any]]) -> List[Dict[Any, Any]]: + """Given a dictionary whose values are lists of the same length, turn it into a list of dictionaries whose values are the individual list entries. + + Example: + >>> transpose_dict(dict([["a", [2, 4, 6]], ["b", [3, 5, 7]], ["c", [10, 20, 30]]])) + [{"a": 2, "b": 3, "c": 10}, {"a": 4, "b": 5, "c": 20}, {"a": 6, "b": 7, "c": 30}] + """ + return [dict(zip(d.keys(), values)) for values in zip(*d.values())] + + +def assert_frame_equal_no_order(df1: pd.DataFrame, df2: pd.DataFrame, index: List[str], **kwargs: Any) -> None: + """Assert that two DataFrames are equal, ignoring the order of rows.""" + # Remove any existing index. If it wasn't named, drop it. Set a new index and sort it. + df1 = df1.reset_index().drop(columns="index").set_index(index).sort_index() + df2 = df2.reset_index().drop(columns="index").set_index(index).sort_index() + pd.testing.assert_frame_equal(df1, df2, **kwargs) + + class CovidcastBase(unittest.TestCase): def setUp(self): # use the local test instance of the database @@ -22,45 +161,29 @@ def setUp(self): self.localSetUp() self._db._connection.commit() - def localSetUp(self): - # stub; override in subclasses to perform custom setup. - # runs after tables have been truncated but before database changes have been committed - pass - def tearDown(self): # close and destroy conenction to the database + self.localTearDown() self._db.disconnect(False) del self._db - DEFAULT_TIME_VALUE=2000_01_01 - DEFAULT_ISSUE=2000_01_01 - def _make_placeholder_row(self, **kwargs): - settings = { - 'source': 'src', - 'signal': 'sig', - 'geo_type': 'state', - 'geo_value': 'pa', - 'time_type': 'day', - 'time_value': self.DEFAULT_TIME_VALUE, - 'value': 0.0, - 'stderr': 1.0, - 'sample_size': 2.0, - 'missing_value': nmv, - 'missing_stderr': nmv, - 'missing_sample_size': nmv, - 'issue': self.DEFAULT_ISSUE, - 'lag': 0 - } - settings.update(kwargs) - return (CovidcastRow(**settings), settings) + def localSetUp(self): + # stub; override in subclasses to perform custom setup. + # runs after tables have been truncated but before database changes have been committed + pass + + def localTearDown(self): + # stub; override in subclasses to perform custom teardown. + # runs after database changes have been committed + pass - def _insert_rows(self, rows): + def _insert_rows(self, rows: Sequence[CovidcastTestRow]): # inserts rows into the database using the full acquisition process, including 'dbjobs' load into history & latest tables n = self._db.insert_or_update_bulk(rows) print(f"{n} rows added to load table & dispatched to v4 schema") self._db._connection.commit() # NOTE: this isnt expressly needed for our test cases, but would be if using external access (like through client lib) to ensure changes are visible outside of this db session - def params_from_row(self, row, **kwargs): + def params_from_row(self, row: CovidcastTestRow, **kwargs): ret = { 'data_source': row.source, 'signals': row.signal, @@ -70,14 +193,4 @@ def params_from_row(self, row, **kwargs): 'geo_value': row.geo_value, } ret.update(kwargs) - return ret - - DEFAULT_MINUS=['time_type', 'geo_type', 'source'] - def expected_from_row(self, row, minus=DEFAULT_MINUS): - expected = dict(vars(row)) - # remove columns commonly excluded from output - # nb may need to add source or *_type back in for multiplexed queries - for key in ['id', 'direction_updated_timestamp'] + minus: - del expected[key] - return expected - + return ret \ No newline at end of file diff --git a/src/acquisition/covidcast_nowcast/load_sensors.py b/src/acquisition/covidcast_nowcast/load_sensors.py index f443bbd48..73ce7eee5 100644 --- a/src/acquisition/covidcast_nowcast/load_sensors.py +++ b/src/acquisition/covidcast_nowcast/load_sensors.py @@ -6,7 +6,7 @@ import sqlalchemy import delphi.operations.secrets as secrets -from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter +from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter, PathDetails SENSOR_CSV_PATH = "/common/covidcast_nowcast/receiving/" SUCCESS_DIR = "archive/successful" @@ -52,7 +52,7 @@ def main(csv_path: str = SENSOR_CSV_PATH) -> None: _move_after_processing(filepath, success=True) -def load_and_prepare_file(filepath: str, attributes: tuple) -> pd.DataFrame: +def load_and_prepare_file(filepath: str, attributes: PathDetails) -> pd.DataFrame: """ Read CSV file into a DataFrame and add relevant attributes as new columns to match DB table. @@ -68,15 +68,14 @@ def load_and_prepare_file(filepath: str, attributes: tuple) -> pd.DataFrame: ------- DataFrame with additional attributes added as columns based on filename and current date. """ - source, signal, time_type, geo_type, time_value, issue_value, lag_value = attributes data = pd.read_csv(filepath, dtype=CSV_DTYPES) - data["source"] = source - data["signal"] = signal - data["time_type"] = time_type - data["geo_type"] = geo_type - data["time_value"] = time_value - data["issue"] = issue_value - data["lag"] = lag_value + data["source"] = attributes.source + data["signal"] = attributes.signal + data["time_type"] = attributes.time_type + data["geo_type"] = attributes.geo_type + data["time_value"] = attributes.time_value + data["issue"] = attributes.issue + data["lag"] = attributes.lag data["value_updated_timestamp"] = int(time.time()) return data diff --git a/src/client/delphi_epidata.R b/src/client/delphi_epidata.R index 598bba814..be944b857 100644 --- a/src/client/delphi_epidata.R +++ b/src/client/delphi_epidata.R @@ -15,7 +15,7 @@ Epidata <- (function() { # API base url BASE_URL <- 'https://delphi.cmu.edu/epidata/api.php' - client_version <- '0.4.5' + client_version <- '0.4.6' # Helper function to cast values and/or ranges to strings .listitem <- function(value) { diff --git a/src/client/delphi_epidata.js b/src/client/delphi_epidata.js index e92b4abb9..6ef2e9f9c 100644 --- a/src/client/delphi_epidata.js +++ b/src/client/delphi_epidata.js @@ -22,7 +22,7 @@ } })(this, function (exports, fetchImpl, jQuery) { const BASE_URL = "https://delphi.cmu.edu/epidata/"; - const client_version = "0.4.5"; + const client_version = "0.4.6"; // Helper function to cast values and/or ranges to strings function _listitem(value) { diff --git a/src/client/packaging/npm/package.json b/src/client/packaging/npm/package.json index b9e20df9d..60f0e7b3a 100644 --- a/src/client/packaging/npm/package.json +++ b/src/client/packaging/npm/package.json @@ -2,7 +2,7 @@ "name": "delphi_epidata", "description": "Delphi Epidata API Client", "authors": "Delphi Group", - "version": "0.4.5", + "version": "0.4.6", "license": "MIT", "homepage": "https://github.com/cmu-delphi/delphi-epidata", "bugs": { diff --git a/src/client/packaging/pypi/delphi_epidata/__init__.py b/src/client/packaging/pypi/delphi_epidata/__init__.py index 9306b81b7..e8ae5b0ea 100644 --- a/src/client/packaging/pypi/delphi_epidata/__init__.py +++ b/src/client/packaging/pypi/delphi_epidata/__init__.py @@ -1,4 +1,4 @@ from .delphi_epidata import Epidata name = 'delphi_epidata' -__version__ = '0.4.5' +__version__ = '0.4.6' diff --git a/src/client/packaging/pypi/setup.py b/src/client/packaging/pypi/setup.py index 1878dda74..e36b48d67 100644 --- a/src/client/packaging/pypi/setup.py +++ b/src/client/packaging/pypi/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="delphi_epidata", - version="0.4.5", + version="0.4.6", author="David Farrow", author_email="dfarrow0@gmail.com", description="A programmatic interface to Delphi's Epidata API.", diff --git a/src/server/_config.py b/src/server/_config.py index 187d4581a..0be0ee219 100644 --- a/src/server/_config.py +++ b/src/server/_config.py @@ -4,7 +4,7 @@ load_dotenv() -VERSION = "0.4.5" +VERSION = "0.4.6" MAX_RESULTS = int(10e6) MAX_COMPATIBILITY_RESULTS = int(3650) diff --git a/src/server/_pandas.py b/src/server/_pandas.py index 54f8f99dc..68cbc8833 100644 --- a/src/server/_pandas.py +++ b/src/server/_pandas.py @@ -1,6 +1,7 @@ from typing import Dict, Any, Optional import pandas as pd +from flask import request from sqlalchemy import text from sqlalchemy.engine.base import Engine @@ -20,7 +21,7 @@ def as_pandas(query: str, params: Dict[str, Any], db_engine: Engine = engine, pa def print_pandas(df: pd.DataFrame): - p = create_printer() + p = create_printer(request.values.get("format")) def gen(): for row in df.to_dict(orient="records"): diff --git a/src/server/_params.py b/src/server/_params.py index a7d36353c..d0b1cda6d 100644 --- a/src/server/_params.py +++ b/src/server/_params.py @@ -446,7 +446,7 @@ def parse_source_signal_sets() -> List[SourceSignalSet]: ds = request.values.get("data_source") if ds: # old version - require_any("signal", "signals", empty=True) + require_any(request, "signal", "signals", empty=True) signals = extract_strings(("signals", "signal")) if len(signals) == 1 and signals[0] == "*": return [SourceSignalSet(ds, True)] @@ -462,7 +462,7 @@ def parse_geo_sets() -> List[GeoSet]: geo_type = request.values.get("geo_type") if geo_type: # old version - require_any("geo_value", "geo_values", empty=True) + require_any(request, "geo_value", "geo_values", empty=True) geo_values = extract_strings(("geo_values", "geo_value")) if len(geo_values) == 1 and geo_values[0] == "*": return [GeoSet(geo_type, True)] @@ -478,7 +478,7 @@ def parse_time_set() -> TimeSet: time_type = request.values.get("time_type") if time_type: # old version - require_all("time_type", "time_values") + require_all(request, "time_type", "time_values") time_values = extract_dates("time_values") if time_values == ["*"]: return TimeSet(time_type, True) diff --git a/src/server/_printer.py b/src/server/_printer.py index 04196c71d..52f959968 100644 --- a/src/server/_printer.py +++ b/src/server/_printer.py @@ -2,7 +2,7 @@ from io import StringIO from typing import Any, Dict, Iterable, List, Optional, Union -from flask import Response, jsonify, request, stream_with_context +from flask import Response, jsonify, stream_with_context from flask.json import dumps import orjson @@ -11,12 +11,10 @@ from .utils.logger import get_structured_logger -def print_non_standard(data): +def print_non_standard(format: str, data): """ prints a non standard JSON message """ - - format = request.values.get("format", "classic") if format == "json": return jsonify(data) @@ -250,8 +248,9 @@ def _end(self): return b"" -def create_printer() -> APrinter: - format: str = request.values.get("format", "classic") +def create_printer(format: str) -> APrinter: + if format is None: + return ClassicPrinter() if format == "tree": return ClassicTreePrinter("signal") if format.startswith("tree-"): diff --git a/src/server/_query.py b/src/server/_query.py index 3c23f94ad..267a78eb1 100644 --- a/src/server/_query.py +++ b/src/server/_query.py @@ -9,9 +9,11 @@ Sequence, Tuple, Union, - cast + cast, ) +from flask import Response +from flask import request from sqlalchemy import text from sqlalchemy.engine import Row @@ -53,7 +55,7 @@ def filter_values( param_key: str, params: Dict[str, Any], formatter=lambda x: x, -): +) -> str: if not values: return "FALSE" # builds a SQL expression to filter strings (ex: locations) @@ -68,7 +70,7 @@ def filter_strings( values: Optional[Sequence[str]], param_key: str, params: Dict[str, Any], -): +) -> str: return filter_values(field, values, param_key, params) @@ -77,7 +79,7 @@ def filter_integers( values: Optional[Sequence[IntRange]], param_key: str, params: Dict[str, Any], -): +) -> str: return filter_values(field, values, param_key, params) @@ -86,7 +88,7 @@ def filter_dates( values: Optional[TimeValues], param_key: str, params: Dict[str, Any], -): +) -> str: ranges = time_values_to_ranges(values) return filter_values(field, ranges, param_key, params, date_string) @@ -198,7 +200,7 @@ def parse_row( fields_string: Optional[Sequence[str]] = None, fields_int: Optional[Sequence[str]] = None, fields_float: Optional[Sequence[str]] = None, -): +) -> Dict[str, Any]: keys = set(row.keys()) parsed = dict() if fields_string: @@ -234,7 +236,7 @@ def limit_query(query: str, limit: int) -> str: return full_query -def run_query(p: APrinter, query_tuple: Tuple[str, Dict[str, Any]]): +def run_query(p: APrinter, query_tuple: Tuple[str, Dict[str, Any]]) -> Iterable[Row]: query, params = query_tuple # limit rows + 1 for detecting whether we would have more full_query = text(limit_query(query, p.remaining_rows + 1)) @@ -254,12 +256,12 @@ def execute_queries( fields_int: Sequence[str], fields_float: Sequence[str], transform: Callable[[Dict[str, Any], Row], Dict[str, Any]] = _identity_transform, -): +) -> Response: """ execute the given queries and return the response to send them """ - p = create_printer() + p = create_printer(request.values.get("format")) fields_to_send = set(extract_strings("fields") or []) if fields_to_send: @@ -313,14 +315,14 @@ def execute_query( fields_int: Sequence[str], fields_float: Sequence[str], transform: Callable[[Dict[str, Any], Row], Dict[str, Any]] = _identity_transform, -): +) -> Response: """ execute the given query and return the response to send it """ return execute_queries([(query, params)], fields_string, fields_int, fields_float, transform) -def _join_l(value: Union[str, List[str]]): +def _join_l(value: Union[str, List[str]]) -> str: return ", ".join(value) if isinstance(value, (list, tuple)) else value diff --git a/src/server/_validate.py b/src/server/_validate.py index ffdd15232..957bee09d 100644 --- a/src/server/_validate.py +++ b/src/server/_validate.py @@ -1,12 +1,11 @@ -from typing import List, Optional, Sequence, Tuple, Union +from typing import Optional -from flask import request +from flask import Request from ._exceptions import UnAuthenticatedException, ValidationFailedException -from .utils import IntRange, TimeValues -def resolve_auth_token() -> Optional[str]: +def resolve_auth_token(request: Request) -> Optional[str]: # auth request param if "auth" in request.values: return request.values["auth"] @@ -20,8 +19,8 @@ def resolve_auth_token() -> Optional[str]: return None -def check_auth_token(token: str, optional=False) -> bool: - value = resolve_auth_token() +def check_auth_token(request: Request, token: str, optional=False) -> bool: + value = resolve_auth_token(request) if value is None: if optional: @@ -35,7 +34,7 @@ def check_auth_token(token: str, optional=False) -> bool: return valid_token -def require_all(*values: str) -> bool: +def require_all(request: Request, *values: str) -> bool: """ returns true if all fields are present in the request otherwise raises an exception :returns bool @@ -46,7 +45,7 @@ def require_all(*values: str) -> bool: return True -def require_any(*values: str, empty=False) -> bool: +def require_any(request: Request, *values: str, empty=False) -> bool: """ returns true if any fields are present in the request otherwise raises an exception :returns bool diff --git a/src/server/endpoints/afhsb.py b/src/server/endpoints/afhsb.py index 69c2d2431..92cee145c 100644 --- a/src/server/endpoints/afhsb.py +++ b/src/server/endpoints/afhsb.py @@ -1,6 +1,6 @@ from typing import Dict, List -from flask import Blueprint +from flask import Blueprint, request from .._config import AUTH from .._params import extract_integers, extract_strings @@ -54,8 +54,8 @@ def _split_flu_types(flu_types: List[str]): @bp.route("/", methods=("GET", "POST")) def handle(): - check_auth_token(AUTH["afhsb"]) - require_all("locations", "epiweeks", "flu_types") + check_auth_token(request, AUTH["afhsb"]) + require_all(request, "locations", "epiweeks", "flu_types") locations = extract_strings("locations") epiweeks = extract_integers("epiweeks") diff --git a/src/server/endpoints/cdc.py b/src/server/endpoints/cdc.py index 6b7b9450d..e89eb94fb 100644 --- a/src/server/endpoints/cdc.py +++ b/src/server/endpoints/cdc.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._config import AUTH, NATION_REGION, REGION_TO_STATE from .._params import extract_strings, extract_integers @@ -12,8 +12,8 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - check_auth_token(AUTH["cdc"]) - require_all("locations", "epiweeks") + check_auth_token(request, AUTH["cdc"]) + require_all(request, "locations", "epiweeks") # parse the request locations = extract_strings("locations") diff --git a/src/server/endpoints/covid_hosp_facility.py b/src/server/endpoints/covid_hosp_facility.py index d1c9fad8a..b8e40d036 100644 --- a/src/server/endpoints/covid_hosp_facility.py +++ b/src/server/endpoints/covid_hosp_facility.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_integers, extract_strings from .._query import execute_query, QueryBuilder @@ -10,7 +10,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("hospital_pks", "collection_weeks") + require_all(request, "hospital_pks", "collection_weeks") hospital_pks = extract_strings("hospital_pks") collection_weeks = extract_integers("collection_weeks") publication_dates = extract_integers("publication_dates") diff --git a/src/server/endpoints/covid_hosp_facility_lookup.py b/src/server/endpoints/covid_hosp_facility_lookup.py index 54a3b9183..751dfebb3 100644 --- a/src/server/endpoints/covid_hosp_facility_lookup.py +++ b/src/server/endpoints/covid_hosp_facility_lookup.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_strings from .._query import execute_query, QueryBuilder @@ -10,7 +10,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_any("state", "ccn", "city", "zip", "fips_code") + require_any(request, "state", "ccn", "city", "zip", "fips_code") state = extract_strings("state") ccn = extract_strings("ccn") city = extract_strings("city") diff --git a/src/server/endpoints/covid_hosp_state_timeseries.py b/src/server/endpoints/covid_hosp_state_timeseries.py index a20e74d25..78931ee68 100644 --- a/src/server/endpoints/covid_hosp_state_timeseries.py +++ b/src/server/endpoints/covid_hosp_state_timeseries.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_integers, extract_strings, extract_date from .._query import execute_query, QueryBuilder @@ -11,7 +11,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("states", "dates") + require_all(request, "states", "dates") states = extract_strings("states") dates = extract_integers("dates") issues = extract_integers("issues") diff --git a/src/server/endpoints/covidcast.py b/src/server/endpoints/covidcast.py index 05f7cfc3f..09b3d3740 100644 --- a/src/server/endpoints/covidcast.py +++ b/src/server/endpoints/covidcast.py @@ -100,7 +100,7 @@ def _verify_argument_time_type_matches(is_day_argument: bool, count_daily_signal @bp.route("/trend", methods=("GET", "POST")) def handle_trend(): - require_all("window", "date") + require_all(request, "window", "date") source_signal_sets = parse_source_signal_sets() daily_signals, weekly_signals = count_signal_time_types(source_signal_sets) source_signal_sets, alias_mapper = create_source_signal_alias_mapper(source_signal_sets) @@ -133,7 +133,7 @@ def handle_trend(): q.apply_geo_filters("geo_type", "geo_value", geo_sets) q.apply_time_filter("time_type", "time_value", time_window) - p = create_printer() + p = create_printer(request.values.get("format")) def gen(rows): for key, group in groupby((parse_row(row, fields_string, fields_int, fields_float) for row in rows), lambda row: (row["geo_type"], row["geo_value"], row["source"], row["signal"])): @@ -155,7 +155,7 @@ def gen(rows): @bp.route("/trendseries", methods=("GET", "POST")) def handle_trendseries(): - require_all("window") + require_all(request, "window") source_signal_sets = parse_source_signal_sets() daily_signals, weekly_signals = count_signal_time_types(source_signal_sets) source_signal_sets, alias_mapper = create_source_signal_alias_mapper(source_signal_sets) @@ -181,7 +181,7 @@ def handle_trendseries(): q.apply_geo_filters("geo_type", "geo_value", geo_sets) q.apply_time_filter("time_type", "time_value", time_window) - p = create_printer() + p = create_printer(request.values.get("format")) shifter = lambda x: shift_day_value(x, -basis_shift) if not is_day: @@ -208,7 +208,7 @@ def gen(rows): @bp.route("/correlation", methods=("GET", "POST")) def handle_correlation(): - require_all("reference", "window", "others", "geo") + require_all(request, "reference", "window", "others", "geo") reference = parse_single_source_signal_arg("reference") other_sets = parse_source_signal_arg("others") daily_signals, weekly_signals = count_signal_time_types(other_sets + [reference]) @@ -246,7 +246,7 @@ def handle_correlation(): # week but convert to date for simpler shifting df["time_value"] = to_datetime(df["time_value"].apply(lambda v: time_value_to_week(v).startdate())) - p = create_printer() + p = create_printer(request.values.get("format")) def prepare_data_frame(df): return df[["time_value", "value"]].set_index("time_value") @@ -364,7 +364,7 @@ def handle_backfill(): """ example query: http://localhost:5000/covidcast/backfill?signal=fb-survey:smoothed_cli&time=day:20200101-20220101&geo=state:ny&anchor_lag=60 """ - require_all("geo", "time", "signal") + require_all(request, "geo", "time", "signal") source_signal_set = parse_single_source_signal_arg("signal") daily_signals, weekly_signals = count_signal_time_types([source_signal_set]) source_signal_sets, _ = create_source_signal_alias_mapper([source_signal_set]) @@ -393,7 +393,7 @@ def handle_backfill(): q.apply_geo_filters("geo_type", "geo_value", [geo_set]) q.apply_time_filter("time_type", "time_value", time_set) - p = create_printer() + p = create_printer(request.values.get("format")) def find_anchor_row(rows: List[Dict[str, Any]], issue: int) -> Optional[Dict[str, Any]]: # assume sorted by issue asc diff --git a/src/server/endpoints/covidcast_meta.py b/src/server/endpoints/covidcast_meta.py index 86eeb8b64..92c78017f 100644 --- a/src/server/endpoints/covidcast_meta.py +++ b/src/server/endpoints/covidcast_meta.py @@ -1,7 +1,6 @@ -import sys from typing import Dict, List, Optional -from flask import Blueprint +from flask import Blueprint, request from flask.json import loads from sqlalchemy import text @@ -80,4 +79,4 @@ def handle(): signals = [SourceSignal(v) for v in (extract_strings("signals") or [])] geo_types = extract_strings("geo_types") - return create_printer()(filter_fields(fetch_data(time_types, geo_types, signals))) + return create_printer(request.values.get("format"))(filter_fields(fetch_data(time_types, geo_types, signals))) diff --git a/src/server/endpoints/covidcast_nowcast.py b/src/server/endpoints/covidcast_nowcast.py index d71ff9404..79cb71670 100644 --- a/src/server/endpoints/covidcast_nowcast.py +++ b/src/server/endpoints/covidcast_nowcast.py @@ -19,10 +19,10 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all( + require_all(request, "data_source", "time_type", "geo_type", "time_values", "signals", "sensor_names" ) - require_any("geo_value", "geo_values", empty=True) + require_any(request, "geo_value", "geo_values", empty=True) time_values = extract_dates("time_values") as_of = extract_date("as_of") diff --git a/src/server/endpoints/covidcast_utils/model.py b/src/server/endpoints/covidcast_utils/model.py index abab0033b..4cc78888e 100644 --- a/src/server/endpoints/covidcast_utils/model.py +++ b/src/server/endpoints/covidcast_utils/model.py @@ -202,7 +202,7 @@ def _load_data_sources(): data_sources, data_sources_df = _load_data_sources() -data_source_by_id = {d.source: d for d in data_sources} +data_sources_by_id = {d.source: d for d in data_sources} def _load_data_signals(sources: List[DataSource]): @@ -231,7 +231,7 @@ def _load_data_signals(sources: List[DataSource]): data_signals_by_key = {d.key: d for d in data_signals} # also add the resolved signal version to the signal lookup for d in data_signals: - source = data_source_by_id.get(d.source) + source = data_sources_by_id.get(d.source) if source and source.uses_db_alias: data_signals_by_key[(source.db_source, d.signal)] = d @@ -261,7 +261,7 @@ def create_source_signal_alias_mapper(source_signals: List[SourceSignalSet]) -> alias_to_data_sources: Dict[str, List[DataSource]] = {} transformed_sets: List[SourceSignalSet] = [] for ssset in source_signals: - source = data_source_by_id.get(ssset.source) + source = data_sources_by_id.get(ssset.source) if not source or not source.uses_db_alias: transformed_sets.append(ssset) continue diff --git a/src/server/endpoints/delphi.py b/src/server/endpoints/delphi.py index c365ab1e0..cb7efcd46 100644 --- a/src/server/endpoints/delphi.py +++ b/src/server/endpoints/delphi.py @@ -12,7 +12,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("system", "epiweek") + require_all(request, "system", "epiweek") system = request.values["system"] epiweek = int(request.values["epiweek"]) @@ -29,4 +29,4 @@ def handle(): row["forecast"] = loads(row["json"]) del row["json"] # send query - return print_non_standard(rows) + return print_non_standard(request.values.get("format"), rows) diff --git a/src/server/endpoints/dengue_nowcast.py b/src/server/endpoints/dengue_nowcast.py index f77f6bd18..9e89a7fbb 100644 --- a/src/server/endpoints/dengue_nowcast.py +++ b/src/server/endpoints/dengue_nowcast.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_integers, extract_strings from .._query import execute_query, QueryBuilder @@ -11,7 +11,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("locations", "epiweeks") + require_all(request, "locations", "epiweeks") locations = extract_strings("locations") epiweeks = extract_integers("epiweeks") diff --git a/src/server/endpoints/dengue_sensors.py b/src/server/endpoints/dengue_sensors.py index 0837dc3fc..f8286eacd 100644 --- a/src/server/endpoints/dengue_sensors.py +++ b/src/server/endpoints/dengue_sensors.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._config import AUTH from .._params import extract_integers, extract_strings @@ -12,8 +12,8 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - check_auth_token(AUTH["sensors"]) - require_all("names", "locations", "epiweeks") + check_auth_token(request, AUTH["sensors"]) + require_all(request, "names", "locations", "epiweeks") names = extract_strings("names") locations = extract_strings("locations") diff --git a/src/server/endpoints/ecdc_ili.py b/src/server/endpoints/ecdc_ili.py index b15dc7cb2..724d5d835 100644 --- a/src/server/endpoints/ecdc_ili.py +++ b/src/server/endpoints/ecdc_ili.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_integer, extract_integers, extract_strings from .._query import execute_query, QueryBuilder @@ -11,7 +11,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("regions", "epiweeks") + require_all(request, "regions", "epiweeks") regions = extract_strings("regions") epiweeks = extract_integers("epiweeks") issues = extract_integers("issues") diff --git a/src/server/endpoints/flusurv.py b/src/server/endpoints/flusurv.py index 67e842cb8..08b2a14d9 100644 --- a/src/server/endpoints/flusurv.py +++ b/src/server/endpoints/flusurv.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_integer, extract_integers, extract_strings from .._query import execute_query, QueryBuilder @@ -9,7 +9,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("epiweeks", "locations") + require_all(request, "epiweeks", "locations") epiweeks = extract_integers("epiweeks") locations = extract_strings("locations") diff --git a/src/server/endpoints/fluview.py b/src/server/endpoints/fluview.py index 75e928c86..262cbeb27 100644 --- a/src/server/endpoints/fluview.py +++ b/src/server/endpoints/fluview.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List, Tuple -from flask import Blueprint +from flask import Blueprint, request from .._config import AUTH from .._params import ( @@ -21,9 +21,9 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - authorized = check_auth_token(AUTH["fluview"], optional=True) + authorized = check_auth_token(request, AUTH["fluview"], optional=True) - require_all("epiweeks", "regions") + require_all(request, "epiweeks", "regions") epiweeks = extract_integers("epiweeks") regions = extract_strings("regions") diff --git a/src/server/endpoints/fluview_clinicial.py b/src/server/endpoints/fluview_clinicial.py index e213a1638..b7b248aa0 100644 --- a/src/server/endpoints/fluview_clinicial.py +++ b/src/server/endpoints/fluview_clinicial.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_integer, extract_integers, extract_strings from .._query import execute_query, QueryBuilder @@ -9,7 +9,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("epiweeks", "regions") + require_all(request, "epiweeks", "regions") epiweeks = extract_integers("epiweeks") regions = extract_strings("regions") diff --git a/src/server/endpoints/fluview_meta.py b/src/server/endpoints/fluview_meta.py index c3106b7a9..c5ef8c894 100644 --- a/src/server/endpoints/fluview_meta.py +++ b/src/server/endpoints/fluview_meta.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._printer import create_printer from .._query import filter_fields, parse_result @@ -23,4 +23,4 @@ def gen(): for row in meta_fluview(): yield row - return create_printer()(filter_fields(gen())) + return create_printer(request.values.get("format"))(filter_fields(gen())) diff --git a/src/server/endpoints/gft.py b/src/server/endpoints/gft.py index 343f565f4..ce9e6d1f0 100644 --- a/src/server/endpoints/gft.py +++ b/src/server/endpoints/gft.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_integers, extract_strings from .._query import execute_query, QueryBuilder @@ -11,7 +11,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("locations", "epiweeks") + require_all(request, "locations", "epiweeks") locations = extract_strings("locations") epiweeks = extract_integers("epiweeks") diff --git a/src/server/endpoints/ght.py b/src/server/endpoints/ght.py index ab858e79c..24ba84c23 100644 --- a/src/server/endpoints/ght.py +++ b/src/server/endpoints/ght.py @@ -12,8 +12,8 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - check_auth_token(AUTH["ght"]) - require_all("locations", "epiweeks", "query") + check_auth_token(request, AUTH["ght"]) + require_all(request, "locations", "epiweeks", "query") locations = extract_strings("locations") epiweeks = extract_integers("epiweeks") diff --git a/src/server/endpoints/kcdc_ili.py b/src/server/endpoints/kcdc_ili.py index fc9328898..8561a85e7 100644 --- a/src/server/endpoints/kcdc_ili.py +++ b/src/server/endpoints/kcdc_ili.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_integer, extract_integers, extract_strings from .._query import execute_query, QueryBuilder @@ -11,7 +11,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("regions", "epiweeks") + require_all(request, "regions", "epiweeks") regions = extract_strings("regions") epiweeks = extract_integers("epiweeks") issues = extract_integers("issues") diff --git a/src/server/endpoints/meta.py b/src/server/endpoints/meta.py index 154d757e9..8bb4c4890 100644 --- a/src/server/endpoints/meta.py +++ b/src/server/endpoints/meta.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._printer import print_non_standard from .._query import parse_result @@ -46,4 +46,4 @@ def handle(): "wiki": wiki, "delphi": delphi, } - return print_non_standard([row]) + return print_non_standard(request.values.get("format"), [row]) diff --git a/src/server/endpoints/meta_afhsb.py b/src/server/endpoints/meta_afhsb.py index 43566c1be..8a74b51ca 100644 --- a/src/server/endpoints/meta_afhsb.py +++ b/src/server/endpoints/meta_afhsb.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._config import AUTH from .._printer import print_non_standard @@ -12,7 +12,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - check_auth_token(AUTH["afhsb"]) + check_auth_token(request, AUTH["afhsb"]) # build query table1 = "afhsb_00to13_state" @@ -29,4 +29,4 @@ def handle(): query = f"SELECT DISTINCT `{key}` FROM (select `{key}` from `{table1}` union select `{key}` from `{table2}`) t" data[key] = parse_result(query, {}, [], [key]) - return print_non_standard(data) + return print_non_standard(request.values.get("format"), data) diff --git a/src/server/endpoints/meta_norostat.py b/src/server/endpoints/meta_norostat.py index 2c180382b..789b09021 100644 --- a/src/server/endpoints/meta_norostat.py +++ b/src/server/endpoints/meta_norostat.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._config import AUTH from .._printer import print_non_standard @@ -12,7 +12,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - check_auth_token(AUTH["norostat"]) + check_auth_token(request, AUTH["norostat"]) # build query query = "SELECT DISTINCT `release_date` FROM `norostat_raw_datatable_version_list`" @@ -22,4 +22,4 @@ def handle(): locations = parse_result(query, {}, ["location"]) data = {"releases": releases, "locations": locations} - return print_non_standard(data) + return print_non_standard(request.values.get("format"), data) diff --git a/src/server/endpoints/nidss_dengue.py b/src/server/endpoints/nidss_dengue.py index 8d7c12624..a9c37dc84 100644 --- a/src/server/endpoints/nidss_dengue.py +++ b/src/server/endpoints/nidss_dengue.py @@ -1,6 +1,6 @@ import re -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_integers, extract_strings from .._query import execute_queries, filter_integers @@ -13,7 +13,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("locations", "epiweeks") + require_all(request, "locations", "epiweeks") locations = extract_strings("locations") epiweeks = extract_integers("epiweeks") diff --git a/src/server/endpoints/nidss_flu.py b/src/server/endpoints/nidss_flu.py index 8eb7d3b56..55e40e4b3 100644 --- a/src/server/endpoints/nidss_flu.py +++ b/src/server/endpoints/nidss_flu.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_integer, extract_integers, extract_strings from .._query import execute_query, QueryBuilder @@ -11,7 +11,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("regions", "epiweeks") + require_all(request, "regions", "epiweeks") regions = extract_strings("regions") epiweeks = extract_integers("epiweeks") issues = extract_integers("issues") diff --git a/src/server/endpoints/norostat.py b/src/server/endpoints/norostat.py index 24867a8d4..7dc06d443 100644 --- a/src/server/endpoints/norostat.py +++ b/src/server/endpoints/norostat.py @@ -12,8 +12,8 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - check_auth_token(AUTH["norostat"]) - require_all("location", "epiweeks") + check_auth_token(request, AUTH["norostat"]) + require_all(request, "location", "epiweeks") location = request.values["location"] epiweeks = extract_integers("epiweeks") diff --git a/src/server/endpoints/nowcast.py b/src/server/endpoints/nowcast.py index 77c535ee6..f1f377760 100644 --- a/src/server/endpoints/nowcast.py +++ b/src/server/endpoints/nowcast.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_integers, extract_strings from .._query import execute_query, QueryBuilder @@ -11,7 +11,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("locations", "epiweeks") + require_all(request, "locations", "epiweeks") locations = extract_strings("locations") epiweeks = extract_integers("epiweeks") diff --git a/src/server/endpoints/paho_dengue.py b/src/server/endpoints/paho_dengue.py index e793a7c17..7fa01c88b 100644 --- a/src/server/endpoints/paho_dengue.py +++ b/src/server/endpoints/paho_dengue.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._params import extract_integer, extract_integers, extract_strings from .._query import execute_query, QueryBuilder @@ -11,7 +11,7 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("regions", "epiweeks") + require_all(request, "regions", "epiweeks") regions = extract_strings("regions") epiweeks = extract_integers("epiweeks") issues = extract_integers("issues") diff --git a/src/server/endpoints/quidel.py b/src/server/endpoints/quidel.py index 081706190..6de9205b8 100644 --- a/src/server/endpoints/quidel.py +++ b/src/server/endpoints/quidel.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .._config import AUTH from .._params import extract_integers, extract_strings @@ -12,8 +12,8 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - check_auth_token(AUTH["quidel"]) - require_all("locations", "epiweeks") + check_auth_token(request, AUTH["quidel"]) + require_all(request, "locations", "epiweeks") locations = extract_strings("locations") epiweeks = extract_integers("epiweeks") diff --git a/src/server/endpoints/sensors.py b/src/server/endpoints/sensors.py index f803dd396..cd76ca4d8 100644 --- a/src/server/endpoints/sensors.py +++ b/src/server/endpoints/sensors.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, Request, request from .._config import AUTH, GRANULAR_SENSOR_AUTH_TOKENS, OPEN_SENSORS from .._exceptions import EpiDataException @@ -27,8 +27,8 @@ PHP_INT_MAX = 2147483647 -def _authenticate(names: List[str]): - auth_tokens_presented = (resolve_auth_token() or "").split(",") +def _authenticate(req: Request, names: List[str]): + auth_tokens_presented = (resolve_auth_token(req) or "").split(",") names = extract_strings("names") n_names = len(names) @@ -103,10 +103,10 @@ def _authenticate(names: List[str]): @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("names", "locations", "epiweeks") + require_all(request, "names", "locations", "epiweeks") names = extract_strings("names") or [] - _authenticate(names) + _authenticate(request, names) # parse the request locations = extract_strings("locations") diff --git a/src/server/endpoints/signal_dashboard_coverage.py b/src/server/endpoints/signal_dashboard_coverage.py index 347d1b0a7..ba71058fd 100644 --- a/src/server/endpoints/signal_dashboard_coverage.py +++ b/src/server/endpoints/signal_dashboard_coverage.py @@ -1,7 +1,7 @@ from typing import List, Dict, Any -from flask import Blueprint +from flask import Blueprint, request -from .._query import parse_result, filter_fields +from .._query import parse_result from .._printer import print_non_standard # first argument is the endpoint name @@ -42,4 +42,4 @@ def fetch_coverage_data() -> Dict[str, Dict[str, List[Dict[str, Any]]]]: @bp.route("/", methods=("GET", "POST")) def handle(): - return print_non_standard(fetch_coverage_data()) + return print_non_standard(request.values.get("format"), fetch_coverage_data()) diff --git a/src/server/endpoints/signal_dashboard_status.py b/src/server/endpoints/signal_dashboard_status.py index 72c06dc52..7d32686c4 100644 --- a/src/server/endpoints/signal_dashboard_status.py +++ b/src/server/endpoints/signal_dashboard_status.py @@ -1,4 +1,4 @@ -from flask import Blueprint +from flask import Blueprint, request from .signal_dashboard_coverage import fetch_coverage_data from .._query import parse_row, run_query @@ -30,7 +30,7 @@ def handle(): AND enabled_signal.`id` = status.`signal_id` """ - p = create_printer() + p = create_printer(request.values.get("format")) def gen(rows, coverage_data): for row in rows: diff --git a/src/server/endpoints/twitter.py b/src/server/endpoints/twitter.py index 41cbe3492..84cbb2850 100644 --- a/src/server/endpoints/twitter.py +++ b/src/server/endpoints/twitter.py @@ -19,9 +19,9 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - check_auth_token(AUTH["twitter"]) - require_all("locations") - require_any("dates", "epiweeks") + check_auth_token(request, AUTH["twitter"]) + require_all(request, "locations") + require_any(request, "dates", "epiweeks") locations = extract_strings("locations") if "dates" in request.values: diff --git a/src/server/endpoints/wiki.py b/src/server/endpoints/wiki.py index 61139578f..7c98e2ab7 100644 --- a/src/server/endpoints/wiki.py +++ b/src/server/endpoints/wiki.py @@ -11,8 +11,8 @@ @bp.route("/", methods=("GET", "POST")) def handle(): - require_all("articles") - require_any("dates", "epiweeks") + require_all(request, "articles") + require_any(request, "dates", "epiweeks") articles = extract_strings("articles") language = request.values.get("language", "en") diff --git a/tests/acquisition/covidcast/test_covidcast_row.py b/tests/acquisition/covidcast/test_covidcast_row.py new file mode 100644 index 000000000..9462fd4ed --- /dev/null +++ b/tests/acquisition/covidcast/test_covidcast_row.py @@ -0,0 +1,92 @@ +import unittest + +from pandas import DataFrame +from pandas.testing import assert_frame_equal + +from delphi_utils.nancodes import Nans +from delphi.epidata.acquisition.covidcast.covidcast_row import CovidcastRow, set_df_dtypes +from delphi.epidata.acquisition.covidcast.test_utils import ( + CovidcastTestRow, + covidcast_rows_as_api_compatibility_row_df, + covidcast_rows_as_api_row_df, + covidcast_rows_from_args, + transpose_dict, +) + +# py3tester coverage target (equivalent to `import *`) +__test_target__ = 'delphi.epidata.acquisition.covidcast.covidcast_row' + + +class TestCovidcastRows(unittest.TestCase): + expected_df = set_df_dtypes(DataFrame({ + "source": ["src"] * 10, + "signal": ["sig_base"] * 5 + ["sig_other"] * 5, + "time_type": ["day"] * 10, + "geo_type": ["county"] * 10, + "time_value": [2021_05_01 + i for i in range(5)] * 2, + "geo_value": ["01234"] * 10, + "value": range(10), + "stderr": [10.0] * 10, + "sample_size": [10.0] * 10, + "missing_value": [Nans.NOT_MISSING] * 10, + "missing_stderr": [Nans.NOT_MISSING] * 10, + "missing_sample_size": [Nans.NOT_MISSING] * 10, + "issue": [2021_05_01 + i for i in range(5)] * 2, + "lag": [0] * 10, + "direction": [None] * 10 + }), CovidcastRow._pandas_dtypes) + + def test_transpose_dict(self): + assert transpose_dict( + { + "a": [2, 4, 6], + "b": [3, 5, 7], + "c": [10, 20, 30] + } + ) == [ + {"a": 2, "b": 3, "c": 10}, + {"a": 4, "b": 5, "c": 20}, + {"a": 6, "b": 7, "c": 30} + ] + + + def test_CovidcastRow(self): + df = CovidcastTestRow.make_default_row( + signal="sig_base", + value=0.0, + time_value=2021_05_01, + issue=2021_05_01, + ).as_api_row_df() + expected_df = self.expected_df.iloc[0:1] + assert_frame_equal(df, expected_df) + + df = CovidcastTestRow.make_default_row( + signal="sig_base", + value=0.0, + time_value=2021_05_01, + issue=2021_05_01, + ).as_api_compatibility_row_df() + expected_df = self.expected_df.iloc[0:1][df.columns] + assert_frame_equal(df, expected_df) + + + def test_covidcast_rows(self): + covidcast_rows = covidcast_rows_from_args( + signal=["sig_base"] * 5 + ["sig_other"] * 5, + time_value=[2021_05_01 + i for i in range(5)] * 2, + value=list(range(10)), + sanitize_fields = True + ) + df = covidcast_rows_as_api_row_df(covidcast_rows) + expected_df = self.expected_df + assert_frame_equal(df, expected_df) + + covidcast_rows = covidcast_rows_from_args( + signal=["sig_base"] * 5 + ["sig_other"] * 5, + time_value=[2021_05_01 + i for i in range(5)] * 2, + value=list(range(10)), + sanitize_fields = True + ) + df = covidcast_rows_as_api_compatibility_row_df(covidcast_rows) + expected_df = self.expected_df[df.columns] + assert_frame_equal(df, expected_df) diff --git a/tests/acquisition/covidcast/test_csv_importer.py b/tests/acquisition/covidcast/test_csv_importer.py index a99eb3a1a..0906febd1 100644 --- a/tests/acquisition/covidcast/test_csv_importer.py +++ b/tests/acquisition/covidcast/test_csv_importer.py @@ -2,20 +2,17 @@ # standard library import unittest -from unittest.mock import MagicMock -from unittest.mock import patch +from unittest.mock import MagicMock, patch from datetime import date -import math import numpy as np -import os # third party -import pandas +import pandas as pd import epiweeks as epi from delphi_utils import Nans -from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter from delphi.utils.epiweek import delta_epiweeks +from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter, CsvRowValue, PathDetails # py3tester coverage target __test_target__ = 'delphi.epidata.acquisition.covidcast.csv_importer' @@ -34,6 +31,7 @@ def test_is_sane_day(self): self.assertFalse(CsvImporter.is_sane_day(20200199)) self.assertFalse(CsvImporter.is_sane_day(202015)) + def test_is_sane_week(self): """Sanity check some weeks.""" @@ -44,37 +42,38 @@ def test_is_sane_week(self): self.assertFalse(CsvImporter.is_sane_week(202054)) self.assertFalse(CsvImporter.is_sane_week(20200418)) + + @patch("delphi.epidata.acquisition.covidcast.csv_importer.glob") @patch("os.path.isdir") - def test_find_issue_specific_csv_files(self,os_isdir_mock): + def test_find_issue_specific_csv_files(self, mock_os_isdir: MagicMock, mock_glob: MagicMock): """Recursively explore and find issue specific CSV files.""" # check valid path path_prefix='prefix/to/the/data/issue_20200408' - os_isdir_mock.return_value=True + mock_os_isdir.return_value=True issue_path=path_prefix+'ght/20200408_state_rawsearch.csv' - mock_glob = MagicMock() - mock_glob.glob.side_effect = ([path_prefix], [issue_path]) + mock_glob.side_effect = ([path_prefix], [issue_path]) #check if the day is a valid day. issuedir_match= CsvImporter.PATTERN_ISSUE_DIR.match(path_prefix.lower()) issue_date_value = int(issuedir_match.group(2)) self.assertTrue(CsvImporter.is_sane_day(issue_date_value)) - found = set(CsvImporter.find_issue_specific_csv_files(path_prefix, glob=mock_glob)) - self.assertTrue(len(found)>0) + found = set(CsvImporter.find_issue_specific_csv_files(path_prefix)) + self.assertTrue(len(found) > 0) # check unvalid path: path_prefix_invalid='invalid/prefix/to/the/data/issue_20200408' - os_isdir_mock.return_value=False + mock_os_isdir.return_value=False issue_path_invalid=path_prefix_invalid+'ght/20200408_state_rawsearch.csv' - mock_glob_invalid = MagicMock() - mock_glob_invalid.glob.side_effect = ([path_prefix_invalid], [issue_path_invalid]) + mock_glob.side_effect = ([path_prefix_invalid], [issue_path_invalid]) - found = set(CsvImporter.find_issue_specific_csv_files(path_prefix_invalid, glob=mock_glob_invalid)) + found = set(CsvImporter.find_issue_specific_csv_files(path_prefix_invalid)) self.assertFalse(len(found)>0) - def test_find_csv_files(self): + @patch("delphi.epidata.acquisition.covidcast.csv_importer.glob") + def test_find_csv_files(self, mock_glob: MagicMock): """Recursively explore and find CSV files.""" path_prefix = 'prefix/to/the/data/' @@ -98,19 +97,18 @@ def test_find_csv_files(self): # ignored path_prefix + 'ignored/README.md', ] - mock_glob = MagicMock() - mock_glob.glob.return_value = glob_paths + mock_glob.return_value = glob_paths - found = set(CsvImporter.find_csv_files(path_prefix, glob=mock_glob)) + found = set(CsvImporter.find_csv_files(path_prefix)) expected_issue_day=int(date.today().strftime("%Y%m%d")) expected_issue_week=int(str(epi.Week.fromdate(date.today()))) time_value_day = 20200408 expected = set([ - (glob_paths[0], ('fb_survey', 'cli', 'week', 'county', 202015, expected_issue_week, delta_epiweeks(202015, expected_issue_week))), - (glob_paths[1], ('ght', 'rawsearch', 'day', 'state', time_value_day, expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days)), - (glob_paths[2], ('valid', 'sig', 'day', 'nation', time_value_day, expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days)), - (glob_paths[3], ('valid', 'sig', 'day', 'hhs', time_value_day, expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days)), + (glob_paths[0], PathDetails(expected_issue_week, delta_epiweeks(202015, expected_issue_week), 'fb_survey', 'cli', 'week', 202015, 'county')), + (glob_paths[1], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'ght', 'rawsearch', 'day', time_value_day, 'state')), + (glob_paths[2], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'valid', 'sig', 'day', time_value_day, 'nation')), + (glob_paths[3], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'valid', 'sig', 'day', time_value_day, 'hhs')), (glob_paths[4], None), (glob_paths[5], None), (glob_paths[6], None), @@ -118,6 +116,7 @@ def test_find_csv_files(self): ]) self.assertEqual(found, expected) + def test_is_header_valid_allows_extra_columns(self): """Allow and ignore extra columns in the header.""" @@ -126,6 +125,7 @@ def test_is_header_valid_allows_extra_columns(self): self.assertTrue(CsvImporter.is_header_valid(columns)) self.assertTrue(CsvImporter.is_header_valid(columns | {'foo', 'bar'})) + def test_is_header_valid_does_not_depend_on_column_order(self): """Allow columns to appear in any order.""" @@ -134,6 +134,7 @@ def test_is_header_valid_does_not_depend_on_column_order(self): self.assertTrue(CsvImporter.is_header_valid(columns)) + def test_floaty_int(self): """Parse ints that may look like floats.""" @@ -143,6 +144,7 @@ def test_floaty_int(self): with self.assertRaises(ValueError): CsvImporter.floaty_int('-1.1') + def test_maybe_apply(self): """Apply a function to a value as long as it's not null-like.""" @@ -153,6 +155,7 @@ def test_maybe_apply(self): self.assertIsNone(CsvImporter.maybe_apply(float, '')) self.assertIsNone(CsvImporter.maybe_apply(float, None)) + def test_extract_and_check_row(self): """Apply various sanity checks to a row of data.""" @@ -208,37 +211,40 @@ def make_row( self.assertEqual(error, field) success_cases = [ - (make_row(), CsvImporter.RowValues('vi', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)), - (make_row(value=None, stderr=np.nan, sample_size='', missing_value=str(float(Nans.DELETED)), missing_stderr=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvImporter.RowValues('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)), - (make_row(stderr='', sample_size='NA', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvImporter.RowValues('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)), - (make_row(sample_size=None, missing_value='missing_value', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.NOT_MISSING))), CsvImporter.RowValues('vi', 1.23, 4.56, None, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER)), + (make_row(), CsvRowValue('vi', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)), + (make_row(value=None, stderr=np.nan, sample_size='', missing_value=str(float(Nans.DELETED)), missing_stderr=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvRowValue('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)), + (make_row(stderr='', sample_size='NA', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvRowValue('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)), + (make_row(sample_size=None, missing_value='missing_value', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.NOT_MISSING))), CsvRowValue('vi', 1.23, 4.56, None, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER)), ] for ((geo_type, row), field) in success_cases: values, error = CsvImporter.extract_and_check_row(row, geo_type) self.assertIsNone(error) - self.assertIsInstance(values, CsvImporter.RowValues) + self.assertIsInstance(values, CsvRowValue) self.assertEqual(values.geo_value, field.geo_value) self.assertEqual(values.value, field.value) self.assertEqual(values.stderr, field.stderr) self.assertEqual(values.sample_size, field.sample_size) - def test_load_csv_with_invalid_header(self): + + @patch("pandas.read_csv") + def test_load_csv_with_invalid_header(self, mock_read_csv): """Bail loading a CSV when the header is invalid.""" data = {'foo': [1, 2, 3]} - mock_pandas = MagicMock() - mock_pandas.read_csv.return_value = pandas.DataFrame(data=data) filepath = 'path/name.csv' - geo_type = 'state' + details = PathDetails(20200101, 0, "src", "name", "day", 20200101, "state") - rows = list(CsvImporter.load_csv(filepath, geo_type, pandas=mock_pandas)) + mock_read_csv.return_value = pd.DataFrame(data) + rows = list(CsvImporter.load_csv(filepath, details)) - self.assertTrue(mock_pandas.read_csv.called) - self.assertTrue(mock_pandas.read_csv.call_args[0][0], filepath) + self.assertTrue(mock_read_csv.called) + self.assertTrue(mock_read_csv.call_args[0][0], filepath) self.assertEqual(rows, [None]) - def test_load_csv_with_valid_header(self): + + @patch("pandas.read_csv") + def test_load_csv_with_valid_header(self, mock_read_csv): """Yield sanity checked `RowValues` from a valid CSV file.""" # one invalid geo_id, but otherwise valid @@ -248,15 +254,14 @@ def test_load_csv_with_valid_header(self): 'se': ['2.1', '2.2', '2.3', '2.4'], 'sample_size': ['301', '302', '303', '304'], } - mock_pandas = MagicMock() - mock_pandas.read_csv.return_value = pandas.DataFrame(data=data) filepath = 'path/name.csv' - geo_type = 'state' + details = PathDetails(20200101, 0, "src", "name", "day", 20200101, "state") - rows = list(CsvImporter.load_csv(filepath, geo_type, pandas=mock_pandas)) + mock_read_csv.return_value = pd.DataFrame(data=data) + rows = list(CsvImporter.load_csv(filepath, details)) - self.assertTrue(mock_pandas.read_csv.called) - self.assertTrue(mock_pandas.read_csv.call_args[0][0], filepath) + self.assertTrue(mock_read_csv.called) + self.assertTrue(mock_read_csv.call_args[0][0], filepath) self.assertEqual(len(rows), 4) self.assertEqual(rows[0].geo_value, 'ca') @@ -286,15 +291,14 @@ def test_load_csv_with_valid_header(self): 'missing_stderr': [Nans.NOT_MISSING, Nans.REGION_EXCEPTION, Nans.NOT_MISSING, Nans.NOT_MISSING] + [None], 'missing_sample_size': [Nans.NOT_MISSING] * 2 + [Nans.REGION_EXCEPTION] * 2 + [None] } - mock_pandas = MagicMock() - mock_pandas.read_csv.return_value = pandas.DataFrame(data=data) filepath = 'path/name.csv' - geo_type = 'state' + details = PathDetails(20200101, 0, "src", "name", "day", 20200101, "state") - rows = list(CsvImporter.load_csv(filepath, geo_type, pandas=mock_pandas)) + mock_read_csv.return_value = pd.DataFrame(data) + rows = list(CsvImporter.load_csv(filepath, details)) - self.assertTrue(mock_pandas.read_csv.called) - self.assertTrue(mock_pandas.read_csv.call_args[0][0], filepath) + self.assertTrue(mock_read_csv.called) + self.assertTrue(mock_read_csv.call_args[0][0], filepath) self.assertEqual(len(rows), 5) self.assertEqual(rows[0].geo_value, 'ca') diff --git a/tests/acquisition/covidcast/test_csv_to_database.py b/tests/acquisition/covidcast/test_csv_to_database.py index 0b91815fb..938070944 100644 --- a/tests/acquisition/covidcast/test_csv_to_database.py +++ b/tests/acquisition/covidcast/test_csv_to_database.py @@ -4,10 +4,10 @@ import argparse from typing import Iterable import unittest -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch -from delphi.epidata.acquisition.covidcast.csv_to_database import get_argument_parser, main, \ - collect_files, upload_archive, make_handlers +from delphi.epidata.acquisition.covidcast.csv_importer import PathDetails +from delphi.epidata.acquisition.covidcast.csv_to_database import get_argument_parser, main, collect_files, upload_archive, make_handlers # py3tester coverage target __test_target__ = 'delphi.epidata.acquisition.covidcast.csv_to_database' @@ -15,55 +15,62 @@ class UnitTests(unittest.TestCase): """Basic unit tests.""" + _path_details = [ + # a good file + ('path/a.csv', PathDetails(20200420, 1, 'src_a', 'sig_a', 'day', 20200419, 'hrr')), + # a file with a data error + ('path/b.csv', PathDetails(202017, 1, 'src_b', 'sig_b', 'week', 202016, 'msa')), + # emulate a file that's named incorrectly + ('path/c.csv', None) + ] def test_get_argument_parser(self): """Return a parser for command-line arguments.""" self.assertIsInstance(get_argument_parser(), argparse.ArgumentParser) - def _path_details(self): - return [ - # a good file - ('path/a.csv', ('src_a', 'sig_a', 'day', 'hrr', 20200419, 20200420, 1)), - # a file with a data error - ('path/b.csv', ('src_b', 'sig_b', 'week', 'msa', 202016, 202017, 1)), - # emulate a file that's named incorrectly - ('path/c.csv', None) - ] - def test_collect_files(self): + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.CsvImporter") + def test_collect_files(self, mock_csv_importer: MagicMock): """Scan the data directory.""" - mock_csv_importer = MagicMock() - mock_csv_importer.find_csv_files.return_value = self._path_details() - collect_files( - "fake_data_dir", - False, # no specific issue - csv_importer_impl=mock_csv_importer) + mock_csv_importer.find_csv_files.return_value = self._path_details + collect_files("fake_data_dir", False) # no specific issue self.assertEqual(mock_csv_importer.find_csv_files.call_count, 1) - - def test_upload_archive(self): + + + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.Database") + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.CsvImporter") + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.FileArchiver") + def test_upload_archive(self, mock_file_archiver: MagicMock, mock_csv_importer: MagicMock, mock_database: MagicMock): """Upload to the database, and archive.""" - def make_row(value): + def make_row(value: float, details: PathDetails): return MagicMock( + source=details.source, + signal=details.signal, + time_type=details.time_type, + geo_type=details.geo_type, + time_value=details.time_value, + issue=details.issue, + lag=details.lag, geo_value=value, value=value, stderr=value, sample_size=value, ) - def load_csv_impl(path, *args): + def load_csv_impl(path, details): if path == 'path/a.csv': # no validation errors - yield make_row('a1') - yield make_row('a2') - yield make_row('a3') + yield make_row('a1', details) + yield make_row('a2', details) + yield make_row('a3', details) elif path == 'path/b.csv': # one validation error - yield make_row('b1') + yield make_row('b1', details) yield None - yield make_row('b3') + yield make_row('b3', details) else: # fail the test for any other path raise Exception('unexpected path') @@ -72,20 +79,16 @@ def iter_len(l: Iterable) -> int: return len(list(l)) data_dir = 'data_dir' - mock_database = MagicMock() mock_database.insert_or_update_bulk = MagicMock(wraps=iter_len) - mock_csv_importer = MagicMock() mock_csv_importer.load_csv = load_csv_impl - mock_file_archiver = MagicMock() mock_logger = MagicMock() modified_row_count = upload_archive( - self._path_details(), + self._path_details, mock_database, - make_handlers(data_dir, False, - file_archiver_impl=mock_file_archiver), - mock_logger, - csv_importer_impl=mock_csv_importer) + make_handlers(data_dir, False), + mock_logger + ) self.assertEqual(modified_row_count, 3) # verify that appropriate rows were added to the database @@ -94,6 +97,7 @@ def iter_len(l: Iterable) -> int: actual_args = [[(a.source, a.signal, a.time_type, a.geo_type, a.time_value, a.geo_value, a.value, a.stderr, a.sample_size, a.issue, a.lag) for a in call.args[0]] for call in call_args_list] + expected_args = [ [('src_a', 'sig_a', 'day', 'hrr', 20200419, 'a1', 'a1', 'a1', 'a1', 20200420, 1), ('src_a', 'sig_a', 'day', 'hrr', 20200419, 'a2', 'a2', 'a2', 'a2', 20200420, 1), @@ -112,109 +116,97 @@ def iter_len(l: Iterable) -> int: ] self.assertEqual(actual_args, expected_args) - def test_main_successful(self): + + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.Database") + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.upload_archive") + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.collect_files") + def test_main_successful(self, mock_collect_files: MagicMock, mock_upload_archive: MagicMock, mock_database: MagicMock): """Run the main program successfully, then commit changes.""" # TODO: use an actual argparse object for the args instead of a MagicMock args = MagicMock(log_file=None, data_dir='data', specific_issue_date=False) - mock_database = MagicMock() - mock_database.count_all_rows.return_value = 0 - fake_database_impl = lambda: mock_database - mock_collect_files = MagicMock() + # `return_value` because we mocked the class constructor + mock_database.return_value.count_all_rows.return_value = 0 mock_collect_files.return_value = [("a",False)] - mock_upload_archive = MagicMock() - - main( - args, - database_impl=fake_database_impl, - collect_files_impl=mock_collect_files, - upload_archive_impl=mock_upload_archive) + main(args) self.assertTrue(mock_collect_files.called) self.assertEqual(mock_collect_files.call_args[0][0], 'data') self.assertTrue(mock_upload_archive.called) self.assertEqual(mock_upload_archive.call_args[0][0], [("a",False)]) - self.assertTrue(mock_database.connect.called) - self.assertTrue(mock_database.disconnect.called) - self.assertTrue(mock_database.disconnect.call_args[0][0]) + self.assertTrue(mock_database.return_value.connect.called) + self.assertTrue(mock_database.return_value.disconnect.called) + self.assertTrue(mock_database.return_value.disconnect.call_args[0][0]) + - def test_main_unsuccessful(self): + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.Database") + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.upload_archive", side_effect=Exception('testing')) + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.collect_files") + def test_main_unsuccessful(self, mock_collect_files: MagicMock, mock_upload_archive: MagicMock, mock_database: MagicMock): """Run the main program with failure, then commit changes.""" # TODO: use an actual argparse object for the args instead of a MagicMock args = MagicMock(log_file=None, data_dir='data', specific_issue_date=False) - mock_database = MagicMock() - mock_database.count_all_rows.return_value = 0 - fake_database_impl = lambda: mock_database - mock_upload_archive = MagicMock(side_effect=Exception('testing')) - mock_collect_files = MagicMock() - mock_collect_files.return_value=[("a",False)] + mock_database.return_value.count_all_rows.return_value = 0 + mock_collect_files.return_value = [("a",False)] with self.assertRaises(Exception): - main( - args, - database_impl=fake_database_impl, - collect_files_impl=mock_collect_files, - upload_archive_impl=mock_upload_archive) + main(args) self.assertTrue(mock_upload_archive.called) self.assertEqual(mock_upload_archive.call_args[0][0], [("a",False)]) - self.assertTrue(mock_database.connect.called) - self.assertTrue(mock_database.disconnect.called) - self.assertTrue(mock_database.disconnect.call_args[0][0]) + self.assertTrue(mock_database.return_value.connect.called) + self.assertTrue(mock_database.return_value.disconnect.called) + self.assertTrue(mock_database.return_value.disconnect.call_args[0][0]) + - def test_main_early_exit(self): + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.Database") + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.upload_archive") + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.collect_files") + def test_main_early_exit(self, mock_collect_files: MagicMock, mock_upload_archive: MagicMock, mock_database: MagicMock): """Run the main program with an empty receiving directory.""" # TODO: use an actual argparse object for the args instead of a MagicMock args = MagicMock(log_file=None, data_dir='data', specific_issue_date=False) - mock_database = MagicMock() mock_database.count_all_rows.return_value = 0 - fake_database_impl = lambda: mock_database - mock_collect_files = MagicMock() mock_collect_files.return_value = [] - mock_upload_archive = MagicMock() - main( - args, - database_impl=fake_database_impl, - collect_files_impl=mock_collect_files, - upload_archive_impl=mock_upload_archive) + main(args) self.assertTrue(mock_collect_files.called) self.assertEqual(mock_collect_files.call_args[0][0], 'data') self.assertFalse(mock_upload_archive.called) - self.assertFalse(mock_database.connect.called) - self.assertFalse(mock_database.disconnect.called) + self.assertFalse(mock_database.return_value.connect.called) + self.assertFalse(mock_database.return_value.disconnect.called) + - def test_database_exception_is_handled(self): + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.Database") + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.CsvImporter") + @patch("delphi.epidata.acquisition.covidcast.csv_to_database.FileArchiver") + def test_database_exception_is_handled(self, mock_file_archiver: MagicMock, mock_csv_importer: MagicMock, mock_database: MagicMock): """Gracefully handle database exceptions.""" data_dir = 'data_dir' - mock_database = MagicMock() mock_database.insert_or_update_bulk.side_effect = Exception('testing') - mock_csv_importer = MagicMock() mock_csv_importer.find_csv_files.return_value = [ - ('path/file.csv', ('src', 'sig', 'day', 'hrr', 20200423, 20200424, 1)), + ('path/file.csv', PathDetails(20200424, 1, 'src', 'sig', 'day', 20200423, 'hrr')), ] mock_csv_importer.load_csv.return_value = [ MagicMock(geo_value='geo', value=1, stderr=1, sample_size=1), ] - mock_file_archiver = MagicMock() mock_logger = MagicMock() upload_archive( - collect_files(data_dir, False, csv_importer_impl=mock_csv_importer), + collect_files(data_dir, False), mock_database, - make_handlers(data_dir, False, file_archiver_impl=mock_file_archiver), - mock_logger, - csv_importer_impl=mock_csv_importer, - ) + make_handlers(data_dir, False), + mock_logger + ) # verify that insertions were attempted self.assertTrue(mock_database.insert_or_update_bulk.called) diff --git a/tests/acquisition/covidcast_nowcast/test_load_sensors.py b/tests/acquisition/covidcast_nowcast/test_load_sensors.py index 0fe96ab17..9b0c5181a 100644 --- a/tests/acquisition/covidcast_nowcast/test_load_sensors.py +++ b/tests/acquisition/covidcast_nowcast/test_load_sensors.py @@ -9,6 +9,7 @@ import pandas as pd # first party +from delphi.epidata.acquisition.covidcast.csv_importer import PathDetails from delphi.epidata.acquisition.covidcast_nowcast.load_sensors import main, load_and_prepare_file # py3tester coverage target @@ -20,13 +21,15 @@ class UpdateTests(unittest.TestCase): @mock.patch('time.time', mock.MagicMock(return_value=12345)) def test_load_and_prepare_file(self): - test_attributes = ("test_source", - "test_signal", - "test_time_type", - "test_geo_type", - 20201231, - 20210102, - 3) + test_attributes = PathDetails( + 20210102, + 3, + "test_source", + "test_signal", + "test_time_type", + 20201231, + "test_geo_type", + ) test_df = load_and_prepare_file(StringIO("sensor_name,geo_value,value\ntestname,01001,1.5"), test_attributes) pd.testing.assert_frame_equal(test_df, diff --git a/tests/server/test_pandas.py b/tests/server/test_pandas.py index 083162a47..12a9c18cd 100644 --- a/tests/server/test_pandas.py +++ b/tests/server/test_pandas.py @@ -9,7 +9,6 @@ from delphi.epidata.server._pandas import as_pandas from delphi.epidata.server._config import MAX_RESULTS - # py3tester coverage target __test_target__ = "delphi.epidata.server._pandas" diff --git a/tests/server/test_validate.py b/tests/server/test_validate.py index ca45c78e2..22a4f153c 100644 --- a/tests/server/test_validate.py +++ b/tests/server/test_validate.py @@ -4,6 +4,8 @@ import unittest import base64 +from flask import request + # from flask.testing import FlaskClient from delphi.epidata.server._common import app from delphi.epidata.server._validate import ( @@ -34,74 +36,74 @@ def setUp(self): def test_resolve_auth_token(self): with self.subTest("no auth"): with app.test_request_context("/"): - self.assertIsNone(resolve_auth_token()) + self.assertIsNone(resolve_auth_token(request)) with self.subTest("param"): with app.test_request_context("/?auth=abc"): - self.assertEqual(resolve_auth_token(), "abc") + self.assertEqual(resolve_auth_token(request), "abc") with self.subTest("bearer token"): with app.test_request_context("/", headers={"Authorization": "Bearer abc"}): - self.assertEqual(resolve_auth_token(), "abc") + self.assertEqual(resolve_auth_token(request), "abc") with self.subTest("basic token"): userpass = base64.b64encode(b"epidata:abc").decode("utf-8") with app.test_request_context( "/", headers={"Authorization": f"Basic {userpass}"} ): - self.assertEqual(resolve_auth_token(), "abc") + self.assertEqual(resolve_auth_token(request), "abc") def test_check_auth_token(self): with self.subTest("no auth but optional"): with app.test_request_context("/"): - self.assertFalse(check_auth_token("abc", True)) + self.assertFalse(check_auth_token(request, "abc", True)) with self.subTest("no auth but required"): with app.test_request_context("/"): self.assertRaises( - ValidationFailedException, lambda: check_auth_token("abc") + ValidationFailedException, lambda: check_auth_token(request, "abc") ) with self.subTest("auth and required"): with app.test_request_context("/?auth=abc"): - self.assertTrue(check_auth_token("abc")) + self.assertTrue(check_auth_token(request, "abc")) with self.subTest("auth and required but wrong"): with app.test_request_context("/?auth=abc"): self.assertRaises( - UnAuthenticatedException, lambda: check_auth_token("def") + UnAuthenticatedException, lambda: check_auth_token(request, "def") ) with self.subTest("auth and required but wrong but optional"): with app.test_request_context("/?auth=abc"): - self.assertFalse(check_auth_token("def", True)) + self.assertFalse(check_auth_token(request, "def", True)) def test_require_all(self): with self.subTest("all given"): with app.test_request_context("/"): - self.assertTrue(require_all()) + self.assertTrue(require_all(request)) with app.test_request_context("/?abc=abc&def=3"): - self.assertTrue(require_all("abc", "def")) + self.assertTrue(require_all(request, "abc", "def")) with self.subTest("missing parameter"): with app.test_request_context("/?abc=abc"): self.assertRaises( - ValidationFailedException, lambda: require_all("abc", "def") + ValidationFailedException, lambda: require_all(request, "abc", "def") ) with self.subTest("missing empty parameter"): with app.test_request_context("/?abc=abc&def="): self.assertRaises( - ValidationFailedException, lambda: require_all("abc", "def") + ValidationFailedException, lambda: require_all(request, "abc", "def") ) def test_require_any(self): with self.subTest("default given"): with app.test_request_context("/"): - self.assertRaises(ValidationFailedException, lambda: require_any("abc")) + self.assertRaises(ValidationFailedException, lambda: require_any(request, "abc")) with self.subTest("one option give"): with app.test_request_context("/?abc=abc"): - self.assertTrue(require_any("abc", "def")) + self.assertTrue(require_any(request, "abc", "def")) with self.subTest("multiple options given"): with app.test_request_context("/?abc=abc&def=d"): - self.assertTrue(require_any("abc", "def")) + self.assertTrue(require_any(request, "abc", "def")) with self.subTest("one options given with is empty"): with app.test_request_context("/?abc="): - self.assertRaises(ValidationFailedException, lambda: require_any("abc")) + self.assertRaises(ValidationFailedException, lambda: require_any(request, "abc")) with self.subTest("one options given with is empty but ok"): with app.test_request_context("/?abc="): - self.assertTrue(require_any("abc", empty=True)) + self.assertTrue(require_any(request, "abc", empty=True))