Skip to content

Commit 5662a79

Browse files
committed
Add additional tests and testdata
1 parent e503e97 commit 5662a79

13 files changed

+28706
-4410
lines changed

src/acquisition/rvdss/pull_historic.py

+70-35
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def extract_captions_of_interest(soup):
105105
The captions from the 'summary' tag require less parsing, but sometimes they
106106
are missing. In that case, use the figure captions
107107
"""
108-
captions = soup.findAll('summary')
108+
captions = soup.find_all('summary')
109109

110110
table_identifiers = ["respiratory","number","positive","abbreviation"]
111111

@@ -321,6 +321,34 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu=
321321

322322
return(table)
323323

324+
def fix_edge_cases(table,season,caption,current_week):
325+
# One-off edge cases where tables need to be manually adjusted because
326+
# they will cause errors otherwise
327+
if season[0] == '2017':
328+
if current_week == 35 and "entero" in caption.text.lower():
329+
# The positive enterovirus table in week 35 of the 2017-2018 season has french
330+
# in the headers,so the french needs to be removed
331+
table.columns = ['week', 'week end', 'canada tests', 'entero/rhino%', 'at tests',
332+
'entero/rhino%.1', 'qc tests', 'entero/rhino%.2', 'on tests',
333+
'entero/rhino%.3', 'pr tests', 'entero/rhino%.4', 'bc tests',
334+
'entero/rhino%.5']
335+
elif current_week == 35 and "adeno" in caption.text.lower():
336+
# In week 35 of the 2017-2018, the positive adenovirus table has ">week end"
337+
# instead of "week end", so remove > from the column
338+
table = table.rename(columns={'>week end':"week end"})
339+
elif current_week == 47 and "rsv" in caption.text.lower():
340+
# In week 47 of the 2017-2018 season, a date is written as 201-11-25,
341+
# instead of 2017-11-25
342+
table.loc[table['week'] == 47, 'week end'] = "2017-11-25"
343+
elif season[0] == '2015' and current_week == 41:
344+
# In week 41 of the 2015-2016 season, a date written in m-d-y format not d-m-y
345+
table=table.replace("10-17-2015","17-10-2015",regex=True)
346+
elif season[0] == '2022' and current_week == 11 and "hmpv" in caption.text.lower():
347+
# In week 11 of the 2022-2023 season, in the positive hmpv table,
348+
# a date is written as 022-09-03, instead of 2022-09-03
349+
table.loc[table['week'] == 35, 'week end'] = "2022-09-03"
350+
return(table)
351+
324352
def fetch_one_season_from_report(url):
325353
# From the url, go to the main landing page for a season
326354
# which contains all the links to each week in the season
@@ -397,33 +425,35 @@ def fetch_one_season_from_report(url):
397425
# Make column names lowercase
398426
table.columns=table.columns.str.lower()
399427

400-
# One-off edge cases where tables need to be manually adjusted because
401-
# they will cause errors otherwise
402-
if season[0] == '2017':
403-
if current_week == 35 and "entero" in caption.text.lower():
404-
# The positive enterovirus table in week 35 of the 2017-2018 season has french
405-
# in the headers,so the french needs to be removed
406-
table.columns = ['week', 'week end', 'canada tests', 'entero/rhino%', 'at tests',
407-
'entero/rhino%.1', 'qc tests', 'entero/rhino%.2', 'on tests',
408-
'entero/rhino%.3', 'pr tests', 'entero/rhino%.4', 'bc tests',
409-
'entero/rhino%.5']
410-
elif current_week == 35 and "adeno" in caption.text.lower():
411-
# In week 35 of the 2017-2018, the positive adenovirus table has ">week end"
412-
# instead of "week end", so remove > from the column
413-
table = table.rename(columns={'>week end':"week end"})
414-
elif current_week == 47 and "rsv" in caption.text.lower():
415-
# In week 47 of the 2017-2018 season, a date is written as 201-11-25,
416-
# instead of 2017-11-25
417-
table.loc[table['week'] == 47, 'week end'] = "2017-11-25"
418-
elif season[0] == '2015' and current_week == 41:
419-
# In week 41 of the 2015-2016 season, a date written in m-d-y format not d-m-y
420-
table=table.replace("10-17-2015","17-10-2015",regex=True)
421-
elif season[0] == '2022' and current_week == 11 and "hmpv" in caption.text.lower():
422-
# In week 11 of the 2022-2023 season, in the positive hmpv table,
423-
# a date is written as 022-09-03, instead of 2022-09-03
424-
table.loc[table['week'] == 35, 'week end'] = "2022-09-03"
425-
426-
# check if both ah1 and h1n1 are given. If so drop one since they are the same virus and ah1 is always empty
428+
# # One-off edge cases where tables need to be manually adjusted because
429+
# # they will cause errors otherwise
430+
# if season[0] == '2017':
431+
# if current_week == 35 and "entero" in caption.text.lower():
432+
# # The positive enterovirus table in week 35 of the 2017-2018 season has french
433+
# # in the headers,so the french needs to be removed
434+
# table.columns = ['week', 'week end', 'canada tests', 'entero/rhino%', 'at tests',
435+
# 'entero/rhino%.1', 'qc tests', 'entero/rhino%.2', 'on tests',
436+
# 'entero/rhino%.3', 'pr tests', 'entero/rhino%.4', 'bc tests',
437+
# 'entero/rhino%.5']
438+
# elif current_week == 35 and "adeno" in caption.text.lower():
439+
# # In week 35 of the 2017-2018, the positive adenovirus table has ">week end"
440+
# # instead of "week end", so remove > from the column
441+
# table = table.rename(columns={'>week end':"week end"})
442+
# elif current_week == 47 and "rsv" in caption.text.lower():
443+
# # In week 47 of the 2017-2018 season, a date is written as 201-11-25,
444+
# # instead of 2017-11-25
445+
# table.loc[table['week'] == 47, 'week end'] = "2017-11-25"
446+
# elif season[0] == '2015' and current_week == 41:
447+
# # In week 41 of the 2015-2016 season, a date written in m-d-y format not d-m-y
448+
# table=table.replace("10-17-2015","17-10-2015",regex=True)
449+
# elif season[0] == '2022' and current_week == 11 and "hmpv" in caption.text.lower():
450+
# # In week 11 of the 2022-2023 season, in the positive hmpv table,
451+
# # a date is written as 022-09-03, instead of 2022-09-03
452+
# table.loc[table['week'] == 35, 'week end'] = "2022-09-03"
453+
454+
table = fix_edge_cases(table, season[0], caption, current_week)
455+
456+
# check if both ah1 and h1n1 are given. If so drop one since they are the same virus and ah1 is always empty
427457
table = drop_ah1_columns(table)
428458

429459
# Rename columns
@@ -490,15 +520,20 @@ def fetch_one_season_from_report(url):
490520
# If not, add the weeks tables into the season table
491521

492522
# check for deduplication pandas
493-
if not respiratory_detection_table.index.isin(all_respiratory_detection_tables.index).any():
494-
all_respiratory_detection_tables= pd.concat([all_respiratory_detection_tables,respiratory_detection_table])
523+
all_respiratory_detection_tables= pd.concat([all_respiratory_detection_tables,respiratory_detection_table])
524+
all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables])
525+
if not number_detections_table.index.isin(all_number_tables.index).any():
526+
all_number_tables=pd.concat([all_number_tables,number_detections_table])
527+
528+
# if not respiratory_detection_table.index.isin(all_respiratory_detection_tables.index).any():
529+
# all_respiratory_detection_tables= pd.concat([all_respiratory_detection_tables,respiratory_detection_table])
495530

496-
if not combined_positive_tables.index.isin(all_positive_tables.index).any():
497-
all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables])
531+
# if not combined_positive_tables.index.isin(all_positive_tables.index).any():
532+
# all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables])
498533

499-
if number_table_exists:
500-
if not number_detections_table.index.isin(all_number_tables.index).any():
501-
all_number_tables=pd.concat([all_number_tables,number_detections_table])
534+
# if number_table_exists:
535+
# if not number_detections_table.index.isin(all_number_tables.index).any():
536+
# all_number_tables=pd.concat([all_number_tables,number_detections_table])
502537

503538
return {
504539
"respiratory_detection": all_respiratory_detection_tables,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
[{"prod": "VS", "lang": "en", "Surveillance_Week":"2024-24 (week ending June 15, 2024)","date": "2024-06-20", "path": "/respiratory-virus-detections/archive/2024-06-20/"},
2+
{"prod": "VS", "lang": "fr", "Surveillance_Week":"2024-24 (La semaine se terminant le 15 juin 2024)","date": "2024-06-20", "path": "/detections-virus-respiratoires/archive/2024-06-20/"},
3+
{"prod": "VS", "lang": "en", "Surveillance_Week":"2024-25 (week ending June 22, 2024)","date": "2024-06-27", "path": "/respiratory-virus-detections/archive/2024-06-27/"},
4+
{"prod": "VS", "lang": "fr", "Surveillance_Week":"2024-25 (La semaine se terminant le 22 juin 2024)","date": "2024-06-27", "path": "/detections-virus-respiratoires/archive/2024-06-27/"},
5+
{"prod": "VS", "lang": "en", "Surveillance_Week":"2024-26 (week ending June 29, 2024)","date": "2024-07-04", "path": "/respiratory-virus-detections/archive/2024-07-04/"},
6+
{"prod": "VS", "lang": "fr", "Surveillance_Week":"2024-26 (La semaine se terminant le 29 juin 2024)","date": "2024-07-04", "path": "/detections-virus-respiratoires/archive/2024-07-04/"},
7+
{"prod": "VS", "lang": "en", "Surveillance_Week":"2024-27 (week ending July 6, 2024)","date": "2024-07-11", "path": "/respiratory-virus-detections/archive/2024-07-11/"},
8+
{"prod": "VS", "lang": "fr", "Surveillance_Week":"2024-27 (La semaine se terminant le 6 juillet 2024)","date": "2024-07-11", "path": "/detections-virus-respiratoires/archive/2024-07-11/"},
9+
{"prod": "VS", "lang": "en", "Surveillance_Week":"2024-28 (week ending July 13, 2024)","date": "2024-07-18", "path": "/respiratory-virus-detections/archive/2024-07-18/"},
10+
{"prod": "VS", "lang": "fr", "Surveillance_Week":"2024-28 (La semaine se terminant le 13 juillet 2024)","date": "2024-07-18", "path": "/detections-virus-respiratoires/archive/2024-07-18/"},
11+
{"prod": "VS", "lang": "en", "Surveillance_Week":"2024-29 (week ending July 20, 2024)","date": "2024-08-01", "path": "/respiratory-virus-detections/archive/2024-08-01/"},
12+
{"prod": "VS", "lang": "fr", "Surveillance_Week":"2024-29 (La semaine se terminant le 20 juillet 2024)","date": "2024-08-01", "path": "/detections-virus-respiratoires/archive/2024-08-01/"},
13+
{"prod": "VS", "lang": "en", "Surveillance_Week":"2024-30 (week ending July 27, 2024)","date": "2024-08-08", "path": "/respiratory-virus-detections/archive/2024-08-08/"},
14+
{"prod": "VS", "lang": "fr", "Surveillance_Week":"2024-30 (La semaine se terminant le 27 juillet 2024)","date": "2024-08-08", "path": "/detections-virus-respiratoires/archive/2024-08-08/"},
15+
{"prod": "VS", "lang": "en", "Surveillance_Week":"2024-31 (week ending August 3, 2024)","date": "2024-08-15", "path": "/respiratory-virus-detections/archive/2024-08-15/"},
16+
{"prod": "VS", "lang": "fr", "Surveillance_Week":"2024-31 (La semaine se terminant le 3 août 2024)","date": "2024-08-15", "path": "/detections-virus-respiratoires/archive/2024-08-15/"},
17+
{"prod": "VS", "lang": "en", "Surveillance_Week":"2024-32 (week ending August 10, 2024)","date": "2024-08-22", "path": "/respiratory-virus-detections/archive/2024-08-22/"},
18+
{"prod": "VS", "lang": "fr", "Surveillance_Week":"2024-32 (La semaine se terminant le 10 août 2024)","date": "2024-08-22", "path": "/detections-virus-respiratoires/archive/2024-08-22/"},
19+
{"prod": "VS", "lang": "en", "Surveillance_Week":"2024-33 (week ending August 17, 2024)","date": "2024-08-29", "path": "/respiratory-virus-detections/archive/2024-08-29/"},
20+
{"prod": "VS", "lang": "fr", "Surveillance_Week":"2024-33 (La semaine se terminant le 17 août 2024)","date": "2024-08-29", "path": "/detections-virus-respiratoires/archive/2024-08-29/"},
21+
{"prod": "VS", "lang": "en", "Surveillance_Week":"2024-34 (week ending August 24, 2024)","date": "2024-09-05", "path": "/respiratory-virus-detections/archive/2024-09-05/"},
22+
{"prod": "VS", "lang": "fr", "Surveillance_Week":"2024-34 (La semaine se terminant le 24 août 2024)","date": "2024-09-05", "path": "/detections-virus-respiratoires/archive/2024-09-05/"}]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2024-06-20
2+
2024-06-27
3+
2024-07-04
4+
2024-07-11
5+
2024-07-18
6+
2024-08-01
7+
2024-08-08
8+
2024-08-15
9+
2024-08-22
10+
2024-08-29
11+
2024-09-05
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
epiweek,time_value,issue,geo_type,geo_value,flu_tests,fluah1n1pdm09_positive_tests,fluah3_positive_tests,fluauns_positive_tests,flua_positive_tests,flub_positive_tests,rsv_tests,rsv_positive_tests,hpiv_tests,hpiv1_positive_tests,hpiv2_positive_tests,hpiv3_positive_tests,hpiv4_positive_tests,hpivother_positive_tests,adv_tests,adv_positive_tests,hmpv_tests,hmpv_positive_tests,evrv_tests,evrv_positive_tests,hcov_tests,hcov_positive_tests
2+
201703,2017-01-21,2017-01-25,lab,nl,87,0,4,4,8,0,87,7,87,0,1,1,0,0,87,0,87,3,87,1,NA,NA
3+
201703,2017-01-21,2017-01-25,lab,pe,58,0,17,0,17,0,58,6,2,0,0,0,0,0,2,0,2,0,2,0,2,1
4+
201703,2017-01-21,2017-01-25,lab,ns,119,0,0,12,12,0,109,18,20,0,0,1,0,0,20,0,20,0,20,1,20,7
5+
201703,2017-01-21,2017-01-25,lab,nb,247,0,0,43,43,0,249,33,57,0,0,1,0,0,57,2,57,1,57,5,57,7
6+
201703,2017-01-21,2017-01-25,region,atlantic,511,0,21,59,80,0,503,64,166,0,1,3,0,0,166,2,166,4,166,7,79,15
7+
201703,2017-01-21,2017-01-25,lab,région nord est,330,0,0,18,18,0,293,62,0,0,0,0,0,0,0,0,0,0,NA,NA,0,0
8+
201703,2017-01-21,2017-01-25,lab,québec chaudière appalaches,608,0,0,93,93,2,380,83,139,0,0,5,0,0,140,2,108,0,NA,NA,108,0
9+
201703,2017-01-21,2017-01-25,lab,centre du québec,659,0,49,102,151,4,490,118,0,0,0,0,0,0,0,0,0,0,NA,NA,0,0
10+
201703,2017-01-21,2017-01-25,lab,montréal laval,1653,0,0,300,300,6,1101,167,632,1,2,3,3,0,644,24,613,28,NA,NA,549,27
11+
201703,2017-01-21,2017-01-25,lab,ouest du québec,403,0,0,75,75,2,147,33,0,0,0,0,0,0,0,0,0,0,NA,NA,0,0
12+
201703,2017-01-21,2017-01-25,lab,montérégie,342,0,0,69,69,1,285,49,0,0,0,0,0,0,0,0,0,0,NA,NA,0,0
13+
201703,2017-01-21,2017-01-25,region,qc,3995,0,49,657,706,15,2696,512,771,1,2,8,3,0,784,26,721,28,NA,NA,657,27
14+
201703,2017-01-21,2017-01-25,lab,ottawa phl,64,0,14,0,14,0,64,7,64,0,0,1,0,0,64,0,64,9,64,1,64,16
15+
201703,2017-01-21,2017-01-25,lab,cheo ottawa,304,0,0,42,42,0,304,60,29,0,0,0,0,0,29,0,51,3,29,1,29,2
16+
201703,2017-01-21,2017-01-25,lab,kingston phl,76,0,17,0,17,0,76,11,76,1,4,2,0,0,76,1,74,4,76,3,74,11
17+
201703,2017-01-21,2017-01-25,lab,uhn mount sinai hospital,433,0,0,71,71,0,433,24,18,0,0,0,0,0,0,0,18,0,0,0,0,0
18+
201703,2017-01-21,2017-01-25,lab,phol toronto,1559,2,613,1,616,3,1174,102,1174,0,2,7,2,0,1174,9,1169,36,1174,28,1169,98
19+
201703,2017-01-21,2017-01-25,lab,sick kids hospital toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
20+
201703,2017-01-21,2017-01-25,lab,sunnybrook womens college hsc,127,0,12,10,22,0,127,10,127,0,1,1,0,0,127,0,127,4,127,1,127,12
21+
201703,2017-01-21,2017-01-25,lab,sault ste marie phl,24,0,4,0,4,0,24,5,24,0,0,2,0,0,24,0,24,1,24,0,24,3
22+
201703,2017-01-21,2017-01-25,lab,timmins phl,14,0,1,0,1,0,14,3,14,0,0,1,0,0,14,0,14,1,14,1,14,2
23+
201703,2017-01-21,2017-01-25,lab,st josephs london,100,0,0,16,16,0,100,6,18,0,1,0,0,0,0,0,18,2,0,0,0,0
24+
201703,2017-01-21,2017-01-25,lab,london phl,399,2,100,2,104,1,399,65,399,0,2,8,4,0,399,6,397,17,399,13,397,57
25+
201703,2017-01-21,2017-01-25,lab,orillia phl,228,0,38,0,38,0,227,42,227,0,1,3,1,0,227,5,226,8,227,15,226,22
26+
201703,2017-01-21,2017-01-25,lab,thunder bay phl,36,0,7,0,7,0,32,6,32,0,0,0,0,0,32,0,32,1,32,2,32,6
27+
201703,2017-01-21,2017-01-25,lab,sudbury phl,62,0,7,0,7,0,62,6,62,0,0,1,0,0,62,0,60,0,62,4,60,5
28+
201703,2017-01-21,2017-01-25,lab,hamilton phl,201,1,54,2,57,3,190,32,190,0,0,2,0,0,190,3,186,6,190,13,186,17
29+
201703,2017-01-21,2017-01-25,lab,peterborough phl,139,0,31,1,32,0,139,21,139,0,0,4,0,0,139,0,139,3,139,9,139,9
30+
201703,2017-01-21,2017-01-25,region,on,3766,5,898,145,1048,7,3365,400,2593,1,11,32,7,0,2557,24,2599,95,2557,91,2541,260
31+
201703,2017-01-21,2017-01-25,lab,mb,333,0,7,11,18,0,331,92,55,0,0,1,0,0,55,0,44,2,55,1,44,11
32+
201703,2017-01-21,2017-01-25,lab,regina,536,0,101,10,111,0,536,113,536,0,0,13,3,0,536,9,536,7,536,28,536,50
33+
201703,2017-01-21,2017-01-25,lab,saskatoon,217,0,0,32,32,1,217,40,37,0,0,1,0,0,37,1,37,0,37,2,37,0
34+
201703,2017-01-21,2017-01-25,lab,sk,753,0,101,42,143,1,753,153,573,0,0,14,3,0,573,10,573,7,573,30,573,50
35+
201703,2017-01-21,2017-01-25,lab,ab,1336,0,202,48,250,4,1336,185,1336,33,0,0,0,0,1336,9,1336,10,1336,30,1336,30
36+
201703,2017-01-21,2017-01-25,region,prairies,2422,0,310,101,411,5,2420,430,1964,33,0,15,3,0,1964,19,1953,19,1964,61,1953,91
37+
201703,2017-01-21,2017-01-25,region,bc,1360,0,55,362,417,20,1360,149,277,2,3,11,6,0,277,2,277,22,277,30,277,27
38+
201703,2017-01-21,2017-01-25,lab,yt,27,0,2,4,6,0,27,7,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
39+
201703,2017-01-21,2017-01-25,lab,nt,32,0,7,1,8,0,32,6,32,1,0,0,0,0,32,1,32,0,32,1,32,3
40+
201703,2017-01-21,2017-01-25,lab,nu,44,0,8,6,14,0,44,2,44,0,3,0,0,0,44,2,44,0,44,16,44,4
41+
201703,2017-01-21,2017-01-25,region,territories,103,0,17,11,28,0,103,15,76,1,3,0,0,0,76,3,76,0,76,17,76,7
42+
201703,2017-01-21,2017-01-25,nation,ca,12157,5,1350,1335,2690,47,10447,1570,5847,38,20,69,19,0,5824,76,5792,168,5040,206,5583,427

0 commit comments

Comments
 (0)