-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgsr_data_report.R
More file actions
58 lines (52 loc) · 2.24 KB
/
gsr_data_report.R
File metadata and controls
58 lines (52 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
library(argparser)
library(AnvilDataModels)
library(readr)
argp <- arg_parser("report")
argp <- add_argument(argp, "--data_file", help="tsv file with data")
argp <- add_argument(argp, "--dd_file", help="json file with GSR data dictionary")
argp <- add_argument(argp, "--dd_table_name", help="name of data dictionary table in dd_file")
argp <- add_argument(argp, "--analysis_file", help="tsv file with analysis table")
argp <- add_argument(argp, "--stop_on_fail", flag=TRUE, help="return an error code if data_file does not pass checks")
argv <- parse_args(argp)
# argv <- list(data_file="testdata/gsr_chr1.tsv",
# dd_file="testdata/gsr_data_model.json",
# dd_table_name="gsr_files_dd",
# analysis_file="output_analysis_table.tsv")
# read data model
dd <- json_to_dm(argv$dd_file)
dd_table_name <- argv$dd_table_name
stopifnot(dd_table_name %in% names(dd))
# read 1000 rows for checking data against expected type
dat <- read_tsv(argv$data_file, n_max=1000, comment="#")
dat <- list(dat)
names(dat) <- dd_table_name
# read analysis table to assess conditions
if (!is.na(argv$analysis_file)) {
analysis <- read_tsv(argv$analysis_file, col_types=cols(.default=col_character()))
# parse conditions and add cols to 'required' as necessary
req <- character()
cond <- attr(dd[[dd_table_name]], "conditions")
for (c in names(cond)) {
p <- AnvilDataModels:::.parse_condition(cond[[c]])
if (analysis[[p$column]] == p$value) {
req <- c(req, c)
}
}
if (length(req) > 0) {
req <- unique(c(attr(dd[[dd_table_name]], "required"), req))
# can't update attributes on a dm object
tmp <- dd[[dd_table_name]]
attr(tmp, "required") <- req
# remove conditions so columns aren't listed twice in check
attr(tmp, "conditions") <- character()
tmp <- list(tmp)
names(tmp) <- dd_table_name
dd <- dm::as_dm(tmp)
}
}
params <- list(tables=dat, model=dd)
pass <- custom_render_markdown("data_dictionary_report", "data_dictionary_validation", parameters=params)
writeLines(tolower(as.character(pass)), "pass.txt")
if (argv$stop_on_fail) {
if (!pass) stop("data file not compatible with data model; see data_dictionary_validation.html")
}