-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsubgroupRisk.R
324 lines (256 loc) · 11.2 KB
/
subgroupRisk.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
#' Summarise relative risk
#'
#' Calculate the average relative risk across individuals in a subgroup,
#' or calculate the subgroup specific attributable fraction based on the current relative risks.
#'
#' Attributable fractions are calculated using the method as in Bellis & Jones 2014, which is also equivalent to the
#' method described in the Brennan et al. 2015 SAPM mathematical description paper.
#'
#' @param data A data table of individual characteristics.
#' @param label Character - a label to append to the outcome variable to help identify it in later calculations.
#' @param disease_names Character vector - the names of the diseases for which summaries of relative risk are required.
#' @param af Logical - if TRUE, then attributable fractions are calculated. If FALSE, then the total relative risk
#' is calculated. Defaults to FALSE.
#' @param use_weights Logical - should the calculation account for survey weights. Defaults to FALSE.
#' Weight variable must be called "wt_int".
#' @param year_range Either an integer vector of the years to be selected or "all". Defaults to "all".
#' @param pool Logical - should the years selected be pooled. Defaults to FALSE.
#' @param subgroups Character vector - the variable names of the subgroups used to stratify the estimates.
#' @param mort_or_morb Character - for alcohol related diseases that have separate
#' relative risk curves for mortality and morbidity, should the risks corresponding to
#' mortality ("mort") or morbidity ("morb") be used.
#' @param alc_mort_and_morb Character vector of the names of the
#' alcohol related diseases that have separate risk functions for
#' mortality and morbidity.
#' @param substance Whether to compute relative risks for just alcohol ("alc"),
#' just tobacco ("tob") or joint risks for tobacco and alcohol ("tobalc").
#' @param smooth Logical - should the age patterns in average risk be smoothed with a moving average.
#' For use only if average risk is stratified by single years of age. Defaults to FALSE
#' @param oesoph_subtypes Logical - should the attributable fractions for oesophageal cancer
#' be multiplied by the proportions of each subtype. Defaults to FALSE.
#'
#' @return Returns a data table containing the subgroup specific summaries for each disease.
#'
#' @importFrom data.table := setDT setnames copy .SD .N
#'
#' @export
#'
#'
#' @examples
#' \dontrun{
#' # Simulate individual data
#'
#' # Using the parameters for the Gamma distribution from Kehoe et al. 2012
#' n <- 1e4
#' grams_ethanol_day <- rgamma(n, shape = 0.69, scale = 19.03)
#'
#' data <- data.table(
#' year = 2016,
#' weekmean = grams_ethanol_day * 7 / 8,
#' peakday = 2 * grams_ethanol_day / 8,
#' age = rpois(n, 30),
#' sex = sample(x = c("Male", "Female"), size = n, replace = T),
#' income5cat = "1_lowest income",
#' imd_quintile = "5_most_deprived",
#' kids = "0",
#' social_grade = "C2DE",
#' eduend4cat = "16-18", # age finished education
#' ethnic2cat = "white", # white / non-white
#' employ2cat = "yes", # employed / not
#' wtval = rnorm(n, mean = 60, sd = 5), # weight in kg
#' htval = rnorm(n, mean = 1.7, sd = .1) # height in m
#' )
#'
#' # Disease names
#' alc_disease_names <- c(
#' "Pharynx",
#' "Ischaemic_heart_disease",
#' "LiverCirrhosis",
#' "Transport_injuries",
#' "Alcohol_poisoning",
#' "Alcoholic_gastritis"
#' )
#'
#' # Run basic function without alcohol lags
#' test_data <- RRFunc(
#' data = copy(data),
#' substance = "alc",
#' alc_diseases = alc_disease_names,
#' alc_wholly_chronic_thresholds = c(2, 2),
#' alc_wholly_acute_thresholds = c(3, 3),
#' show_progress = TRUE
#' )
#'
#' # Calculate alcohol attributable fractions
#' test_aafs <- subgroupRisk(
#' data = test_data$data_plus_rr,
#' disease_names = alc_disease_names,
#' af = TRUE,
#' subgroups = "sex"
#' )
#'
#' test_aafs
#' }
subgroupRisk <- function(
data,
label = NULL,
disease_names = c("Pharynx", "Oral_cavity"),
af = FALSE,
use_weights = FALSE,
year_range = "all",
pool = FALSE,
subgroups = c("sex", "age_cat"),
mort_or_morb = "mort",
alc_mort_and_morb = c("Ischaemic_heart_disease", "LiverCirrhosis", "Haemorrhagic_Stroke", "Ischaemic_Stroke"),
substance = c("alc", "tob", "tobalc")[3],
smooth = FALSE,
oesoph_subtypes = FALSE
) {
out <- copy(data) # see if this can be removed from code as inefficient
# To switch-out the mortality risk functions for the morbidity risk functions if necessary
if(substance %fin% c("alc", "tobalc") & mort_or_morb == "morb" & !is.null(alc_mort_and_morb)) {
out[ , (alc_mort_and_morb) := NULL]
for(i in alc_mort_and_morb) {
setnames(out, paste0(i, "_morb"), stringr::str_remove(i, "_morb"))
}
}
if("age_cat" %in% subgroups & !("age_cat" %in% colnames(out))) {
out[ , age_cat := c("12-15", "16-17", "18-24", "25-34", "35-49", "50-64", "65-74", "75-89")[findInterval(age, c(-10, 16, 18, 25, 35, 50, 65, 75))]]
}
# To select a specified range of years of data
if(year_range[1] != "all") {
out <- out[year %in% year_range]
}
# If several years of data are selected, should they be pooled
if(pool == T) {
out[ , year := 1]
}
# Create a weighting variable
# depending on whether survey weights are to be used or not
if(use_weights == F) {
out[, weight := 1 / .N, by = c(subgroups, "year")]
} else {
out[, weight := wt_int / sum(wt_int, na.rm = T), by = c(subgroups, "year")]
}
# List of diseases for which absolute rather than relative risk is used
# These are all the wholly attributable acute and chronic conditions for alcohol
abs_diseases <- c(
"Excessive_Blood_Level_of_Alcohol",
"Toxic_effect_of_alcohol",
"Alcohol_poisoning",
"Evidence_of_alcohol_involvement_determined_by_blood_alcohol_level",
"Acute_intoxication",
"Alcoholic_cardiomyopathy",
"Alcoholic_gastritis",
"Alcoholic_liver_disease",
"Acute_pancreatitis_alcohol_induced",
"Chronic_pancreatitis_alcohol_induced",
"Alcohol_induced_pseudoCushings_syndrome",
"Alcoholic_myopathy",
"Alcoholic_polyneuropathy",
"Maternal_care_for_suspected_damage_to_foetus_from_alcohol",
"Degeneration",
"Mental_and_behavioural_disorders_due_to_use_of_alcohol")
# Standardise the relative risks by subtracting 1 and multiplying by the weight
for (d in disease_names) {
kn <- NROW(out[is.na(get(d)) | is.infinite(get(d))])
testthat::expect_equal(kn, 0, info = paste0("subgroupRisk: NA or infinite values in risk values for ", d, " input into function"))
kn <- NROW(out[get(d) < 0])
testthat::expect_equal(kn, 0, info = paste0("subgroupRisk: negative values in risk values for ", d, " input into function"))
# NEEDS MORE CHECKING TO ENSURE CONSISTENT WITH THE MATHEMATICS
if(isTRUE(af)) {
# For PAF
out[, (paste0(d, "_z")) := weight * (get(d) - 1)]
#out[, (paste0(d, "_z")) := weight * (get(d))]
} else {
# For PIF
out[, (paste0(d, "_z")) := weight * get(d)]
}
}
############################################################
# To prepare for subsequent computation of a PIF
# compute the average relative risk within a subgroup
# compute the average rather than the total, so that when we later calculate the ratio
# of this aggregated relative risk between treatment and control arms,
# the ratio is not influenced by differences in the number of individuals
# i.e. we want to calculate the ratio of the expected value of individual risk in each arm
if(!isTRUE(af)) {
if(!("year" %in% subgroups) & "year" %in% colnames(out)) {
subgroups <- c(subgroups, "year")
}
# calculate average relative risk
out_risk <- out[,
lapply(.SD, function(x) {
sum(x, na.rm = T)
}),
by = subgroups,
.SDcols = paste0(disease_names, "_z")]
# if(any(disease_names %in% colnames(out))) {
# out[ , (disease_names[disease_names %in% colnames(out)]) := NULL]
# }
setnames(out_risk, paste0(disease_names, "_z"), disease_names)
out_risk <- melt(
out_risk,
id.vars = subgroups,
variable.name = "condition",
value.name = paste0("av_risk_", label)
)
kn <- NROW(out_risk[is.na(get(paste0("av_risk_", label))) | is.infinite(get(paste0("av_risk_", label)))])
testthat::expect_equal(kn, 0, info = paste0("subgroupRisk: NA or infinite values in average risk values for ", d, " after calculation"))
kn <- NROW(out_risk[get(paste0("av_risk_", label)) < 0])
testthat::expect_equal(kn, 0, info = paste0("subgroupRisk: negative values in average risk values for ", d, " after calculation"))
# If the average relative risks should be smoothed over age
# for use when using single years of age
if(isTRUE(smooth)) {
out_risk[ , (paste0("av_risk_", label)) := {
kn <- 7 # the width in years of the moving average age window
z <- TTR::SMA(get(paste0("av_risk_", label)), n = kn)
z <- z[!is.na(z)]
z[z < 0] <- 0
c(rep(z[1], kn - 1), z)
}
, by = c(subgroups[subgroups != "age"], "condition")]
}
}
############################################################
# For attributable fractions
if(isTRUE(af)) {
if(!("year" %in% subgroups) & "year" %in% colnames(out)) {
subgroups <- c(subgroups, "year")
}
# calculate attributable fractions, considering residual risk in former smokers
out_risk <- out[,
lapply(.SD, function(x) {
sum(x, na.rm = T) / (sum(x, na.rm = T) + 1)
}),
by = subgroups,
.SDcols = paste0(disease_names, "_z")]
# if(any(disease_names %in% colnames(out_risk))) {
# out_risk[ , (disease_names[disease_names %in% colnames(out_risk)]) := NULL]
# }
##########################
if(oesoph_subtypes == TRUE) {
# Summarise the proportion of oesophageal cancer cases that are scc
# in the same way as the above attributable fractions
out_scc_prop <- out[ , .(prop_scc = sum(weight * prop_scc, na.rm = T) / sum(weight, na.rm = T)), by = subgroups]
# Merge in the proportions of scc into the paf dataset
out_risk <- merge(out_risk, out_scc_prop, all.x = T, all.y = F, by = subgroups)
# Multiple the estimated attributable fractions for Oesophageal Scc and AC by the proportions of each subtype
out_risk[ , Oesophageal_SCC_z := Oesophageal_SCC_z * prop_scc]
if(substance %fin% c("tob")) {
out_risk[ , Oesophageal_AC_z := Oesophageal_AC_z * (1 - prop_scc)]
}
}
###########################
setnames(out_risk, paste0(disease_names, "_z"), disease_names)
out_risk <- melt(
out_risk,
id.vars = subgroups,
variable.name = "condition",
value.name = "af"
)
# Set the AAF = 1 for wholly attributable conditions
out_risk[condition %in% abs_diseases, af := 1]
}
return(out_risk)
}