add stapm version of alcohol binge parameters for Scotland

STAPM · Dec 18, 2022 · 02f23f4 · 02f23f4
1 parent 816dcf7
commit 02f23f4
Show file tree

Hide file tree

Showing 15 changed files with 492 additions and 54 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,6 @@ dev/
 .DS_Store
 *.pdf
 data-raw/binge_params/*.csv
+.rds
+data-raw/binge_params/Scotland/*.rds
+
diff --git a/R/binge_params.R b/R/binge_params.R
@@ -1,5 +1,5 @@
 
-#' Parameters to estimate amount drunk on single occasions - STAPM version
+#' Parameters to estimate amount drunk on single occasions - England - STAPM version
 #' 
 #' As our starting point we use the parameter estimates from Hill-McManus et al 2014 - stored within the `tobalcepi` package as the data object `binge_params`. 
 #' The problem with using these parameters directly in STAPM is that STAPM does not model the individual life-course trajectories of 
@@ -17,14 +17,25 @@
 #'
 #' @source Hill-McManus et al 2014. "Estimation of usual occasion-based individual drinking patterns using diary survey data". https://doi.org/https://doi.org/10.1016/j.drugalcdep.2013.09.022. 
 #'
+"binge_params_stapm"
+
+#' Parameters to estimate amount drunk on single occasions - Scotland - STAPM version
+#' 
+#' Scottish version of `binge_params_stapm` based on the Scottish Health Survey years 2008 to 2019.   
 #'
+#' @docType data
 #'
+#' @format A list of four data tables (1 = Negative binomial regression model for the number of weekly drinking occasions, 
+#' 2 = Fitted Heckman selection model for probability that an individual drinks on at least 3 separate occasions during the diary period, 
+#' 3 = Fitted Heckman outcome regression results for the standard deviation in the quantity of alcohol consumed in a drinking occasion, 
+#' 4 = average height and weight)
 #'
+#' @source Hill-McManus et al 2014. "Estimation of usual occasion-based individual drinking patterns using diary survey data". https://doi.org/https://doi.org/10.1016/j.drugalcdep.2013.09.022. 
 #'
-"binge_params_stapm"
+"binge_params_stapm_scot"
 
 
-#' Parameters to estimate amount drunk on single occassions
+#' Parameters to estimate amount drunk on single occasions
 #' 
 #' We use parameter estimates from Hill-McManus et al 2014 -   
 #' 
@@ -35,20 +46,14 @@
 #' Table 6 - Fitted Heckman outcome regression results for the standard deviation in the quantity of alcohol consumed in a drinking occasion   
 #' 
 #' We do not use parameter estimates from Table 4 - Fitted Tobit regression model for the mean grams of alcohol consumed during a drinking occasion. 
-#' This is because we calculate from the data by dividing the weekly mean alcohol consumption by the estimated number of weekly drinking occassions.  
-#'
-#'
+#' This is because we calculate from the data by dividing the weekly mean alcohol consumption by the estimated number of weekly drinking occasions.  
 #'
 #' @docType data
 #'
 #' @format A list of three data tables (1 = Table 3, 2 = Table 5, 3 = Table 6)
 #'
 #' @source Hill-McManus et al 2014. "Estimation of usual occasion-based individual drinking patterns using diary survey data". https://doi.org/https://doi.org/10.1016/j.drugalcdep.2013.09.022. 
 #'
-#'
-#'
-#'
-#'
 "binge_params"
 
 

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -80,6 +80,7 @@ reference:
   - WArisk_acute
   - binge_params
   - binge_params_stapm
+  - binge_params_stapm_scot
 - subtitle: "Alcohol lag times"
   contents:
   - AlcLags

diff --git a/data-raw/binge_params/20_working_code.R b/data-raw/binge_params/20_working_code.R
diff --git a/...binge_params/10_HSE_variable_processing.R → ...rams/England/10_HSE_variable_processing.R b/...binge_params/10_HSE_variable_processing.R → ...rams/England/10_HSE_variable_processing.R
diff --git a/data-raw/binge_params/12_Imputation.R → ...-raw/binge_params/England/12_Imputation.R b/data-raw/binge_params/12_Imputation.R → ...-raw/binge_params/England/12_Imputation.R
diff --git a/...raw/binge_params/binge_params_for_stapm.R → ...e_params/England/binge_params_for_stapm.R b/...raw/binge_params/binge_params_for_stapm.R → ...e_params/England/binge_params_for_stapm.R
diff --git a/data-raw/binge_params/README b/data-raw/binge_params/README
@@ -0,0 +1,21 @@
+
+The aim of this code is to process the parameters that allow the modelling of 
+acute harms that are partially attributable to alcohol in STAPM
+
+# This is the SAPM method
+data_sapm_test <- tobalcepi::AlcBinge(data)
+
+adapt the method to predict the number of drinking occassions 
+and the distribution of amount consumed 
+as a function of the average weekly amount of alcohol consumed, age, sex and IMD quintile. 
+
+assign the parameter values from Hill-McManus et al. to each individual 
+and then average these values by age, sex and IMD quintile 
+
+the IMD quintile averages will then reflect the variation among IMD quintiles 
+in the distribution of ethnicity, income, number of children etc.
+
+
+
+
+
diff --git a/data-raw/binge_params/Scotland/10_clean_shes.R b/data-raw/binge_params/Scotland/10_clean_shes.R
@@ -0,0 +1,142 @@
+
+# The aim of this code is to clean the tobacco and alcohol data from the Scottish Health Survey
+
+# note: no questions asked to < 16 year olds in Shes
+
+# Using functions in the hseclean package
+library(hseclean)
+library(data.table)
+library(magrittr)
+
+# Location of Scottish data
+root_dir <- "X:/HAR_PR/PR/Consumption_TA/HSE/Scottish Health Survey (SHeS)/"
+
+# The variables to retain
+keep_vars = c(
+  "hse_id", "wt_int", "psu", "cluster", "year", "quarter",
+  "age", "age_cat", "sex", "imd_quintile",
+  "ethnicity_2cat",
+  "degree", "marstat", "relationship_status", "employ2cat", "social_grade", "kids", "income5cat",
+  "nssec3_lab", "man_nonman", "activity_lstweek", "eduend4cat",
+
+  "hse_mental",
+
+  "weight", "height",
+
+  "drinks_now",
+  "drink_freq_7d", "n_days_drink", "peakday", "binge_cat",
+  "beer_units", "wine_units", "spirit_units", "rtd_units",
+  "weekmean", "drating", "dnoft", "dnnow", "dnany", "dnevr",
+  "perc_spirit_units", "perc_wine_units", "perc_rtd_units", "perc_beer_units",
+  "drinker_cat",
+  #"spirits_pref_cat", "wine_pref_cat", "rtd_pref_cat", "beer_pref_cat",
+  "total_units7_ch",
+
+  # Smoking
+  "cig_smoker_status",
+  "years_since_quit", "years_reg_smoker", "cig_ever",
+  "smk_start_age", "smk_stop_age", "censor_age", 
+  "cigs_per_day", "smoker_cat", "hand_rolled_per_day", "machine_rolled_per_day", "prop_handrolled", "cig_type")
+
+
+# Main processing
+
+cleandata <- function(data) {
+
+  data <- clean_age(data)
+  data <- clean_demographic(data)
+  data <- clean_education(data)
+  data <- clean_economic_status(data)
+  data <- clean_family(data)
+  data <- clean_income(data)
+  data <- clean_health_and_bio(data)
+
+  data <- alc_drink_now_allages(data)
+  data <- alc_weekmean_adult(data)
+  data <- alc_sevenday_adult(data)
+
+  data <- smk_status(data)
+  data <- smk_former(data)
+  data <- smk_life_history(data)
+  data <- smk_amount(data)
+
+  data <- select_data(
+    data,
+    ages = 16:89,
+    years = 2008:2019,
+    keep_vars = keep_vars,
+    complete_vars = c("age", "sex", "imd_quintile", "psu", "cluster", "year")
+  )
+
+  return(data)
+}
+
+shes_data <- combine_years(list(
+  cleandata(read_SHeS_2008(root = root_dir)),
+  cleandata(read_SHeS_2009(root = root_dir)),
+  cleandata(read_SHeS_2010(root = root_dir)),
+  cleandata(read_SHeS_2011(root = root_dir)),
+  cleandata(read_SHeS_2012(root = root_dir)),
+  cleandata(read_SHeS_2013(root = root_dir)),
+  cleandata(read_SHeS_2014(root = root_dir)),
+  cleandata(read_SHeS_2015(root = root_dir)),
+  cleandata(read_SHeS_2016(root = root_dir)),
+  cleandata(read_SHeS_2017(root = root_dir)),
+  cleandata(read_SHeS_2018(root = root_dir)),
+  cleandata(read_SHeS_2019(root = root_dir))
+))
+
+# Load population data for Scotland
+# from here - X:\ScHARR\PR_Mortality_data_TA\data\Processed pop sizes and death rates from VM
+
+scot_pops <- fread("X:/ScHARR/PR_Mortality_data_TA/data/Processed pop sizes and death rates from VM/pop_sizes_scotland_national_v1_2022-12-13_mort.tools_1.5.0.csv")
+setnames(scot_pops, c("pops"), c("N"))
+
+# adjust the survey weights according to the ratio of the real population to the sampled population
+shes_data <- clean_surveyweights(shes_data, pop_data = scot_pops)
+
+# remake age categories
+shes_data[, age_cat := c("16-17",
+                         "18-24",
+                         "25-34",
+                         "35-44",
+                         "45-54",
+                         "55-64",
+                         "65-74",
+                         "75-89")[findInterval(age, c(-1, 18, 25, 35, 45, 55, 65, 75, 1000))]]
+
+setnames(shes_data,
+         c("smk_start_age", "cig_smoker_status", "years_since_quit"),
+         c("start_age", "smk.state", "time_since_quit"))
+
+# Checks on data
+
+shes_data[(spirit_units + wine_units + rtd_units + beer_units) != weekmean]
+
+shes_data[drinker_cat != "abstainer" & weekmean == 0]
+
+# some drinkers have no data for average weekly consumption
+# remove these individuals from the dataset - rather than imputing the missing data
+shes_data <- shes_data[!(drinker_cat != "abstainer" & weekmean == 0)]
+
+shes_data[drinker_cat != "abstainer" & is.na(weekmean)]
+
+# select only rows with complete information on average weekly alcohol consumption
+shes_data <- shes_data[!is.na(weekmean)]
+
+shes_data[smoker_cat != "non_smoker" & cigs_per_day == 0]
+
+# select only rows with complete information on average weekly alcohol consumption
+shes_data <- shes_data[!is.na(smk.state)]
+
+nrow(shes_data)
+
+#shes_data[is.na(drinker_cat)]
+
+######## Write the data
+
+# note the package version so that the data can be tagged with it
+ver <- packageVersion("hseclean")
+
+saveRDS(shes_data, paste0("X:/ScHARR/PR_STAPM/Code/R_packages/tobalcepi/data-raw/binge_params/Scotland/tobalc_consumption_scot_national_2008-2019_v1_", Sys.Date(), "_hseclean_", ver, ".rds"))
+
diff --git a/data-raw/binge_params/Scotland/15_Imputation_shes.R b/data-raw/binge_params/Scotland/15_Imputation_shes.R
@@ -0,0 +1,99 @@
+
+# This code reads the processed health survey datasets for each year
+# and conducts imputation.
+
+# all the variables to be imputed are categorical
+
+library(data.table)
+library(hseclean)
+
+# Load the data
+
+# choose the file output by 10_clean_shes.R
+
+data <- readRDS("X:/ScHARR/PR_STAPM/Code/R_packages/tobalcepi/data-raw/binge_params/Scotland/tobalc_consumption_scot_national_2008-2019_v1_2022-12-18_hseclean_1.9.2.rds")
+
+# sapply(data, class)
+
+# view variables with missingness
+misscheck <- function(var) {
+  x <- table(var, useNA = "ifany")
+  na <- x[which(is.na(names(x)))]
+  if(length(na) == 0) na <- 0
+  perc <- round(100 * na / sum(x), 2)
+  #return(c(paste0(na, " missing obs, ", perc, "%")))
+  return(na)
+}
+
+n_missing <- sapply(data, misscheck)
+missing_vars <- n_missing[which(n_missing > 0)]
+missing_vars
+
+# household equivalised income has the most missingness
+
+# The categorical variables involved
+var_names <- c(
+  "kids",
+  "relationship_status",
+  "ethnicity_2cat",
+  "eduend4cat",
+  "degree",
+  "nssec3_lab",
+  "employ2cat",# complete variable
+  "activity_lstweek",# complete variable
+  "income5cat",
+  "hse_mental",
+  "drinker_cat",
+  "social_grade",
+  "smk.state"
+)
+
+# Note that the imputation wont work unless the variables considered are
+# either subject to imputation or do not contain any missingness (i.e. are complete)
+
+# The variables to be imputed and the method to be used
+var_methods <- rep("", ncol(data))
+
+var_methods[which(var_names == "kids")] <- "polr"
+var_methods[which(var_names == "relationship_status")] <- "polyreg"
+var_methods[which(var_names == "ethnicity_2cat")] <- "logreg"
+var_methods[which(var_names == "eduend4cat")] <- "polyreg"
+var_methods[which(var_names == "degree")] <- "logreg"
+var_methods[which(var_names == "nssec3_lab")] <- "polyreg"
+var_methods[which(var_names == "income5cat")] <- "polr"
+var_methods[which(var_names == "hse_mental")] <- "logreg"
+var_methods[which(var_names == "social_grade")] <- "polyreg"
+
+
+# Set order of factors where needed for imputing as ordered.
+data[ , kids := factor(kids, levels = c("0", "1", "2", "3+"))]
+data[ , income5cat := factor(income5cat, levels = c("1_lowest_income", "2", "3", "4", "5_highest_income"))]
+
+
+# Impute missing values
+
+# Run the imputation
+imp <- impute_data_mice(
+  data = data,
+  var_names = var_names,
+  var_methods = var_methods,
+  n_imputations = 5
+  # for testing just do 1 imputation
+  # but test with more later
+  # for point estimates, apparently 2-10 imputations are enough
+)
+
+data_imp <- copy(imp$data)
+
+
+# note the package version so that the data can be tagged with it
+ver <- packageVersion("hseclean")
+
+saveRDS(data_imp, paste0("X:/ScHARR/PR_STAPM/Code/R_packages/tobalcepi/data-raw/binge_params/Scotland/tobalc_consumption_scot_national_2008-2019_v1_", Sys.Date(), "_hseclean_", ver, "_imputed.rds"))
+
+
+
+
+
+
+