Skip to content

Commit

Permalink
add stapm version of alcohol binge parameters for Scotland
Browse files Browse the repository at this point in the history
  • Loading branch information
dosgillespie committed Dec 18, 2022
1 parent 816dcf7 commit 02f23f4
Show file tree
Hide file tree
Showing 15 changed files with 492 additions and 54 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ dev/
.DS_Store
*.pdf
data-raw/binge_params/*.csv
.rds
data-raw/binge_params/Scotland/*.rds

25 changes: 15 additions & 10 deletions R/binge_params.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

#' Parameters to estimate amount drunk on single occasions - STAPM version
#' Parameters to estimate amount drunk on single occasions - England - STAPM version
#'
#' As our starting point we use the parameter estimates from Hill-McManus et al 2014 - stored within the `tobalcepi` package as the data object `binge_params`.
#' The problem with using these parameters directly in STAPM is that STAPM does not model the individual life-course trajectories of
Expand All @@ -17,14 +17,25 @@
#'
#' @source Hill-McManus et al 2014. "Estimation of usual occasion-based individual drinking patterns using diary survey data". https://doi.org/https://doi.org/10.1016/j.drugalcdep.2013.09.022.
#'
"binge_params_stapm"

#' Parameters to estimate amount drunk on single occasions - Scotland - STAPM version
#'
#' Scottish version of `binge_params_stapm` based on the Scottish Health Survey years 2008 to 2019.
#'
#' @docType data
#'
#' @format A list of four data tables (1 = Negative binomial regression model for the number of weekly drinking occasions,
#' 2 = Fitted Heckman selection model for probability that an individual drinks on at least 3 separate occasions during the diary period,
#' 3 = Fitted Heckman outcome regression results for the standard deviation in the quantity of alcohol consumed in a drinking occasion,
#' 4 = average height and weight)
#'
#' @source Hill-McManus et al 2014. "Estimation of usual occasion-based individual drinking patterns using diary survey data". https://doi.org/https://doi.org/10.1016/j.drugalcdep.2013.09.022.
#'
"binge_params_stapm"
"binge_params_stapm_scot"


#' Parameters to estimate amount drunk on single occassions
#' Parameters to estimate amount drunk on single occasions
#'
#' We use parameter estimates from Hill-McManus et al 2014 -
#'
Expand All @@ -35,20 +46,14 @@
#' Table 6 - Fitted Heckman outcome regression results for the standard deviation in the quantity of alcohol consumed in a drinking occasion
#'
#' We do not use parameter estimates from Table 4 - Fitted Tobit regression model for the mean grams of alcohol consumed during a drinking occasion.
#' This is because we calculate from the data by dividing the weekly mean alcohol consumption by the estimated number of weekly drinking occassions.
#'
#'
#' This is because we calculate from the data by dividing the weekly mean alcohol consumption by the estimated number of weekly drinking occasions.
#'
#' @docType data
#'
#' @format A list of three data tables (1 = Table 3, 2 = Table 5, 3 = Table 6)
#'
#' @source Hill-McManus et al 2014. "Estimation of usual occasion-based individual drinking patterns using diary survey data". https://doi.org/https://doi.org/10.1016/j.drugalcdep.2013.09.022.
#'
#'
#'
#'
#'
"binge_params"


Expand Down
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ reference:
- WArisk_acute
- binge_params
- binge_params_stapm
- binge_params_stapm_scot
- subtitle: "Alcohol lag times"
contents:
- AlcLags
Expand Down
41 changes: 0 additions & 41 deletions data-raw/binge_params/20_working_code.R

This file was deleted.

File renamed without changes.
21 changes: 21 additions & 0 deletions data-raw/binge_params/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@

The aim of this code is to process the parameters that allow the modelling of
acute harms that are partially attributable to alcohol in STAPM

# This is the SAPM method
data_sapm_test <- tobalcepi::AlcBinge(data)

adapt the method to predict the number of drinking occassions
and the distribution of amount consumed
as a function of the average weekly amount of alcohol consumed, age, sex and IMD quintile.

assign the parameter values from Hill-McManus et al. to each individual
and then average these values by age, sex and IMD quintile

the IMD quintile averages will then reflect the variation among IMD quintiles
in the distribution of ethnicity, income, number of children etc.





142 changes: 142 additions & 0 deletions data-raw/binge_params/Scotland/10_clean_shes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@

# The aim of this code is to clean the tobacco and alcohol data from the Scottish Health Survey

# note: no questions asked to < 16 year olds in Shes

# Using functions in the hseclean package
library(hseclean)
library(data.table)
library(magrittr)

# Location of Scottish data
root_dir <- "X:/HAR_PR/PR/Consumption_TA/HSE/Scottish Health Survey (SHeS)/"

# The variables to retain
keep_vars = c(
"hse_id", "wt_int", "psu", "cluster", "year", "quarter",
"age", "age_cat", "sex", "imd_quintile",
"ethnicity_2cat",
"degree", "marstat", "relationship_status", "employ2cat", "social_grade", "kids", "income5cat",
"nssec3_lab", "man_nonman", "activity_lstweek", "eduend4cat",

"hse_mental",

"weight", "height",

"drinks_now",
"drink_freq_7d", "n_days_drink", "peakday", "binge_cat",
"beer_units", "wine_units", "spirit_units", "rtd_units",
"weekmean", "drating", "dnoft", "dnnow", "dnany", "dnevr",
"perc_spirit_units", "perc_wine_units", "perc_rtd_units", "perc_beer_units",
"drinker_cat",
#"spirits_pref_cat", "wine_pref_cat", "rtd_pref_cat", "beer_pref_cat",
"total_units7_ch",

# Smoking
"cig_smoker_status",
"years_since_quit", "years_reg_smoker", "cig_ever",
"smk_start_age", "smk_stop_age", "censor_age",
"cigs_per_day", "smoker_cat", "hand_rolled_per_day", "machine_rolled_per_day", "prop_handrolled", "cig_type")


# Main processing

cleandata <- function(data) {

data <- clean_age(data)
data <- clean_demographic(data)
data <- clean_education(data)
data <- clean_economic_status(data)
data <- clean_family(data)
data <- clean_income(data)
data <- clean_health_and_bio(data)

data <- alc_drink_now_allages(data)
data <- alc_weekmean_adult(data)
data <- alc_sevenday_adult(data)

data <- smk_status(data)
data <- smk_former(data)
data <- smk_life_history(data)
data <- smk_amount(data)

data <- select_data(
data,
ages = 16:89,
years = 2008:2019,
keep_vars = keep_vars,
complete_vars = c("age", "sex", "imd_quintile", "psu", "cluster", "year")
)

return(data)
}

shes_data <- combine_years(list(
cleandata(read_SHeS_2008(root = root_dir)),
cleandata(read_SHeS_2009(root = root_dir)),
cleandata(read_SHeS_2010(root = root_dir)),
cleandata(read_SHeS_2011(root = root_dir)),
cleandata(read_SHeS_2012(root = root_dir)),
cleandata(read_SHeS_2013(root = root_dir)),
cleandata(read_SHeS_2014(root = root_dir)),
cleandata(read_SHeS_2015(root = root_dir)),
cleandata(read_SHeS_2016(root = root_dir)),
cleandata(read_SHeS_2017(root = root_dir)),
cleandata(read_SHeS_2018(root = root_dir)),
cleandata(read_SHeS_2019(root = root_dir))
))

# Load population data for Scotland
# from here - X:\ScHARR\PR_Mortality_data_TA\data\Processed pop sizes and death rates from VM

scot_pops <- fread("X:/ScHARR/PR_Mortality_data_TA/data/Processed pop sizes and death rates from VM/pop_sizes_scotland_national_v1_2022-12-13_mort.tools_1.5.0.csv")
setnames(scot_pops, c("pops"), c("N"))

# adjust the survey weights according to the ratio of the real population to the sampled population
shes_data <- clean_surveyweights(shes_data, pop_data = scot_pops)

# remake age categories
shes_data[, age_cat := c("16-17",
"18-24",
"25-34",
"35-44",
"45-54",
"55-64",
"65-74",
"75-89")[findInterval(age, c(-1, 18, 25, 35, 45, 55, 65, 75, 1000))]]

setnames(shes_data,
c("smk_start_age", "cig_smoker_status", "years_since_quit"),
c("start_age", "smk.state", "time_since_quit"))

# Checks on data

shes_data[(spirit_units + wine_units + rtd_units + beer_units) != weekmean]

shes_data[drinker_cat != "abstainer" & weekmean == 0]

# some drinkers have no data for average weekly consumption
# remove these individuals from the dataset - rather than imputing the missing data
shes_data <- shes_data[!(drinker_cat != "abstainer" & weekmean == 0)]

shes_data[drinker_cat != "abstainer" & is.na(weekmean)]

# select only rows with complete information on average weekly alcohol consumption
shes_data <- shes_data[!is.na(weekmean)]

shes_data[smoker_cat != "non_smoker" & cigs_per_day == 0]

# select only rows with complete information on average weekly alcohol consumption
shes_data <- shes_data[!is.na(smk.state)]

nrow(shes_data)

#shes_data[is.na(drinker_cat)]

######## Write the data

# note the package version so that the data can be tagged with it
ver <- packageVersion("hseclean")

saveRDS(shes_data, paste0("X:/ScHARR/PR_STAPM/Code/R_packages/tobalcepi/data-raw/binge_params/Scotland/tobalc_consumption_scot_national_2008-2019_v1_", Sys.Date(), "_hseclean_", ver, ".rds"))

99 changes: 99 additions & 0 deletions data-raw/binge_params/Scotland/15_Imputation_shes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@

# This code reads the processed health survey datasets for each year
# and conducts imputation.

# all the variables to be imputed are categorical

library(data.table)
library(hseclean)

# Load the data

# choose the file output by 10_clean_shes.R

data <- readRDS("X:/ScHARR/PR_STAPM/Code/R_packages/tobalcepi/data-raw/binge_params/Scotland/tobalc_consumption_scot_national_2008-2019_v1_2022-12-18_hseclean_1.9.2.rds")

# sapply(data, class)

# view variables with missingness
misscheck <- function(var) {
x <- table(var, useNA = "ifany")
na <- x[which(is.na(names(x)))]
if(length(na) == 0) na <- 0
perc <- round(100 * na / sum(x), 2)
#return(c(paste0(na, " missing obs, ", perc, "%")))
return(na)
}

n_missing <- sapply(data, misscheck)
missing_vars <- n_missing[which(n_missing > 0)]
missing_vars

# household equivalised income has the most missingness

# The categorical variables involved
var_names <- c(
"kids",
"relationship_status",
"ethnicity_2cat",
"eduend4cat",
"degree",
"nssec3_lab",
"employ2cat",# complete variable
"activity_lstweek",# complete variable
"income5cat",
"hse_mental",
"drinker_cat",
"social_grade",
"smk.state"
)

# Note that the imputation wont work unless the variables considered are
# either subject to imputation or do not contain any missingness (i.e. are complete)

# The variables to be imputed and the method to be used
var_methods <- rep("", ncol(data))

var_methods[which(var_names == "kids")] <- "polr"
var_methods[which(var_names == "relationship_status")] <- "polyreg"
var_methods[which(var_names == "ethnicity_2cat")] <- "logreg"
var_methods[which(var_names == "eduend4cat")] <- "polyreg"
var_methods[which(var_names == "degree")] <- "logreg"
var_methods[which(var_names == "nssec3_lab")] <- "polyreg"
var_methods[which(var_names == "income5cat")] <- "polr"
var_methods[which(var_names == "hse_mental")] <- "logreg"
var_methods[which(var_names == "social_grade")] <- "polyreg"


# Set order of factors where needed for imputing as ordered.
data[ , kids := factor(kids, levels = c("0", "1", "2", "3+"))]
data[ , income5cat := factor(income5cat, levels = c("1_lowest_income", "2", "3", "4", "5_highest_income"))]


# Impute missing values

# Run the imputation
imp <- impute_data_mice(
data = data,
var_names = var_names,
var_methods = var_methods,
n_imputations = 5
# for testing just do 1 imputation
# but test with more later
# for point estimates, apparently 2-10 imputations are enough
)

data_imp <- copy(imp$data)


# note the package version so that the data can be tagged with it
ver <- packageVersion("hseclean")

saveRDS(data_imp, paste0("X:/ScHARR/PR_STAPM/Code/R_packages/tobalcepi/data-raw/binge_params/Scotland/tobalc_consumption_scot_national_2008-2019_v1_", Sys.Date(), "_hseclean_", ver, "_imputed.rds"))







Loading

0 comments on commit 02f23f4

Please sign in to comment.