-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinking CH level data sets.R
212 lines (168 loc) · 6.78 KB
/
linking CH level data sets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
## ==========================================================================##
# Project: DACHA
# Team: Improvement Analytics Unit (IAU) at the Health Foundation
# Script: linkage_prep_CHdata.R
# Corresponding author: Liz Crellin ([email protected])
# Descripton: this will bring all the care home level data sets together
# Inputs:
# All the outputs of the previous scripts
# Outputs:
# One row per person data set with carehome level info included
# Notes: To use, need to adjust locations of R scripts and csv files
## ==========================================================================##
# Source relevant scripts ------------------------------------------------
# Set up ####
library(aws.s3)
library(dplyr)
library(tidyverse)
library(lubridate)
## Define the project bucket
project_bucket <- '' # assign project directory
#CH id spine
pseudocqcids <- s3read_using(read.csv,
object = 'CH level data/ch_pseudo_spine.csv',
#this file was generated in SAS direct from the cqc location ids provided by CH software providers
bucket = proj_bucket,
na.strings = c("", "NA")
)
#import the workforce questionnaire data
workforce <- s3read_using(read.csv,
object = 'CH level data/workforce_questionnaire/clean_pseudo/workforce_questionnaire.csv',
#this file was generated in SAS. Standardized to most recent cqc id and then pseudonymised.
bucket = proj_bucket,
na.strings = c("", "NA")
)
#check one row per pseudocqcid
workforce %>%
select(pseudocqcid) %>%
n_distinct()
#rename recorded date for clarity on source
workforce <- workforce %>%
rename(RecordedDate_WF = RecordedDate)
#import the cleaned CQC data
cqcclean <- s3read_using(read.csv,
object = 'CH level data/CQC publicly available data/cqc_data_cleaned.csv',
#cleaned file from 'CQC_data_prep_R'
bucket = proj_bucket,
na.strings = c("", "NA")
)
#check one row per pseudocqcid
cqcclean %>%
select(pseudocqcid) %>%
n_distinct()
#check some vars
cqcclean %>%
group_by(Care_home_) %>%
tally()
cqcclean %>%
group_by(Location_Type_Sector) %>%
tally()
cqcclean %>%
group_by(Location_Inspection_Directorate) %>%
tally()
cqcclean %>%
group_by(Location_Primary_Inspection_Cate) %>%
tally()
#Location_City - remove as too detailed
cqcclean %>%
group_by(Location_Type) %>%
tally()
#remove unneeded cols and rename
cqcclean <- cqcclean %>%
select(!c(X, Care_home_, Location_Type_Sector, Location_Inspection_Directorate,
Location_Primary_Inspection_Cate, Location_City, Location_Type)) %>%
rename(Location_HSCA_start_date_CQC = Location_HSCA_start_date,
Care_homes_beds_CQC = Care_homes_beds,
Publication_Date_CQC = Publication_Date,
Location_Region_CQC = Location_Region,
Location_NHS_Region_CQC = Location_NHS_Region,
Location_Local_Authority_CQC = Location_Local_Authority,
Location_ONSPD_CCG_CQC = Location_ONSPD_CCG,
Service_Type_CQC = service_type,
#ratings can stay as they are as we know they are from CQC
Years_Since_Reg_CQC = years_since_registration,
Serv_user_dementia_CQC = serv_user_dementia
)
#sort dates
cqcclean <- cqcclean %>%
mutate(Location_HSCA_start_date_CQC = ymd(Location_HSCA_start_date_CQC),
Publication_Date_CQC = dmy(Publication_Date_CQC))
#import the cleaned CH residency table
CHR <- s3read_using(read.csv,
object = 'person level data sets for linkage/CHR_pre_linkage.csv',
#cleaned file from 'linkage_prep_CHR.R'
bucket = proj_bucket,
na.strings = c("", "NA")
)
CHR <- CHR %>%
select(-X)
CHR %>%
select(Der_CQC_Location_CHR, Der_CQC_Service_Type_CHR, CCG_Of_Residence_CHR, CareHomeIndicator_CHR) %>%
n_distinct()
#latest record per care home
relevant_record_chr <- CHR %>%
select(Der_Start_Date_CHR, Der_CQC_Location_CHR, Der_CQC_Service_Type_CHR, CCG_Of_Residence_CHR, CareHomeIndicator_CHR) %>%
group_by (Der_CQC_Location_CHR) %>%
arrange(Der_CQC_Location_CHR, Der_Start_Date_CHR) %>%
fill (everything (), .direction = "down") %>% # fills values above where there is NA
slice_tail(n=1) %>% # take most recent record for the CH
select(-Der_Start_Date_CHR) %>%
ungroup () %>%
distinct ()
relevant_record_chr %>%
select(Der_CQC_Location_CHR) %>%
n_distinct()
#this has one row per pseudocqcid.
relevant_record_chr <- relevant_record_chr %>%
rename(Der_CQC_Location_CHRCH = Der_CQC_Location_CHR,
Der_CQC_Service_Type_CHRCH = Der_CQC_Service_Type_CHR,
CCG_Of_Residence_CHRCH = CCG_Of_Residence_CHR,
CareHomeIndicator_CHRCH = CareHomeIndicator_CHR)
#then join
linked_ch_level <- pseudocqcids %>%
left_join(workforce, by = "pseudocqcid") %>%
left_join(cqcclean, by = c("pseudocqcid"))%>%
left_join(relevant_record_chr, by = c( "pseudocqcid" = "Der_CQC_Location_CHRCH"))
#output csv
s3write_using(linked_ch_level,
FUN=write.csv,
object = "Linked data sets/MDS_CH_level.csv",
bucket = proj_bucket)
#Now look to link with the person level MDS
#import it
linked_personlevel <- s3read_using(read.csv,
object = 'Linked data sets/MDS_personlevel.csv',
bucket = proj_bucket,
na.strings = c("", "NA")
)
linked_personlevel <- linked_personlevel %>%
select(-X)
linked_MDS <- linked_personlevel %>%
left_join(linked_ch_level, by = "pseudocqcid")
# now generate new variables - registered bed capacity, years of service registration
linked_MDS <- linked_MDS %>%
mutate(
registered_bed_capacity = case_when (
is.na (Care_homes_beds_CQC) ~ "Missing",
Care_homes_beds_CQC <=24 ~ "0-24",
Care_homes_beds_CQC >= 25 & Care_homes_beds_CQC <=49 ~ "25-49",
TRUE ~ "50+"),
service_reg_yrs = case_when (
is.na(Years_Since_Reg_CQC) ~ "Missing",
Years_Since_Reg_CQC <=10 ~ "Less than 10 years",
TRUE ~ "More than 10 years")
)
#output csv
s3write_using(linked_MDS,
FUN=write.csv,
object = "Linked data sets/MDS.csv",
bucket = proj_bucket)
MDS_variables <- colnames(linked_MDS) %>%
as.data.frame()
s3write_using(MDS_variables, FUN=write.csv, object = "wp4/Linked data sets/MDS_variables.csv", bucket = project_bucket)
#how is linkage looking for workforce survey?
linkcheck <- linked_personlevel %>%
left_join(linked_ch_level, by = "pseudocqcid")
linkcheck %>%
filter(!is.na(RecordedDate_WF)) %>%
n_distinct()