Data_and_main_analyses.txt

# We assume you use R 3.4.1 or newer
# active required packages:
require(GenomicSEM)
require(lavaan) # tested with version 0.6-4
require(MASS)
require(corpcor)
require(metaSEM)
library(tidyr)
library(plyr)
require(Matrix)
require(parallel)




############### Download data
# example for select number of mental health phenotypes

### Mental health phenotypes
## Age of smoking initiation and cigarettes per day:
https://conservancy.umn.edu/handle/11299/201564 # AgeofInitiation.txt.gz, CigarettesPerDay.txt.gz

## SES
# EA:
https://www.thessgac.org/data # file: GWAS_EA_excl23andMe.txt 
# HI
https://www.ccace.ed.ac.uk/node/335 # file: Hill_CB_2016.zip 
TI: 
https://www.ccace.ed.ac.uk/node/335 # file: Hill_CB_2016.zip 

### Reference file
# Hapmap 3 SNPs with the MHC region removed
https://utexas.box.com/s/vkd36n197m8klbaio3yzoxsee6sxo11v # file: w_hm3.noMHC.snplist




########################### Correlation vs. Partial correlation analyses ########################################





################ Munge

gzip all summary statistics

require(GenomicSEM)
munge(c("AgeofInitiation.txt.gz,"CigarettesPerDay.txt.gz","GWAS_EA_excl23andMe.txt.gz,"Hill2016_UKB_Income_summary_results_21112016","Hill2016_UKB_Townsend_summary_results_08112016"), "w_hm3.noMHC.snplist",trait.names=c("SmkInit,CigDay","EA","HI","TI"), c(262990,263954,766345,96900,112005), info.filter = 0.9, maf.filter = 0.01)

################ LDSC

traits =c("SmkInit.sumstats.gz","EA.sumstats.gz","HI.sumstats.gz","TI.sumstats.gz","CigDay.sumstats.gz")
sample.prev = c(NA,NA,NA,NA,NA)
population.prev = c(NA,NA,NA,NA,NA)
ld ="eur_w_ld_chr/"
wld="eur_w_ld_chr/" 
trait.names<-c("SmkInit", "EA","HI","TI","CigDay")
ldsc_matrix <- ldsc(traits, sample.prev, population.prev, ld, wld, trait.names)

################ Lavaan Models

# Correlation_model: use of virtual factors: T1 en T2
Correlation <- 'SES =~ 1*HI + EA + TI
T1=~ SmkInit
T2=~ CigDay
T1 ~~ SES
T2 ~~ SES
T1 ~~ a*T2
HI ~~ d*HI
d>0.001
SmkInit ~~ 0*SmkInit
CigDay ~~ 0*CigDay
T1 ~~ b*T1
T2 ~~ c*T2
'

# Partial correlation with virtual factors T1 and T2.
Partial_Correlation_SES <- 'SES =~ 1*HI + EA + TI
T1 =~ SmkInit
T2 =~ CigDay
T1 ~  SES
T2 ~  SES
T1 ~~ a*T2
SmkInit ~~ 0*SmkInit
CigDay ~~ 0*CigDay
T1 ~~ b*T1
T2 ~~ c*T2
SES ~~ SES
HI ~~ d*HI
d>0.001
'

################ Genomic SEM output

GenomixSEM_output_correlation <-usermodelldsc_matrix, estimation = "DWLS", model = Correlation_model)
GenomixSEM_output_Partial_correlation <-usermodelldsc_matrix, estimation = "DWLS", model = Partial_Correlation_SES)

################ Monte Carlo sampling approach
### Significance models latent factor, WITH and WITHOUT partialling out variation of the latent SES factor
### Factor out SES from two traits of interest: SmkInit CigDay
### Function below uses previously generated object named: ldsc_matrix

### the two models we compare: 

# Correlation model: uses virtuele factors T1 and T2
Correlation <- 'SES =~ 1*HI + EA + TI
T1=~ trait1
T2=~ trait2
T1 ~~ SES
T2 ~~ SES
T1 ~~ a*T2
HI ~~ d*HI
d>0.001
trait1 ~~ 0*trait1
trait2 ~~ 0*trait2
T1 ~~ b*T1
T2 ~~ c*T2
'

# Partial correlation WITH virtual factors T1 and T2. 
Partial_Correlation_SES <- 'SES =~ 1*HI + EA + TI
T1 =~ trait1
T2 =~ trait2
T1 ~  SES
T2 ~  SES
T1 ~~ a*T2
trait1 ~~ 0*trait1
trait2 ~~ 0*trait2
T1 ~~ b*T1
T2 ~~ c*T2
SES ~~ SES
HI ~~ d*HI
d>0.001
'

# create a separate randomization function. This creates a list of randomizations
# for the ldsc_matrix struct that was passed.
randomize.cor.v.pcor.latent <- function(suffStat=NULL, seed=NULL, rep=1000) {
  set.seed(seed=seed)
  if(is.null(suffStat$S)) {
    S <- suffStat$S_Full
    V <- suffStat$V_Full
  } else {
    S <- suffStat$S
    V <- suffStat$V
  }
  baz <- mvrnorm(rep,S[lower.tri(S,diag = T)], Sigma = V) 
  return(baz)
}

cor.v.pcor.latent <- function(bazrow, header=NULL, model.cor=NULL, model.pcor=NULL) {
  # Local function which runs the Monte Carlo sampling

  require(Matrix)
  require(lavaan)
  makeSymm <- function(m) {
    m[upper.tri(m)] <- t(m)[upper.tri(m)]
    return(m)
  }
  print(bazrow)
  res = matrix(NA,1,8)
  
  sig <- matrix(NA,length(header),length(header))
  sig[lower.tri(sig,diag = T)] <- bazrow
  sig<- makeSymm(sig)
  sig <- as.matrix(nearPD(sig)$mat)
  colnames(sig) <- header
    
  sem.pcor <- lavaan::sem(model = model.pcor, sample.cov = sig, sample.nobs = 2)
  sum.pcor<- sem.pcor@ParTable
  va <- sum.pcor$est[sum.pcor$label=='a']
  vb <- sum.pcor$est[sum.pcor$label=='b']
  vc <- sum.pcor$est[sum.pcor$label=='c']
    
  res[1,1:4] <- c(va,vb,vc,va/sqrt(vb*vc))
    
  sem.cor <- lavaan::sem(model = model.cor, sample.cov = sig, sample.nobs = 2)
  sum.cor<- sem.cor@ParTable
  va <- sum.cor$est[sum.cor$label=='a']
  vb <- sum.cor$est[sum.cor$label=='b']
  vc <- sum.cor$est[sum.cor$label=='c']
    
  res[1,5:8] <- c(va,vb,vc,va/sqrt(vb*vc))
  
  return(t(res))
}

# random sampling
cNRepeats = 1000
rnd = randomize.cor.v.pcor.latent(ldsc_matrix, seed=123, rep=cNRepeats)
#cl <- makeCluster(8)
# outcome <- cor.v.pcor.latent(ldsc_matrix, seed=15182, rep=1000)
#par_results = parRapply(cl=cl, x=rnd, FUN=cor.v.pcor.latent, header=colnames(ldsc_matrix$S), model.cor=Correlation, model.pcor=Partial_Correlation_SES)
#stopCluster(cl)
par_results = apply(rnd, 1, FUN=cor.v.pcor.latent, header=colnames(ldsc_matrix$S), model.cor=Correlation, model.pcor=Partial_Correlation_SES)

outcome = as.data.frame(t(matrix(par_results, 8, cNRepeats)))
colnames(outcome)<-c('partial_path_a','partial_h2_trait1_div2','partial_h2_trait2_div2','partial_rG','path_a','h2_trait1_div2','h2_trait2_div2','rG')
write.table(x=outcome, file="latent_outcome.txt", quote = F, row.names = F)

# what proporiton of correlaitons are bigger then the partial correlations?
outcome_no_NAs <- na.omit(outcome)
Proportion_cor_larger_than_pcor <- sum(outcome_no_NAs$rG > outcome_no_NAs$partial_rG) / dim(outcome)[1]