### TIPS ###
# We first load all the libraries we will need for this practical. 
# Make sure you have those installed 
# If not then please visit https://www2.unil.ch/popgen/teaching/SISG22/
library(gaston)
library(hierfstat)
library(JGTeach)

Heritability of (simulated) traits from (simulated) panmictic populations

  1. We will use again the data we generated during the first practical, where we used ms to simulate data from a panmictic population. You may remember we simulated for 500 individuals 20 chromosomes 1 Megabase long.
Code
pan<-ms2bed("../pan.txt")


set.seed(87) #If you enter this line, it will insure you can reproduce exactly the same results as in this document.

t1<-make.traits(pan,h2=0.8,n.causal=100,minp=0)

str(t1)
## List of 4
##  $ h2    : num 0.8
##  $ h2hat : num 0.81
##  $ trait :'data.frame':  500 obs. of  2 variables:
##   ..$ gval : num [1:500] -0.0149 -0.2541 -0.2405 0.0505 -0.363 ...
##   ..$ pheno: num [1:500] 0.0936 -0.4572 -0.1738 0.078 -0.6318 ...
##  $ causal:'data.frame':  100 obs. of  3 variables:
##   ..$ chr: int [1:100] 1 13 2 16 2 4 8 14 18 16 ...
##   ..$ id : chr [1:100] "M_1054" "M_19098" "M_2783" "M_23440" ...
##   ..$ efs: num [1:100] -0.000724 -0.142978 -0.008772 -0.110842 -0.177222 ...
t1$h2hat
## [1] 0.8095129
hist(t1$trait$pheno)

with(t1$trait,plot(pheno,gval))

#estimated heritability
with(t1$trait,var(gval)/var(pheno))
## [1] 0.8095129
#check 
t1$h2hat==with(t1$trait,var(gval)/var(pheno))
## [1] TRUE
Answers

The trait shows a normal distribution centered on 0. The correlation between the phenotypes and the breeding values of the individuals is high, as would be expected for a trait with a large heritability.

\(\hat{h^2}\) is the realized heritability, quantified as \(V_A/V_P\), where \(V_A\) is the variance of breeding values and \(V_P\) is the variance of phenotypes, whereas \(h^2\) is the intended heritability of the trait. The two are very close.

Code
t2<-make.traits(pan,h2=0.3,n.causal=100,minp=0)
t2$h2hat
## [1] 0.2920886
with(t2$trait,plot(pheno,gval))

with(t2$trait,var(gval)/var(pheno))
## [1] 0.2920886
t2$h2hat==with(t2$trait,var(gval)/var(pheno))
## [1] TRUE
Answers

The correlation between phenotypes and breeding values is much weaker this time, because the environmental component of the trait is larger. We note that phenotypes vary between \(-2\) and \(2\), while breeding values vary between \(-1\) and \(1.5\).

Next we will use a linear mixed model to estimate the heritability of this trait. This requires a trait and a GRM. We will thus estimate heritability with the 3 GRMS we discussed in the previous lecture.

  1. Compute heritabilities for trait 1 and 2 using the 3 GRMs and function gaston::lmm.aireml. Are the heritability estimates from the 3 GRMS similar?
Code
#first compute the GRMs

#allele sharing GRM
GRM.as.pan<-hierfstat::kinship2grm(hierfstat::beta.dosage(pan,inb=TRUE))

#Kc0 GRM
GRM.c0.pan<-2*(JGTeach::Kc0(pan))

#Standard GRM
std.pan<-gaston::GRM(pan)

#extract heritability from an lmm.aireml object

herit<-(function(x) x$tau/(x$tau+x$sigma2))

#Trait 1

#value to be estimated
t1$h2hat
## [1] 0.8095129
#estimations
herit(rAS<-gaston::lmm.aireml(t1$trait$pheno,K=GRM.as.pan,verbose=FALSE))
## [1] 0.9229796
herit(rC0<-gaston::lmm.aireml(t1$trait$pheno,K=GRM.c0.pan,verbose=FALSE))
## [1] 0.9229086
herit(rStd<-gaston::lmm.aireml(t1$trait$pheno,K=std.pan,verbose=FALSE))
## [1] 0.9999926
# Trait 2

#value to be estimated
t2$h2hat
## [1] 0.2920886
#estimations
herit(lmm.aireml(t2$trait$pheno,K=GRM.as.pan,verbose=FALSE))
## [1] 0.3827383
herit(lmm.aireml(t2$trait$pheno,K=GRM.c0.pan,verbose=FALSE))
## [1] 0.3825026
herit(lmm.aireml(t2$trait$pheno,K=std.pan,verbose=FALSE))
## [1] 0.6107762
Answers

In the previous two questions, we assumed we new the breeding values. It is never the case in reality, and in order to estimate heritabilities, we have to use a mixed model, or a specific breeding design from which we can extract the relevant variance component (more on this in the quantitative genetics module).

The heritabilities estimated with \(GRM_{AS}\) and \(GRM_{c0}\) are almost identical and fairly close to the estimated heritabilities from breedig values. Heritabilities estimated with the standard GRM are large overestimates.

Code
std.pan.maf05<-GRM(pan[,pan@snps$maf>=0.05])

herit(lmm.aireml(t1$trait$pheno,K=std.pan.maf05,verbose=FALSE))
## [1] 0.8932641
herit(lmm.aireml(t2$trait$pheno,K=std.pan.maf05,verbose=FALSE))
## [1] 0.3928383
Answers

Once we filter at maf $ $, the heritability estimates we obtain with the standard GRM are similar to the two othe GRMs and closer to the expected heritability.

Code
p.lm<-association.test(pan,Y=t2$trait$pheno,method="lm")
## Warning in association.test(pan, Y = t2$trait$pheno, method = "lm"): Method = "lm" force
## test = "wald"
pAS.lmm<-association.test(pan,Y=t2$trait$pheno,method="lmm",K=GRM.as.pan,verbose=FALSE)
pc0.lmm<-association.test(pan,Y=t2$trait$pheno,method="lmm",K=GRM.c0.pan,verbose=FALSE)
pstd.lmm<-association.test(pan,Y=t2$trait$pheno,method="lmm",K=std.pan,verbose=FALSE)
pstdmaf05.lmm<-association.test(pan,Y=t2$trait$pheno,method="lmm",K=std.pan.maf05,verbose=FALSE)

# get genomic inflation factor
get.lambda<-function(x) {xchisq<-qchisq(1-x,1);median(xchisq)/qchisq(0.5,1)}

#plot quantile quantile plot of p-values
pval.qplot<-function(x,...){
  n<-length(x)
  plot(-log10(1:n/n),-log10(sort(x)),
       xlab="-log10(theo p)",ylab="-log10 emp(p)",pch=16,...);abline(c(0,1))
}

par(mfrow=c(2,2))
pval.qplot(p.lm$p,main=paste0("Lambda lm \n",round(get.lambda(p.lm$p),digits=3)))
pval.qplot(pAS.lmm$p,main=paste0("Lambda lmm Kas\n",round(get.lambda(pAS.lmm$p),digits=3)))
pval.qplot(pstd.lmm$p,main=paste0("Lambda lmm Std\n",round(get.lambda(pstd.lmm$p),digits=3)))
pval.qplot(pstdmaf05.lmm$p,main=paste0("Lambda Std maf 0.05\n",round(get.lambda(pstdmaf05.lmm$p),digits=3)))

par(mfrow=c(1,1))
Answers

We see that the inflation factor \(\lambda\) for the linear model is larger than 1 (1.11), whereas inflation factors for the mixed models are sligthly below one. Even in this random mating, ideal population, it seems useful to use a mixed model rather than a linear model to test for association.

Code
# The get.herit function creates a trait and estimate 
# its heritability with lmm and the given GRM

get.herit<-function(bed,h2=0.8,n.causal=100,minp=0,GRM=GRM,...){

    herit<-function(x) x$tau/(x$tau+x$sigma2)

  tx<-make.traits(bed=bed,h2=h2,n.causal=n.causal,minp=minp,...)
  est<-herit(lmm.aireml(Y=tx$trait$pheno,K=GRM,verbose=FALSE,...))
  c(tx$h2hat,est)           
}


#for h2=0.8

h2.as<-replicate(20,get.herit(pan,GRM=GRM.as.pan))
h2.c0<-replicate(20,get.herit(pan,GRM=GRM.c0.pan))
h2.std<-replicate(20,get.herit(pan,GRM=std.pan))
h2.std.maf05<-replicate(20,get.herit(pan,GRM=std.pan.maf05))

boxplot(cbind(h2.as[2,],h2.c0[2,],h2.std[2,],h2.std.maf05[2,]))
abline(h=0.8)

#for h2=0.3

h2.as<-replicate(20,get.herit(pan,h2=0.3,GRM=GRM.as.pan))
h2.c0<-replicate(20,get.herit(pan,h2=0.3,GRM=GRM.c0.pan))
h2.std<-replicate(20,get.herit(h2=0.3,pan,GRM=std.pan))
h2.std.maf05<-replicate(20,get.herit(h2=0.3,pan,GRM=std.pan.maf05))

boxplot(cbind(h2.as[2,],h2.c0[2,],h2.std[2,],h2.std.maf05[2,]))
#boxplot(cbind(t(h2.as),t(h2.c0),t(h2.std),t(h2.std.maf05)))
abline(h=0.3)
Answers

Yes, we now see that heritabilities estimated with \(GRM_{AS}\) or \(GRM_{c0}\) are almost unbiased, while heritabilities estimated with the satandar unfiltered GRM are biased. Filtering on maf ans using the standard GRM to estimate heritability help remove the bias.

Heritability from simulated traits using (part of) 1000 genome data

We can redo the same exercice this time using the subset of data we have been using since the begininng of this module. The difference between the 1000 genome data and the simulated data is we have population structure in the 1000 genome data. What will be the effect of these on heritability estimates?

  1. simulate one trait with heritability \(0.5\) and driven by 10 causal loci with \(MAF>=0.01\) using the Eurasian samples (EAS, EUR, SAS) from the 1000 genomes data, and estimate its heritability using the 4 GRMS we used previously (Allele sharing As; c0; standard; and standard filter on \(maf \geq 0.05\)). Discuss the results and their shortcomings
Code
ch22<-read.VCF("../chr22_Mb0_20.recode.vcf.gz")
## ped stats and snps stats have been set. 
## 'p' has been set. 
## 'mu' and 'sigma' have been set.
ch22.M<-readRDS("../matching.ch22.RDS")
samp.desc.url<-"https://www2.unil.ch/popgen/teaching/SISG18/integrated_call_samples_v3.20130502.ALL.panel"
samp.desc<-read.table(samp.desc.url,header=TRUE,stringsAsFactors = TRUE)


EURASIA<-which(samp.desc$super_pop %in% c("EAS","EUR","SAS"))
ch22EURASIA<-ch22[EURASIA,]
ch22EURASIA<-ch22EURASIA[,ch22EURASIA@snps$maf>0]
ch22EURASIA.M<-ch22.M[EURASIA,EURASIA]
Mb<-mean(mat2vec(ch22EURASIA.M))

ch22EURASIA.Kas<-(ch22EURASIA.M-Mb)/(1-Mb)
ch22EURASIA.Kc0<-JGTeach::Kc0(ch22EURASIA.M,matching=TRUE)
ch22EURASIA.std.GRM<-gaston::GRM(ch22EURASIA)
ch22EURASIA.std.GRM.maf05<-gaston::GRM(ch22[EURASIA,ch22@snps$maf>=0.05])

set.seed(13)

t1EURASIA<-make.traits(ch22EURASIA,n.causal=10,h2=0.5,minp=0.01)

t1EURASIA$h2hat
## [1] 0.4793195
#estimations
herit(rASEURASIA<-gaston::lmm.aireml(t1EURASIA$trait$pheno,K=2*ch22EURASIA.Kas,verbose=FALSE))
## [1] 0.5671181
herit(rC0EURASIA<-gaston::lmm.aireml(t1EURASIA$trait$pheno,K=2*ch22EURASIA.Kc0,verbose=FALSE))
## [1] 0.5670315
herit(rStdEURASIA<-gaston::lmm.aireml(t1EURASIA$trait$pheno,K=ch22EURASIA.std.GRM,verbose=FALSE))
## EM step failed to improve likelihood (this should not happen)
## [1] 0.7776692
herit(rStdEURASIAmaf05<-gaston::lmm.aireml(t1EURASIA$trait$pheno,K=ch22EURASIA.std.GRM.maf05,verbose=FALSE))
## [1] 0.5387993
Answers

For this specific trait with intended heritability \(0.5\), heritability estimates from \(AS\) and \(c0\) GRMs are slightly over the mark (\(0.57\)), while the standard GRM give an estimate way of the mark (\(0.78\)). Filtering on MAF help bring the heritability estimate closer to the realized value.

We would need to simulate several traits to verify the tendency observed, feel free to do it!

Code
nsnps<-dim(ch22EURASIA)[2]
p.lm<-association.test(ch22EURASIA,Y=t1EURASIA$trait$pheno,method="lm")
## Warning in association.test(ch22EURASIA, Y = t1EURASIA$trait$pheno, method = "lm"): Method
## = "lm" force test = "wald"
pAS.lmm<-association.test(ch22EURASIA,Y=t1EURASIA$trait$pheno,method="lmm",K=2*ch22EURASIA.Kas,verbose=FALSE)
pc0.lmm<-association.test(ch22EURASIA,Y=t1EURASIA$trait$pheno,method="lmm",K=2*ch22EURASIA.Kc0,verbose=FALSE)
pstd.lmm<-association.test(ch22EURASIA,Y=t1EURASIA$trait$pheno,method="lmm",K=ch22EURASIA.std.GRM,verbose=FALSE)
pstdmaf05.lmm<-association.test(ch22EURASIA,Y=t1EURASIA$trait$pheno,method="lmm",K=ch22EURASIA.std.GRM.maf05,verbose=FALSE)

# get genomic inflation factor
get.lambda<-function(x) {xchisq<-qchisq(1-x,1);median(xchisq)/qchisq(0.5,1)}

#plot quantile quantile plot of p-values
pval.qplot<-function(x,...){
  n<-length(x)
  plot(-log10(1:n/n),-log10(sort(x)),
       xlab="-log10(theo p)",ylab="-log10 emp(p)",pch=16,...);abline(c(0,1))
}

par(mfrow=c(2,2))
pval.qplot(p.lm$p,main=paste0("Lambda lm \n",round(get.lambda(p.lm$p),digits=3)))
pval.qplot(pAS.lmm$p,main=paste0("Lambda lmm Kas\n",round(get.lambda(pAS.lmm$p),digits=3)))
pval.qplot(pstd.lmm$p,main=paste0("Lambda lmm Std\n",round(get.lambda(pstd.lmm$p),digits=3)))
pval.qplot(pstdmaf05.lmm$p,main=paste0("Lambda Std maf 0.05\n",round(get.lambda(pstdmaf05.lmm$p),digits=3)))

par(mfrow=c(1,1))
Answers

The inflation factor for the linear model is again above one, while it is below 1 for the mixed models. Note also the range of \(-log_{10}\) p-values is much larger for the linear model than the others

Code
par(mfrow=c(2,2))
manhattan(x=data.frame(chr=ch22EURASIA@snps$chr,pos=ch22EURASIA@snps$pos,p=p.lm$p),
          thinning=FALSE,pch=16,cex=0.5,main="lm")
abline(v=ch22EURASIA@snps$pos[match(t1EURASIA$causal$id,pAS.lmm$id)],col="red")
abline(h=-log10(5e-08),col="blue",lty=2) #standard genomic threshold
abline(h=-log10(0.05/62939),col="blue") # Bonferroni threshold for 62939 SNPs

manhattan(x=data.frame(chr=ch22EURASIA@snps$chr,pos=ch22EURASIA@snps$pos,p=pAS.lmm$p),
          thinning=FALSE,pch=16,cex=0.5,main="lmm Kas")
abline(v=ch22EURASIA@snps$pos[match(t1EURASIA$causal$id,pAS.lmm$id)],col="red")
abline(h=-log10(5e-08),col="blue",lty=2) #standard genomic threshold
abline(h=-log10(0.05/62939),col="blue") # Bonferroni threshold for 62939 SNPs

manhattan(x=data.frame(chr=ch22EURASIA@snps$chr,pos=ch22EURASIA@snps$pos,p=pstd.lmm$p),
          thinning=FALSE,pch=16,cex=0.5,main="lmm std")
abline(v=ch22EURASIA@snps$pos[match(t1EURASIA$causal$id,pAS.lmm$id)],col="red")
abline(h=-log10(5e-08),col="blue",lty=2) #standard genomic threshold
abline(h=-log10(0.05/62939),col="blue") # Bonferroni threshold for 62939 SNPs

manhattan(x=data.frame(chr=ch22EURASIA@snps$chr,pos=ch22EURASIA@snps$pos,p=pstdmaf05.lmm$p),
          thinning=FALSE,pch=16,cex=0.5,main="lmm std maf05")
abline(v=ch22EURASIA@snps$pos[match(t1EURASIA$causal$id,pAS.lmm$id)],col="red")
abline(h=-log10(5e-08),col="blue",lty=2) #standard genomic threshold
abline(h=-log10(0.05/62939),col="blue") # Bonferroni threshold for 62939 SNPs

par(mfrow=c(1,1))
Answers

In these manhattan plots, the horizontal dashed blue line corresponds to the classical genomic level of significance ( for \(10^6\) SNPs, i.e. \(5 \times 10^-8\)) while the solid blue line corresponds to the 5% threshold adjusted for the number of tests ($ $). The difference between the blue solid and dashed horizontal lines will depend on the number of markers used. The vertical red lines show the position of the causal loci along the 20 MB of chromosome 22.

We first note the difference in y axis scales between the different panels: the two left panels axes go to 50, while the two right panels do not exceed 30.

On all panels, the Manhattan plots show SNPs associated with the traots to be located very close to the causal loci, which is good! The third, fourth and fifth causal loci from the left are detected by all 4 GWAS.

But the GWAS performed with a linear model spuriously identify loci outside of the causal regions, for instance on the left of the first causal loci.

You can get a better feel for how the different models behave by simulating other traits, with different heritabilities, number of causal loci etc…