### TIPS ###
# We first load all the libraries we will need for this practical.
# Make sure you have those installed
# If not then please visit https://www2.unil.ch/popgen/teaching/SISG22/
library(gaston)
library(hierfstat)
library(JGTeach)
library(HardyWeinberg)
hierfstat::ms2bed
, and calculate
for each locus its inbreeding coefficient; use it to test whether the
loci are in Hardy Weinberg Equilibrium; plot \(-log10\) of these p-values against their
expectations under the null hypothesis.# F=1-Ho/He
pan <- ms2bed("../pan.txt")
inb.coeff <- 1 - pan@snps$hz/(2*pan@p*(1-pan@p)) # He = 2p(1-p)
# nb inds
ni <- dim(pan)[1] # or nrow(pan)
x2 <- ni*inb.coeff^2
p.val.x2 <- pchisq(x2,df=1,lower=FALSE) # check ?pchisq
nl <- dim(pan)[2] # or ncol(pan)
# theoretical dist p.val under null
theo.pval <- (1:nl) / nl
plot(-log10(theo.pval),-log10(sort(p.val.x2)),
col="red",cex=0.5,xlab="Theo p val dist",
ylab="emp p-val dist x2", pch=16)
abline(c(0,1))
Is it what you would have expected?
No, since the simulated population is panmictic, we expect the empirical p-values to match the theoretical ones, i.e. the points should be aligned along the \(y=x\) line of the figure. It is clearly not the case.
x <- (0:1000) / 1000 # same as seq(0,1,by=0.001)
# indexing on maf >= 1%
maf01 <- which(pan@snps$maf >= 0.01) # indexing of snps with maf >= 0.01
nl <- length(maf01)
theo.pval <- 1:nl / nl
plot(-log10(theo.pval),-log10(sort(p.val.x2[maf01])),
col="red",cex=0.5,xlab="Theo p val dist",
ylab="emp p-val dist x2", pch=16)
abline(c(0,1))
# highlight outliers in het ~ p plot
plot(pan[,maf01]@p, pan[,maf01]@snps$hz, col="black", pch=16, cex=0.6, ylab = 'Het', xlab='p')
lines(x,2*x*(1-x),col="orange")
outliers <- which(-log10(p.val.x2[maf01])>4)
# two step indexing (first maf01 and then those points that are p-val outliers)
points(pan[,maf01][,outliers]@p, pan[,maf01][,outliers]@snps$hz, col="red", pch=16, cex=0.6)
The minimum expected number per cell is 1 (5) [the proportion of cells with expected counts lower than 5 should not exceed 20%]
Is this rule working here?
par(mfrow=c(1,2))
# what frequency leads to np^2==1 e.g. p=(1/n)^0.5
# nb inds
ni <- dim(pan)[1]
xi <- 1
mafn1 <- which(pan@snps$maf >= (xi/ni)^.5)
nl <- length(mafn1)
theo.pval <- 1:nl/nl
plot(-log10(theo.pval),-log10(sort(p.val.x2[mafn1])),
col="red",cex=0.5,xlab="Theo p val dist", pch = 16,
ylab="emp p-val dist x2",main=expression(np^2>=1));abline(c(0,1))
#what frequency leads to np^2==5
xi <- 5
mafn5 <- which(pan@snps$maf >= (xi/ni)^.5)
nl <- length(mafn5)
theo.pval <- 1:nl/nl
plot(-log10(theo.pval),-log10(sort(p.val.x2[mafn5])),
col="red",cex=0.5,xlab="Theo p val dist", pch=16,
ylab="emp p-val dist x2",main=expression(np^2>=5));abline(c(0,1))
par(mfrow=c(1,1))
If we set the minimum expected count per cell to one (left panel),we see that we still have a few outlier loci on the QQ-plot (points far away fro the \(y=x\) line), while there should be none.
With the rule \(np^2 \geq 5\), we now have all the points aligned to the \(y=x\) line, meaning that for the \(\chi^2\) test to work, the condition we should use is maf \(\geq \sqrt(5/n)\):
n<-c(1:10*10,2:10*10^2,2:10*10^3,2:10*10^4,2:10*10^5)
plot(n,(5/n)^0.5, type ="l",lwd="2",col="red",xlab="Number of inds", ylab="MAF",main="min MAF for HW chi2 test",log="xy")
We would therefore need at least \(50'000\) individuals to insure valid HW \(\chi^2\) tests with a \(0.01\) maf.
HardyWeinberg::HWExactStats
to
obtain the exact p-values for these loci, using first the argument
midp
set to FALSE and then set to TRUE.# what are we cbinding here?
hw.ex <- HWExactStats(cbind(pan@snps$N0, pan@snps$N1, pan@snps$N2), midp=FALSE)
hw.mp <- HWExactStats(cbind(pan@snps$N0, pan@snps$N1, pan@snps$N2), midp=TRUE)
nl <- length(hw.mp)
par(mfrow=c(1,2))
plot(-log10(1:nl/nl), -log10(sort(hw.ex)), cex=0.6, pch=16, col='red',
xlab='theoretical p-val dist', ylab='emp p-val dist exact') ; abline(c(0,1))
plot(-log10(1:nl/nl), -log10(sort(hw.mp)), cex=0.6, pch=16, col='red',
xlab='theoretical p-val dist', ylab='emp p-val dist exact-mp') ; abline(c(0,1))
par(mfrow=c(1,1))
### TIPS
# ';' allows you to specify two commands in one line
With the exact test, the p-values do align to the \(y=x\) line, without filtering on maf. Without correction, the p-values tend to be slightly underestimated. The mid p-value correction makes the the empirical p-values closer to the \(y=x\) line
# read vcf (care for path!)
ch22 <- read.VCF("../chr22_Mb0_20.recode.vcf.gz")
## ped stats and snps stats have been set.
## 'p' has been set.
## 'mu' and 'sigma' have been set.
# description file
samp.desc.file <- "https://www2.unil.ch/popgen/teaching/SISG18/integrated_call_samples_v3.20130502.ALL.panel"
samp.desc <- read.table(samp.desc.file,header=TRUE)
# subset EAS
EAS <- which(samp.desc$super_pop=="EAS")
plot(ch22[EAS,]@p, ch22[EAS,]@snps$hz, col="black", pch=16, cex=0.6, xlab='p',ylab='het',
main='Het~p for EAS')
lines(x, 2*x*(1-x), col="orange")
hw.mp.EAS <- HWExactStats(cbind(ch22[EAS,]@snps$N0, ch22[EAS,]@snps$N1, ch22[EAS,]@snps$N2), midp=TRUE)
nl <- length(hw.mp.EAS) # length of of p-values
plot(-log10(1:nl/nl), -log10(sort(hw.mp.EAS)), cex=0.6, pch=16, xlab='theoretical p-val distr',
ylab='emp p-val dist exact-mp', main='QQplot for EAS' )
abline(c(0,1))
# index outliers
outliers <- which(-log10(hw.mp.EAS) > 6)
plot(ch22[EAS,]@p, ch22[EAS,]@snps$hz, col="black", pch=16, cex=0.6, xlab='p', ylab='Het')
lines(x, 2*x*(1-x), col="orange")
points(ch22[EAS,outliers]@p, ch22[EAS,outliers]@snps$hz, col="red",pch=16,cex=0.6)
The first (QQ-)plot confirms that many loci are not in Hardy Weinberg equilibrium. The second shows that the loci not in HW equilibrium are those further away from the expected orange line, and are not clustered at low maf. These loci should be removed at the Quality Control (QC) step.
ni <- 100
f <- 0.125
pchisq(qchisq(0.95,df=1), df=1, ncp=ni*f^2, lower=FALSE)
## [1] 0.239527
#density of chisq with ncp nf2
x <- seq(0.2,20,0.1)
plot(x, dchisq(x,df=1), type="h", col='#FF000080',lwd =2,
xlab=expression(chi^2), ylab="probability density") # chisq prob dens
lines(x, dchisq(x,df=1,ncp=ni*f^2), type="h", col='#0000FF80', lwd=2) # with ncp
legend('topright', lty=rep(1,2), col=c('#FF000080','#0000FF80'), lwd=c(2,2),legend=c('no ncp','ncp = nf^2'))
abline(v=qchisq(0.95,df=1)) # 95th centile of chisq dist
The red (\(H_0: f=0\)) and blue (\(H_1: f=0.125\)) distributions overlap very much and are very difficult to distinguish one from another. The vertical black line shows the 95% quantile of the red distribution, we see that most of the blue distribution is also left of this line (76.05%), and the power is very low (23.95%). Thus, it is almost impossible to detect \(f=0.125\) with a sample of \(100\) individuals
ns <- 1:10*100
round( pchisq(qchisq(0.95,df=1), df=1, ncp=ns*f^2, lower=FALSE), digits=3)
## [1] 0.240 0.424 0.581 0.705 0.798 0.865 0.911 0.942 0.963 0.977
ni <- 500
f <- 0.125
pchisq( qchisq(0.95,df=1), df=1, ncp=ni*f^2, lower=FALSE)
## [1] 0.7981762
plot(x, dchisq(x,df=1), type="h", col="#FF000080", lwd=2,
xlab=expression(chi^2), ylab="probability density")
#density of chisq with ncp nf2
lines(x, dchisq(x,df=1,ncp=ni*f^2), type="h", col="#0000FF80",lwd=2)
legend('topright', lty=rep(1,2), col=c('#FF000080','#0000FF80'),lwd=c(2,2), legend=c('no ncp','ncp = nf^2'))
abline(v=qchisq(0.95,df=1)) # 95th centile of chisq dist
END OF PRACTICAL 2.