### TIPS ###
# We first load all the libraries we will need for this practical. 
# Make sure you have those installed 
# If not then please visit https://www2.unil.ch/popgen/teaching/SISG22/
library(gaston)
library(hierfstat)
library(JGTeach)
library(HardyWeinberg)

HW \(\chi^2\) tests

  1. With the pan.txt, create a bed object using hierfstat::ms2bed, and calculate for each locus its inbreeding coefficient; use it to test whether the loci are in Hardy Weinberg Equilibrium; plot \(-log10\) of these p-values against their expectations under the null hypothesis.
Code
# F=1-Ho/He
pan <- ms2bed("../pan.txt")
inb.coeff <- 1 - pan@snps$hz/(2*pan@p*(1-pan@p)) # He = 2p(1-p)
# nb inds
ni <- dim(pan)[1] # or nrow(pan)
x2 <- ni*inb.coeff^2 
p.val.x2 <- pchisq(x2,df=1,lower=FALSE) # check ?pchisq
nl <- dim(pan)[2] # or ncol(pan)
# theoretical dist p.val under null
theo.pval <- (1:nl) / nl
plot(-log10(theo.pval),-log10(sort(p.val.x2)),
     col="red",cex=0.5,xlab="Theo p val dist",
     ylab="emp p-val dist x2", pch=16)
abline(c(0,1))

Is it what you would have expected?

Answers

No, since the simulated population is panmictic, we expect the empirical p-values to match the theoretical ones, i.e. the points should be aligned along the \(y=x\) line of the figure. It is clearly not the case.

  1. redo the same but for loci with minor allele count (mac) of at least 10 (maf \(\geq 0.01\); this filtering “rule” is often used in genomic analysis). Identify, using e.g. a different color, the loci that are not in HWE according to this test on the plot of observed heterozygosity against allele frequencies. Do you see a pattern?
Code
x <- (0:1000) / 1000 # same as seq(0,1,by=0.001)
# indexing on maf >= 1%
maf01 <- which(pan@snps$maf >= 0.01) # indexing of snps with maf >= 0.01
nl <- length(maf01)
theo.pval <- 1:nl / nl
plot(-log10(theo.pval),-log10(sort(p.val.x2[maf01])), 
     col="red",cex=0.5,xlab="Theo p val dist",
     ylab="emp p-val dist x2", pch=16)
abline(c(0,1))

# highlight outliers in het ~ p plot 
plot(pan[,maf01]@p, pan[,maf01]@snps$hz, col="black", pch=16, cex=0.6, ylab = 'Het', xlab='p')
lines(x,2*x*(1-x),col="orange") 
outliers <- which(-log10(p.val.x2[maf01])>4) 
# two step indexing (first maf01 and then those points that are p-val outliers)
points(pan[,maf01][,outliers]@p, pan[,maf01][,outliers]@snps$hz, col="red", pch=16, cex=0.6)

Answers First of all, we see that there are now less points departing from the \(y=x\) line of the QQ-plot, but still many do. The second figure shows that these points are concentrated at low \(\leq 0.1\) or high \(\geq 0.9\) allele frequencies, a hint that our filter (maf \(\leq 0.01\)) is not sufficient to remove ill-behaved loci

The minimum expected number per cell is 1 (5) [the proportion of cells with expected counts lower than 5 should not exceed 20%]

Is this rule working here?

Code
par(mfrow=c(1,2))

# what frequency leads to np^2==1 e.g. p=(1/n)^0.5
# nb inds
ni <- dim(pan)[1]
xi <- 1
mafn1 <- which(pan@snps$maf >= (xi/ni)^.5) 
nl <- length(mafn1)
theo.pval <- 1:nl/nl
plot(-log10(theo.pval),-log10(sort(p.val.x2[mafn1])),
     col="red",cex=0.5,xlab="Theo p val dist", pch = 16,
     ylab="emp p-val dist x2",main=expression(np^2>=1));abline(c(0,1))

#what frequency leads to np^2==5
xi <- 5
mafn5 <- which(pan@snps$maf >= (xi/ni)^.5)
nl <- length(mafn5)
theo.pval <- 1:nl/nl
plot(-log10(theo.pval),-log10(sort(p.val.x2[mafn5])),
     col="red",cex=0.5,xlab="Theo p val dist", pch=16,
     ylab="emp p-val dist x2",main=expression(np^2>=5));abline(c(0,1))

par(mfrow=c(1,1))
Answers

If we set the minimum expected count per cell to one (left panel),we see that we still have a few outlier loci on the QQ-plot (points far away fro the \(y=x\) line), while there should be none.

With the rule \(np^2 \geq 5\), we now have all the points aligned to the \(y=x\) line, meaning that for the \(\chi^2\) test to work, the condition we should use is maf \(\geq \sqrt(5/n)\):

n<-c(1:10*10,2:10*10^2,2:10*10^3,2:10*10^4,2:10*10^5)
plot(n,(5/n)^0.5, type ="l",lwd="2",col="red",xlab="Number of inds", ylab="MAF",main="min MAF for HW chi2 test",log="xy")

We would therefore need at least \(50'000\) individuals to insure valid HW \(\chi^2\) tests with a \(0.01\) maf.

HW exact tests

  1. Now use the function HardyWeinberg::HWExactStats to obtain the exact p-values for these loci, using first the argument midp set to FALSE and then set to TRUE.
Code
# what are we cbinding here? 
hw.ex <- HWExactStats(cbind(pan@snps$N0, pan@snps$N1, pan@snps$N2), midp=FALSE)
hw.mp <- HWExactStats(cbind(pan@snps$N0, pan@snps$N1, pan@snps$N2), midp=TRUE)
nl <- length(hw.mp)

par(mfrow=c(1,2))

plot(-log10(1:nl/nl), -log10(sort(hw.ex)), cex=0.6, pch=16, col='red',
     xlab='theoretical p-val dist', ylab='emp p-val dist exact') ; abline(c(0,1))
plot(-log10(1:nl/nl), -log10(sort(hw.mp)), cex=0.6, pch=16, col='red',
      xlab='theoretical p-val dist', ylab='emp p-val dist exact-mp') ; abline(c(0,1))

par(mfrow=c(1,1))
### TIPS 
# ';' allows you to specify two commands in one line  
Answers

With the exact test, the p-values do align to the \(y=x\) line, without filtering on maf. Without correction, the p-values tend to be slightly underestimated. The mid p-value correction makes the the empirical p-values closer to the \(y=x\) line

  1. using the East Asian samples from the 1000 genome project, plot the SNPs heterozygosity against the frequency of the alternate allele for chr22:0-20M. Add to the plot a line showing the expected heterozygosity under Hardy Weinberg Equilibrium; Discuss the resulting figures, compared to what we saw with the simulated data.
Code
# read vcf (care for path!)
ch22 <- read.VCF("../chr22_Mb0_20.recode.vcf.gz") 
## ped stats and snps stats have been set. 
## 'p' has been set. 
## 'mu' and 'sigma' have been set.
# description file
samp.desc.file <- "https://www2.unil.ch/popgen/teaching/SISG18/integrated_call_samples_v3.20130502.ALL.panel"
samp.desc <- read.table(samp.desc.file,header=TRUE)
# subset EAS
EAS <- which(samp.desc$super_pop=="EAS")
plot(ch22[EAS,]@p, ch22[EAS,]@snps$hz, col="black", pch=16, cex=0.6, xlab='p',ylab='het',
     main='Het~p for EAS')
lines(x, 2*x*(1-x), col="orange")

Answers We observed many loci departing from the orange line, showing either heterozygote deficiency (dots below the orange line), or excess (dots above the orange line). Noteworthy are the points aligning to an “upper triangle”, these correspond to loci with only heterozygous genotypes called, most likely genotyping/calling errors. We note these are occurring in a very well cured data set, emphasizing how difficult it is to call genotypes properly from high trouhput sequencing data.
  1. Test for Hardy Weinberg using the exact mid-pvalue test for all these loci.
Code
hw.mp.EAS <- HWExactStats(cbind(ch22[EAS,]@snps$N0, ch22[EAS,]@snps$N1, ch22[EAS,]@snps$N2), midp=TRUE)
Answers As we would expect from the previous plots, many loci turn out to depart significantly from HW equilibrium
  1. plot the p-values of the previous tests against their expectation under the null (Rather than the p-values, -log10 of the p-values is more informative). Identify on the plot of heterozygosity against allele frequencies the loci not conforming to HWE, and discuss the results.
Code
nl <- length(hw.mp.EAS) # length of of p-values
plot(-log10(1:nl/nl), -log10(sort(hw.mp.EAS)), cex=0.6, pch=16, xlab='theoretical p-val distr', 
                                        ylab='emp p-val dist exact-mp', main='QQplot for EAS' )
abline(c(0,1))

# index outliers 
outliers <- which(-log10(hw.mp.EAS) > 6)
plot(ch22[EAS,]@p, ch22[EAS,]@snps$hz, col="black", pch=16, cex=0.6, xlab='p', ylab='Het')
lines(x, 2*x*(1-x), col="orange")
points(ch22[EAS,outliers]@p, ch22[EAS,outliers]@snps$hz, col="red",pch=16,cex=0.6)
Answers

The first (QQ-)plot confirms that many loci are not in Hardy Weinberg equilibrium. The second shows that the loci not in HW equilibrium are those further away from the expected orange line, and are not clustered at low maf. These loci should be removed at the Quality Control (QC) step.

Power to detect HW departure [optional]

  1. [optional] How likely are we to detect departure from HW if \(f=0.125\) with a sample of \(100\) individuals?
Code
ni <- 100
f <- 0.125
pchisq(qchisq(0.95,df=1), df=1, ncp=ni*f^2, lower=FALSE)
## [1] 0.239527
#density of chisq with ncp nf2

x <- seq(0.2,20,0.1)
plot(x, dchisq(x,df=1), type="h", col='#FF000080',lwd =2,
     xlab=expression(chi^2), ylab="probability density") # chisq prob dens
lines(x, dchisq(x,df=1,ncp=ni*f^2), type="h", col='#0000FF80', lwd=2) # with ncp
legend('topright', lty=rep(1,2), col=c('#FF000080','#0000FF80'), lwd=c(2,2),legend=c('no ncp','ncp = nf^2'))
abline(v=qchisq(0.95,df=1)) # 95th centile of chisq dist 
Answers

The red (\(H_0: f=0\)) and blue (\(H_1: f=0.125\)) distributions overlap very much and are very difficult to distinguish one from another. The vertical black line shows the 95% quantile of the red distribution, we see that most of the blue distribution is also left of this line (76.05%), and the power is very low (23.95%). Thus, it is almost impossible to detect \(f=0.125\) with a sample of \(100\) individuals

Code
ns <- 1:10*100 
round( pchisq(qchisq(0.95,df=1), df=1, ncp=ns*f^2, lower=FALSE), digits=3)
##  [1] 0.240 0.424 0.581 0.705 0.798 0.865 0.911 0.942 0.963 0.977
ni <- 500
f <- 0.125
pchisq( qchisq(0.95,df=1), df=1, ncp=ni*f^2, lower=FALSE)
## [1] 0.7981762
plot(x, dchisq(x,df=1), type="h", col="#FF000080", lwd=2,
     xlab=expression(chi^2), ylab="probability density")
#density of chisq with ncp nf2
lines(x, dchisq(x,df=1,ncp=ni*f^2), type="h", col="#0000FF80",lwd=2) 
legend('topright', lty=rep(1,2), col=c('#FF000080','#0000FF80'),lwd=c(2,2), legend=c('no ncp','ncp = nf^2'))
abline(v=qchisq(0.95,df=1)) # 95th centile of chisq dist 
Answers A minimum of 500 individuals would be necessary to detect departure from HW with a power of 80% when \(f=0.125\) if we only have a single biallelic locus. Fortunately, most studies rely on many more loci, and combining these together allows detection of departure from HW with much lower sample sizes.

END OF PRACTICAL 2.