--- title: "Population structure" author: "Jerome Goudet" date: '`r Sys.Date()`' output: pdf_document: toc: yes html_document: number_sections: yes toc: yes toc_depth: 3 --- reproduces slide 146 and illustrates the different shapes a beta distribution can take: ```{r} x<-1:1000/1000 par(mfrow=c(3,3)) plot(x,dbeta(x,1,1),xlab="1,1",ylab="",type="l") plot(x,dbeta(x,2,2),xlab="2,2",ylab="",type="l") plot(x,dbeta(x,4,4),xlab="4,4",ylab="",type="l") plot(x,dbeta(x,2,1),xlab="2,1",ylab="",type="l") plot(x,dbeta(x,4,1),xlab="4,1",ylab="",type="l") plot(x,dbeta(x,1,4),xlab="1,4",ylab="",type="l") plot(x,dbeta(x,.9,.9),xlab=".9,.9",ylab="",type="l") plot(x,dbeta(x,.5,.5),xlab=".5,.5",ylab="",type="l") plot(x,dbeta(x,.5,4),xlab=".5,4",ylab="",type="l") ``` Now load the `hierfstat` library and the FBI dataset, and do a few initial calculations ```{r} library(hierfstat) #points to the folder with dataset setwd("c:/Users/jgoudet/Dropbox/Teaching/SISG2015/") fbiwh02<-read.table("./fbiwh02.txt",header=TRUE) head(fbiwh02) ###some exploration of the dataset bs.fbi<-basic.stats(fbiwh02) names(bs.fbi) bs.fbi$n.ind.samp # nb inds per pop bs.fbi$Hs #gene diversities bs.fbi$overall #different sum stats wcfbi<-wc(fbiwh02) names(wcfbi) wcfbi$FST wcfbi$FIS ``` Individual pca of the fbi dataset ```{r} fbi.ipca<-indpca(fbiwh02) plot(fbi.ipca,col=fbiwh02[,1]) ``` Estimation of the $\beta$'s, slides 151 and followings ```{r} (bfbi<-betas(fbiwh02)) names(bfbi) #now looking at pairs of populations (bfbi12<-betas(fbiwh02[fbiwh02[,1]!=3,])) (bfbi13<-betas(fbiwh02[fbiwh02[,1]!=2,])) (bfbi23<-betas(fbiwh02[fbiwh02[,1]!=1,])) ``` Let's move to hapmap data. We will use only data from Chromosome 2. You should have downloaded and uncompressed the data file \verb"hm_chr2_l.txt.gz" before proceeding ```{r} ############################################### dat<-data.frame(matrix(scan("./hm_chr2_l.txt",skip=1),byrow=TRUE,nrow=209)) rn<-scan("./hm_chr2_l.txt",nlines=1,what=character()) names(dat)<-rn loc.pos<-read.table("pos_loc_chr2.txt",header=T) lpos<-loc.pos[(1:(dim(loc.pos)[1]%/%4))*4,2] head(dat[,1:10]) ``` If everything went well, the data has been loaded into R and we can start working. We now estimate $\beta$ for each snp and population, and store it in object `bi`: ```{r} bhmc2<-betas(dat) #takes a while... #check sweep help page, very useful. bi<-t(with(bhmc2,1-sweep(Hi,2,Hb,FUN="/"))) dim(bi) head(bi) ``` Each column is a different population, and each row is a SNP. Let's plot the first column (Europeans) ```{r} plot(lpos,bi[,1],type="l") ``` Looks like a mess... The solution is to use a sliding window, and to obtain a moving average along this sliding windows: ```{r} m<-2500000 #half a windows size i.e 2.5Mb #lapply apply function to each element of a list ul<-lapply(lpos,function(x) max(which(lpos<=min(max(lpos),x+m)))) ll<-lapply(lpos,function(x) min(which(lpos>=max(lpos[1],x-m)))) all<-cbind(unlist(ll),unlist(ul)) head(all) eur<-unlist(apply(all,1,function(x) mean(bi[x[1]:x[2],1],na.rm=T))) #mov average ceu chi<-unlist(apply(all,1,function(x) mean(bi[x[1]:x[2],2],na.rm=T))) #mov avg chb jap<-unlist(apply(all,1,function(x) mean(bi[x[1]:x[2],3],na.rm=T))) #mov avg jpt yor<-unlist(apply(all,1,function(x) mean(bi[x[1]:x[2],4],na.rm=T))) #mov avg yri ``` (Note that we took the averages of the $\beta$. As an exercice, estimate $\beta$s using the sum of numerators divided by the sums of denominators.) Now we are ready to plot the population specific $\beta$ along chromosome 2: ```{r} posmb<-lpos/1000000 #snp position, in megabase nesnp<-which((all[,2]-all[,1])<400) plot(posmb,eur,ylim=range(c(eur,chi,jap,yor)),type="l",col="blue", xlab="Pos Chrom 2 (Mb)",ylab="Pop Fst",lwd=2) lines(posmb,chi,col="green",lwd=2) lines(posmb,jap,col="yellow",lwd=2) lines(posmb,yor,col="red",lwd=2) lines(posmb[nesnp],eur[nesnp],col="white",lwd=2) lines(posmb[nesnp],chi[nesnp],col="white",lwd=2) lines(posmb[nesnp],jap[nesnp],col="white",lwd=2) lines(posmb[nesnp],yor[nesnp],col="white",lwd=2) abline(v=136.570,lwd=2,lty=2) #lactase gene pos omim 603202 abline(v=63.1,lwd=2,lty=2) #prostate cancer gene omim 611868 lines(posmb,(all[,2]-all[,1])/10000,col="grey") #nb snps . ``` Time allowing, look at \emph{Galba truncatula} dataset. Data set described and analysed in Trouve et al. ```{r} data(gtrunchier) table(gtrunchier[,2:1]) vc.gtr<-with(gtrunchier,varcomp.glob(data.frame(Locality,Patch),gtrunchier[,-c(1,2)])) vc.gtr$F ``` Do we see this structure in an individual based PCA? ```{r} x<-indpca(gtrunchier[,-2],ind.labels=gtrunchier[,2]) #ind label is patch of origin plot(x,col=gtrunchier[,1],cex=0.7) # color is population ```