---
title: "Population structure"
author: "Jerome Goudet"
date: '`r Sys.Date()`'
output:
  pdf_document:
    toc: yes
  html_document:
    number_sections: yes
    toc: yes
    toc_depth: 3
---

reproduces slide 146 and illustrates the different shapes a beta distribution can take:

```{r}
x<-1:1000/1000
par(mfrow=c(3,3))
plot(x,dbeta(x,1,1),xlab="1,1",ylab="",type="l")
plot(x,dbeta(x,2,2),xlab="2,2",ylab="",type="l")
plot(x,dbeta(x,4,4),xlab="4,4",ylab="",type="l")
plot(x,dbeta(x,2,1),xlab="2,1",ylab="",type="l")
plot(x,dbeta(x,4,1),xlab="4,1",ylab="",type="l")
plot(x,dbeta(x,1,4),xlab="1,4",ylab="",type="l")
plot(x,dbeta(x,.9,.9),xlab=".9,.9",ylab="",type="l")
plot(x,dbeta(x,.5,.5),xlab=".5,.5",ylab="",type="l")
plot(x,dbeta(x,.5,4),xlab=".5,4",ylab="",type="l")

```


Now load the `hierfstat` library and the FBI dataset, and do a few initial calculations


```{r} 
library(hierfstat)
#points to the folder with dataset
setwd("c:/Users/jgoudet/Dropbox/Teaching/SISG2015/") 
fbiwh02<-read.table("./fbiwh02.txt",header=TRUE)
head(fbiwh02)
###some exploration of the dataset
bs.fbi<-basic.stats(fbiwh02)
names(bs.fbi)
bs.fbi$n.ind.samp # nb inds per pop
bs.fbi$Hs #gene diversities
bs.fbi$overall #different sum stats
wcfbi<-wc(fbiwh02)
names(wcfbi)
wcfbi$FST
wcfbi$FIS
```


Individual pca of the fbi dataset

```{r} 
fbi.ipca<-indpca(fbiwh02)
plot(fbi.ipca,col=fbiwh02[,1])
```

Estimation of the $\beta$'s, slides 151 and followings


```{r} 
(bfbi<-betas(fbiwh02))
names(bfbi)
#now looking at pairs of populations
(bfbi12<-betas(fbiwh02[fbiwh02[,1]!=3,]))
(bfbi13<-betas(fbiwh02[fbiwh02[,1]!=2,]))
(bfbi23<-betas(fbiwh02[fbiwh02[,1]!=1,]))
```

Let's move to hapmap data. We will use only data from Chromosome 2.  
You should have downloaded and uncompressed the data file \verb"hm_chr2_l.txt.gz" 
before proceeding

```{r} 
###############################################
dat<-data.frame(matrix(scan("./hm_chr2_l.txt",skip=1),byrow=TRUE,nrow=209))
rn<-scan("./hm_chr2_l.txt",nlines=1,what=character())
names(dat)<-rn
loc.pos<-read.table("pos_loc_chr2.txt",header=T)
lpos<-loc.pos[(1:(dim(loc.pos)[1]%/%4))*4,2]
head(dat[,1:10])
```

If everything went well, the data has been loaded into R and we can start working. We now estimate $\beta$ for each snp and population, and store it in object `bi`:

```{r} 
bhmc2<-betas(dat) #takes a while...
#check sweep help page, very useful.
bi<-t(with(bhmc2,1-sweep(Hi,2,Hb,FUN="/")))
dim(bi)
head(bi)
```

Each column is a different population, and each row is a SNP. Let's plot the first column (Europeans)

```{r} 
plot(lpos,bi[,1],type="l")
```

Looks like a mess... The solution is to use a sliding window, and to obtain a moving average along this sliding windows:

```{r} 
m<-2500000 #half a windows size i.e 2.5Mb
#lapply apply function to each element of a list
ul<-lapply(lpos,function(x) max(which(lpos<=min(max(lpos),x+m))))
ll<-lapply(lpos,function(x) min(which(lpos>=max(lpos[1],x-m))))
all<-cbind(unlist(ll),unlist(ul))
head(all)
eur<-unlist(apply(all,1,function(x) mean(bi[x[1]:x[2],1],na.rm=T))) #mov average ceu
chi<-unlist(apply(all,1,function(x) mean(bi[x[1]:x[2],2],na.rm=T))) #mov avg chb
jap<-unlist(apply(all,1,function(x) mean(bi[x[1]:x[2],3],na.rm=T))) #mov avg jpt
yor<-unlist(apply(all,1,function(x) mean(bi[x[1]:x[2],4],na.rm=T))) #mov avg yri
```

(Note that we took the averages of the $\beta$.  As an exercice, estimate $\beta$s using the sum of numerators divided by the sums of denominators.) 

Now we are ready to plot the population specific $\beta$ along chromosome 2:

```{r} 
posmb<-lpos/1000000 #snp position, in megabase
nesnp<-which((all[,2]-all[,1])<400)
plot(posmb,eur,ylim=range(c(eur,chi,jap,yor)),type="l",col="blue",
     xlab="Pos Chrom 2 (Mb)",ylab="Pop Fst",lwd=2)
lines(posmb,chi,col="green",lwd=2)
lines(posmb,jap,col="yellow",lwd=2)
lines(posmb,yor,col="red",lwd=2)
lines(posmb[nesnp],eur[nesnp],col="white",lwd=2)
lines(posmb[nesnp],chi[nesnp],col="white",lwd=2)
lines(posmb[nesnp],jap[nesnp],col="white",lwd=2)
lines(posmb[nesnp],yor[nesnp],col="white",lwd=2)
abline(v=136.570,lwd=2,lty=2) #lactase gene pos omim 603202
abline(v=63.1,lwd=2,lty=2) #prostate cancer gene omim 611868
lines(posmb,(all[,2]-all[,1])/10000,col="grey") #nb snps .
```

Time allowing, look at \emph{Galba truncatula} dataset. Data set described and analysed in Trouve et al. 

```{r} 
data(gtrunchier)
table(gtrunchier[,2:1])
vc.gtr<-with(gtrunchier,varcomp.glob(data.frame(Locality,Patch),gtrunchier[,-c(1,2)]))
vc.gtr$F
```

Do we see this structure in an individual based PCA? 

```{r} 
x<-indpca(gtrunchier[,-2],ind.labels=gtrunchier[,2]) #ind label is patch of origin
plot(x,col=gtrunchier[,1],cex=0.7) # color is population
```