#   Heritability of snake vertebral numbers
##  2011 Josef C Uyeda & Stevan J Arnold
##  (revised Jan 2012)
##  Stage 1_Get data into R and estimate heritability by regression
#Get your data into R by following the following steps
#Save the entire Exercise 1.1 directory, including our data file: R_inland_snake_data.txt
#If needed, set your working directory by
#putting the path to the downloaded folder you created in the following statement. 
#If you opened this R session and script from the Exercise directory, you can use
#relative paths and skip this step.
#Be sure and use the / not the \ spacing convention!
setwd('~/Desktop/exercise1-1-20210712T183359Z-001/exercise1-1/') #You may need to set your working directory to where you downloaded the files

## I. Reading and Viewing the Data

#Now read in your data
thamnophis <- read.table('R_inland_snake_data.txt',header=TRUE)
   
#Our dataset contains garter snake scale counts for mothers and their offspring. Scale counts are good 
#for this sort of comparison because they do not vary with ontogeny. Why is this important? Why does 
#this simplify estimating heritability from a parent-offspring regression?
#View(thamnophis)

#Attach the data frame to the local environment. This will work well for our current
#exercise, but it is generally best practice in your own research to AVOID USING ATTACH 
#as it can lead to confusion over which variables are being used. 
attach(thamnophis)

## II. Data preparation
#Create a vector of all the unique family codes

family.list <- unique(family)
family.list
##   [1]  420  437  438  440  494  497  507  511  512  515  516  522  534  540  541
##  [16]  543  545  549  550  554  556  591  676  686  687  693  697  698  703  704
##  [31]  711  712  713  720  724  729  737  895  896  905  911  916  918  920  924
##  [46]  929  934  939  940  947  976  986  987  998 1004 1005 1008 1011 1044 1045
##  [61] 1046 1047 1048 1052 1056 1058 1060 1169 1180 1185 1186 1200 1212 1227  160
##  [76]  249  259  260  265  267  268  274  277  278  279  283  352  355  358  369
##  [91]  370  371  374  375  376  377  379  385  386  452  470  474  482  485  486
## [106]  904  906  908  917  921  922  923  926  927  930  931  933  935  936  937
## [121]  938  941  942  943  944  946  948  949  950  951  954  981  999 1006 1010
## [136] 1016 1018 1019 1022 1024 1027 1054 1073 1080 1088 1090 1102 1144 1162 1166
## [151] 1167 1178 1181 1182 1195 1198 1199 1209 1210 1214 1215
#Here, each family represents a gravid female snake that was caught and all of her offspring. The 
#first entry for a given family is the mother's trait data. The subsequent entries are her kids. 
#We will use `dplyr` to reorganize the dataset into two sets of columns for each family calculating
#the mother's and the kids' trait means. Male snakes tend to have higher vertebral counts than female 
#snakes. This dataset has already had the effects of sex removed (which explains why some kids have 
#fractional scale counts).

#We will now calculate the heritability of the body vertebrae. We will need to 
#do two things, create a vector of all of the mom's vertebrae counts, and create
#a vector of all the kid's vertebrae counts averaged over families. 

moms <- NULL
kids <- NULL

#Go through a loop with i equal to a each successive value in family.list; copy from 'for' to '}' and paste in R console
for (i in family.list){ 
  byfam <- subset(body,family==i) #Create a vector of body vertebrae counts for family i
  moms <- rbind(moms,byfam[1]) #Save the moms count as the next element in the vector "moms"
  kids <- rbind(kids,mean(byfam[-1], na.rm=TRUE)) #Save the mean of the kids count as the next element in the vector "kids" (you are excluding the 1st element of the vector, which is the mom)
}

## III. Estimate heritability
#Now we can estimate heritabilities from a parent-offspring regression. Plot the mother and offspring body vertebrae 
#and fit a linear regression. Then add a trendline from the linear regression fit. 
#Note that the second line will not run without you filling in the appropriate equation. We will save a bunch of stuff in lm1; 
#to see the summary statistics run "summary(lm1)", we especially want the slope, which is stored in lm1$coefficients.
#stop("You will need to perform a linear regression here")
plot(moms, kids, col='blue', pch=19, cex=1.5)
lm1 <- lm(kids ~ moms)# Fill in the blank using the function lm
abline(lm1, lty=2, lwd=5) #Add regression line to the plot

summary(lm1)
## 
## Call:
## lm(formula = kids ~ moms)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7707 -1.5516 -0.0373  1.7237  8.0121 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 110.40290   10.30783  10.711  < 2e-16 ***
## moms          0.33906    0.06204   5.465  1.9e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.698 on 149 degrees of freedom
##   (10 observations deleted due to missingness)
## Multiple R-squared:  0.167,  Adjusted R-squared:  0.1614 
## F-statistic: 29.87 on 1 and 149 DF,  p-value: 1.898e-07
#Questions:
#What parameter is associated with the heritability? 
#How do we obtain our estimate of the heritability for these data (remember the relatedness)? 
#Is this narrow-sense or broad-sense heritability?


#Obtain the estimate of the slope from the coefficients in lm1:
slope <- lm1$coefficients[2] 
#the heritability estimate is then (replace ??? with equations)
h2body <- 2*slope #???
Vpbody <- var(moms, na.rm=TRUE) # Could also include offspring, but non-independent samples
Vabody <- h2body * Vpbody#???  # Use equation for heritability

## IV. Repeat the analysis for tail (and other variables).
#Calculate the heritability of "tail". Try to do it programmatically if you can, with either a 
#for loop or a function.
  
  
## V. Estimating across traits with a function
##Make a function that can quickly calculate the heritability for each trait
#In the user-defined function h2, V is the name of the trait whose heritability will be estimated
h2=function(trait, data=thamnophis, plot=FALSE){
  family.list=unique(family)
  V = thamnophis[, trait]#define V as the vector of the trait data. "trait" will be a character vector giving the column name
    #to be used
  moms=NULL
  kids=NULL
  for (i in family.list){ 
    byfam <- subset(V, family==i) #Create a vector of body vertebrae counts for family i
    moms <- rbind(moms,byfam[1]) #Save the moms count as the next element in the vector "moms"
    kids <- rbind(kids,mean(byfam[-1], na.rm=TRUE)) #Save the mean of the kids count as the next element in the vector "kids" (you are excluding the 1st element of the vector, which is the mom)
  }
  lm1 <- lm(kids ~ moms)# Fill in the blank using the function lm
  
  if(plot){
    plot(moms, kids, col='blue', pch=19, cex=1.5)
    abline(lm1, lty=2, lwd=5)
  }
  
  
  slope=unname(lm1$coeff[2])
  h.squared= 2*slope #??? #Calculate h2 from the slope
  h.squared #return the heritability
}

#test new function called h2:
h2("body")
## [1] 0.6781192
h2("ilab")
## [1] 0.07257756
h2("tail")
## [1] 0.4611018
traits <- colnames(thamnophis)[3:8] #All trait names
sapply(traits, h2)
##        body        tail         mid        ilab        slab        post 
##  0.67811916  0.46110175  0.25110435  0.07257756 -0.08889161  0.35312629
## VI. Uncertainty - Bootstrapping
#How much confidence should we have in these estimates? One way to estimate
#uncertainty and confidence limits with minimal distributional assumptions is
#to conduct a bootstrap.

#Bootstrap the heritability of "body"
plots = FALSE #Turn to `TRUE` if you want to see the bootstrap samples plotted`
family.list=unique(family)
h2.all=NULL
for (j in 1:1000){
  moms=NULL
  kids=NULL
  family.list.boot=sample(family.list,replace=TRUE)
  for (i in family.list.boot){
    byfam=subset(body,family==i) 
    moms=rbind(moms,byfam[1]) 
    kids=rbind(kids,mean(byfam[-1],na.rm=TRUE))
  }
  
  lmfit=lm(kids ~ moms)
  slope=lmfit$coeff[2]
  h.squared=2*slope
  #Save the heritability of replicate i as the ith element in vector h2.all
  h2.all=rbind(h2.all,h.squared)  
  if(plots){
    #Graph showing the variation in points sampled
    plot(moms,kids,xlim=c(150,180),ylim=c(150,180)) 
    #Superimpose on that graph the regression ls
    abline(lmfit$coefficients)
    Sys.sleep(0.1)
  }
}

#Plot a histogram of the heritability estimates and corresponding summary stats 
hist(h2.all, xlim=c(-0.3, 1.3)) 
mean(h2.all)
## [1] 0.6843534
abline(v=mean(h2.all), lwd=5, lty=2)

var(h2.all) #Our estimate of the sampling variance of the mean
##            moms
## moms 0.01723203
sd(h2.all)  #Our estimate of the standard error of the mean
## [1] 0.1312708
##Create a program h2boot that calculates bootstrap values so we can get the value for #each variable quickly; copy the following statements all the way thru the 3rd "}" and #paste into console
h2boot=function(trait, data=thamnophis, boot.reps=1000, plots=FALSE){
  h2.all=NULL
  family.list=data$family
  V <- data[,trait]
  for (j in 1:boot.reps){
    moms=NULL
    kids=NULL
    family.list.boot=sample(family.list,replace=TRUE) #This is the key to bootstrapping
    for (i in family.list.boot){
      byfam=subset(V,family==i) 
      moms=rbind(moms,byfam[1]) 
      kids=rbind(kids,mean(byfam[-1],na.rm=TRUE))
    }
    
    lmfit=lm(kids ~ moms)
    slope=lmfit$coeff[2]
    h2=2*slope
    #Save the heritability of replicate i as the ith element in vector h2.all
    h2.all=rbind(h2.all,h2)  
    if(plots){
      plot(moms,kids,xlim=c(min(V,na.rm=TRUE),max(V,na.rm=TRUE)),ylim=c(min(V,na.rm=TRUE),max(V,na.rm=TRUE))) #Graph showing the variation in points sampled
      #Superimpose on that graph the regression ls
      abline(lmfit$coefficients, lty=2, lwd=5)
      Sys.sleep(0.1)
    }
  }
  hist(h2.all, xlim=c(-0.3, 1.3), breaks=boot.reps/20, main=trait) 
  abline(v=mean(h2.all), lwd=3, lty=2, col="red")
  #Use a double-headed arrow so that out is a global variable not just a local variable
  out<<-list(mean=mean(h2.all),sd=sd(h2.all)) 
  out
}

#Use 'h2boot(tail)' to start the program on 'tail'; hit 'ESC' to end the cycle of boot plots
#but if you hit 'ESC' the histogram and stats won't appear
h2boot("tail", boot.reps=100)

## $mean
## [1] 0.5318461
## 
## $sd
## [1] 0.05101016
par(mfrow=c(3,2))
traits <- colnames(thamnophis)[3:8]
res <- lapply(traits, h2boot)

#Here, we ignored variation in the sizes of families. How should that affect our analyses? 
#End of exercise: What have we learned how to do? This is admittedly a simplistic method of estimating
#heritability and additive genetic variance. We will learn about more sophisticated tools in the lecture
#and lab with Patrick Carter. How might our estimates be biased or an inaccurate estimate of the true
#heritability and additive genetic variation? How could the analysis potentially be improved to get a 
#better estimate?