# Heritability of snake vertebral numbers
## 2011 Josef C Uyeda & Stevan J Arnold
## (revised Jan 2012)
## Stage 1_Get data into R and estimate heritability by regression
#Get your data into R by following the following steps
#Save the entire Exercise 1.1 directory, including our data file: R_inland_snake_data.txt
#If needed, set your working directory by
#putting the path to the downloaded folder you created in the following statement.
#If you opened this R session and script from the Exercise directory, you can use
#relative paths and skip this step.
#Be sure and use the / not the \ spacing convention!
setwd('~/Desktop/exercise1-1-20210712T183359Z-001/exercise1-1/') #You may need to set your working directory to where you downloaded the files
## I. Reading and Viewing the Data
#Now read in your data
thamnophis <- read.table('R_inland_snake_data.txt',header=TRUE)
#Our dataset contains garter snake scale counts for mothers and their offspring. Scale counts are good
#for this sort of comparison because they do not vary with ontogeny. Why is this important? Why does
#this simplify estimating heritability from a parent-offspring regression?
#View(thamnophis)
#Attach the data frame to the local environment. This will work well for our current
#exercise, but it is generally best practice in your own research to AVOID USING ATTACH
#as it can lead to confusion over which variables are being used.
attach(thamnophis)
## II. Data preparation
#Create a vector of all the unique family codes
family.list <- unique(family)
family.list
## [1] 420 437 438 440 494 497 507 511 512 515 516 522 534 540 541
## [16] 543 545 549 550 554 556 591 676 686 687 693 697 698 703 704
## [31] 711 712 713 720 724 729 737 895 896 905 911 916 918 920 924
## [46] 929 934 939 940 947 976 986 987 998 1004 1005 1008 1011 1044 1045
## [61] 1046 1047 1048 1052 1056 1058 1060 1169 1180 1185 1186 1200 1212 1227 160
## [76] 249 259 260 265 267 268 274 277 278 279 283 352 355 358 369
## [91] 370 371 374 375 376 377 379 385 386 452 470 474 482 485 486
## [106] 904 906 908 917 921 922 923 926 927 930 931 933 935 936 937
## [121] 938 941 942 943 944 946 948 949 950 951 954 981 999 1006 1010
## [136] 1016 1018 1019 1022 1024 1027 1054 1073 1080 1088 1090 1102 1144 1162 1166
## [151] 1167 1178 1181 1182 1195 1198 1199 1209 1210 1214 1215
#Here, each family represents a gravid female snake that was caught and all of her offspring. The
#first entry for a given family is the mother's trait data. The subsequent entries are her kids.
#We will use `dplyr` to reorganize the dataset into two sets of columns for each family calculating
#the mother's and the kids' trait means. Male snakes tend to have higher vertebral counts than female
#snakes. This dataset has already had the effects of sex removed (which explains why some kids have
#fractional scale counts).
#We will now calculate the heritability of the body vertebrae. We will need to
#do two things, create a vector of all of the mom's vertebrae counts, and create
#a vector of all the kid's vertebrae counts averaged over families.
moms <- NULL
kids <- NULL
#Go through a loop with i equal to a each successive value in family.list; copy from 'for' to '}' and paste in R console
for (i in family.list){
byfam <- subset(body,family==i) #Create a vector of body vertebrae counts for family i
moms <- rbind(moms,byfam[1]) #Save the moms count as the next element in the vector "moms"
kids <- rbind(kids,mean(byfam[-1], na.rm=TRUE)) #Save the mean of the kids count as the next element in the vector "kids" (you are excluding the 1st element of the vector, which is the mom)
}
## III. Estimate heritability
#Now we can estimate heritabilities from a parent-offspring regression. Plot the mother and offspring body vertebrae
#and fit a linear regression. Then add a trendline from the linear regression fit.
#Note that the second line will not run without you filling in the appropriate equation. We will save a bunch of stuff in lm1;
#to see the summary statistics run "summary(lm1)", we especially want the slope, which is stored in lm1$coefficients.
#stop("You will need to perform a linear regression here")
plot(moms, kids, col='blue', pch=19, cex=1.5)
lm1 <- lm(kids ~ moms)# Fill in the blank using the function lm
abline(lm1, lty=2, lwd=5) #Add regression line to the plot
summary(lm1)
##
## Call:
## lm(formula = kids ~ moms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7707 -1.5516 -0.0373 1.7237 8.0121
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 110.40290 10.30783 10.711 < 2e-16 ***
## moms 0.33906 0.06204 5.465 1.9e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.698 on 149 degrees of freedom
## (10 observations deleted due to missingness)
## Multiple R-squared: 0.167, Adjusted R-squared: 0.1614
## F-statistic: 29.87 on 1 and 149 DF, p-value: 1.898e-07
#Questions:
#What parameter is associated with the heritability?
#How do we obtain our estimate of the heritability for these data (remember the relatedness)?
#Is this narrow-sense or broad-sense heritability?
#Obtain the estimate of the slope from the coefficients in lm1:
slope <- lm1$coefficients[2]
#the heritability estimate is then (replace ??? with equations)
h2body <- 2*slope #???
Vpbody <- var(moms, na.rm=TRUE) # Could also include offspring, but non-independent samples
Vabody <- h2body * Vpbody#??? # Use equation for heritability
## IV. Repeat the analysis for tail (and other variables).
#Calculate the heritability of "tail". Try to do it programmatically if you can, with either a
#for loop or a function.
## V. Estimating across traits with a function
##Make a function that can quickly calculate the heritability for each trait
#In the user-defined function h2, V is the name of the trait whose heritability will be estimated
h2=function(trait, data=thamnophis, plot=FALSE){
family.list=unique(family)
V = thamnophis[, trait]#define V as the vector of the trait data. "trait" will be a character vector giving the column name
#to be used
moms=NULL
kids=NULL
for (i in family.list){
byfam <- subset(V, family==i) #Create a vector of body vertebrae counts for family i
moms <- rbind(moms,byfam[1]) #Save the moms count as the next element in the vector "moms"
kids <- rbind(kids,mean(byfam[-1], na.rm=TRUE)) #Save the mean of the kids count as the next element in the vector "kids" (you are excluding the 1st element of the vector, which is the mom)
}
lm1 <- lm(kids ~ moms)# Fill in the blank using the function lm
if(plot){
plot(moms, kids, col='blue', pch=19, cex=1.5)
abline(lm1, lty=2, lwd=5)
}
slope=unname(lm1$coeff[2])
h.squared= 2*slope #??? #Calculate h2 from the slope
h.squared #return the heritability
}
#test new function called h2:
h2("body")
## [1] 0.6781192
h2("ilab")
## [1] 0.07257756
h2("tail")
## [1] 0.4611018
traits <- colnames(thamnophis)[3:8] #All trait names
sapply(traits, h2)
## body tail mid ilab slab post
## 0.67811916 0.46110175 0.25110435 0.07257756 -0.08889161 0.35312629
## VI. Uncertainty - Bootstrapping
#How much confidence should we have in these estimates? One way to estimate
#uncertainty and confidence limits with minimal distributional assumptions is
#to conduct a bootstrap.
#Bootstrap the heritability of "body"
plots = FALSE #Turn to `TRUE` if you want to see the bootstrap samples plotted`
family.list=unique(family)
h2.all=NULL
for (j in 1:1000){
moms=NULL
kids=NULL
family.list.boot=sample(family.list,replace=TRUE)
for (i in family.list.boot){
byfam=subset(body,family==i)
moms=rbind(moms,byfam[1])
kids=rbind(kids,mean(byfam[-1],na.rm=TRUE))
}
lmfit=lm(kids ~ moms)
slope=lmfit$coeff[2]
h.squared=2*slope
#Save the heritability of replicate i as the ith element in vector h2.all
h2.all=rbind(h2.all,h.squared)
if(plots){
#Graph showing the variation in points sampled
plot(moms,kids,xlim=c(150,180),ylim=c(150,180))
#Superimpose on that graph the regression ls
abline(lmfit$coefficients)
Sys.sleep(0.1)
}
}
#Plot a histogram of the heritability estimates and corresponding summary stats
hist(h2.all, xlim=c(-0.3, 1.3))
mean(h2.all)
## [1] 0.6843534
abline(v=mean(h2.all), lwd=5, lty=2)
var(h2.all) #Our estimate of the sampling variance of the mean
## moms
## moms 0.01723203
sd(h2.all) #Our estimate of the standard error of the mean
## [1] 0.1312708
##Create a program h2boot that calculates bootstrap values so we can get the value for #each variable quickly; copy the following statements all the way thru the 3rd "}" and #paste into console
h2boot=function(trait, data=thamnophis, boot.reps=1000, plots=FALSE){
h2.all=NULL
family.list=data$family
V <- data[,trait]
for (j in 1:boot.reps){
moms=NULL
kids=NULL
family.list.boot=sample(family.list,replace=TRUE) #This is the key to bootstrapping
for (i in family.list.boot){
byfam=subset(V,family==i)
moms=rbind(moms,byfam[1])
kids=rbind(kids,mean(byfam[-1],na.rm=TRUE))
}
lmfit=lm(kids ~ moms)
slope=lmfit$coeff[2]
h2=2*slope
#Save the heritability of replicate i as the ith element in vector h2.all
h2.all=rbind(h2.all,h2)
if(plots){
plot(moms,kids,xlim=c(min(V,na.rm=TRUE),max(V,na.rm=TRUE)),ylim=c(min(V,na.rm=TRUE),max(V,na.rm=TRUE))) #Graph showing the variation in points sampled
#Superimpose on that graph the regression ls
abline(lmfit$coefficients, lty=2, lwd=5)
Sys.sleep(0.1)
}
}
hist(h2.all, xlim=c(-0.3, 1.3), breaks=boot.reps/20, main=trait)
abline(v=mean(h2.all), lwd=3, lty=2, col="red")
#Use a double-headed arrow so that out is a global variable not just a local variable
out<<-list(mean=mean(h2.all),sd=sd(h2.all))
out
}
#Use 'h2boot(tail)' to start the program on 'tail'; hit 'ESC' to end the cycle of boot plots
#but if you hit 'ESC' the histogram and stats won't appear
h2boot("tail", boot.reps=100)
## $mean
## [1] 0.5318461
##
## $sd
## [1] 0.05101016
par(mfrow=c(3,2))
traits <- colnames(thamnophis)[3:8]
res <- lapply(traits, h2boot)
#Here, we ignored variation in the sizes of families. How should that affect our analyses?
#End of exercise: What have we learned how to do? This is admittedly a simplistic method of estimating
#heritability and additive genetic variance. We will learn about more sophisticated tools in the lecture
#and lab with Patrick Carter. How might our estimates be biased or an inaccurate estimate of the true
#heritability and additive genetic variation? How could the analysis potentially be improved to get a
#better estimate?