-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathS2_OTNOT_FVT2021_v2.Rmd
244 lines (189 loc) · 8.33 KB
/
S2_OTNOT_FVT2021_v2.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
---
title: "AJB2021_FVT_CEPF_Analysis"
author: "Robert L. Baker"
email: "[email protected]"
affiliation: "https://rlbakerlab.com"
date: "12/23/2020"
abstract: "This document includes an example dataset, code for fitting a logistic growth curve to FVT, generating plots, and then analyzing the data using a parameters as data approach."
tags: [R, FVT, Brassica, AJB, Function-Valued Trait, American Journal of Botany, Development, High throughput phenotyping, HTP]
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
r<-getOption("repos")
r["CRAN"]<-"https://cloud.r-project.org"
options(repos= r)
if(!require(ply)){
install.packages("plyr", dependencies=TRUE)
}
if(!require(lubridate)){
install.packages("lubridate", dependencies=TRUE)
}
```
## Data Background and Explanation
These data were generated at the CEPF at Purdue University in 2020 as part of a larger experiment. Here we analyze a subset of the data from two genotypes of <em>Brassica rapa</em> (L58 & R500) in a single control treatment and the single response variable "TopPlantSurface". Plants were grown in a consistent, growth chamber environment and watering was automated. Each weekday, plants exited the growth chamber on an automated conveyor belt and passed through an array of phenotyping equipment. Data analyzed here are from a camera aimed directly down at the center of each pot. TopPlantSurface is generated from RGB image data and is extracted from the image using custom software specific to the CEPF (but remarkably reminiscent of openCV or PlantCV). TopPlantSurface consists of all the pixels identified as "Plant" from this overhead view and is a likely proxy for plant growth, photosynthetic area, biomass, etc. Also note that these are not raw data; obviously erroneous data points have been dropped. For instance, on one day the watering regime was off and the plants wilted, leading to abnormally small values for "TopPlantSurface". One replicate appeared to be mislabeled and was also dropped from the analysis outlined below.
#### Prep for analysis:
```{r startup, results="hide", message=FALSE}
rm(list=ls()) #remove all objects from the R environment
gc() #can help return memory to R after large objects have been called.
#load necessary packages
library(plyr)
library(lubridate)
library(car)
library(tidyverse)
#load the data (from Sumplemtental Information S1 of this publication)
dat<-read.csv(file="CEPF_TopSurface2020.csv")
```
#### Convert data collection date from mm/dd/yy to "days from planting".
Unfortunately we don't have germination date, which would be better than planting date. Luckily for these genotypes there is typically very little difference in days to germination.
```{r dateconversions}
#take a look at the data
head(dat)
#convert date formats from mm/dd/yy to Julian days:
dat$PlantDate<-as.Date(dat$PlantDate, format="%m/%d/%Y")
dat$PlantDate<-format(dat$PlantDate, "%j")
dat$PlantDate<-as.numeric(dat$PlantDate)
dat$DataDate<-as.Date(dat$DataDate, format="%m/%d/%Y")
dat$DataDate<-format(dat$DataDate, "%j")
dat$DataDate<-as.numeric(dat$DataDate)
#generate a new column, "DataDate", which is the time between planting and the observation, in days
dat$DataDate<-dat$DataDate-dat$PlantDate
#take a look at the data again:
head(dat)
```
#### Visual data inspection:
```{r plot_raw_data}
#dat$DataDate<-as.factor(dat$DataDate) #convert to factor
#get max and min values to set y-axis limits on plots:
min(dat$TopPlantSurface, na.rm=TRUE) #minimum value
max(dat$TopPlantSurface, na.rm=TRUE) #maximum value
dat_split<-split(dat, dat$PlantID) #split the dataframe into multiple dataframes, one for each individual plant (PlantID) and place all of these dataframes in a list of dataframes.
j<-unique(dat$PlantID) #j is a list of all the individual plant IDs
length(j) # the number of plants in the analysis
```
```{r plot data}
plot(NULL, xlim=c(0,48), ylim=c(0,125089), ylab="TopSurface (pixels)", xlab="Days after Planting", main="Plant size during development") #set up an empty plot
#generate a plot where each color is a different plant for data inspection purposes:
for(i in 1:length(j)){
yvalues<-dat_split[[i]][7]
y2values<-yvalues$TopPlantSurface
xvalues<-dat_split[[i]][6]
x2values<-xvalues$DataDate
main<-dat_split[[i]][1]
geno<-dat_split[[i]][2]
points(x2values,y2values, pch=19, col=i)
}
```
#### Fit a logistic growth curve to a single plant
Once you are satisfied that your data can be fit with a specific function, in this case a logistic growth curve, you can fit the curve to the data. First, test this out the curve fitting procedure with an individual plant:
```{r logit_tester, message=FALSE}
#Grab just one plant:
tester<-dat_split[[2]]
head(tester)
#plot it just to make sure:
plot(tester$DataDate, tester$TopPlantSurface)
#fit a self-starting logistic growth model using non-linear regression.
plant.ss<-nls(TopPlantSurface ~ SSlogis(DataDate, phi1, phi2, phi3), data=tester)
#phi1: upper asymptote
#phi2: value of x where y=0.5*phi1 (inflection point)
#phi3: rate parameter
#model summary:
summary(plant.ss)
```
```{r do them all}
#write a function to do the logistic growth curve model fits:
log.fit<-function(TopPlantSurface, DataDate, tester){
y<-tester[,"TopPlantSurface"]
x<-tester[,"DataDate"]
log.ss<-nls(y~SSlogis(x, phi1, ph2, phi3))
A<-summary(log.ss)$coef[1] #phi1: upper asymptote
I<-summary(log.ss)$coef[2] #phi2: value of x where y=0.5*phi1 (inflection point)
r<-summary(log.ss)$coef[3] #phi3: rate parameter
plot(y~x, main="logistic function", xlab="Days Since Planting", ylab="TopPlantSurface (pixels)")
lines(0:max(x), predict(log.ss, data.frame(x=0:max(x))), col="red")
out<-data.frame(cbind(c(Asymptoe=A, Inflection=I, rate=r)))
names(out)[1]<-"Logistic Curve"
return(out)
}
log.fit(TopPlantsurface, DataDate, tester)
```
#### Calculate and plot logistic growth curves for all individual plants
Also, extract parameters describing the logistic growth curve for each plant
```{r do_them_all, message=FALSE}
#Set up some empty lists
ID<-NULL
Geno<-NULL
Max<-NULL
Infl<-NULL
Growth<-NULL
#Set up a plot
plot(1, type="n", xlim=c(9,43), ylim=c(0,120000), main="TopPlantSurface", ylab="TopPlantSurface (pixels)", xlab="Days since Planting")
legend(x="topleft", title="Genotype", legend=c("L58 (open circles)", "R500 (filled circles)"),
pch=21, col=c("cyan", "black"), pt.bg=c("white", "magenta"))
for(i in 1:length(j)){
tester<-dat_split[[i]]
y<-tester[,"TopPlantSurface"]
x<-tester[,"DataDate"]
#fit logistic growth curves to each L58 plant, individually:
if(tester[2,2]=="L58"){
log.ss<-nls(y~SSlogis(x, phi1, phi2, phi3))
#extract parameters & confidence intervals
max<-Confint(log.ss)[[1]]
infl<-Confint(log.ss)[[2]]
growth<-Confint(log.ss)[[3]]
#plot L58 genotypes (open circles):
points(y~x, pch=21, col=i)
lines(0:45, predict(log.ss, data.frame(x=0:45)), col=i)
}
#fit logistic growth curves to each R500 plant, individually:
if(tester[2,2]=="R500"){
log.ss<-nls(y~SSlogis(x, phi1, ph2, ph3))
max<-Confint(log.ss)[[1]]
infl<-Confint(log.ss)[[2]]
growth<-Confint(log.ss)[[3]]
#plot R500 genotypes (closed circles):
points(y~x, pch=21, col="black", bg=i)
lines(0:45, predict(log.ss, data.frame(x=0:45)), col=i)
}
geno<-tester$Genotype[1]
ID<-c(ID,tester[1,1])
Geno<-c(Geno,geno)
Max<-c(Max,max)
Infl<-c(Infl, infl)
Growth<-c(Growth, growth)
}
```
#### Satisfied with the fits, time to work with downstream analyses:
Using a "parameters as data" approach.
```{r downstream analyses}
#dataframe of extracted logistic growth parameters
out<-data.frame(cbind(ID, Geno, Max, Infl, Growth))
out$Geno<-as.factor(out$Geno)
#Basic linear models for each parameter:
Maxmod<-lm(Max~Geno, data=out)
summary(Maxmod)
Inflmod<-lm(Infl~Geno, data=out)
summary(Inflmod)
Growthmod<-lm(Growth~Geno, data=out)
summary(Growthmod)
```
#### Documentation and citations:
```{r, citations, echo=FALSE}
cite<-citation()[1]
print(cite, bibtex=FALSE)
shortRversion <- function() {
rvs <- R.version.string
if(grepl("devel", (st <- R.version$status)))
rvs <- sub(paste0(" ",st," "), "-devel_", rvs, fixed=TRUE)
gsub("[()]", "", gsub(" ", "_", sub(" version ", "-", rvs)))
}
shortRversion()
lub<-citation("lubridate")
print(lub, bibtex=FALSE)
car<-citation("car")
print(car, bibtex=FALSE)
plyr<-citation("plyr")
print(plyr, bibtex=FALSE)
tidy<-citation("tidyverse")
print(tidy, bibtex=FALSE)
```