Prepare Root type, root architecture data from TRY for use#
The Root type, root architecture data from TRY informs on root growth types.
If you intend to clean more than one or two traits, we recommend the use of the batch pre-processing script. Refer to the TRY main page for details.
If you have questions, suggestions, spot errors, or want to contribute, get in touch with us through planthub@idiv.de.
Author: David Schellenberger Costa
Requirements#
To run the script, the following is needed:
TRY data, available here
the data.table library may need to be installed
Code#
# load in libraries
library(data.table) # handle large datasets
# clear workspace
rm(list = ls())
Let’s get the TRY data
# set working directory (adapt this!)
setwd(paste0(.brd, "PlantHub"))
# read in data (adapt this!)
TRY <- fread("TRY_PlantHub.gz")
# select data of interest
TRYSubset <- TRY[TraitName == "Root type, root architecture"]
To get an overview of the data, we convert values to lowercase, sort them, and show them as a table.
# extract original data strings
oriVals <- TRYSubset$OrigValueStr # oriVals == original values
# change all to lowercase to ease later classification
oriVals <- tolower(oriVals)
# get an overview over the data by summarizing values and showing them in alphabetical order
valueOverview <- table(oriVals)
valueOverview[order(valueOverview)]
We first remove the root complexity data, because it can’t be compared to the other data. For the remainder, some decoding is necessary here. The actual information on what is coded can be found in the References and Comments columns of the respective datasets. Finally, we remove all remaining numeric values.
# decode coded entries
# remove root complexity entries because hard to compare
oriVals[TRYSubset$DatasetID == 115] <- NA
# decode
oriVals[TRYSubset$DatasetID == 37 & oriVals == 1] <- "roots with little absorptive function"
oriVals[TRYSubset$DatasetID == 37 & oriVals == 2] <- "roots adventitious, hair-like"
oriVals[TRYSubset$DatasetID == 37 & oriVals == 3] <- "intermediate, with more than one thickened root"
oriVals[TRYSubset$DatasetID == 37 & oriVals == 4] <- "one dominant, long taproot"
oriVals[TRYSubset$DatasetID == 668 & oriVals == 1] <- NA
oriVals[TRYSubset$DatasetID == 668 & oriVals == 2] <- NA
oriVals[TRYSubset$DatasetID == 668 & oriVals == 3] <- NA
oriVals[TRYSubset$DatasetID == 668 & oriVals == 4] <- "fibrous roots"
oriVals[TRYSubset$DatasetID == 668 & oriVals == 5] <- NA
oriVals[TRYSubset$DatasetID == 668 & oriVals == 6] <- "taproot systems"
oriVals[TRYSubset$DatasetID == 445 & oriVals == "no"] <- "no cluster roots"
oriVals[TRYSubset$DatasetID == 445 & oriVals == "yes"] <- "cluster roots"
oriVals[TRYSubset$DatasetID == 243 & oriVals == "no"] <- "no taproots"
oriVals[TRYSubset$DatasetID == 243 & oriVals == "yes"] <- "taproots"
oriVals[TRYSubset$DatasetID == 319 & oriVals == "no"] <- "no cluster roots"
# remove purely numeric values and others that have no lowercase character included
oriVals[!grepl("[[:lower:]]", oriVals)] <- NA
The most important part of the cleaning process is the definition of the search strings to look for. We use regular expressions in some cases to be more inclusive (or exclusive).
# create a vector containing the search strings to look for
searchNames <- c(
"(^tap ?roots?|root tap|^tap$)",
"heartroot|intermediate|var\\.root",
"fibrous",
"shallow?|lateral",
"adventitious",
"^cluster",
"contractile",
"no taproots",
"no cluster roots",
"absent|no root"
)
We can now search for the strings defined before and give names to the new categories.
# search for the strings defined before
searchResults <- sapply(searchNames, grepl, oriVals)
# name columns of searchResults matrix like corrected searchNames
colnames(searchResults) <- c(
"taproot", "heart-root", "fibrous roots", "shallow roots",
"adventitious roots", "cluster roots", "contractile roots", "no taproot", "no cluster roots", "no root"
)
# show the number of matches to each category
colSums(searchResults)
Let’s have a look at the results.
# show the number of matches to each category
colSums(searchResults)
# show the original entries for which no match was retrieved
sort(table(oriVals[rowSums(searchResults) < 1]))
# show the number of entries that weren't matched to any category
sum(rowSums(searchResults) < 1)
# show the number of entries that were matched to more that one category
sum(rowSums(searchResults) > 1)
As some of the categories are exclusive, we remove ambiguous entries.
# remove contradictory entries
searchResults[
searchResults[, grep("no root", colnames(searchResults))] == TRUE &
rowSums(searchResults) > 1,
] <- FALSE
searchResults[
rowSums(searchResults[, grep("taproot", colnames(searchResults))]) > 1,
grep("taproot", colnames(searchResults))
] <- FALSE
searchResults[
rowSums(searchResults[, grep("cluster", colnames(searchResults))]) > 1,
grep("cluster", colnames(searchResults))
] <- FALSE
We can now use the cleaned results data to create a new data vector.
# use the searchResults matrix to create new value strings by concatenating all data found
for (i in seq_along(searchResultsCols)) {
searchResultsTemp <- searchResults[, colnames(searchResults) %in% searchResultsCols[[i]], drop = FALSE]
newVals[, i] <- sapply(seq_len(nrow(searchResultsTemp)), function(x) {
paste(searchResultsCols[[i]][searchResultsTemp[x, ]], collapse = ",")
})
}
newVals[newVals == ""] <- NA
We can now transfer the data into the TRY data frame.
# integrate into TRY
TRY[TraitName == "Root type, root architecture", CleanedValueStr := newVals]
Although not necessary, we may want to simplify the trait name.
# add classification into whole plant trait or plant part trait to trait name
TRY[TraitName == "Root type, root architecture", TraitName := "Root type"]
Let’s write the data to a file.
# write data
fwrite(TRY, file = paste0("TRY_processed_", Sys.Date(), ".gz"))