Section 7 Vis–NIR modelling and predictions
A prediction model based on vis–NIR spectra was fitted for each soil property using the selected optimal calibration subset. Each model was developed using a memory-based learning (MBL) algorithm (from the resemble
package). Please check the paper for additional details on the MBL lagorithm used (see section Vis–NIR modelling and predictions subsections 1-3).
7.1 Calibration and validation
First define some basic aspects of the MBL which will be commmon to all the properties for which NIR modeling is going to be performed.
## Add to the train data a variable indicating to what sampling point each
## sample belongs to. This variable will be used internally by the mbl
## function during the internal validations.
train$samplegroup <- substr(train$ID, 2, 100)
## Dissimilarity metric: partial least squares (pls)
dmetric <- "pls"
## Threshold distances to be tested Note: the number of threshold distances
## tested here is large, it could be reduced
diss2test <- seq(0.3, 1.5, by = 0.05)
## Define the minimum and maximum number of neighbors that must be retained
## for each local model. Example: If with a given threshold distance only 70
## neighbors are selected, then the function will be fored to take the
## especified minimum number of neighbors to be included in the model. In
## this case 120. We also set that the maximum number of neighbors that can
## be included can be all the samples in the calibration set.
kminmax <- c(120, nrow(train$spc))
## Regression method: Weighted average pls (wapls)
rmethod <- "wapls1"
## set the maximum and minimum number of pls factors for the local wapls
## regressions
pls.f <- c(minpls = 5, maxpls = 20)
## Adjust some additional parameters that control some aspects of the mbl
ctrl <- mblControl(sm = dmetric, pcSelection = list("opc", 40), valMethod = "NNv",
returnDiss = TRUE, scaled = FALSE, center = TRUE)
We also create some data.frames
were we are going to store the prediction results for the validation set
## Create a data frame and store the validation results Squared correlation
## coefficient (R2) Root mean squared error (RMSE) Mean error (ME) (particle
## size predictions are also back-transformed to the original space to report
## the R2, RMSE and ME).
valrestuls <- data.frame(Property = c("Ca", "alr_clay", "alr_silt", "Clay",
"Silt", "Sand"), RMSE = NA, R2 = NA, ME = NA)
## Create a data frame and store the variance of the residuals for each layer
## (this will be later used for geostatistical analyses)
residualvariances <- data.frame(Property = c("Ca", "alr_Clay", "alr_Silt"),
layerA = NA, layerB = NA, layersAB = NA)
Now we perfom MBL and predictions for soil exchangeable Ca2+:
## Run the MBL and find the optimal threshold distance for neighbor selection
## and predict Ca for validation - MBL fits a wapls model to each sample in
## Xu using the calibration samples {Yr,Xr} - The internal validation
## predictions these are predictions of the nearest samples of each Xu found
## in Xr - Nearest neighbor validation (NNv) -
sbl_ca <- mbl(Yr = train$Ca, Xr = train$spc, Xu = valida$spc, mblCtrl = ctrl,
group = train$samplegroup, dissUsage = "none", k.diss = diss2test, k.range = kminmax,
pls.c = pls.f, method = rmethod)
sbl_ca
## Best threshold distance
idx.best.ca <- which.min(sbl_ca$nnValStats$st.rmse)
best.kdiss.ca <- sbl_ca$nnValStats$k.diss[idx.best.ca]
## Get the predicted values for the validation set
pred_valCa <- getPredictions(sbl_ca)[, idx.best.ca]
## R2
valrestuls$R2[valrestuls$Property == "Ca"] <- cor(valida$Ca, pred_valCa)^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "Ca"] <- mean((valida$Ca - pred_valCa)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "Ca"] <- mean(valida$Ca - pred_valCa)
## Residual variances layer A
residualvariances$layerA[residualvariances$Property == "Ca"] <- var(valida$Ca[valida$layer ==
"A"] - pred_valCa[valida$layer == "A"])
## layer B
residualvariances$layerB[residualvariances$Property == "Ca"] <- var(valida$Ca[valida$layer ==
"B"] - pred_valCa[valida$layer == "B"])
## layers A and B
residualvariances$layersAB[residualvariances$Property == "Ca"] <- var(valida$Ca -
pred_valCa)
MBL and predictions for \(alr(clay)\)…
sbl_alrclay <- mbl(Yr = train$alr_Clay, Xr = train$spc, Xu = valida$spc, mblCtrl = ctrl,
group = train$samplegroup, dissUsage = "none", k.diss = diss2test, k.range = kminmax,
pls.c = pls.f, method = rmethod)
## Best threshold distance
idx.best.alrclay <- which.min(sbl_alrclay$nnValStats$st.rmse)
best.kdiss.alrclay <- sbl_alrclay$nnValStats$k.diss[idx.best.alrclay]
## Get the predicted values for the validation set
pred_val_alrclay <- getPredictions(sbl_alrclay)[, idx.best.alrclay]
## R2
valrestuls$R2[valrestuls$Property == "alr_clay"] <- cor(valida$alr_Clay, pred_val_alrclay)^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "alr_clay"] <- mean((valida$alr_Clay -
pred_val_alrclay)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "alr_clay"] <- mean(valida$alr_Clay - pred_val_alrclay)
## Residual variances layer A
residualvariances$layerA[residualvariances$Property == "alr_Clay"] <- var(valida$alr_Clay[valida$layer ==
"A"] - pred_val_alrclay[valida$layer == "A"])
## layer B
residualvariances$layerB[residualvariances$Property == "alr_Clay"] <- var(valida$alr_Clay[valida$layer ==
"B"] - pred_val_alrclay[valida$layer == "B"])
## layers A and B
residualvariances$layersAB[residualvariances$Property == "alr_Clay"] <- var(valida$alr_Clay -
pred_val_alrclay)
…and \(alr(silt)\)…
sbl_alrsilt <- mbl(Yr = train$alr_Silt, Xr = train$spc, Xu = valida$spc, mblCtrl = ctrl,
group = train$samplegroup, dissUsage = "none", k.diss = diss2test, k.range = kminmax,
pls.c = pls.f, method = rmethod)
## Best threshold distance
idx.best.alrsilt <- which.min(sbl_alrsilt$nnValStats$st.rmse)
best.kdiss.alrsilt <- sbl_alrsilt$nnValStats$k.diss[idx.best.alrsilt]
## Get the predicted values for the validation set
pred_val_alrsilt <- getPredictions(sbl_alrsilt)[, idx.best.alrsilt]
## R2
valrestuls$R2[valrestuls$Property == "alr_silt"] <- cor(valida$alr_Clay, pred_val_alrsilt)^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "alr_silt"] <- mean((valida$alr_Clay -
pred_val_alrsilt)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "alr_silt"] <- mean(valida$alr_Clay - pred_val_alrsilt)
## Residual variances layer A
residualvariances$layerA[residualvariances$Property == "alr_Silt"] <- var(valida$alr_Silt[valida$layer ==
"A"] - pred_val_alrsilt[valida$layer == "A"])
## layer B
residualvariances$layerB[residualvariances$Property == "alr_Silt"] <- var(valida$alr_Silt[valida$layer ==
"B"] - pred_val_alrsilt[valida$layer == "B"])
## layers A and B
residualvariances$layersAB[residualvariances$Property == "alr_Silt"] <- var(valida$alr_Silt -
pred_val_alrsilt)
Save the table of the variance of the residuals into your working directory. This data will be later used in the spatial analyses.
write.table(x = residualvariances, file = "vnir_residual_variances.txt", sep = "\t",
row.names = FALSE)
To asses the accuracies and precisions of the predictions (in the validation set) for particle-size distribution, we need to back-transform the additive log-ratio transformed variables to the original clay, silt and sand contents:
## Alr back-transformations Clay contents
valClay_alr <- 100 * exp(pred_val_alrclay)/(1 + exp(pred_val_alrclay) + exp(pred_val_alrsilt))
## Silt contents
valSilt_alr <- 100 * exp(pred_val_alrsilt)/(1 + exp(pred_val_alrclay) + exp(pred_val_alrsilt))
## Sand contents
valSand_alr <- 100 * 1/(1 + exp(pred_val_alrclay) + exp(pred_val_alrsilt))
Now we can compute the parameters to asses accuracies and precisions:
## Clay content R2
valrestuls$R2[valrestuls$Property == "Clay"] <- (cor(valida$Clay, valClay_alr))^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "Clay"] <- mean((valida$Clay - valClay_alr)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "Clay"] <- mean(valida$Clay - valClay_alr)
## Silt content R2
valrestuls$R2[valrestuls$Property == "Silt"] <- (cor(valida$Silt, valSilt_alr))^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "Silt"] <- mean((valida$Silt - valSilt_alr)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "Silt"] <- mean(valida$Silt - valSilt_alr)
## Sand content R2
valrestuls$R2[valrestuls$Property == "Sand"] <- (cor(valida$Sand, valSand_alr))^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "Sand"] <- mean((valida$Sand - valSand_alr)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "Sand"] <- mean(valida$Sand - valSand_alr)
Examine the valrestuls
object…
valrestuls
7.2 Property predictions in the prediction set
After validating the vis-NIR models we can apply them to the prediction set.
We can start by creating a data.frame
where the predictions will be stored. In this data.frame
the predicted values will be stored under the following variable names: Ca_spec
, alr_Clay_spec
and alr_Silt_spec
:
vnirpredictions <- data.frame(Ca_spec = rep(NA, nrow(pred)), alr_Clay_spec = rep(NA,
nrow(pred)), alr_Silt_spec = rep(NA, nrow(pred)))
vnirpredictions <- data.frame(Ca_spec = rep(NA, nrow(pred)), alr_Clay_spec = rep(NA,
nrow(pred)), alr_Silt_spec = rep(NA, nrow(pred)))
## Add additional relevant information from the original prediction set
vnirpredictions <- cbind(pred[, c("ID", "POINT_X", "POINT_Y", "set", "Ca", "Clay",
"Silt", "Sand", "alr_Clay", "alr_Silt")], vnirpredictions)
## Re-encode the set variable to 'prediction'
vnirpredictions$set <- factor("prediction")
For exchangeable Ca2+ predictions…
# Predict Ca in the prediction set based on the optimized threshold distance
# for neighbor selection
sbl_ca_pred <- mbl(Yr = train$Ca, Xr = train$spc, Xu = pred$spc, mblCtrl = ctrl,
group = train$samplegroup, dissUsage = "none", k.diss = best.kdiss.ca, k.range = kminmax,
pls.c = pls.f, method = rmethod)
## get the predicted values and store them in nirpredictions
vnirpredictions$Ca_spec <- getPredictions(sbl_ca_pred)[, 1]
For \(alr(clay)\) predictions…
sbl_alrclay_pred <- mbl(Yr = train$alr_Clay, Xr = train$spc, Xu = pred$spc,
mblCtrl = ctrl, group = train$samplegroup, dissUsage = "none", k.diss = best.kdiss.alrclay,
k.range = kminmax, pls.c = pls.f, method = rmethod)
## get the predicted values and store them in nirpredictions
vnirpredictions$alr_Clay_spec <- getPredictions(sbl_alrclay_pred)[, 1]
For \(alr(silt)\) predictions…
sbl_alrsilt_pred <- mbl(Yr = train$alr_Silt, Xr = train$spc, Xu = pred$spc,
mblCtrl = ctrl, group = train$samplegroup, dissUsage = "none", k.diss = best.kdiss.alrsilt,
k.range = kminmax, pls.c = pls.f, method = rmethod)
## get the predicted values and store them in nirpredictions
vnirpredictions$alr_Silt_spec <- getPredictions(sbl_alrsilt_pred)[, 1]
Examine the vnirpredictions
object…
vnirpredictions
summary(vnirpredictions)