Section 7 Vis–NIR modelling and predictions


A prediction model based on vis–NIR spectra was fitted for each soil property using the selected optimal calibration subset. Each model was developed using a memory-based learning (MBL) algorithm (from the resemble package). Please check the paper for additional details on the MBL lagorithm used (see section Vis–NIR modelling and predictions subsections 1-3).

7.1 Calibration and validation

First define some basic aspects of the MBL which will be commmon to all the properties for which NIR modeling is going to be performed.

## Add to the train data a variable indicating to what sampling point each
## sample belongs to. This variable will be used internally by the mbl
## function during the internal validations.
train$samplegroup <- substr(train$ID, 2, 100)

## Dissimilarity metric: partial least squares (pls)
dmetric <- "pls"

## Threshold distances to be tested Note: the number of threshold distances
## tested here is large, it could be reduced
diss2test <- seq(0.3, 1.5, by = 0.05)

## Define the minimum and maximum number of neighbors that must be retained
## for each local model.  Example: If with a given threshold distance only 70
## neighbors are selected, then the function will be fored to take the
## especified minimum number of neighbors to be included in the model. In
## this case 120.  We also set that the maximum number of neighbors that can
## be included can be all the samples in the calibration set.
kminmax <- c(120, nrow(train$spc))

## Regression method: Weighted average pls (wapls)
rmethod <- "wapls1"

## set the maximum and minimum number of pls factors for the local wapls
## regressions
pls.f <- c(minpls = 5, maxpls = 20)

## Adjust some additional parameters that control some aspects of the mbl
ctrl <- mblControl(sm = dmetric, pcSelection = list("opc", 40), valMethod = "NNv", 
                    returnDiss = TRUE, scaled = FALSE, center = TRUE)

We also create some data.frames were we are going to store the prediction results for the validation set

## Create a data frame and store the validation results Squared correlation
## coefficient (R2) Root mean squared error (RMSE) Mean error (ME) (particle
## size predictions are also back-transformed to the original space to report
## the R2, RMSE and ME).
valrestuls <- data.frame(Property = c("Ca", "alr_clay", "alr_silt", "Clay", 
                    "Silt", "Sand"), RMSE = NA, R2 = NA, ME = NA)

## Create a data frame and store the variance of the residuals for each layer
## (this will be later used for geostatistical analyses)
residualvariances <- data.frame(Property = c("Ca", "alr_Clay", "alr_Silt"), 
                    layerA = NA, layerB = NA, layersAB = NA)

Now we perfom MBL and predictions for soil exchangeable Ca2+:

## Run the MBL and find the optimal threshold distance for neighbor selection
## and predict Ca for validation - MBL fits a wapls model to each sample in
## Xu using the calibration samples {Yr,Xr} - The internal validation
## predictions these are predictions of the nearest samples of each Xu found
## in Xr - Nearest neighbor validation (NNv) -
sbl_ca <- mbl(Yr = train$Ca, Xr = train$spc, Xu = valida$spc, mblCtrl = ctrl, 
                    group = train$samplegroup, dissUsage = "none", k.diss = diss2test, k.range = kminmax, 
                    pls.c = pls.f, method = rmethod)
sbl_ca

## Best threshold distance
idx.best.ca <- which.min(sbl_ca$nnValStats$st.rmse)
best.kdiss.ca <- sbl_ca$nnValStats$k.diss[idx.best.ca]

## Get the predicted values for the validation set
pred_valCa <- getPredictions(sbl_ca)[, idx.best.ca]

## R2
valrestuls$R2[valrestuls$Property == "Ca"] <- cor(valida$Ca, pred_valCa)^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "Ca"] <- mean((valida$Ca - pred_valCa)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "Ca"] <- mean(valida$Ca - pred_valCa)

## Residual variances layer A
residualvariances$layerA[residualvariances$Property == "Ca"] <- var(valida$Ca[valida$layer == 
                    "A"] - pred_valCa[valida$layer == "A"])
## layer B
residualvariances$layerB[residualvariances$Property == "Ca"] <- var(valida$Ca[valida$layer == 
                    "B"] - pred_valCa[valida$layer == "B"])
## layers A and B
residualvariances$layersAB[residualvariances$Property == "Ca"] <- var(valida$Ca - 
                    pred_valCa)

MBL and predictions for \(alr(clay)\)

sbl_alrclay <- mbl(Yr = train$alr_Clay, Xr = train$spc, Xu = valida$spc, mblCtrl = ctrl, 
                    group = train$samplegroup, dissUsage = "none", k.diss = diss2test, k.range = kminmax, 
                    pls.c = pls.f, method = rmethod)

## Best threshold distance
idx.best.alrclay <- which.min(sbl_alrclay$nnValStats$st.rmse)
best.kdiss.alrclay <- sbl_alrclay$nnValStats$k.diss[idx.best.alrclay]

## Get the predicted values for the validation set
pred_val_alrclay <- getPredictions(sbl_alrclay)[, idx.best.alrclay]

## R2
valrestuls$R2[valrestuls$Property == "alr_clay"] <- cor(valida$alr_Clay, pred_val_alrclay)^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "alr_clay"] <- mean((valida$alr_Clay - 
                    pred_val_alrclay)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "alr_clay"] <- mean(valida$alr_Clay - pred_val_alrclay)

## Residual variances layer A
residualvariances$layerA[residualvariances$Property == "alr_Clay"] <- var(valida$alr_Clay[valida$layer == 
                    "A"] - pred_val_alrclay[valida$layer == "A"])
## layer B
residualvariances$layerB[residualvariances$Property == "alr_Clay"] <- var(valida$alr_Clay[valida$layer == 
                    "B"] - pred_val_alrclay[valida$layer == "B"])
## layers A and B
residualvariances$layersAB[residualvariances$Property == "alr_Clay"] <- var(valida$alr_Clay - 
                    pred_val_alrclay)

…and \(alr(silt)\)

sbl_alrsilt <- mbl(Yr = train$alr_Silt, Xr = train$spc, Xu = valida$spc, mblCtrl = ctrl, 
                    group = train$samplegroup, dissUsage = "none", k.diss = diss2test, k.range = kminmax, 
                    pls.c = pls.f, method = rmethod)

## Best threshold distance
idx.best.alrsilt <- which.min(sbl_alrsilt$nnValStats$st.rmse)
best.kdiss.alrsilt <- sbl_alrsilt$nnValStats$k.diss[idx.best.alrsilt]

## Get the predicted values for the validation set
pred_val_alrsilt <- getPredictions(sbl_alrsilt)[, idx.best.alrsilt]

## R2
valrestuls$R2[valrestuls$Property == "alr_silt"] <- cor(valida$alr_Clay, pred_val_alrsilt)^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "alr_silt"] <- mean((valida$alr_Clay - 
                    pred_val_alrsilt)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "alr_silt"] <- mean(valida$alr_Clay - pred_val_alrsilt)

## Residual variances layer A
residualvariances$layerA[residualvariances$Property == "alr_Silt"] <- var(valida$alr_Silt[valida$layer == 
                    "A"] - pred_val_alrsilt[valida$layer == "A"])
## layer B
residualvariances$layerB[residualvariances$Property == "alr_Silt"] <- var(valida$alr_Silt[valida$layer == 
                    "B"] - pred_val_alrsilt[valida$layer == "B"])
## layers A and B
residualvariances$layersAB[residualvariances$Property == "alr_Silt"] <- var(valida$alr_Silt - 
                    pred_val_alrsilt)

Save the table of the variance of the residuals into your working directory. This data will be later used in the spatial analyses.

write.table(x = residualvariances, file = "vnir_residual_variances.txt", sep = "\t", 
                    row.names = FALSE)

To asses the accuracies and precisions of the predictions (in the validation set) for particle-size distribution, we need to back-transform the additive log-ratio transformed variables to the original clay, silt and sand contents:

## Alr back-transformations Clay contents
valClay_alr <- 100 * exp(pred_val_alrclay)/(1 + exp(pred_val_alrclay) + exp(pred_val_alrsilt))
## Silt contents
valSilt_alr <- 100 * exp(pred_val_alrsilt)/(1 + exp(pred_val_alrclay) + exp(pred_val_alrsilt))
## Sand contents
valSand_alr <- 100 * 1/(1 + exp(pred_val_alrclay) + exp(pred_val_alrsilt))

Now we can compute the parameters to asses accuracies and precisions:

## Clay content R2
valrestuls$R2[valrestuls$Property == "Clay"] <- (cor(valida$Clay, valClay_alr))^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "Clay"] <- mean((valida$Clay - valClay_alr)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "Clay"] <- mean(valida$Clay - valClay_alr)

## Silt content R2
valrestuls$R2[valrestuls$Property == "Silt"] <- (cor(valida$Silt, valSilt_alr))^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "Silt"] <- mean((valida$Silt - valSilt_alr)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "Silt"] <- mean(valida$Silt - valSilt_alr)

## Sand content R2
valrestuls$R2[valrestuls$Property == "Sand"] <- (cor(valida$Sand, valSand_alr))^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "Sand"] <- mean((valida$Sand - valSand_alr)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "Sand"] <- mean(valida$Sand - valSand_alr)

Examine the valrestuls object…

valrestuls

7.2 Property predictions in the prediction set

After validating the vis-NIR models we can apply them to the prediction set. We can start by creating a data.frame where the predictions will be stored. In this data.frame the predicted values will be stored under the following variable names: Ca_spec, alr_Clay_spec and alr_Silt_spec:

vnirpredictions <- data.frame(Ca_spec = rep(NA, nrow(pred)), alr_Clay_spec = rep(NA, 
                    nrow(pred)), alr_Silt_spec = rep(NA, nrow(pred)))

vnirpredictions <- data.frame(Ca_spec = rep(NA, nrow(pred)), alr_Clay_spec = rep(NA, 
                    nrow(pred)), alr_Silt_spec = rep(NA, nrow(pred)))

## Add additional relevant information from the original prediction set
vnirpredictions <- cbind(pred[, c("ID", "POINT_X", "POINT_Y", "set", "Ca", "Clay", 
                    "Silt", "Sand", "alr_Clay", "alr_Silt")], vnirpredictions)

## Re-encode the set variable to 'prediction'
vnirpredictions$set <- factor("prediction")

For exchangeable Ca2+ predictions…

# Predict Ca in the prediction set based on the optimized threshold distance
# for neighbor selection
sbl_ca_pred <- mbl(Yr = train$Ca, Xr = train$spc, Xu = pred$spc, mblCtrl = ctrl, 
                    group = train$samplegroup, dissUsage = "none", k.diss = best.kdiss.ca, k.range = kminmax, 
                    pls.c = pls.f, method = rmethod)
## get the predicted values and store them in nirpredictions
vnirpredictions$Ca_spec <- getPredictions(sbl_ca_pred)[, 1]

For \(alr(clay)\) predictions…

sbl_alrclay_pred <- mbl(Yr = train$alr_Clay, Xr = train$spc, Xu = pred$spc, 
                    mblCtrl = ctrl, group = train$samplegroup, dissUsage = "none", k.diss = best.kdiss.alrclay, 
                    k.range = kminmax, pls.c = pls.f, method = rmethod)
## get the predicted values and store them in nirpredictions
vnirpredictions$alr_Clay_spec <- getPredictions(sbl_alrclay_pred)[, 1]

For \(alr(silt)\) predictions…

sbl_alrsilt_pred <- mbl(Yr = train$alr_Silt, Xr = train$spc, Xu = pred$spc, 
                    mblCtrl = ctrl, group = train$samplegroup, dissUsage = "none", k.diss = best.kdiss.alrsilt, 
                    k.range = kminmax, pls.c = pls.f, method = rmethod)
## get the predicted values and store them in nirpredictions
vnirpredictions$alr_Silt_spec <- getPredictions(sbl_alrsilt_pred)[, 1]

Examine the vnirpredictions object…

vnirpredictions
summary(vnirpredictions)