Section 4 Sampling

4.1 Optimal calibration set size identification

In this section we show how the optimal size for a calibration set is identified using the mean squared Euclidean distance ($msd$) between estimates of the probability density functions ($pdfs$) of the whole set of samples and the pdfs of subsets with different sizes.
First we compute the principal components (PCs) of the NIR spectra of the whole set of calibration candidate samples. Then for each component we compute kernel density estimates ($KDE$):

## Apply standard normal variate 
data$spc_snv <- standardNormalVariate(data$spc)

## extract the validation samples into a new set/object
valida <- data[data$set == "validation",]

## remove the validation samples from data
data <- data[data$set == "cal_candidate",]

## --- 2. Perform a principal component analysis  ----
## Compress the data 
pcaall <- orthoProjection(Xr = data$spc_snv, 
                          X2 = NULL, 
                          Yr = NULL, 
                          method = "pca", 
                          pcSelection = list("cumvar", 0.99), 
                          center = TRUE, 
                          scaled = FALSE)
                          
## standardize the socres
pcaall$scores.std <- sweep(pcaall$scores, MARGIN = 2, STATS = pcaall$sc.sdv, FUN = "/")

## compute the max and min of each score for the limits of the density estimations
max.sc <- colMins(pcaall$scores.std)
min.sc <- colMaxs(pcaall$scores.std)

## compute the mean and sd of each score for the comparisons with the samples
mean.sc <- colMeans(pcaall$scores.std)
sd.sc <- colSds(pcaall$scores.std)

# number of points in the density distribution
nxdens <- 500

## matrix where the density values will be stored
ix <- 1
sc.dens <- data.frame(seq(min.sc[ix], max.sc[ix], length = nxdens), 
                      matrix(NA, nxdens, length(min.sc)))
colnames(sc.dens) <- c("x", paste("densc", 1:length(min.sc), sep = ""))

## Kernel density estimates (KDE) of each component 
d.bandwidths <- rep(NA, length(min.sc))
names(d.bandwidths) <- colnames(pcaall$scores.std)
for(i in 1:length(min.sc)){
  idsty <- density(pcaall$scores.std[,i], 
                   bw = "nrd0", 
                   n = nxdens, from = min.sc[i], to = max.sc[i], 
                   kernel = "gaussian")
  sc.dens[,i+1] <- idsty$y
  d.bandwidths[i] <- idsty$bw
}

## Create the vector of different sample set sizes to be tested
css <- seq(10, 400, by = 10)

Now we will iterate over the different calibration set sizes (css). We are going to use the LHS algorithm to select subsets from our set of candiate samples for calibartion. This is done using the (standardized) scores of the principal components (PCs) of the spectra (pcaall$scores.std). For each calibration subset we’ll compute the mean squared Euclidean distance ($msd$) between estimates of the probability density functions ($pdfs$) of the whole set of samples and the $pdfs$ of samples in the subset. The $msd$ is computed using kernel density estimates ($KDE$) of the $pdfs$. To obtain reliable estimates of $msd$ as a function of the sample set size, we will repeat 10 times (repetitions) the whole sampling procedure for each css, the final $msd$ will be the the average of the $msd$s obtained at each iteration. The following pseudo-code summarizes the procedure to compute the $msd$ (see the ‘Calibration samples’ section in our paper for more details):

Input:  PCs of the whole set of candidate samples (p);
        KDEs of the PCs of the whole set of candidate samples;
Output: msd

1 for each repetition do:
2   for each proposed sampling size do:
3     select from the PCs a subset (s);
4     for each component in s and p:
5       compute the msd between KDE(s,component) and KDE(p,component);
6     end
7   end
8 end

9 aggregate the results of the msds obatined for all 
the repetition iterations for each sample set size

First we’ll compute the $msd$s for ecah repetition and we will write each set of results into our working directory.

## Sample with LHS (latin hypercube sampling)

## These three nested loops might take a while
## (aprox. 16 min per repetition, i.e. a bit more than a couple of hours 
## for the whole thing)
## (for reducing the computation time the loops
## can be vectorized using the apply family of functions. 
## Furtheromre parallelization of the loop for the repetitions
## can also be applied)
## We present the computations with nested loops for interpretability 
## reasons

repetitions <- 10

## root name for the results of each iteration
filerootname <- "6pcs_resultsclhs_rep"

## Create the data.frame where the results will be stored 
## at each iteration
results.clhs <-  data.frame(css = css, 
                            msd = rep(NA, length(css)),
                            mndiff = rep(NA, length(css)),
                            sddiff = rep(NA, length(css)))
for(k in 1:repetitions){
  results.clhs[,-1] <- NA
  
  ## Define a file name
  fn <- paste(filerootname, k,".txt", sep = "")
  if(fn %in% list.files()){
    results.clhs <- read.table(fn, header = T, sep = "\t")
  }
  
  iter.p <- 1 + sum(rowSums(!is.na(results.clhs)) == ncol(results.clhs))
  
  for(i in iter.p:length(css)){
    set.seed(k)
    i.calidx <- clhs(x = as.data.frame(pcaall$scores.std),
                     size = css[i], 
                     iter = 10000)
    
    ## Compute the KDEs of each PC
    j.sc.dens <- msd.sc  <- sc.dens 
    for(j in 1:length(min.sc)){
      ## use the same bandwidth (bw) as in the whole set of candidates
      j.sc.dens[,j + 1] <- density(x = pcaall$scores.std[i.calidx,j], 
                                   bw = d.bandwidths[j], 
                                   n = nxdens, 
                                   from = min.sc[j], 
                                   to = max.sc[j], 
                                   kernel = "gaussian")$y
    }
    results.clhs$msd[i] <- mean(colMeans((j.sc.dens[,-1] - sc.dens[,-1])^2, na.rm = T))
    results.clhs$mndiff[i] <- mean(abs(colMeans(pcaall$scores.std[i.calidx,])))
    results.clhs$sddiff[i] <- mean(abs(colSds(pcaall$scores.std[i.calidx,]) - 1))
    
    ## write the results obtained so far...
    if(iter == length(nmsrepsclhs)){
      final.clhs <- final.clhs/iter
      write.table(x = final.clhs, 
                  file = "6pcs_final.clhs.txt", 
                  sep = "\t", 
                  row.names = FALSE)
    }
    
    print(results.clhs[1:i,])
  }
}

After executing the above nested loops, you should have obtained 10 different *.txt files in your working directorty (in this case the root name of the files is 6pcs_resultsclhs_rep[n].txt where [n] represents the repetition number). These files contain the $msd$ results obtained at each repetition iteration. Now we’ll calculate the final $msd$ as the average of the ones obtained at each repetition iteration.

## Specify a file name for the file where the final results will be stored
finalresultsfile <- "6pcs_final.clhs.txt"

nmsrepsclhs <- paste(filerootname, 1:repetitions, ".txt", sep = "")
final.clhs <- 0
for(i in nmsrepsclhs){
  iter <- which(i == nmsrepsclhs)
  results.clhs <- read.table(i, header = T, sep = "\t")
  results.clhs$mndiff <- abs(results.clhs$mndiff)
  final.clhs <- final.clhs + results.clhs

  ## write a table with the final results 
  if(iter == length(nmsrepsclhs)){
    final.clhs <- final.clhs/iter
    write.table(x = final.clhs, 
                file = finalresultsfile, 
                sep = "\t", 
                row.names = FALSE)
  }
}

## Compute the standard devitions of the msd results
final.clhs_sd <- 0
for(i in nmsrepsclhs){
  iter <- which(i == nmsrepsclhs)
  results.clhs <- read.table(i, header = T, sep = "\t")
  results.clhs$mndiff <- abs(results.clhs$mndiff)
  final.clhs_sd <- (results.clhs - final.clhs_sd)^2
  if(iter == length(nmsrepsclhs)){
    final.clhs_sd <- (final.clhs_sd/iter)^0.5
  }
}

After executing the above code, a file with the final $msd$ estimations is writen in your working directory. In addition the standard deviations of the $msd$ for the different set sizes was also computed and stored in the object final.clhs_sd.

4.2 Plotting the results

To plot the $msd$ results using ggplot (Figure 3 in our paper):

final.clhs_plot <- data.frame(final.clhs[,1:2], 
                              sd_lower = final.clhs[,2] - final.clhs_sd[,2],
                              sd_upper = final.clhs[,2] + final.clhs_sd[,2])

p.tmp <- ggplot(final.clhs_plot) + geom_line(aes(x = css, msd, colour = "msd")) + 
  theme_bw() + 
  theme(axis.title.y = element_text(face= "bold.italic", colour = grey(0.2), size=18),
        axis.text.y  = element_text(angle=0, vjust =0.5, hjust =0.5, size=14), 
        legend.title = element_text(colour = "white", size=20)) +
  theme(axis.title.x = element_text(face= "bold", colour = grey(0.2), size=18),
        axis.text.x  = element_text(angle = 0, vjust=0, size=14)) +
  theme(legend.position = "none") + 
  theme(legend.text = element_text(face="bold", colour = grey(0.2), size=18)) +
  labs(y = "msd", x = "Calibration set size") +
  #coord_cartesian(ylim = c(0, 8)) + 
  theme(strip.background = element_rect(fill = "grey"), 
        strip.text.x = element_text(size = 16, colour = "black", angle = 0)) 
        
p.tmp + 
  geom_ribbon(aes(ymin=sd_lower, ymax=sd_upper, x=css, colour = "bands"), alpha = 0.2) +
  scale_colour_manual(name = '', values = c("bands" = NA, "msd" = "black"))

4.3 Final selection of the calibration set

The previous analysis allows to infer an optimal calibration set size. This optimal size is indicated by the point at which no substantial reduction of the $msd$ is observed. In our case this assestment was intuetively done by looking at the $msd$ vs. $css$ plot. We set the optimal calibration set size (ocss) 180 samples:

# Optimal calibration sample set size
ocss <- 180

Once the optimal calibration set size is identified, we can propose/sample different calibration sets (solutions) containing 180 samples. In our paper we proposed 10 different solutions and we selected the one that returned the mimimum $msd$ (this subset will be then used as the final calibration set in later analyses):

## Compute the maximum and minimum of each score for the limits of the
## density estimations
max.sc <- colMins(pcaall$scores.std)
min.sc <- colMaxs(pcaall$scores.std)

## Compute the mean and standard deviation of each score for the comparisons
## with the samples
mean.sc <- colMeans(pcaall$scores.std)
sd.sc <- colSds(pcaall$scores.std)

## Set a number of solutions to test
solutions <- 10

## object where the sample indices of the different solutions will be stored
calidx <- NULL

## object where the msds will be stored
results.clhs.ocss <- rep(NA, length = solutions)

for (i in 1:solutions) {
                    ## set seed for the random number generation
                    set.seed(round(i * exp(4)))
                    
                    ## sample the ith solution
                    i.calidx <- clhs(x = as.data.frame(pcaall$scores.std), size = ocss, iter = 10000)
                    
                    ## store the indices of the selected samples
                    calidx[[i]] <- i.calidx
                    
                    ## Compute the KDEs
                    j.sc.dens <- msd.sc <- sc.dens
                    for (j in 1:length(min.sc)) {
                                        j.sc.dens[, j + 1] <- density(x = pcaall$scores.std[i.calidx, j], bw = d.bandwidths[j], 
                                                            n = nxdens, from = min.sc[j], to = max.sc[j], kernel = "gaussian")$y
                    }
                    ## compute the msds
                    results.clhs.ocss[i] <- mean(colMeans((j.sc.dens[, -1] - sc.dens[, -1])^2, 
                                        na.rm = T))
}

## Identify the index of the best group
plot(results.clhs.ocss)

## best solution
which.min(results.clhs.ocss)

## get the indices of the samples in the final solution
calibration.idx <- calidx[[which.min(results.clhs.ocss)]]

## get the IDs of the samples in the final solution
cal_smpls <- as.character(data$ID[calibration.idx])

Save the IDs of the selected calibration samples into your working directory

writeLines(text = cal_smpls, con = "calibration_samples_ids.txt", sep = "\n")