PecanProject · DongchenZ · Jul 22, 2024 · Jul 22, 2024 · Jul 22, 2024 · Jul 22, 2024
diff --git a/base/remote/R/check_qsub_status.R b/base/remote/R/check_qsub_status.R
@@ -2,11 +2,12 @@
 #'
 #' @param run run ID, as an integer
 #' @param qstat (string) qstat command for checking job status
+#' @param verbose Boolean: determine if you want to print out the progress, default is TRUE.
 #' @inheritParams remote.execute.cmd
 #'
 #' @return `TRUE` if run is marked as DONE, otherwise FALSE.
 #' @export
-qsub_run_finished <- function(run, host, qstat) {
+qsub_run_finished <- function(run, host, qstat, verbose = TRUE) {
   if (is.na(run)) {
     PEcAn.logger::logger.warn("Job", run, "encountered an error during submission.",
                               "NOTE that the job will be stamped as 'finished' in BETY.")
@@ -25,7 +26,9 @@ qsub_run_finished <- function(run, host, qstat) {
   }
 
   if (length(out) > 0 && substring(out, nchar(out) - 3) == "DONE") {
-    PEcAn.logger::logger.debug("Job", run, "for run", run_id_string, "finished")
+    if (verbose) {
+      PEcAn.logger::logger.debug("Job", run, "for run", run_id_string, "finished")
+    }
     return(TRUE)
   } else {
     return(FALSE)

diff --git a/base/remote/man/qsub_run_finished.Rd b/base/remote/man/qsub_run_finished.Rd
diff --git a/book_source/02_demos_tutorials_workflows/04_more_web_interface/02_hidden_analyses.Rmd b/book_source/02_demos_tutorials_workflows/04_more_web_interface/02_hidden_analyses.Rmd
@@ -473,6 +473,28 @@ Here is an example of what does a multi-settings pecan xml file look like. The d
     <start.date>2012-07-15</start.date>
     <end.date>2021-07-15</end.date>
   </Obs_Prep>
+  <batch.settings>
+	  <write.config>
+		  <cores>28</cores>
+		  <folder.num>1</folder.num>
+	  </write.config>
+	  <analysis>
+		  <cores>28</cores>
+		  <folder.num>16</folder.num>
+	  </analysis>
+	  <met.split>
+		  <cores>28</cores>
+		  <folder.num>16</folder.num>
+	  </met.split>
+	  <sda.read>
+		  <cores>28</cores>
+		  <folder.num>16</folder.num>
+	  </sda.read>
+	  <general.job>
+		  <cores>28</cores>
+		  <folder.num>16</folder.num>
+	  </general.job>
+  </batch.settings>
   <spin.up>
   	<start.date>2004/01/01</start.date>
 	  <end.date>2006/12/31</end.date>

diff --git a/book_source/03_topical_pages/03_pecan_xml.Rmd b/book_source/03_topical_pages/03_pecan_xml.Rmd
@@ -826,6 +826,28 @@ The following tags can be used for state data assimilation. More detailed inform
    <start.date>2012-07-15</start.date>
    <end.date>2021-07-15</end.date>
   </Obs_Prep>
+  <batch.settings>
+	  <write.config>
+		  <cores>28</cores>
+		  <folder.num>1</folder.num>
+	  </write.config>
+	  <analysis>
+		  <cores>28</cores>
+		  <folder.num>16</folder.num>
+	  </analysis>
+	  <met.split>
+		  <cores>28</cores>
+		  <folder.num>16</folder.num>
+	  </met.split>
+	  <sda.read>
+		  <cores>28</cores>
+		  <folder.num>16</folder.num>
+	  </sda.read>
+	  <general.job>
+		  <cores>28</cores>
+		  <folder.num>16</folder.num>
+	  </general.job>
+  </batch.settings>
   <spin.up>
   	<start.date>2004/01/01</start.date>
 	  <end.date>2006/12/31</end.date>
@@ -853,6 +875,7 @@ The following tags can be used for state data assimilation. More detailed inform
 * **_NOTE:_** If TRUE, you must also assign a vector of trait names to pick.trait.params within the sda.enkf function.
 * **state.variable** : [required] State variable that is to be assimilated (in PEcAn standard format, with pre-specified variable name, unit, and range). Four variables can be assimilated so far: including Aboveground biomass (AbvGrndWood), LAI, SoilMoistFrac, and Soil carbon (TotSoilCarb).
 * **Obs_Prep** : [required] This section will be handled through the SDA_Obs_Assembler function, if you want to proceed with this function, this section is required.
+* **batch.settings** : [optional] This section contains configurations of computation resources to be allocated for each SDA procedure (e.g., the number of CPUs and the number of jobs to be submitted to the cluster). Procedures include splitting meteorology files, writing configuration files, reading SDA outputs, running Bayesian MCMC analysis part, and removing files (e.g., removing NC files after the first SDA run). This configuration will significantly improve the computation efficiency especially when the number of sites goes crazy (e.g., North America runs with 6400 sites), and meanwhile maintaining the minimum usage of memory. The `sda.enkf_Multisite` function will be used if this section is not specified, otherwise the `sda.enkf_NorthAmerica` function will be used.
 * **spin.up** : [required] start.date and end.date for model spin up.
 * **_NOTE:_** start.date and end.date are distinct from values set in the run tag because spin up can be done over a subset of the run.
 * **forecast.time.step** : [optional] start.date and end.date for model spin up.

diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv
@@ -41,6 +41,7 @@
 "doParallel","*","modules/data.atmosphere","Suggests",FALSE
 "doParallel","*","modules/data.remote","Imports",FALSE
 "doSNOW","*","base/remote","Suggests",FALSE
+"doSNOW","*","modules/assim.sequential","Suggests",FALSE
 "dplR","*","modules/data.land","Imports",FALSE
 "dplyr","*","base/qaqc","Imports",FALSE
 "dplyr","*","base/remote","Imports",FALSE
@@ -62,6 +63,7 @@
 "emdbook","*","modules/assim.sequential","Suggests",FALSE
 "exactextractr","*","modules/assim.sequential","Suggests",FALSE
 "foreach","*","base/remote","Imports",FALSE
+"foreach","*","modules/assim.sequential","Imports",FALSE
 "foreach","*","modules/data.atmosphere","Suggests",FALSE
 "foreach","*","modules/data.remote","Imports",FALSE
 "fs","*","base/db","Imports",FALSE
@@ -83,7 +85,7 @@
 "ggmcmc","*","modules/meta.analysis","Suggests",FALSE
 "ggplot2","*","base/utils","Suggests",FALSE
 "ggplot2","*","base/visualization","Imports",FALSE
-"ggplot2","*","modules/assim.sequential","Imports",FALSE
+"ggplot2","*","modules/assim.sequential","Suggests",FALSE
 "ggplot2","*","modules/benchmark","Imports",FALSE
 "ggplot2","*","modules/data.atmosphere","Imports",FALSE
 "ggplot2","*","modules/data.remote","Suggests",FALSE

diff --git a/modules/assim.sequential/DESCRIPTION b/modules/assim.sequential/DESCRIPTION
@@ -12,9 +12,9 @@ Description: The Predictive Ecosystem Carbon Analyzer (PEcAn) is a scientific
 Imports:
     coda,
     dplyr,
+    foreach,
     furrr,
     future,
-    ggplot2,
     lubridate (>= 1.6.0),
     magrittr,
     Matrix,
@@ -32,7 +32,9 @@ Imports:
     stringr
 Suggests:
     corrplot,
+    doSNOW,
     exactextractr,
+    ggplot2,
     ggrepel,
     emdbook,
     glue,
@@ -65,4 +67,4 @@ Suggests:
 License: BSD_3_clause + file LICENSE
 Copyright: Authors
 Encoding: UTF-8
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.2
diff --git a/modules/assim.sequential/NAMESPACE b/modules/assim.sequential/NAMESPACE
@@ -53,6 +53,7 @@ export(sampler_toggle)
 export(sda.enkf)
 export(sda.enkf.multisite)
 export(sda.enkf.original)
+export(sda.enkf_NorthAmerica)
 export(sda_weights_site)
 export(simple.local)
 export(tobit.model)
@@ -63,6 +64,7 @@ import(furrr)
 import(lubridate)
 import(nimble)
 importFrom(dplyr,"%>%")
+importFrom(foreach,"%dopar%")
 importFrom(lubridate,"%m+%")
 importFrom(magrittr,"%>%")
 importFrom(rlang,.data)
diff --git a/modules/assim.sequential/R/Analysis_sda_block.R b/modules/assim.sequential/R/Analysis_sda_block.R
@@ -58,9 +58,16 @@ analysis_sda_block <- function (settings, block.list.all, X, obs.mean, obs.cov,
 
   #parallel for loop over each block.
   PEcAn.logger::logger.info(paste0("Running MCMC ", "for ", length(block.list.all[[t]]), " blocks"))
-  if ("try-error" %in% class(try(block.list.all[[t]] <- furrr::future_map(block.list.all[[t]], MCMC_block_function, .progress = T)))) {
-    PEcAn.logger::logger.severe("Something wrong within the MCMC_block_function function.")
-    return(0)
+  if (!is.null(settings$state.data.assimilation$batch.settings$analysis)) {
+    if ("try-error" %in% class(try(block.list.all[[t]] <- qsub_analysis_submission(settings = settings, block.list = block.list.all[[t]])))) {
+      PEcAn.logger::logger.severe("Something wrong within the qsub_analysis_submission function.")
+      return(0)
+    }
+  } else {
+    if ("try-error" %in% class(try(block.list.all[[t]] <- furrr::future_map(block.list.all[[t]], MCMC_block_function, .progress = T)))) {
+      PEcAn.logger::logger.severe("Something wrong within the MCMC_block_function function.")
+      return(0)
+    }
   }
   PEcAn.logger::logger.info("Completed!")
 
@@ -77,7 +84,8 @@ analysis_sda_block <- function (settings, block.list.all, X, obs.mean, obs.cov,
               mu.a = V$mu.a,
               Pa = V$Pa,
               Y = Y,
-              R = R))
+              R = R,
+              analysis = V$analysis))
 }
 
 ##' @title build.block.xy
@@ -104,7 +112,7 @@ build.block.xy <- function(settings, block.list.all, X, obs.mean, obs.cov, t) {
   }
   #grab basic arguments based on X.
   site.ids <- unique(attributes(X)$Site)
-  var.names <- unique(attributes(X)$dimnames[[2]])
+  var.names <- unique(colnames(X))
   mu.f <- colMeans(X)
   Pf <- stats::cov(X)
   if (length(diag(Pf)[which(diag(Pf)==0)]) > 0) {
@@ -120,7 +128,8 @@ build.block.xy <- function(settings, block.list.all, X, obs.mean, obs.cov, t) {
     `rownames<-`(site.ids)
   #Finding the distance between the sites
   dis.matrix <- sp::spDists(site.locs, longlat = TRUE)
-  if (!is.null(settings$state.data.assimilation$Localization.FUN)) {
+  if (!is.null(settings$state.data.assimilation$Localization.FUN) && 
+      ! as.numeric(settings$state.data.assimilation$scalef) == 0) {
     Localization.FUN <- get(settings$state.data.assimilation$Localization.FUN)
     #turn that into a blocked matrix format
     blocked.dis <- block_matrix(dis.matrix %>% as.numeric(), rep(length(var.names), length(site.ids)))
@@ -430,7 +439,6 @@ MCMC_block_function <- function(block) {
   conf$addSampler(target = samplerLists[[X.mod.ind]]$target, type = "ess",
                   control = list(propCov= block$data$pf, adaptScaleOnly = TRUE,
                                  latents = "X", pfOptimizeNparticles = TRUE))
-
   #add toggle Y sampler.
   for (i in 1:block$constant$YN) {
     conf$addSampler(paste0("y.censored[", i, "]"), 'toggle', control=list(type='RW'))
@@ -615,6 +623,7 @@ block.2.vector <- function (block.list, X, H) {
   site.ids <- attributes(X)$Site
   mu.f <- mu.a <- c()
   Pf <- Pa <- matrix(0, length(site.ids), length(site.ids))
+  analysis <- X
   for (L in block.list) {
     ind <- c()
     for (id in L$site.ids) {
@@ -623,6 +632,12 @@ block.2.vector <- function (block.list, X, H) {
     #convert mu.f and pf
     mu.a[ind] <- mu.f[ind] <- L$update$mufa
     Pa[ind, ind] <- Pf[ind, ind] <- L$update$pfa
+    # MVN sample based on block.
+    sample <- as.data.frame(mvtnorm::rmvnorm(nrow(X), 
+                                             L$update$mufa, 
+                                             L$update$pfa, 
+                                             method = "svd"))
+    analysis[,ind] <- sample
     #convert mu.a and pa
     ind <- intersect(ind, H$H.ind)
     mu.a[ind] <- L$update$mua
@@ -631,5 +646,6 @@ block.2.vector <- function (block.list, X, H) {
   return(list(mu.f = mu.f,
               Pf = Pf,
               mu.a = mu.a,
-              Pa = Pa))
+              Pa = Pa,
+              analysis = analysis))
 }
diff --git a/modules/assim.sequential/R/Multi_Site_Constructors.R b/modules/assim.sequential/R/Multi_Site_Constructors.R
@@ -6,9 +6,6 @@
 ##' @param var.names vector names of state variable names.
 ##' @param X a matrix of state variables. In this matrix rows represent ensembles, while columns show the variables for different sites.
 ##' @param localization.FUN This is the function that performs the localization of the Pf matrix and it returns a localized matrix with the same dimensions.
-##' @param t not used
-##' @param blocked.dis passed to `localization.FUN`
-##' @param ... passed to `localization.FUN`
 ##' @description The argument X needs to have an attribute pointing the state variables to their corresponding site. This attribute needs to be called `Site`.
 ##' At the moment, the cov between state variables at blocks defining the cov between two sites are assumed zero.
 ##' @return It returns the var-cov matrix of state variables at multiple sites.
@@ -27,15 +24,15 @@ Contruct.Pf <- function(site.ids, var.names, X, localization.FUN=NULL, t=1, bloc
   for (site in site.ids){
     #let's find out where this cov (for the current site needs to go in the main cov matrix)
     pos.in.matrix <- which(attr(X,"Site") %in% site)
-   #foreach site let's get the Xs
+    #foreach site let's get the Xs
     pf.matrix [pos.in.matrix, pos.in.matrix] <- stats::cov( X [, pos.in.matrix] ,use="complete.obs")
   }
 
   # This is where we estimate the cov between state variables of different sites
   #I put this into a sperate loop so we can have more control over it
   site.cov.orders <- expand.grid(site.ids,site.ids) %>%
     dplyr::filter( .data$Var1 != .data$Var2)
-
+  
   for (i in seq_len(nrow(site.cov.orders))){
     # first we need to find out where to put it in the big matrix
     rows.in.matrix <- which(attr(X,"Site") %in% site.cov.orders[i,1])
@@ -57,13 +54,13 @@ Contruct.Pf <- function(site.ids, var.names, X, localization.FUN=NULL, t=1, bloc
 
   # adding labels to rownames and colnames
   labelss <- paste0(rep(var.names, length(site.ids)) %>% as.character(),"(",
-         rep(site.ids, each=length(var.names)),")") 
+                    rep(site.ids, each=length(var.names)),")") 
 
   colnames(pf.matrix.out ) <-labelss
   rownames(pf.matrix.out ) <-labelss
 
   return(pf.matrix.out)
-
+  
 }
 
 ##' @title Construct.R
@@ -82,34 +79,59 @@ Contruct.Pf <- function(site.ids, var.names, X, localization.FUN=NULL, t=1, bloc
 ##' @export
 
 Construct.R<-function(site.ids, var.names, obs.t.mean, obs.t.cov){
-
+  # foreach.
+  cores <- parallel::detectCores()
+  cl <- parallel::makeCluster(cores)
+  doSNOW::registerDoSNOW(cl)
+  #progress bar
+  pb <- utils::txtProgressBar(min=1, max=length(site.ids), style=3)
+  progress <- function(n) utils::setTxtProgressBar(pb, n)
+  opts <- list(progress=progress)
+
   # keeps Hs of sites
   site.specific.Rs <-list()
   #
   nsite <- length(site.ids)
   #
   nvariable <- length(var.names)
   Y<-c()
-
-  for (site in site.ids){
-    choose <- sapply(var.names, agrep, x=names(obs.t.mean[[site]]), max=1, USE.NAMES = FALSE) %>% unlist
-    # if there is no obs for this site
-    if(length(choose) == 0){
-      next;
-    }else{
-      Y <- c(Y, unlist(obs.t.mean[[site]][choose]))
-      #collecting them
-      if (ncol(obs.t.mean[[site]]) > 1)
-      {
-        site.specific.Rs <- c(site.specific.Rs, list(as.matrix(obs.t.cov[[site]][choose,choose])))
-      } else {
-        site.specific.Rs <- c(site.specific.Rs, list(as.matrix(obs.t.cov[[site]][choose])))
-      }
+  # fix GitHub checks.
+  site <- NULL
+  res <- foreach::foreach(site = site.ids, 
+                          .packages=c("Kendall", "purrr"), 
+                          .options.snow=opts) %dopar% {
+                            choose <- sapply(var.names, agrep, x=names(obs.t.mean[[site]]), max=1, USE.NAMES = FALSE) %>% unlist
+                            # if there is no obs for this site
+                            if(length(choose) == 0){
+                              return(NA);
+                            }else{
+                              Y <- unlist(obs.t.mean[[site]][choose])
+                              #collecting them
+                              if (ncol(obs.t.mean[[site]]) > 1)
+                              {
+                                site.R <- list(as.matrix(obs.t.cov[[site]][choose,choose]))
+                              } else {
+                                site.R <- list(as.matrix(obs.t.cov[[site]][choose]))
+                              }
+                            }
+                            return(list(site.R = site.R, 
+                                        site.Y = Y,
+                                        choose = choose))
+                          }
+  for (i in seq_along(site.ids)){
+    temp <- res[[i]]
+    if (is.na(temp)) {
+      next
+    } else {
+      Y <- c(Y, unlist(obs.t.mean[[site.ids[i]]][temp$choose]))
+      site.specific.Rs <- c(site.specific.Rs, temp$site.R)
     }
+  }
   #make block matrix out of our collection
   R <- Matrix::bdiag(site.specific.Rs) %>% as.matrix()
-    }
-
+  # stop parallel.
+  parallel::stopCluster(cl)
+  foreach::registerDoSEQ()
   return(list(Y=Y, R=R))
 }