From 3b3230abc715c1e53ff5ed82b62b597093f843c5 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Tue, 5 May 2026 14:44:18 -0700 Subject: [PATCH 01/10] feat: createSSURGO abstract WSS ZIP file inventory --- R/createSSURGO.R | 283 +++++++++++++++++++++++++++++------------------ 1 file changed, 175 insertions(+), 108 deletions(-) diff --git a/R/createSSURGO.R b/R/createSSURGO.R index ed127e79..937975f3 100644 --- a/R/createSSURGO.R +++ b/R/createSSURGO.R @@ -25,9 +25,9 @@ #' @param overwrite _logical_. Overwrite by re-extracting if directory already exists? Default: #' `FALSE` #' @param quiet _logical_. Passed to `curl::curl_download()`. -#' +#' #' @details When `db="STATSGO"` the `WHERE` argument is not supported. Allowed `areasymbols` include -#' `"US"` and two-letter state codes e.g. `"WY"` for the Wyoming general soils map. +#' `"US"` and two-letter state codes e.g. `"WY"` for the Wyoming general soils map. #' #' @export #' @@ -42,17 +42,17 @@ #' have prefix `soilmu_` (mapunit), `soilsa_` (survey area), `soilsf_` (special features). There #' will also be a TXT file with prefix `soilsf_` describing any special features. Shapefile names #' then have an `a_` (polygon), `l_` (line), `p_` (point) followed by the soil survey area symbol. -#' +#' #' @return _character_. Paths to downloaded ZIP files (invisibly). May not exist if `remove_zip = #' TRUE`. #' @seealso [createSSURGO()] -downloadSSURGO <- function(WHERE = NULL, +downloadSSURGO <- function(WHERE = NULL, areasymbols = NULL, - destdir = tempdir(), - exdir = destdir, + destdir = tempdir(), + exdir = destdir, include_template = FALSE, db = c('SSURGO', 'STATSGO'), - extract = TRUE, + extract = TRUE, remove_zip = FALSE, overwrite = FALSE, quiet = FALSE) { @@ -117,13 +117,14 @@ downloadSSURGO <- function(WHERE = NULL, dir.create(exdir, recursive = TRUE) } - for (i in seq_along(paths2)) { + res <- lapply(seq_along(paths2), function(i) { ssa <- gsub(".*wss_SSA_(.*)_.*", "\\1", paths2[i]) - if ((!dir.exists(file.path(exdir, ssa)) || overwrite) && - length(utils::unzip(paths2[i], exdir = exdir)) == 0) { + lz <- utils::unzip(paths2[i], list = TRUE) + uz <- utils::unzip(paths2[i], exdir = exdir) + if ((!dir.exists(file.path(exdir, ssa)) || overwrite) && length(uz) == 0) { message(paste('Invalid zipfile:', paths2[i])) } - } + }) if (remove_zip) { file.remove(paths2) @@ -156,13 +157,13 @@ downloadSSURGO <- function(WHERE = NULL, #' @param include_spatial _logical_ or _character_. Include spatial data layers in database? #' Default: `TRUE` inserts all spatial tables. If `include_spatial` is a _character_ vector #' containing table names, only that set are written to file. e.g. `include_spatial=c("mupolygon", -#' "featpoint")` writes only the mapunit polygons and special feature points. +#' "featpoint")` writes only the mapunit polygons and special feature points. #' @param include_tabular _logical_ or _character_. Include tabular data layers in database? #' Default: `TRUE` inserts all tabular tables. If `include_tabular` is a _character_ vector #' containing table names, only that set are written to file. e.g. `include_tabular=c("mapunit", #' "muaggatt")` writes only the `mapunit` and `muaggatt` tables. Note that special feature #' descriptions are stored in table `"featdesc"` and metadata for each soil survey area are stored -#' in `"soil_metadata"` tables. +#' in `"soil_metadata"` tables. #' @param dissolve_field _character_. Dissolve geometries to create MULTIPOLYGON features? Column #' name #' specified is the grouping variable. Default: `NULL` does no aggregation, giving 1 `POLYGON` @@ -187,7 +188,7 @@ downloadSSURGO <- function(WHERE = NULL, #' } createSSURGO <- function(filename = NULL, exdir, - conn = NULL, + conn = NULL, pattern = NULL, include_spatial = TRUE, include_tabular = TRUE, @@ -233,22 +234,28 @@ createSSURGO <- function(filename = NULL, stop("package `sf` is required to write spatial datasets to DBI data sources", call. = FALSE) } - f <- list.files(exdir, recursive = TRUE, pattern = pattern, full.names = TRUE) + layer_names <- .get_spatial_layer_names() - # create and add combined vector datasets: - # "soilmu_a", "soilmu_l", "soilmu_p", "soilsa_a", "soilsf_l", "soilsf_p" - f.shp <- f[grepl(".*\\.shp$", f)] - shp.grp <- do.call('rbind', strsplit(gsub(".*soil([musfa]{2})_([apl])_([a-z]{2}\\d{3}|[a-z]{2})\\.shp", "\\1;\\2;\\3", f.shp), ";", fixed = TRUE)) + f <- list.files( + exdir, + recursive = TRUE, + pattern = pattern, + full.names = TRUE + ) - layer_names <- c(`mu_a` = "mupolygon", `mu_l` = "muline", `mu_p` = "mupoint", - `sa_a` = "sapolygon", `sf_l` = "featline", `sf_p` = "featpoint") + inv <- .inventory_ssurgo_files( + f, + exdir = NULL, + pattern = pattern, + layer_names = layer_names, + include_spatial = include_spatial, + include_tabular = include_tabular, + header = header + ) - if (is.character(include_spatial)) { - idx <- paste0(shp.grp[, 1], "_", shp.grp[, 2]) %in% names(layer_names[layer_names %in% include_spatial]) - shp.grp <- shp.grp[idx, ] - f.shp <- f.shp[idx] - include_spatial <- TRUE - } + # inventory method converts partial sets (character vectors) to logical for include_* args + include_spatial <- inv$include_spatial + include_tabular <- inv$include_tabular if ((missing(conn) || is.null(conn)) && !IS_GPKG) { @@ -264,8 +271,8 @@ createSSURGO <- function(filename = NULL, on.exit(DBI::dbDisconnect(conn)) } - if (nrow(shp.grp) >= 1 && ncol(shp.grp) == 3 && include_spatial) { - f.shp.grp <- split(f.shp, list(feature = shp.grp[, 1], geom = shp.grp[, 2]), drop = TRUE) + if (nrow(inv$shp.grp) >= 1 && ncol(inv$shp.grp) == 3 && include_spatial) { + f.shp.grp <- split(inv$f.shp, list(feature = inv$shp.grp[, 1], geom = inv$shp.grp[, 2]), drop = TRUE) if (IS_DUCKDB) { DBI::dbExecute(conn, "INSTALL spatial; LOAD spatial;") @@ -366,45 +373,14 @@ createSSURGO <- function(filename = NULL, on.exit(DBI::dbDisconnect(conn)) } - # create and add combined tabular datasets - f.txt <- f[grepl(".*\\.txt$", f)] - txt.grp <- gsub("\\.txt", "", basename(f.txt)) - - # explicit handling special feature descriptions -> "featdesc" table - txt.grp[grepl("soilsf_t_", txt.grp)] <- "featdesc" - txt.grp[grepl("soil_metadata_", txt.grp)] <- "soil_metadata" - txt.first <- unique(txt.grp[grep("^sdv|^md*s|^Metadata", txt.grp)]) - - f.txt.grp <- split(f.txt, txt.grp) - f.txt.grp[txt.first] <- lapply(f.txt.grp[txt.first], .subset, 1) - - # get table, column, index lookup tables - mstabn <- f.txt.grp[[which(names(f.txt.grp) %in% c("mstab", "mdstattabs", "MetadataTable"))[1]]][[1]] - mstabcn <- f.txt.grp[[which(names(f.txt.grp) %in% c("mstabcol", "mdstattabcols", "MetadataColumnLookup"))[1]]][[1]] - msidxdn <- f.txt.grp[[which(names(f.txt.grp) %in% c("msidxdet", "mdstatidxdet", "MetadataIndexDetail"))[1]]][[1]] - - if (length(mstabn) >= 1) { - mstab <- read.delim(mstabn[1], sep = "|", stringsAsFactors = FALSE, header = header) - mstab_lut <- c(mstab[[1]], "soil_metadata") - names(mstab_lut) <- c(mstab[[5]], "soil_metadata") - } else { - mstab_lut <- names(f.txt.grp) - names(mstab_lut) <- names(f.txt.grp) - } - - if (is.character(include_tabular)) { - f.txt.grp <- f.txt.grp[names(mstab_lut[mstab_lut %in% include_tabular])] - include_tabular <- TRUE - } - if (include_tabular) { - if (length(mstabcn) >= 1) { - mstabcol <- read.delim(mstabcn[1], sep = "|", stringsAsFactors = FALSE, header = header) + if (length(inv$mstabcn) >= 1) { + mstabcol <- read.delim(inv$mstabcn[1], sep = "|", stringsAsFactors = FALSE, header = header) } - if (length(msidxdn) >= 1) { - msidxdet <- read.delim(msidxdn[1], sep = "|", stringsAsFactors = FALSE, header = header) + if (length(inv$msidxdn) >= 1) { + msidxdet <- read.delim(inv$msidxdn[1], sep = "|", stringsAsFactors = FALSE, header = header) } # build type mapping from SSURGO logicaldatatype metadata @@ -420,70 +396,52 @@ createSSURGO <- function(filename = NULL, .ssurgo_type_map <- NULL } - # helper: coerce columns to schema types from mstabcol metadata - .coerce_ssurgo_types <- function(y, tablename, mstabcol, type_map) { - if (is.null(type_map) || length(mstabcol) < 6) return(y) - col_meta <- mstabcol[mstabcol[[1]] == tablename, c(3L, 6L), drop = FALSE] - for (j in seq_len(nrow(col_meta))) { - col <- col_meta[[1L]][j] - rtyp <- type_map[col_meta[[2L]][j]] - if (is.na(rtyp) || !col %in% names(y)) next - y[[col]] <- switch(rtyp, - character = as.character(y[[col]]), - integer = suppressWarnings(as.integer(as.character(y[[col]]))), - numeric = suppressWarnings(as.numeric(as.character(y[[col]]))), - logical = as.logical(y[[col]]) - ) - } - y - } - - lapply(names(f.txt.grp), function(x) { + lapply(names(inv$f.txt.grp), function(x) { if (!is.null(mstabcol)) { - newnames <- mstabcol[[3]][mstabcol[[1]] == mstab_lut[x]] + newnames <- mstabcol[[3]][mstabcol[[1]] == inv$mstab_lut[x]] } if (!is.null(msidxdet)) { - indexPK <- na.omit(msidxdet[[4]][msidxdet[[1]] == mstab_lut[x] & grepl("PK_", msidxdet[[2]])]) - indexDI <- na.omit(msidxdet[[4]][msidxdet[[1]] == mstab_lut[x] & grepl("DI_", msidxdet[[2]])]) + indexPK <- na.omit(msidxdet[[4]][msidxdet[[1]] == inv$mstab_lut[x] & grepl("PK_", msidxdet[[2]])]) + indexDI <- na.omit(msidxdet[[4]][msidxdet[[1]] == inv$mstab_lut[x] & grepl("DI_", msidxdet[[2]])]) } - d <- try(lapply(seq_along(f.txt.grp[[x]]), function(i) { + d <- try(lapply(seq_along(inv$f.txt.grp[[x]]), function(i) { # message(f.txt.grp[[x]][i]) - y <- try(read.delim(f.txt.grp[[x]][i], sep = "|", stringsAsFactors = FALSE, header = header, + y <- try(read.delim(inv$f.txt.grp[[x]][i], sep = "|", stringsAsFactors = FALSE, header = header, na.strings = c("", "NA")), silent = TRUE) if (inherits(y, 'try-error')) { if (!quiet) { - message("File ", f.txt.grp[[x]][i], " contains no data") + message("File ", inv$f.txt.grp[[x]][i], " contains no data") } return(NULL) } else if (length(y) == 1) { - if (grepl("soil_metadata", f.txt.grp[[x]][i])) { + if (grepl("soil_metadata", inv$f.txt.grp[[x]][i])) { y <- data.frame( - areasymbol = toupper(gsub(".*soil_metadata_(.*)\\.txt", "\\1", f.txt.grp[[x]][i])), + areasymbol = toupper(gsub(".*soil_metadata_(.*)\\.txt", "\\1", inv$f.txt.grp[[x]][i])), content = paste(y[[1]], collapse = "\n") ) } else { y <- data.frame(content = y) } } else { - if (!is.null(mstab) && !header) { # preserve headers if present + if (!is.null(inv$mstab) && !header) { # preserve headers if present colnames(y) <- newnames } # enforce schema types from metadata - y <- .coerce_ssurgo_types(y, mstab_lut[x], mstabcol, .ssurgo_type_map) + y <- .coerce_ssurgo_types(y, inv$mstab_lut[x], mstabcol, .ssurgo_type_map) } - if (is.na(mstab_lut[x])) { + if (is.na(inv$mstab_lut[x])) { # readme, version return(NULL) } # remove deeper rules from cointerp for smaller DB size # most people only use depth==0 (default) - if (mstab_lut[x] == "cointerp" && !is.null(maxruledepth)) { + if (inv$mstab_lut[x] == "cointerp" && !is.null(maxruledepth)) { y <- y[y$ruledepth <= maxruledepth, ] } @@ -493,30 +451,30 @@ createSSURGO <- function(filename = NULL, try({ if (i == 1 && isFALSE(append)) { - DBI::dbWriteTable(conn, mstab_lut[x], y, overwrite = overwrite) + DBI::dbWriteTable(conn, inv$mstab_lut[x], y, overwrite = overwrite) } else { - if (DBI::dbExistsTable(conn, mstab_lut[x]) && x %in% txt.first) { + if (DBI::dbExistsTable(conn, inv$mstab_lut[x]) && x %in% txt.first) { # skip writing sdv/mds* metadata tables to avoid uniqueness issues return(FALSE) } - DBI::dbWriteTable(conn, mstab_lut[x], y, append = TRUE) + DBI::dbWriteTable(conn, inv$mstab_lut[x], y, append = TRUE) } }, silent = quiet) }), silent = quiet) - if (length(mstab_lut[x]) && is.na(mstab_lut[x])) { - mstab_lut[x] <- x + if (length(inv$mstab_lut[x]) && is.na(inv$mstab_lut[x])) { + inv$mstab_lut[x] <- x } - if (length(mstab_lut[x]) && !is.na(mstab_lut[x])) { + if (length(inv$mstab_lut[x]) && !is.na(inv$mstab_lut[x])) { # create pkey indices if (!is.null(indexPK) && length(indexPK) > 0) { try({ q <- sprintf("CREATE UNIQUE INDEX IF NOT EXISTS %s ON %s (%s)", - paste0('PK_', mstab_lut[x]), mstab_lut[x], + paste0('PK_', inv$mstab_lut[x]), inv$mstab_lut[x], paste(indexPK, collapse = ",")) - if (DBI::dbExistsTable(conn, mstab_lut[x])) + if (DBI::dbExistsTable(conn, inv$mstab_lut[x])) DBI::dbExecute(conn, q) }, silent = quiet) } @@ -526,8 +484,8 @@ createSSURGO <- function(filename = NULL, for (i in seq_along(indexDI)) { try({ q <- sprintf("CREATE INDEX IF NOT EXISTS %s ON %s (%s)", - paste0('DI_', mstab_lut[x]), mstab_lut[x], indexDI[i]) - if (DBI::dbExistsTable(conn, mstab_lut[x])) + paste0('DI_', inv$mstab_lut[x]), inv$mstab_lut[x], indexDI[i]) + if (DBI::dbExistsTable(conn, inv$mstab_lut[x])) DBI::dbExecute(conn, q) }, silent = quiet) } @@ -540,9 +498,9 @@ createSSURGO <- function(filename = NULL, try(.gpkg_create_contents(conn)) } # update gpkg_contents table entry - if (DBI::dbExistsTable(conn, mstab_lut[x])) { - try(.gpkg_delete_contents(conn, mstab_lut[x])) - try(.gpkg_add_contents(conn, mstab_lut[x])) + if (DBI::dbExistsTable(conn, inv$mstab_lut[x])) { + try(.gpkg_delete_contents(conn, inv$mstab_lut[x])) + try(.gpkg_add_contents(conn, inv$mstab_lut[x])) } } @@ -557,6 +515,115 @@ createSSURGO <- function(filename = NULL, invisible(res) } +.inventory_ssurgo_files <- function(files = list.files(exdir, + recursive = TRUE, + pattern = pattern, + full.names = TRUE), + exdir = NULL, + pattern = NULL, + layer_names = .get_spatial_layer_names(), + include_spatial = TRUE, + include_tabular = TRUE, + header = FALSE) { + + # create and add combined vector datasets: + # "soilmu_a", "soilmu_l", "soilmu_p", "soilsa_a", "soilsf_l", "soilsf_p" + f.shp <- f[grepl(".*\\.shp$", f)] + shp.grp <- do.call('rbind', strsplit( + gsub( + ".*soil([musfa]{2})_([apl])_([a-z]{2}\\d{3}|[a-z]{2})\\.shp", + "\\1;\\2;\\3", + f.shp + ), + ";", + fixed = TRUE + )) + + if (is.character(include_spatial)) { + idx <- paste0(shp.grp[, 1], "_", shp.grp[, 2]) %in% names(layer_names[layer_names %in% include_spatial]) + shp.grp <- shp.grp[idx, ] + f.shp <- f.shp[idx] + include_spatial <- TRUE + } + + # create and add combined tabular datasets + f.txt <- f[grepl(".*\\.txt$", f)] + txt.grp <- gsub("\\.txt", "", basename(f.txt)) + + # explicit handling special feature descriptions -> "featdesc" table + txt.grp[grepl("soilsf_t_", txt.grp)] <- "featdesc" + txt.grp[grepl("soil_metadata_", txt.grp)] <- "soil_metadata" + txt.first <- unique(txt.grp[grep("^sdv|^md*s|^Metadata", txt.grp)]) + + f.txt.grp <- split(f.txt, txt.grp) + f.txt.grp[txt.first] <- lapply(f.txt.grp[txt.first], .subset, 1) + + # get table, column, index lookup tables + mstabn <- f.txt.grp[[which(names(f.txt.grp) %in% c("mstab", "mdstattabs", "MetadataTable"))[1]]][[1]] + mstabcn <- f.txt.grp[[which(names(f.txt.grp) %in% c("mstabcol", "mdstattabcols", "MetadataColumnLookup"))[1]]][[1]] + msidxdn <- f.txt.grp[[which(names(f.txt.grp) %in% c("msidxdet", "mdstatidxdet", "MetadataIndexDetail"))[1]]][[1]] + + if (length(mstabn) >= 1) { + mstab <- read.delim(mstabn[1], sep = "|", stringsAsFactors = FALSE, header = header) + mstab_lut <- c(mstab[[1]], "soil_metadata") + names(mstab_lut) <- c(mstab[[5]], "soil_metadata") + } else { + mstab_lut <- names(f.txt.grp) + names(mstab_lut) <- names(f.txt.grp) + } + + if (is.character(include_tabular)) { + f.txt.grp <- f.txt.grp[names(mstab_lut[mstab_lut %in% include_tabular])] + include_tabular <- TRUE + } + + list( + f.shp = f.shp, + shp.grp = shp.grp, + include_spatial = include_spatial, + f.txt.grp = f.txt.grp, + txt.first = txt.first, + include_tabular = include_tabular, + mstabn = mstabn, + mstabcn = mstabcn, + msidxdn = msidxdn, + mstab = mstab, + mstab_lut = mstab_lut + ) +} + +.get_spatial_layer_names <- function() { + c( + `mu_a` = "mupolygon", + `mu_l` = "muline", + `mu_p` = "mupoint", + `sa_a` = "sapolygon", + `sf_l` = "featline", + `sf_p` = "featpoint" + ) +} + +# helper: coerce columns to schema types from mstabcol metadata +.coerce_ssurgo_types <- function(y, tablename, mstabcol, type_map) { + if (is.null(type_map) || length(mstabcol) < 6) + return(y) + col_meta <- mstabcol[mstabcol[[1]] == tablename, c(3L, 6L), drop = FALSE] + for (j in seq_len(nrow(col_meta))) { + col <- col_meta[[1L]][j] + rtyp <- type_map[col_meta[[2L]][j]] + if (is.na(rtyp) || !col %in% names(y)) + next + y[[col]] <- switch( + rtyp, + character = as.character(y[[col]]), + integer = suppressWarnings(as.integer(as.character(y[[col]]))), + numeric = suppressWarnings(as.numeric(as.character(y[[col]]))), + logical = as.logical(y[[col]]) + ) + } + y +} + ## From https://github.com/brownag/gpkg ----- #' Add, Remove, Update and Create `gpkg_contents` table and records From e4a8443ddce9130762c6fca41012366d9f2f42b0 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Tue, 5 May 2026 16:13:25 -0700 Subject: [PATCH 02/10] feat(downloadSSURGO): add support for include_spatial and include_tabular using main table names --- R/createSSURGO.R | 97 ++++++++++++++++++++++++++++--------------- man/downloadSSURGO.Rd | 50 ++++++++++++++++------ 2 files changed, 100 insertions(+), 47 deletions(-) diff --git a/R/createSSURGO.R b/R/createSSURGO.R index 937975f3..b1478bf4 100644 --- a/R/createSSURGO.R +++ b/R/createSSURGO.R @@ -8,18 +8,30 @@ #' of `sacatalog` table such as `areasymbol = 'CA067'`, `"areasymbol IN ('CA628', 'CA067')"` or #' `areasymbol LIKE 'CT%'`. #' -#' @param WHERE _character_. A SQL `WHERE` clause expression used to filter records in `sacatalog` table. -#' Alternately `WHERE` can be any spatial object supported by `SDA_spatialQuery()` for defining -#' the target extent. -#' @param areasymbols _character_. Character vector of soil survey area symbols e.g. `c("CA067", "CA077")`. Used -#' in lieu of `WHERE` argument. +#' @param WHERE _character_. A SQL `WHERE` clause expression used to filter records in `sacatalog` +#' table. Alternately `WHERE` can be any spatial object supported by `SDA_spatialQuery()` for +#' defining the target extent. +#' @param areasymbols _character_. Character vector of soil survey area symbols e.g. `c("CA067", +#' "CA077")`. Used in lieu of `WHERE` argument. #' @param destdir _character_. Directory to download ZIP files into. Default `tempdir()`. -#' @param exdir _character_. Directory to extract ZIP archives into. May be a directory that does not yet exist. -#' Each ZIP file will extract to a folder labeled with `areasymbol` in this directory. Default: -#' `destdir` -#' @param include_template _logical_. Include the (possibly state-specific) MS Access template database? -#' Default: `FALSE` -#' @param db _character_. Either `"SSURGO"` (default; detailed soil map) or `"STATSGO"` (general soil map). +#' @param exdir _character_. Directory to extract ZIP archives into. May be a directory that does +#' not yet exist. Each ZIP file will extract to a folder labeled with `areasymbol` in this +#' directory. Default: `destdir` +#' @param include_template _logical_. Include the (possibly state-specific) MS Access template +#' database? Default: `FALSE` +#' @param include_spatial _logical_ or _character_. Extract spatial data layers from ZIP file? +#' Default: `TRUE` inserts all spatial tables. If `include_spatial` is a _character_ vector +#' containing table names, only that set is extracted from the downloaded ZIP files. e.g. +#' `include_spatial=c("mupolygon", "featpoint")` extracts only the shapefiles (with side car +#' files) for mapunit polygons and special feature points. +#' @param include_tabular _logical_ or _character_. Include tabular data layers in database? +#' Default: `TRUE` inserts all tabular tables. If `include_tabular` is a _character_ vector +#' containing table names, only that set is extracted from the downloaded ZIP files. e.g. +#' `include_tabular=c("mapunit", "muaggatt")` writes only the `mapunit` and `muaggatt` tables. +#' Note that special feature descriptions are stored in table `"featdesc"` and metadata for each +#' soil survey area are stored in `"soil_metadata"` tables. +#' @param db _character_. Either `"SSURGO"` (default; detailed soil map) or `"STATSGO"` (general +#' soil map). #' @param extract _logical_. Extract ZIP files to `exdir`? Default: `TRUE` #' @param remove_zip _logical_. Remove ZIP files after extracting? Default: `FALSE` #' @param overwrite _logical_. Overwrite by re-extracting if directory already exists? Default: @@ -28,12 +40,21 @@ #' #' @details When `db="STATSGO"` the `WHERE` argument is not supported. Allowed `areasymbols` include #' `"US"` and two-letter state codes e.g. `"WY"` for the Wyoming general soils map. -#' +#' +#' As in `createSSURGO()`, the `include_spatial` and `include_tabular` arguments either take a +#' logical value (default `TRUE`) or a character vector of the specific table names to include. Note +#' that when used in `downloadSSURGO()` the required metadata files are _always_ extracted to +#' facilitate mapping to user-facing table names. These arguments allow for customizing the files +#' that get extracted from ZIP files, not just filtering on file names (as is implemented with +#' pre-existing `pattern` argument). This can dramatically improve efficiency of extraction and the +#' overall size of the data in `exdir`. These arguments can be used in conjunction with the +#' `pattern` argument to fine-tune the files included in the generated snapshot database. +#' #' @export #' #' @details Pipe-delimited TXT files are found in _/tabular/_ folder extracted from a SSURGO ZIP. -#' The files are named for tables in the SSURGO schema. There is no header / the files do not have -#' column names. See the _Soil Data Access Tables and Columns Report_: +#' The files are named for tables in the SSURGO schema. There is no header and the files do not +#' have column names. See the _Soil Data Access Tables and Columns Report_: #' \url{https://sdmdataaccess.nrcs.usda.gov/documents/TablesAndColumnsReport.pdf} for details on #' tables, column names and metadata including the default sequence of columns used in TXT files. #' The function returns a `try-error` if the `WHERE`/`areasymbols` arguments result in @@ -51,6 +72,8 @@ downloadSSURGO <- function(WHERE = NULL, destdir = tempdir(), exdir = destdir, include_template = FALSE, + include_spatial = TRUE, + include_tabular = TRUE, db = c('SSURGO', 'STATSGO'), extract = TRUE, remove_zip = FALSE, @@ -119,10 +142,25 @@ downloadSSURGO <- function(WHERE = NULL, res <- lapply(seq_along(paths2), function(i) { ssa <- gsub(".*wss_SSA_(.*)_.*", "\\1", paths2[i]) - lz <- utils::unzip(paths2[i], list = TRUE) - uz <- utils::unzip(paths2[i], exdir = exdir) + if (isTRUE(include_spatial) && isTRUE(include_tabular)) { + lz <- NULL + } else { + lz <- utils::unzip(paths2[i], list = TRUE)$Name + # need to pre-extract mstab data to map to real column names + utils::unzip(paths2[i], files = lz[grepl( + "^(mstab|mdstattabs|MetadataTable|mstabcol|mdstattabcol|MetadataColumnLookup|msidxdet|mdstatidxdet|MetadataIndexDetail)$", + tools::file_path_sans_ext(basename(lz)) + )], exdir = exdir) + inv <- .inventory_ssurgo_files(lz, include_spatial = include_spatial, include_tabular = include_tabular) + lz <- unlist(c(inv$f.shp.sc, inv$f.txt.grp)) + } + uz <- utils::unzip(paths2[i], files = lz, exdir = exdir) if ((!dir.exists(file.path(exdir, ssa)) || overwrite) && length(uz) == 0) { message(paste('Invalid zipfile:', paths2[i])) + } else { + if (!quiet) { + message("Extracted: ", paths2[i]) + } } }) @@ -235,18 +273,10 @@ createSSURGO <- function(filename = NULL, } layer_names <- .get_spatial_layer_names() - - f <- list.files( - exdir, - recursive = TRUE, - pattern = pattern, - full.names = TRUE - ) + f <- list.files(exdir, recursive = TRUE, full.names = TRUE) inv <- .inventory_ssurgo_files( - f, - exdir = NULL, - pattern = pattern, + files = f[grepl(pattern, f)], layer_names = layer_names, include_spatial = include_spatial, include_tabular = include_tabular, @@ -453,7 +483,7 @@ createSSURGO <- function(filename = NULL, if (i == 1 && isFALSE(append)) { DBI::dbWriteTable(conn, inv$mstab_lut[x], y, overwrite = overwrite) } else { - if (DBI::dbExistsTable(conn, inv$mstab_lut[x]) && x %in% txt.first) { + if (DBI::dbExistsTable(conn, inv$mstab_lut[x]) && x %in% inv$txt.first) { # skip writing sdv/mds* metadata tables to avoid uniqueness issues return(FALSE) } @@ -515,11 +545,7 @@ createSSURGO <- function(filename = NULL, invisible(res) } -.inventory_ssurgo_files <- function(files = list.files(exdir, - recursive = TRUE, - pattern = pattern, - full.names = TRUE), - exdir = NULL, +.inventory_ssurgo_files <- function(files, pattern = NULL, layer_names = .get_spatial_layer_names(), include_spatial = TRUE, @@ -528,7 +554,7 @@ createSSURGO <- function(filename = NULL, # create and add combined vector datasets: # "soilmu_a", "soilmu_l", "soilmu_p", "soilsa_a", "soilsf_l", "soilsf_p" - f.shp <- f[grepl(".*\\.shp$", f)] + f.shp <- files[grepl(".*\\.shp$", files)] shp.grp <- do.call('rbind', strsplit( gsub( ".*soil([musfa]{2})_([apl])_([a-z]{2}\\d{3}|[a-z]{2})\\.shp", @@ -539,15 +565,17 @@ createSSURGO <- function(filename = NULL, fixed = TRUE )) + f.shp.sc <- character(0) if (is.character(include_spatial)) { idx <- paste0(shp.grp[, 1], "_", shp.grp[, 2]) %in% names(layer_names[layer_names %in% include_spatial]) shp.grp <- shp.grp[idx, ] f.shp <- f.shp[idx] + f.shp.sc <- files[grepl(paste0("soil", shp.grp[1], "_", shp.grp[2], "_", shp.grp[3]), files)] include_spatial <- TRUE } # create and add combined tabular datasets - f.txt <- f[grepl(".*\\.txt$", f)] + f.txt <- files[grepl(".*\\.txt$", files)] txt.grp <- gsub("\\.txt", "", basename(f.txt)) # explicit handling special feature descriptions -> "featdesc" table @@ -580,6 +608,7 @@ createSSURGO <- function(filename = NULL, list( f.shp = f.shp, shp.grp = shp.grp, + f.shp.sc = f.shp.sc, include_spatial = include_spatial, f.txt.grp = f.txt.grp, txt.first = txt.first, diff --git a/man/downloadSSURGO.Rd b/man/downloadSSURGO.Rd index 178335b4..526f3db8 100644 --- a/man/downloadSSURGO.Rd +++ b/man/downloadSSURGO.Rd @@ -10,6 +10,8 @@ downloadSSURGO( destdir = tempdir(), exdir = destdir, include_template = FALSE, + include_spatial = TRUE, + include_tabular = TRUE, db = c("SSURGO", "STATSGO"), extract = TRUE, remove_zip = FALSE, @@ -18,23 +20,36 @@ downloadSSURGO( ) } \arguments{ -\item{WHERE}{\emph{character}. A SQL \code{WHERE} clause expression used to filter records in \code{sacatalog} table. -Alternately \code{WHERE} can be any spatial object supported by \code{SDA_spatialQuery()} for defining -the target extent.} +\item{WHERE}{\emph{character}. A SQL \code{WHERE} clause expression used to filter records in \code{sacatalog} +table. Alternately \code{WHERE} can be any spatial object supported by \code{SDA_spatialQuery()} for +defining the target extent.} -\item{areasymbols}{\emph{character}. Character vector of soil survey area symbols e.g. \code{c("CA067", "CA077")}. Used -in lieu of \code{WHERE} argument.} +\item{areasymbols}{\emph{character}. Character vector of soil survey area symbols e.g. \code{c("CA067", "CA077")}. Used in lieu of \code{WHERE} argument.} \item{destdir}{\emph{character}. Directory to download ZIP files into. Default \code{tempdir()}.} -\item{exdir}{\emph{character}. Directory to extract ZIP archives into. May be a directory that does not yet exist. -Each ZIP file will extract to a folder labeled with \code{areasymbol} in this directory. Default: -\code{destdir}} +\item{exdir}{\emph{character}. Directory to extract ZIP archives into. May be a directory that does +not yet exist. Each ZIP file will extract to a folder labeled with \code{areasymbol} in this +directory. Default: \code{destdir}} -\item{include_template}{\emph{logical}. Include the (possibly state-specific) MS Access template database? -Default: \code{FALSE}} +\item{include_template}{\emph{logical}. Include the (possibly state-specific) MS Access template +database? Default: \code{FALSE}} -\item{db}{\emph{character}. Either \code{"SSURGO"} (default; detailed soil map) or \code{"STATSGO"} (general soil map).} +\item{include_spatial}{\emph{logical} or \emph{character}. Extract spatial data layers from ZIP file? +Default: \code{TRUE} inserts all spatial tables. If \code{include_spatial} is a \emph{character} vector +containing table names, only that set is extracted from the downloaded ZIP files. e.g. +\code{include_spatial=c("mupolygon", "featpoint")} extracts only the shapefiles (with side car +files) for mapunit polygons and special feature points.} + +\item{include_tabular}{\emph{logical} or \emph{character}. Include tabular data layers in database? +Default: \code{TRUE} inserts all tabular tables. If \code{include_tabular} is a \emph{character} vector +containing table names, only that set is extracted from the downloaded ZIP files. e.g. +\code{include_tabular=c("mapunit", "muaggatt")} writes only the \code{mapunit} and \code{muaggatt} tables. +Note that special feature descriptions are stored in table \code{"featdesc"} and metadata for each +soil survey area are stored in \code{"soil_metadata"} tables.} + +\item{db}{\emph{character}. Either \code{"SSURGO"} (default; detailed soil map) or \code{"STATSGO"} (general +soil map).} \item{extract}{\emph{logical}. Extract ZIP files to \code{exdir}? Default: \code{TRUE}} @@ -61,9 +76,18 @@ of \code{sacatalog} table such as \code{areasymbol = 'CA067'}, \code{"areasymbol When \code{db="STATSGO"} the \code{WHERE} argument is not supported. Allowed \code{areasymbols} include \code{"US"} and two-letter state codes e.g. \code{"WY"} for the Wyoming general soils map. +As in \code{createSSURGO()}, the \code{include_spatial} and \code{include_tabular} arguments either take a +logical value (default \code{TRUE}) or a character vector of the specific table names to include. Note +that when used in \code{downloadSSURGO()} the required metadata files are \emph{always} extracted to +facilitate mapping to user-facing table names. These arguments allow for customizing the files +that get extracted from ZIP files, not just filtering on file names (as is implemented with +pre-existing \code{pattern} argument). This can dramatically improve efficiency of extraction and the +overall size of the data in \code{exdir}. These arguments can be used in conjunction with the +\code{pattern} argument to fine-tune the files included in the generated snapshot database. + Pipe-delimited TXT files are found in \emph{/tabular/} folder extracted from a SSURGO ZIP. -The files are named for tables in the SSURGO schema. There is no header / the files do not have -column names. See the \emph{Soil Data Access Tables and Columns Report}: +The files are named for tables in the SSURGO schema. There is no header and the files do not +have column names. See the \emph{Soil Data Access Tables and Columns Report}: \url{https://sdmdataaccess.nrcs.usda.gov/documents/TablesAndColumnsReport.pdf} for details on tables, column names and metadata including the default sequence of columns used in TXT files. The function returns a \code{try-error} if the \code{WHERE}/\code{areasymbols} arguments result in From 500a0d852d20da43e8f55475ac5e1dabffc6dee3 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Tue, 5 May 2026 17:20:43 -0700 Subject: [PATCH 03/10] docs: update NEWS.md --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 0c65b437..ffd233e4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # soilDB 2.9.2 (development) - EDIT base URL (for `get_EDIT_ecoclass_by_geoUnit()` and `make_EDIT_service_URL()`) updated to new USDA-managed server: - SoilWeb-based Web Coverage Services (`soilColor.wcs()`, `ISSR800.wcs()`, `mukey.wcs()`) have been updated with FY26 maps, now including most OCONUS soil surveys (AK, HI, PR, PW, GU, AS, MP) + - `downloadSSURGO()` gains arguments `include_spatial` and `include_tabular` that are analogous to arguments of the same name from `createSSURGO()`. Thanks to feature request from @dylanbeaudette (#470). # soilDB 2.9.1 (2026-04-01) - `ROSETTA()` updates thanks to Todd Skaggs (USDA-ARS): From 281a29a17f3cb00c81fa908f2560c2854cb6ebb8 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Tue, 5 May 2026 17:59:49 -0700 Subject: [PATCH 04/10] docs: reorganize details in downloadSSURGO --- R/createSSURGO.R | 23 +++++++++++------------ man/downloadSSURGO.Rd | 23 +++++++++++------------ 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/R/createSSURGO.R b/R/createSSURGO.R index b1478bf4..d7054cdd 100644 --- a/R/createSSURGO.R +++ b/R/createSSURGO.R @@ -38,18 +38,6 @@ #' `FALSE` #' @param quiet _logical_. Passed to `curl::curl_download()`. #' -#' @details When `db="STATSGO"` the `WHERE` argument is not supported. Allowed `areasymbols` include -#' `"US"` and two-letter state codes e.g. `"WY"` for the Wyoming general soils map. -#' -#' As in `createSSURGO()`, the `include_spatial` and `include_tabular` arguments either take a -#' logical value (default `TRUE`) or a character vector of the specific table names to include. Note -#' that when used in `downloadSSURGO()` the required metadata files are _always_ extracted to -#' facilitate mapping to user-facing table names. These arguments allow for customizing the files -#' that get extracted from ZIP files, not just filtering on file names (as is implemented with -#' pre-existing `pattern` argument). This can dramatically improve efficiency of extraction and the -#' overall size of the data in `exdir`. These arguments can be used in conjunction with the -#' `pattern` argument to fine-tune the files included in the generated snapshot database. -#' #' @export #' #' @details Pipe-delimited TXT files are found in _/tabular/_ folder extracted from a SSURGO ZIP. @@ -63,6 +51,17 @@ #' have prefix `soilmu_` (mapunit), `soilsa_` (survey area), `soilsf_` (special features). There #' will also be a TXT file with prefix `soilsf_` describing any special features. Shapefile names #' then have an `a_` (polygon), `l_` (line), `p_` (point) followed by the soil survey area symbol. +#' When `db="STATSGO"` the `WHERE` argument is not supported. Allowed `areasymbols` include +#' `"US"` and two-letter state codes e.g. `"WY"` for the Wyoming general soils map. +#' +#' As in `createSSURGO()`, the `include_spatial` and `include_tabular` arguments either take a +#' logical value (default `TRUE`) or a character vector of the specific table names to include. Note +#' that when used in `downloadSSURGO()` the required metadata files are _always_ extracted to +#' facilitate mapping to user-facing table names. These arguments allow for customizing the files +#' that get extracted from ZIP files, not just filtering on file names (as is implemented with +#' pre-existing `pattern` argument). This can dramatically improve efficiency of extraction and the +#' overall size of the data in `exdir`. These arguments can be used in conjunction with the +#' `pattern` argument to fine-tune the files included in the generated snapshot database. #' #' @return _character_. Paths to downloaded ZIP files (invisibly). May not exist if `remove_zip = #' TRUE`. diff --git a/man/downloadSSURGO.Rd b/man/downloadSSURGO.Rd index 526f3db8..01663e0a 100644 --- a/man/downloadSSURGO.Rd +++ b/man/downloadSSURGO.Rd @@ -73,18 +73,6 @@ To specify the Soil Survey Areas you would like to obtain data you use a \code{W of \code{sacatalog} table such as \code{areasymbol = 'CA067'}, \code{"areasymbol IN ('CA628', 'CA067')"} or \verb{areasymbol LIKE 'CT\%'}. -When \code{db="STATSGO"} the \code{WHERE} argument is not supported. Allowed \code{areasymbols} include -\code{"US"} and two-letter state codes e.g. \code{"WY"} for the Wyoming general soils map. - -As in \code{createSSURGO()}, the \code{include_spatial} and \code{include_tabular} arguments either take a -logical value (default \code{TRUE}) or a character vector of the specific table names to include. Note -that when used in \code{downloadSSURGO()} the required metadata files are \emph{always} extracted to -facilitate mapping to user-facing table names. These arguments allow for customizing the files -that get extracted from ZIP files, not just filtering on file names (as is implemented with -pre-existing \code{pattern} argument). This can dramatically improve efficiency of extraction and the -overall size of the data in \code{exdir}. These arguments can be used in conjunction with the -\code{pattern} argument to fine-tune the files included in the generated snapshot database. - Pipe-delimited TXT files are found in \emph{/tabular/} folder extracted from a SSURGO ZIP. The files are named for tables in the SSURGO schema. There is no header and the files do not have column names. See the \emph{Soil Data Access Tables and Columns Report}: @@ -96,6 +84,17 @@ Several ESRI shapefiles are found in the \emph{/spatial/} folder extracted from have prefix \code{soilmu_} (mapunit), \code{soilsa_} (survey area), \code{soilsf_} (special features). There will also be a TXT file with prefix \code{soilsf_} describing any special features. Shapefile names then have an \code{a_} (polygon), \code{l_} (line), \code{p_} (point) followed by the soil survey area symbol. +When \code{db="STATSGO"} the \code{WHERE} argument is not supported. Allowed \code{areasymbols} include +\code{"US"} and two-letter state codes e.g. \code{"WY"} for the Wyoming general soils map. + +As in \code{createSSURGO()}, the \code{include_spatial} and \code{include_tabular} arguments either take a +logical value (default \code{TRUE}) or a character vector of the specific table names to include. Note +that when used in \code{downloadSSURGO()} the required metadata files are \emph{always} extracted to +facilitate mapping to user-facing table names. These arguments allow for customizing the files +that get extracted from ZIP files, not just filtering on file names (as is implemented with +pre-existing \code{pattern} argument). This can dramatically improve efficiency of extraction and the +overall size of the data in \code{exdir}. These arguments can be used in conjunction with the +\code{pattern} argument to fine-tune the files included in the generated snapshot database. } \seealso{ \code{\link[=createSSURGO]{createSSURGO()}} From 2abe58baa65b2fe55681dc521aeb04a842ad2b58 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Tue, 5 May 2026 20:02:22 -0700 Subject: [PATCH 05/10] fix(.inventory_ssurgo_files): handling for NULL pattern --- R/createSSURGO.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/createSSURGO.R b/R/createSSURGO.R index d7054cdd..577eee3a 100644 --- a/R/createSSURGO.R +++ b/R/createSSURGO.R @@ -273,9 +273,14 @@ createSSURGO <- function(filename = NULL, layer_names <- .get_spatial_layer_names() f <- list.files(exdir, recursive = TRUE, full.names = TRUE) + fdx <- rep(TRUE, length(f)) + + if (!is.null(pattern)) { + fdx <- grepl(pattern, f) + } inv <- .inventory_ssurgo_files( - files = f[grepl(pattern, f)], + files = f[fdx], layer_names = layer_names, include_spatial = include_spatial, include_tabular = include_tabular, From 157571f2c8e529661092b006f91f43e7073cf0f7 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Tue, 5 May 2026 21:07:24 -0700 Subject: [PATCH 06/10] fix(.inventory_ssurgo_files): pass exdir through --- R/createSSURGO.R | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/R/createSSURGO.R b/R/createSSURGO.R index 577eee3a..59b2346e 100644 --- a/R/createSSURGO.R +++ b/R/createSSURGO.R @@ -150,7 +150,7 @@ downloadSSURGO <- function(WHERE = NULL, "^(mstab|mdstattabs|MetadataTable|mstabcol|mdstattabcol|MetadataColumnLookup|msidxdet|mdstatidxdet|MetadataIndexDetail)$", tools::file_path_sans_ext(basename(lz)) )], exdir = exdir) - inv <- .inventory_ssurgo_files(lz, include_spatial = include_spatial, include_tabular = include_tabular) + inv <- .inventory_ssurgo_files(lz, exdir = exdir, include_spatial = include_spatial, include_tabular = include_tabular) lz <- unlist(c(inv$f.shp.sc, inv$f.txt.grp)) } uz <- utils::unzip(paths2[i], files = lz, exdir = exdir) @@ -281,6 +281,7 @@ createSSURGO <- function(filename = NULL, inv <- .inventory_ssurgo_files( files = f[fdx], + exdir = exdir, layer_names = layer_names, include_spatial = include_spatial, include_tabular = include_tabular, @@ -420,7 +421,7 @@ createSSURGO <- function(filename = NULL, # build type mapping from SSURGO logicaldatatype metadata # mstabcol columns: 1=tabphyname, 2=colsequence, 3=colphyname, 4=collogname, # 5=uomabbrev, 6=logicaldatatype, 7=notnull, 8=fieldsize - if (exists("mstabcol") && length(mstabcol) >= 6) { + if (length(mstabcol) >= 6) { .ssurgo_type_map <- c( String = "character", Choice = "character", Vtext = "character", `Date/Time` = "character", @@ -550,6 +551,7 @@ createSSURGO <- function(filename = NULL, } .inventory_ssurgo_files <- function(files, + exdir, pattern = NULL, layer_names = .get_spatial_layer_names(), include_spatial = TRUE, @@ -596,7 +598,12 @@ createSSURGO <- function(filename = NULL, msidxdn <- f.txt.grp[[which(names(f.txt.grp) %in% c("msidxdet", "mdstatidxdet", "MetadataIndexDetail"))[1]]][[1]] if (length(mstabn) >= 1) { - mstab <- read.delim(mstabn[1], sep = "|", stringsAsFactors = FALSE, header = header) + mstab <- read.delim( + file.path(exdir, mstabn[1]), + sep = "|", + stringsAsFactors = FALSE, + header = header + ) mstab_lut <- c(mstab[[1]], "soil_metadata") names(mstab_lut) <- c(mstab[[5]], "soil_metadata") } else { From 47801eeb5afb35099c9c6813f5bd417250e56752 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Tue, 5 May 2026 21:28:13 -0700 Subject: [PATCH 07/10] fixup!: fix(.inventory_ssurgo_files): pass exdir through --- R/createSSURGO.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/createSSURGO.R b/R/createSSURGO.R index 59b2346e..0a44554a 100644 --- a/R/createSSURGO.R +++ b/R/createSSURGO.R @@ -281,7 +281,6 @@ createSSURGO <- function(filename = NULL, inv <- .inventory_ssurgo_files( files = f[fdx], - exdir = exdir, layer_names = layer_names, include_spatial = include_spatial, include_tabular = include_tabular, From 3a1158345dc7b9a8acb558a0729d9d56d000bbec Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Tue, 5 May 2026 22:28:20 -0700 Subject: [PATCH 08/10] fix(.inventory_ssurgo_files): spatial layer pattern --- R/createSSURGO.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/createSSURGO.R b/R/createSSURGO.R index 0a44554a..997b7933 100644 --- a/R/createSSURGO.R +++ b/R/createSSURGO.R @@ -575,7 +575,10 @@ createSSURGO <- function(filename = NULL, idx <- paste0(shp.grp[, 1], "_", shp.grp[, 2]) %in% names(layer_names[layer_names %in% include_spatial]) shp.grp <- shp.grp[idx, ] f.shp <- f.shp[idx] - f.shp.sc <- files[grepl(paste0("soil", shp.grp[1], "_", shp.grp[2], "_", shp.grp[3]), files)] + f.shp.sc <- files[grepl(paste0( + paste0("soil", shp.grp[, 1], "_", shp.grp[, 2], "_", shp.grp[, 3]), + collapse = "|" + ), files)] include_spatial <- TRUE } From 576908a63e3a1fb936107696fa8fa10c304e2627 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Tue, 5 May 2026 22:37:50 -0700 Subject: [PATCH 09/10] fix: no exdir for createSSURGO --- R/createSSURGO.R | 210 ++++++++++++++++++++++++----------------------- 1 file changed, 107 insertions(+), 103 deletions(-) diff --git a/R/createSSURGO.R b/R/createSSURGO.R index 997b7933..58db0d93 100644 --- a/R/createSSURGO.R +++ b/R/createSSURGO.R @@ -53,7 +53,7 @@ #' then have an `a_` (polygon), `l_` (line), `p_` (point) followed by the soil survey area symbol. #' When `db="STATSGO"` the `WHERE` argument is not supported. Allowed `areasymbols` include #' `"US"` and two-letter state codes e.g. `"WY"` for the Wyoming general soils map. -#' +#' #' As in `createSSURGO()`, the `include_spatial` and `include_tabular` arguments either take a #' logical value (default `TRUE`) or a character vector of the specific table names to include. Note #' that when used in `downloadSSURGO()` the required metadata files are _always_ extracted to @@ -62,7 +62,7 @@ #' pre-existing `pattern` argument). This can dramatically improve efficiency of extraction and the #' overall size of the data in `exdir`. These arguments can be used in conjunction with the #' `pattern` argument to fine-tune the files included in the generated snapshot database. -#' +#' #' @return _character_. Paths to downloaded ZIP files (invisibly). May not exist if `remove_zip = #' TRUE`. #' @seealso [createSSURGO()] @@ -78,43 +78,43 @@ downloadSSURGO <- function(WHERE = NULL, remove_zip = FALSE, overwrite = FALSE, quiet = FALSE) { - + db <- match.arg(toupper(db), c('SSURGO', 'STATSGO')) - + if (!is.null(WHERE) && db == "STATSGO") { stop('custom WHERE clause not supported with db="STATSGO"', call. = FALSE) } - + if (!is.null(areasymbols) && db == "STATSGO") { WHERE <- areasymbols } - + if (is.null(WHERE) && is.null(areasymbols)) { stop('must specify either `WHERE` or `areasymbols` argument', call. = FALSE) } - + if (is.null(WHERE) && !is.null(areasymbols)) { WHERE <- sprintf("areasymbol IN %s", format_SQL_in_statement(areasymbols)) } - + if (!is.character(WHERE)) { # attempt passing WHERE to SDA_spatialQuery res <- suppressMessages(SDA_spatialQuery(WHERE, what = 'areasymbol')) WHERE <- paste("areasymbol IN", format_SQL_in_statement(res$areasymbol)) } - + # make WSS download URLs from areasymbol, template, date urls <- .make_WSS_download_url(WHERE, include_template = include_template, db = db) - + if (inherits(urls, 'try-error')) { message(urls[1]) return(invisible(urls)) } - + if (!dir.exists(destdir)) { dir.create(destdir, recursive = TRUE) } - + # download files for (i in seq_along(urls)) { destfile <- file.path(destdir, basename(urls[i])) @@ -122,23 +122,23 @@ downloadSSURGO <- function(WHERE = NULL, try(curl::curl_download(urls[i], destfile = destfile, quiet = quiet, mode = "wb", handle = .soilDB_curl_handle()), silent = quiet) } } - + paths <- list.files(destdir, pattern = "\\.zip$", full.names = TRUE) paths2 <- paths[grep(".*wss_(SSA|gsmsoil)_(.*)_.*", paths)] - + if (extract) { if (!quiet) { message("Extracting downloaded ZIP files...") } - + if (length(paths2) == 0) { stop("Could not find SSURGO ZIP files in `destdir`: ", destdir, call. = FALSE) } - + if (!dir.exists(exdir)) { dir.create(exdir, recursive = TRUE) } - + res <- lapply(seq_along(paths2), function(i) { ssa <- gsub(".*wss_SSA_(.*)_.*", "\\1", paths2[i]) if (isTRUE(include_spatial) && isTRUE(include_tabular)) { @@ -162,15 +162,15 @@ downloadSSURGO <- function(WHERE = NULL, } } }) - + if (remove_zip) { file.remove(paths2) } } - + invisible(paths2) } - + #' Create a database from SSURGO Exports #' #' The following database types are tested and fully supported: @@ -194,13 +194,13 @@ downloadSSURGO <- function(WHERE = NULL, #' @param include_spatial _logical_ or _character_. Include spatial data layers in database? #' Default: `TRUE` inserts all spatial tables. If `include_spatial` is a _character_ vector #' containing table names, only that set are written to file. e.g. `include_spatial=c("mupolygon", -#' "featpoint")` writes only the mapunit polygons and special feature points. +#' "featpoint")` writes only the mapunit polygons and special feature points. #' @param include_tabular _logical_ or _character_. Include tabular data layers in database? #' Default: `TRUE` inserts all tabular tables. If `include_tabular` is a _character_ vector #' containing table names, only that set are written to file. e.g. `include_tabular=c("mapunit", #' "muaggatt")` writes only the `mapunit` and `muaggatt` tables. Note that special feature #' descriptions are stored in table `"featdesc"` and metadata for each soil survey area are stored -#' in `"soil_metadata"` tables. +#' in `"soil_metadata"` tables. #' @param dissolve_field _character_. Dissolve geometries to create MULTIPOLYGON features? Column #' name #' specified is the grouping variable. Default: `NULL` does no aggregation, giving 1 `POLYGON` @@ -236,16 +236,16 @@ createSSURGO <- function(filename = NULL, header = FALSE, quiet = TRUE, ...) { - + if ((missing(filename) || length(filename) == 0) && missing(conn)) { stop("`filename` should be a path to a .gpkg or .sqlite file to create or append to, or a DBIConnection should be provided via `conn`.") } - + if (missing(conn) || is.null(conn)) { # delete existing file if overwrite=TRUE; does _not_ apply to DBIConnection if (file.exists(filename)) { if (isTRUE(overwrite) && isFALSE(append)) { - file.remove(filename) + file.remove(filename) } else if (isTRUE(overwrite) && isTRUE(append)) { stop("Both overwrite=TRUE and append=TRUE; set only one argument to TRUE", call. = FALSE) } else if (isFALSE(overwrite) && isFALSE(append)) { @@ -253,10 +253,10 @@ createSSURGO <- function(filename = NULL, } } } - + # DuckDB has special spatial format, so it gets custom handling for IS_DUCKDB <- inherits(conn, "duckdb_connection") - + if (inherits(conn, 'SQLiteConnection')) { IS_GPKG <- grepl("\\.gpkg$", conn@dbname, ignore.case = TRUE)[1] filename <- conn@dbname @@ -266,19 +266,19 @@ createSSURGO <- function(filename = NULL, IS_GPKG <- FALSE } } - + if (!IS_DUCKDB && !requireNamespace("sf")) { stop("package `sf` is required to write spatial datasets to DBI data sources", call. = FALSE) - } - + } + layer_names <- .get_spatial_layer_names() f <- list.files(exdir, recursive = TRUE, full.names = TRUE) fdx <- rep(TRUE, length(f)) - + if (!is.null(pattern)) { fdx <- grepl(pattern, f) } - + inv <- .inventory_ssurgo_files( files = f[fdx], layer_names = layer_names, @@ -286,32 +286,32 @@ createSSURGO <- function(filename = NULL, include_tabular = include_tabular, header = header ) - + # inventory method converts partial sets (character vectors) to logical for include_* args include_spatial <- inv$include_spatial include_tabular <- inv$include_tabular - + if ((missing(conn) || is.null(conn)) && !IS_GPKG) { - + if (!requireNamespace("RSQLite")) { stop("package 'RSQLite' is required (when `conn` is not specified)", call. = FALSE) } - + conn <- DBI::dbConnect(DBI::dbDriver("SQLite"), - filename, + filename, loadable.extensions = TRUE) - + # if user did not specify their own connection, close on exit on.exit(DBI::dbDisconnect(conn)) - } - + } + if (nrow(inv$shp.grp) >= 1 && ncol(inv$shp.grp) == 3 && include_spatial) { f.shp.grp <- split(inv$f.shp, list(feature = inv$shp.grp[, 1], geom = inv$shp.grp[, 2]), drop = TRUE) - + if (IS_DUCKDB) { DBI::dbExecute(conn, "INSTALL spatial; LOAD spatial;") } - + for (i in seq_along(f.shp.grp)) { for (j in seq_along(f.shp.grp[[i]])) { lnm <- layer_names[match(gsub(".*soil([musfa]{2}_[apl])_.*", "\\1", f.shp.grp[[i]][j]), @@ -327,10 +327,10 @@ createSSURGO <- function(filename = NULL, } } else { shp <- sf::read_sf(f.shp.grp[[i]][j]) - + colnames(shp) <- tolower(colnames(shp)) sf::st_geometry(shp) <- "geometry" - + .st_write_sf_conn <- function(x, dsn, layer, j, overwrite) { if ((i == 1 && j == 1) && isFALSE(append)) { sf::write_sf( @@ -350,7 +350,7 @@ createSSURGO <- function(filename = NULL, ) } } - + # dissolve on dissolve_field # TODO: add nationalmusym to spatial layer? if (!is.null(dissolve_field) && dissolve_field %in% colnames(shp)) { @@ -366,20 +366,20 @@ createSSURGO <- function(filename = NULL, } y[1] }, do_union = TRUE) - + shp[[paste0(dissolve_field, ".1")]] <- NULL - + destgeom <- "MULTIPOLYGON" if (any(sf::st_geometry_type(shp) %in% c("POINT", "MULTIPOINT"))) { destgeom <- "MULTIPOINT" } else if (any(sf::st_geometry_type(shp) %in% c("LINESTRING", "MULTILINESTRING"))) { destgeom <- "MULTILINESTRING" } - + shp <- sf::st_cast(shp, destgeom) } } - + if (IS_GPKG && missing(conn)) { # writing to SQLiteConnection fails to create proper gpkg_contents entries # so use the path for GPKG only @@ -392,27 +392,27 @@ createSSURGO <- function(filename = NULL, } } } - + if (IS_GPKG) { - + if (!requireNamespace("RSQLite")) { stop("package 'RSQLite' is required (when `conn` is not specified)", call. = FALSE) } - + conn <- DBI::dbConnect(DBI::dbDriver("SQLite"), - filename, + filename, loadable.extensions = TRUE) - + # if user did not specify their own connection, close on exit on.exit(DBI::dbDisconnect(conn)) - } - + } + if (include_tabular) { if (length(inv$mstabcn) >= 1) { mstabcol <- read.delim(inv$mstabcn[1], sep = "|", stringsAsFactors = FALSE, header = header) } - + if (length(inv$msidxdn) >= 1) { msidxdet <- read.delim(inv$msidxdn[1], sep = "|", stringsAsFactors = FALSE, header = header) } @@ -422,7 +422,7 @@ createSSURGO <- function(filename = NULL, # 5=uomabbrev, 6=logicaldatatype, 7=notnull, 8=fieldsize if (length(mstabcol) >= 6) { .ssurgo_type_map <- c( - String = "character", Choice = "character", Vtext = "character", + String = "character", Choice = "character", Vtext = "character", `Date/Time` = "character", Integer = "integer", Float = "numeric", Boolean = "logical" ) @@ -431,21 +431,21 @@ createSSURGO <- function(filename = NULL, } lapply(names(inv$f.txt.grp), function(x) { - + if (!is.null(mstabcol)) { newnames <- mstabcol[[3]][mstabcol[[1]] == inv$mstab_lut[x]] } - + if (!is.null(msidxdet)) { indexPK <- na.omit(msidxdet[[4]][msidxdet[[1]] == inv$mstab_lut[x] & grepl("PK_", msidxdet[[2]])]) indexDI <- na.omit(msidxdet[[4]][msidxdet[[1]] == inv$mstab_lut[x] & grepl("DI_", msidxdet[[2]])]) } - + d <- try(lapply(seq_along(inv$f.txt.grp[[x]]), function(i) { # message(f.txt.grp[[x]][i]) y <- try(read.delim(inv$f.txt.grp[[x]][i], sep = "|", stringsAsFactors = FALSE, header = header, na.strings = c("", "NA")), silent = TRUE) - + if (inherits(y, 'try-error')) { if (!quiet) { message("File ", inv$f.txt.grp[[x]][i], " contains no data") @@ -472,59 +472,59 @@ createSSURGO <- function(filename = NULL, # readme, version return(NULL) } - + # remove deeper rules from cointerp for smaller DB size # most people only use depth==0 (default) if (inv$mstab_lut[x] == "cointerp" && !is.null(maxruledepth)) { y <- y[y$ruledepth <= maxruledepth, ] } - + if ("musym" %in% colnames(y)) { y$musym <- as.character(y$musym) } - + try({ if (i == 1 && isFALSE(append)) { DBI::dbWriteTable(conn, inv$mstab_lut[x], y, overwrite = overwrite) } else { if (DBI::dbExistsTable(conn, inv$mstab_lut[x]) && x %in% inv$txt.first) { - # skip writing sdv/mds* metadata tables to avoid uniqueness issues + # skip writing sdv/mds* metadata tables to avoid uniqueness issues return(FALSE) } DBI::dbWriteTable(conn, inv$mstab_lut[x], y, append = TRUE) } }, silent = quiet) }), silent = quiet) - + if (length(inv$mstab_lut[x]) && is.na(inv$mstab_lut[x])) { inv$mstab_lut[x] <- x } - + if (length(inv$mstab_lut[x]) && !is.na(inv$mstab_lut[x])) { - + # create pkey indices if (!is.null(indexPK) && length(indexPK) > 0) { try({ - q <- sprintf("CREATE UNIQUE INDEX IF NOT EXISTS %s ON %s (%s)", - paste0('PK_', inv$mstab_lut[x]), inv$mstab_lut[x], + q <- sprintf("CREATE UNIQUE INDEX IF NOT EXISTS %s ON %s (%s)", + paste0('PK_', inv$mstab_lut[x]), inv$mstab_lut[x], paste(indexPK, collapse = ",")) if (DBI::dbExistsTable(conn, inv$mstab_lut[x])) DBI::dbExecute(conn, q) }, silent = quiet) } - + # create key indices if (!is.null(indexDI) && length(indexDI) > 0) { for (i in seq_along(indexDI)) { try({ - q <- sprintf("CREATE INDEX IF NOT EXISTS %s ON %s (%s)", + q <- sprintf("CREATE INDEX IF NOT EXISTS %s ON %s (%s)", paste0('DI_', inv$mstab_lut[x]), inv$mstab_lut[x], indexDI[i]) if (DBI::dbExistsTable(conn, inv$mstab_lut[x])) DBI::dbExecute(conn, q) }, silent = quiet) } } - + # for GPKG output, add gpkg_contents (metadata for features and attributes) if (IS_GPKG) { if (!.gpkg_has_contents(conn)) { @@ -537,28 +537,28 @@ createSSURGO <- function(filename = NULL, try(.gpkg_add_contents(conn, inv$mstab_lut[x])) } } - + # TODO: other foreign keys/relationships? ALTER TABLE/ADD CONSTRAINT not available in SQLite # the only way to add a foreign key is via CREATE TABLE which means refactoring above two # steps into a single SQL statement (create table with primary and foreign keys) } }) } - + res <- DBI::dbListTables(conn) invisible(res) } .inventory_ssurgo_files <- function(files, - exdir, + exdir = NULL, pattern = NULL, layer_names = .get_spatial_layer_names(), include_spatial = TRUE, include_tabular = TRUE, header = FALSE) { - + # create and add combined vector datasets: - # "soilmu_a", "soilmu_l", "soilmu_p", "soilsa_a", "soilsf_l", "soilsf_p" + # "soilmu_a", "soilmu_l", "soilmu_p", "soilsa_a", "soilsf_l", "soilsf_p" f.shp <- files[grepl(".*\\.shp$", files)] shp.grp <- do.call('rbind', strsplit( gsub( @@ -569,7 +569,7 @@ createSSURGO <- function(filename = NULL, ";", fixed = TRUE )) - + f.shp.sc <- character(0) if (is.character(include_spatial)) { idx <- paste0(shp.grp[, 1], "_", shp.grp[, 2]) %in% names(layer_names[layer_names %in% include_spatial]) @@ -581,43 +581,47 @@ createSSURGO <- function(filename = NULL, ), files)] include_spatial <- TRUE } - + # create and add combined tabular datasets f.txt <- files[grepl(".*\\.txt$", files)] txt.grp <- gsub("\\.txt", "", basename(f.txt)) - + # explicit handling special feature descriptions -> "featdesc" table txt.grp[grepl("soilsf_t_", txt.grp)] <- "featdesc" txt.grp[grepl("soil_metadata_", txt.grp)] <- "soil_metadata" txt.first <- unique(txt.grp[grep("^sdv|^md*s|^Metadata", txt.grp)]) - + f.txt.grp <- split(f.txt, txt.grp) f.txt.grp[txt.first] <- lapply(f.txt.grp[txt.first], .subset, 1) - + # get table, column, index lookup tables mstabn <- f.txt.grp[[which(names(f.txt.grp) %in% c("mstab", "mdstattabs", "MetadataTable"))[1]]][[1]] mstabcn <- f.txt.grp[[which(names(f.txt.grp) %in% c("mstabcol", "mdstattabcols", "MetadataColumnLookup"))[1]]][[1]] msidxdn <- f.txt.grp[[which(names(f.txt.grp) %in% c("msidxdet", "mdstatidxdet", "MetadataIndexDetail"))[1]]][[1]] - + if (length(mstabn) >= 1) { - mstab <- read.delim( - file.path(exdir, mstabn[1]), - sep = "|", - stringsAsFactors = FALSE, - header = header - ) + mstab <- read.delim( + ifelse( + is.null(exdir), + yes = mstabn[1], + no = file.path(exdir, mstabn[1]) + ), + sep = "|", + stringsAsFactors = FALSE, + header = header + ) mstab_lut <- c(mstab[[1]], "soil_metadata") names(mstab_lut) <- c(mstab[[5]], "soil_metadata") } else { mstab_lut <- names(f.txt.grp) names(mstab_lut) <- names(f.txt.grp) } - + if (is.character(include_tabular)) { f.txt.grp <- f.txt.grp[names(mstab_lut[mstab_lut %in% include_tabular])] include_tabular <- TRUE } - + list( f.shp = f.shp, shp.grp = shp.grp, @@ -644,7 +648,7 @@ createSSURGO <- function(filename = NULL, `sf_p` = "featpoint" ) } - + # helper: coerce columns to schema types from mstabcol metadata .coerce_ssurgo_types <- function(y, tablename, mstabcol, type_map) { if (is.null(type_map) || length(mstabcol) < 6) @@ -665,7 +669,7 @@ createSSURGO <- function(filename = NULL, } y } - + ## From https://github.com/brownag/gpkg ----- #' Add, Remove, Update and Create `gpkg_contents` table and records @@ -678,9 +682,9 @@ createSSURGO <- function(filename = NULL, #' @noRd #' @keywords internal .gpkg_add_contents <- function(con, table_name, description = "", template = NULL) { - + stopifnot(requireNamespace("RSQLite")) - + if (!missing(template) && !is.null(template) && is.list(template) && @@ -691,13 +695,13 @@ createSSURGO <- function(filename = NULL, ex <- c(-180, -90, 180, 90) cr <- 4326 } - + # append to gpkg_contents RSQLite::dbExecute(con, paste0( - "INSERT INTO gpkg_contents (table_name, data_type, identifier, + "INSERT INTO gpkg_contents (table_name, data_type, identifier, description, last_change, - min_x, min_y, max_x, max_y, srs_id) + min_x, min_y, max_x, max_y, srs_id) VALUES ('", table_name , "', 'attributes', '", @@ -722,7 +726,7 @@ createSSURGO <- function(filename = NULL, RSQLite::dbExecute(con, paste0("DELETE FROM gpkg_contents WHERE table_name = '", table_name, "'")) } -#' @description `.gpkg_has_contents()`: Determine if a database has table named `"gpkg_contents"` +#' @description `.gpkg_has_contents()`: Determine if a database has table named `"gpkg_contents"` #' @noRd #' @keywords internal .gpkg_has_contents <- function(con) { @@ -730,7 +734,7 @@ createSSURGO <- function(filename = NULL, isTRUE("gpkg_contents" %in% RSQLite::dbListTables(con)) } -#' @description `.gpkg_has_contents()`: Determine if a database has table named `"gpkg_contents"` +#' @description `.gpkg_has_contents()`: Determine if a database has table named `"gpkg_contents"` #' @noRd #' @keywords internal .gpkg_create_contents <- function(con) { @@ -748,7 +752,7 @@ createSSURGO <- function(filename = NULL, srs_id INTEGER, CONSTRAINT fk_gc_r_srs_id FOREIGN KEY (srs_id) REFERENCES gpkg_spatial_ref_sys(srs_id) )" - + if (!.gpkg_has_contents(con)) { RSQLite::dbExecute(con, q) } else return(1) From 2de26b9ed338d410a545ed2be6692d874355b7d7 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Wed, 6 May 2026 11:35:14 -0700 Subject: [PATCH 10/10] fix: drop = FALSE --- R/createSSURGO.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/createSSURGO.R b/R/createSSURGO.R index 58db0d93..75714f98 100644 --- a/R/createSSURGO.R +++ b/R/createSSURGO.R @@ -573,7 +573,7 @@ createSSURGO <- function(filename = NULL, f.shp.sc <- character(0) if (is.character(include_spatial)) { idx <- paste0(shp.grp[, 1], "_", shp.grp[, 2]) %in% names(layer_names[layer_names %in% include_spatial]) - shp.grp <- shp.grp[idx, ] + shp.grp <- shp.grp[idx, , drop = FALSE] f.shp <- f.shp[idx] f.shp.sc <- files[grepl(paste0( paste0("soil", shp.grp[, 1], "_", shp.grp[, 2], "_", shp.grp[, 3]),