From 80aa7be1c2d3952f6caf18d878dc600d445c6069 Mon Sep 17 00:00:00 2001 From: DavZim Date: Fri, 6 Dec 2024 13:40:43 +0000 Subject: [PATCH 1/3] fix sharedString deletion --- R/wrappers.R | 65 +++++++++++------- tests/testthat/test-wrappers.R | 117 +++++++++++++++++++++++++++++++-- 2 files changed, 156 insertions(+), 26 deletions(-) diff --git a/R/wrappers.R b/R/wrappers.R index e2d323b9..2d2aa4b4 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -1900,45 +1900,66 @@ deleteDataColumn <- function(wb, sheet, col) { } a <- wb$worksheets[[sheet]]$sheet_data + + # t: if a shared string is used or if the string is a value itself + # v: the shared string index or the string itself (if t == 0) + # in wb$sharedStrings we find the values of the shared strings by index + 1 (0 indexed!) + # check which elements to delete keep <- a$cols != col # if there is no column to delete, exit early if (all(keep)) return(invisible(0)) - # delete cols in cols "col" move higher cols one down + # delete cols in cols "col", move higher cols one down a$cols <- as.integer(a$cols[keep] - 1 * (a$cols[keep] > col)) a$rows <- a$rows[keep] # reduce the shared strings pointers if they are not used anymore - has_t <- !is.na(a$t) & a$t == 1 - used_shared <- a$v[has_t] # a reference to all shared strings - keep_t <- keep[has_t] # these shared strings are kept - keep_t[is.na(keep_t)] <- FALSE - keep_shared <- used_shared[keep_t] - rem_shared <- setdiff(unique(used_shared[!keep_t]), unique(keep_shared)) - for (v in rem_shared) { - to_reduce <- as.numeric(keep_shared) > as.numeric(v) - to_reduce[is.na(to_reduce)] <- FALSE - if (any(to_reduce)) - keep_shared[to_reduce] <- as.character(as.numeric(keep_shared[to_reduce]) - 1) - } - used_shared[keep_t] <- keep_shared - a$v[has_t] <- used_shared - + ss <- data.frame( + # the old index 0 indexed, as used in a$v + old = as.numeric(seq(length(wb$sharedStrings)) - 1), + # will hold the new index 0 indexed, as used in a$v + new = NA, + # the actual strings + string = wb$sharedStrings + ) + + # 1. remove the values from sheet_data (a) a$v <- a$v[keep] a$t <- a$t[keep] - + + # update the shared strings map (ss) with the new indices + + # v_this_sheet etc are the indices that are still used + v_this_sheet <- as.numeric(a$v[!is.na(a$t) & a$t == 1]) + # get all string indices from other sheets, so that strings used in other sheets are not deleted! + v_other_sheets <- unlist(lapply(setdiff(seq_along(wb$worksheets), sheet), function(sh) { + a <- wb$worksheets[[sh]]$sheet_data + as.numeric(a$v[!is.na(a$t) & a$t == 1]) + })) + + idx <- sort(unique(c(v_this_sheet, v_other_sheets))) + ss$new[ss$old %in% idx] <- seq_along(idx) - 1 + + # 2. remove the values from the sharedStrings object + wb$sharedStrings <- wb$sharedStrings[idx + 1] + attr(wb$sharedStrings, "uniqueCount") <- length(idx) + + # 3. reindex the values from the sheet_data to use new shared strings indices + a$v[a$t == 1] <- as.character(ss$new[as.numeric(a$v[a$t == 1]) + 1]) + + # update the shared strings for all other sheets + for (s in setdiff(seq_along(wb$worksheets), sheet)) { + a <- wb$worksheets[[s]]$sheet_data + a$v[a$t == 1] <- as.character(ss$new[as.numeric(a$v[a$t == 1]) + 1]) + } + a$f <- updateFormula(a$f[keep], col = col) a$n_elements <- sum(keep) if ("data_count" %in% names(a)) a$data_count <- length(unique(a$v)) - # remove the unneeded strings from sharedStrings - rv <- as.numeric(rem_shared) + 1 - wb$sharedStrings <- wb$sharedStrings[-rv] - attr(wb$sharedStrings, "uniqueCount") <- length(unique(wb$sharedStrings)) - # adjust styles sheet_name <- wb$sheet_names[[sheet]] this_sheet <- sapply(wb$styleObjects, function(o) { diff --git a/tests/testthat/test-wrappers.R b/tests/testthat/test-wrappers.R index 146c5411..20f54faf 100644 --- a/tests/testthat/test-wrappers.R +++ b/tests/testthat/test-wrappers.R @@ -17,7 +17,7 @@ test_that("int2col and col2int", { test_that("deleteDataColumn basics", { wb <- createWorkbook() addWorksheet(wb, "tester") - + for (i in seq(5)) { mat <- data.frame(x = rep(paste0(int2col(i), i), 10)) writeData(wb, sheet = 1, startRow = 1, startCol = i, mat) @@ -29,8 +29,8 @@ test_that("deleteDataColumn basics", { c("=COUNTA(A2:A11)", "=COUNTA(B2:B11)", "=COUNTA(C2:C11)", "=COUNTA(D2:D11)", "=COUNTA(E2:E11)") ) - - + + deleteDataColumn(wb, 1, col = 3) expect_equal(read.xlsx(wb), data.frame(x = rep("A1", 10), x = "B2", x = "D4", x = "E5", # no C3! @@ -130,7 +130,7 @@ test_that("deleteDataColumn with formatting data", { st <- openxlsx::createStyle(textDecoration = "Bold", fontSize = 20, fontColour = "red") openxlsx::addStyle(wb, 1, style = st, rows = 1, cols = seq(ncol(df))) - + sst <- wb$styleObjects[[1]] sst$rows <- c(1, 1) sst$cols <- c(1, 2) @@ -140,3 +140,112 @@ test_that("deleteDataColumn with formatting data", { expect_equal(wb$styleObjects[[1]], sst) }) + +test_that("deleteDataColumn with shared strings does not crash or change inputs", { + df <- data.frame("Col 1" = "Row 2 Col 1", + "Col 2" = "Row 2 Col 2", + "Col 3" = "Row 2 Col 3", + check.names = FALSE) + + wb <- createWorkbook() + addWorksheet(wb, "tester") + writeData(wb, sheet = 1, startRow = 1, startCol = 1, x = df, colNames = TRUE) + + deleteDataColumn(wb, sheet = 1, col = 2) + + expect_equal( + wb$sharedStrings, + structure( + list( + "Col 1", + "Col 3", + "Row 2 Col 1", + "Row 2 Col 3" + ), + uniqueCount = 4L + ) + ) + expect_equal( + read.xlsx(wb), + data.frame( + "Col 1" = "Row 2 Col 1", + "Col 3" = "Row 2 Col 3" + ) + ) +}) + + +test_that("deleteDataColumn with shared strings on other sheets", { + df <- data.frame("ABC" = "I am a shared string with sheet 2!") + df2 <- data.frame("AB" = "I am a shared string with sheet 2!") + + wb <- createWorkbook() + addWorksheet(wb, "tester") + writeData(wb, sheet = 1, startRow = 1, startCol = 1, x = df, colNames = TRUE) + + simplify <- function(sd) data.frame(rows = sd$rows, cols = sd$cols, t = sd$t, v = sd$v) + expect_equal( + simplify(wb$worksheets[[1]]$sheet_data), + data.frame(rows = c(1, 2), cols = 1, t = 1, v = c("0", "1")) + ) + expect_equal( + wb$sharedStrings, + structure( + list( + "ABC", + "I am a shared string with sheet 2!" + ), + uniqueCount = 2L + ) + ) + + addWorksheet(wb, "tester2") + writeData(wb, sheet = 2, startRow = 1, startCol = 1, x = df2, colNames = TRUE) + + expect_equal( + simplify(wb$worksheets[[2]]$sheet_data), + data.frame(rows = c(1, 2), cols = 1, t = 1, v = c("2", "1")) + ) + expect_equal( + wb$sharedStrings, + structure( + list( + "ABC", + "I am a shared string with sheet 2!", + "AB" + ), + uniqueCount = 3L + ) + ) + + + # deleting from sheet 1 does not delete the string from sheet 2! + deleteDataColumn(wb, sheet = 1, col = 1) + + expect_equal( + simplify(wb$worksheets[[1]]$sheet_data), + data.frame(rows = numeric(0), cols = numeric(0), t = numeric(0), v = character(0)) + ) + + # note on sheet 2, the indices v to the shared strings have to change as well! + expect_equal( + simplify(wb$worksheets[[2]]$sheet_data), + data.frame(rows = c(1, 2), cols = 1, t = 1, v = c("1", "0")) + ) + + expect_equal( + wb$sharedStrings, + structure( + list( + "I am a shared string with sheet 2!", + "AB" + ), + uniqueCount = 2L + ) + ) + + expect_equal( + read.xlsx(wb, sheet = 2), + data.frame(AB = "I am a shared string with sheet 2!") + ) +}) From a722578b400b92ed1dc46e075baff4744078a6e9 Mon Sep 17 00:00:00 2001 From: DavZim Date: Fri, 6 Dec 2024 14:00:56 +0000 Subject: [PATCH 2/3] add safeguard against workbooks without any shared strings --- R/wrappers.R | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/R/wrappers.R b/R/wrappers.R index 2d2aa4b4..81914479 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -1916,14 +1916,17 @@ deleteDataColumn <- function(wb, sheet, col) { a$rows <- a$rows[keep] # reduce the shared strings pointers if they are not used anymore - ss <- data.frame( - # the old index 0 indexed, as used in a$v - old = as.numeric(seq(length(wb$sharedStrings)) - 1), - # will hold the new index 0 indexed, as used in a$v - new = NA, - # the actual strings - string = wb$sharedStrings - ) + ss <- data.frame(old = numeric(0), new = numeric(0), string = character(0)) + if (length(wb$sharedStrings) > 0) { + ss <- data.frame( + # the old index 0 indexed, as used in a$v + old = as.numeric(seq(length(wb$sharedStrings)) - 1), + # will hold the new index 0 indexed, as used in a$v + new = NA, + # the actual strings + string = wb$sharedStrings + ) + } # 1. remove the values from sheet_data (a) a$v <- a$v[keep] From d438e0636b3d8d8bb2555c88ebca8b6a698a44c7 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Wed, 15 Jan 2025 18:30:24 +0100 Subject: [PATCH 3/3] cleanup whitespaces in test --- tests/testthat/test-wrappers.R | 72 +++++++++++++++++----------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/tests/testthat/test-wrappers.R b/tests/testthat/test-wrappers.R index 20f54faf..bd66c0d2 100644 --- a/tests/testthat/test-wrappers.R +++ b/tests/testthat/test-wrappers.R @@ -2,22 +2,22 @@ context("Test wrappers") test_that("int2col and col2int", { - + nums <- 2:27 - + chrs <- c("B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "AA") - + expect_equal(chrs, int2col(nums)) expect_equal(nums, col2int(chrs)) - + }) test_that("deleteDataColumn basics", { wb <- createWorkbook() addWorksheet(wb, "tester") - + for (i in seq(5)) { mat <- data.frame(x = rep(paste0(int2col(i), i), 10)) writeData(wb, sheet = 1, startRow = 1, startCol = i, mat) @@ -29,8 +29,8 @@ test_that("deleteDataColumn basics", { c("=COUNTA(A2:A11)", "=COUNTA(B2:B11)", "=COUNTA(C2:C11)", "=COUNTA(D2:D11)", "=COUNTA(E2:E11)") ) - - + + deleteDataColumn(wb, 1, col = 3) expect_equal(read.xlsx(wb), data.frame(x = rep("A1", 10), x = "B2", x = "D4", x = "E5", # no C3! @@ -40,8 +40,8 @@ test_that("deleteDataColumn basics", { c("=COUNTA(A2:A11)", "=COUNTA(B2:B11)", "=COUNTA(C2:C11)", "=COUNTA(D2:D11)") ) - - + + deleteDataColumn(wb, 1, col = 2) expect_equal(read.xlsx(wb), data.frame(x = rep("A1", 10), x = "D4", x = "E5", # no B2! @@ -50,7 +50,7 @@ test_that("deleteDataColumn basics", { setdiff(wb$worksheets[[1]]$sheet_data$f, NA), c("=COUNTA(A2:A11)", "=COUNTA(B2:B11)", "=COUNTA(C2:C11)") ) - + deleteDataColumn(wb, 1, col = 1) expect_equal(read.xlsx(wb), data.frame(x = rep("D4", 10), x = "E5", # no A1! @@ -68,22 +68,22 @@ test_that("deleteDataColumn with more complicated formulae", { addWorksheet(wb, "tester") writeData(wb, sheet = 1, startRow = 1, startCol = 1, x = matrix(c(1, 1), ncol = 1), colNames = FALSE) - + for (c in 2:10) writeFormula(wb, 1, sprintf("%s1 + 1", int2col(c - 1)), startRow = 1, startCol = c) - + for (c in 2:10) writeFormula(wb, 1, sprintf("%s1 + %s2", int2col(c), int2col(c - 1)), startRow = 2, startCol = c) - + for (c in 2:10) writeFormula(wb, 1, sprintf("%s2 + %s2", int2col(c), int2col(c + 1)), startRow = 3, startCol = c) - + deleteDataColumn(wb, 1, 3) # saveWorkbook(wb, "tester.xlsx") # and inspect by hand: expect lots of #REF! - + expect_equal(read.xlsx(wb), data.frame(`1` = 1, check.names = FALSE)) expect_equal( wb$worksheets[[1]]$sheet_data$f, @@ -103,19 +103,19 @@ test_that("deleteDataColumn with wide data", { colnames(df) <- int2col(seq(ncols)) writeData(wb, sheet = 1, startRow = 1, startCol = 1, x = df, colNames = TRUE) expect_equal(read.xlsx(wb), df) - + deleteDataColumn(wb, 1, 2) expect_equal(read.xlsx(wb), df[, -2]) - + deleteDataColumn(wb, 1, 100) expect_equal(read.xlsx(wb), df[, -2]) - + deleteDataColumn(wb, 1, 55) expect_equal(read.xlsx(wb), df[, -c(2, 56)]) # 56 b.c. one col was already taken out - + deleteDataColumn(wb, 1, 1) expect_equal(read.xlsx(wb), df[, -c(1, 2, 56)]) - + # delete all data for (i in seq(ncols - 2)) deleteDataColumn(wb, 1, 1) @@ -127,15 +127,15 @@ test_that("deleteDataColumn with formatting data", { addWorksheet(wb, "tester") df <- data.frame(x = 1:10, y = letters[1:10], z = 10:1) writeData(wb, sheet = 1, startRow = 1, startCol = 1, x = df, colNames = TRUE) - + st <- openxlsx::createStyle(textDecoration = "Bold", fontSize = 20, fontColour = "red") openxlsx::addStyle(wb, 1, style = st, rows = 1, cols = seq(ncol(df))) - + sst <- wb$styleObjects[[1]] sst$rows <- c(1, 1) sst$cols <- c(1, 2) deleteDataColumn(wb, 1, 2) - + expect_length(wb$styleObjects, 1) expect_equal(wb$styleObjects[[1]], sst) @@ -146,13 +146,13 @@ test_that("deleteDataColumn with shared strings does not crash or change inputs" "Col 2" = "Row 2 Col 2", "Col 3" = "Row 2 Col 3", check.names = FALSE) - + wb <- createWorkbook() addWorksheet(wb, "tester") writeData(wb, sheet = 1, startRow = 1, startCol = 1, x = df, colNames = TRUE) - + deleteDataColumn(wb, sheet = 1, col = 2) - + expect_equal( wb$sharedStrings, structure( @@ -178,11 +178,11 @@ test_that("deleteDataColumn with shared strings does not crash or change inputs" test_that("deleteDataColumn with shared strings on other sheets", { df <- data.frame("ABC" = "I am a shared string with sheet 2!") df2 <- data.frame("AB" = "I am a shared string with sheet 2!") - + wb <- createWorkbook() addWorksheet(wb, "tester") writeData(wb, sheet = 1, startRow = 1, startCol = 1, x = df, colNames = TRUE) - + simplify <- function(sd) data.frame(rows = sd$rows, cols = sd$cols, t = sd$t, v = sd$v) expect_equal( simplify(wb$worksheets[[1]]$sheet_data), @@ -198,10 +198,10 @@ test_that("deleteDataColumn with shared strings on other sheets", { uniqueCount = 2L ) ) - + addWorksheet(wb, "tester2") writeData(wb, sheet = 2, startRow = 1, startCol = 1, x = df2, colNames = TRUE) - + expect_equal( simplify(wb$worksheets[[2]]$sheet_data), data.frame(rows = c(1, 2), cols = 1, t = 1, v = c("2", "1")) @@ -217,22 +217,22 @@ test_that("deleteDataColumn with shared strings on other sheets", { uniqueCount = 3L ) ) - - + + # deleting from sheet 1 does not delete the string from sheet 2! deleteDataColumn(wb, sheet = 1, col = 1) - + expect_equal( simplify(wb$worksheets[[1]]$sheet_data), data.frame(rows = numeric(0), cols = numeric(0), t = numeric(0), v = character(0)) ) - + # note on sheet 2, the indices v to the shared strings have to change as well! expect_equal( simplify(wb$worksheets[[2]]$sheet_data), data.frame(rows = c(1, 2), cols = 1, t = 1, v = c("1", "0")) ) - + expect_equal( wb$sharedStrings, structure( @@ -243,7 +243,7 @@ test_that("deleteDataColumn with shared strings on other sheets", { uniqueCount = 2L ) ) - + expect_equal( read.xlsx(wb, sheet = 2), data.frame(AB = "I am a shared string with sheet 2!")