-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy path00_prep_functions.R
161 lines (143 loc) · 4.99 KB
/
00_prep_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#' @title FABIO download
#'
#' @param file Files to download.
#' @param link Link(s) to download x from.
#' @param path Path to download to.
#' @param v Whether to be verbose.
#'
#' @return Vector of \link[utils]{download.file}'s values.
#'
#' @examples
#' \dontrun{
#' fa_dl(
#' "Production_Crops_E_All_Data_(Normalized).zip",
#' "http://fenixservices.fao.org/faostat/static/bulkdownloads/",
#' "data/fao/"
#' )
#' }
fa_dl <- function(
file, link, path, v = TRUE) {
dl <- paste0(link, file)
dest <- paste0(path, file)
out <- vector("integer", length(file))
for(i in seq_along(file)) {
if(!file.exists(dest[i])) {
out[i] <- download.file(dl[i], dest[i], method = "auto")
} else if(v) cat("Skipping download, already found:", file[i], "\n")
}
out
}
#' @title FABIO extract to RDS
#' @description Extract CSV files from a ZIP archive and convert them to RDS.
#' Handles multiple ZIP files with one CSV each or a single ZIP with multiple
#' CSV files.
#'
#' @param zip Path to the ZIP archive to extract from and to.
#' @param path_out Path to write RDS files to.
#' @param name Name of the RDS file to write.
#' @param extr File to extract from the ZIP archive. If not set and there are
#' multiple files only the first one will be extracted. Set to NULL or an empty
#' string to try extracting all. See \link[utils]{unzip} for further details.
#' @param col_types List of vectors with classes for \link[data.table]{fread}.
#' @param stack Whether to stack the CSV files via \link[data.table]{rbindlist}.
#' @param rm Whether to remove the extracted CSV files.
#' @param v Whether to be verbose.
#' @param ... Fed into \link[utils]{unzip}.
#'
#' @return Vector with created RDS files.
#'
#' @importFrom data.table fread rbindlist
#'
#' @examples
#' \dontrun{
#' fa_extract(
#' "data/fao/GlobalProduction_2018.1.2.zip",
#' "data/fao/",
#' "fish_raw",
#' extr = "TS_FI_PRODUCTION.csv"
#' )
#' }
fa_extract <- function(
path_in, files, path_out, name, extr = NULL,
col_types = NULL, stack = FALSE, read_method = NULL,
rm = TRUE, v = TRUE, ...) {
zip = paste0(path_in, files)
dest_rds <- paste0(path_out, name, ".rds")
if(length(zip) == 1 && length(extr) > 1 || is.null(extr)) {
if(v) cat("Extracting multiple files from a single ZIP archive\n")
csv <- unzip(zip, extr, exdir = gsub("(.*)/", "\\1", path_out))
} else {
if(v) cat("Extracting single files from multiple ZIP archives\n")
csv <- vector("character", length(zip))
for(i in seq_along(zip)) {
if(is.na(extr[i]) || nchar(extr[i]) == 0) {
extr[i] <- unzip(zip[i], list = TRUE)[[1]][1]
}
# if(file.info(zip[i])$size > 200000000) {
csv[i] <- paste0(path_out, ifelse(!is.na(extr[i]), extr[i], sub("zip", "csv", zip[i])))
if(grepl("\\(|\\)", zip[i])) file.rename(zip[i], gsub("\\(|\\)", "", zip[i]))
decompress_file(path_out, gsub("\\(|\\)", "", files[i]))
# } else { csv[i] <- unzip(zip[i], extr[i], exdir = gsub("(.*)/", "\\1", path_out)) }
}
}
rds <- vector("list", length(csv))
for(i in seq_along(csv)) {
cat("Reading:", csv[i], "\n")
if(is.null(read_method) || read_method[i] == "fread") {
rds[[i]] <- data.table::fread(csv[i], colClasses = col_types[[i]])
} else if (read_method[i] == "read_csv") {
rds[[i]] <- as.data.table(readr::read_csv(csv[i]))
} else {stop("Wrong read_method specified for", csv[i], ". Must be either 'fread' or 'read_csv'. If read_method is NULL, fread is used by default. \n")}
}
if(stack) {
if(v) cat("Stacking CSV files via data.table::rbindlist()")
saveRDS(data.table::rbindlist(rds), dest_rds)
} else {
for(i in seq_along(csv)) saveRDS(rds[[i]], dest_rds[i])
}
if(rm) file.remove(csv)
dest_rds
}
#' @title Extract zipped files >= 4GB
#' @description Extract files >= 4GB from a ZIP archive without truncation.
#'
#' @param directory Path to the folder containing the ZIP archive.
#' @param file File name of the ZIP archive.
#' @param .file_cache Allows to skip uncompression.
#'
#' @return Unzipped content of a ZIP archive.
#'
#' @examples
#' \dontrun{
#' decompress_file(
#' "./input/fao/",
#' "GlobalProduction_2018.1.2.zip"
#' )
#' }
#' @source https://stackoverflow.com/questions/42740206/r-possible-truncation-of-4gb-file
decompress_file <- function(directory, file, .file_cache = FALSE) {
if (.file_cache == TRUE) {
print("decompression skipped")
} else {
# Set working directory for decompression
# simplifies unzip directory location behavior
wd <- getwd()
setwd(directory)
# Run decompression
decompression <-
system2("unzip",
args = c("-o", # include override flag
file),
stdout = TRUE)
# uncomment to delete archive once decompressed
# file.remove(file)
# Reset working directory
setwd(wd); rm(wd)
# Test for success criteria
# change the search depending on
# your implementation
if (grepl("Warning message", tail(decompression, 1))) {
print(decompression)
}
}
}