diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 8c025a6a25f9..2ab9852abc85 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -122,6 +122,7 @@ build() { -DARROW_PACKAGE_PREFIX="${MINGW_PREFIX}" \ -DARROW_PARQUET=ON \ -DARROW_S3=ON \ + -DARROW_AZURE=OFF \ -DARROW_SNAPPY_USE_SHARED=OFF \ -DARROW_USE_GLOG=OFF \ -DARROW_UTF8PROC_USE_SHARED=OFF \ diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml index c640629a8d03..feeac3dba47b 100644 --- a/dev/tasks/r/github.packages.yml +++ b/dev/tasks/r/github.packages.yml @@ -76,12 +76,14 @@ jobs: run: | brew install sccache ninja brew install openssl@3.0 + brew install libxml2 - name: Build libarrow shell: bash env: {{ macros.github_set_sccache_envvars()|indent(8) }} MACOSX_DEPLOYMENT_TARGET: "11.6" ARROW_GCS: ON + ARROW_AZURE: ON ARROW_DEPENDENCY_SOURCE: BUNDLED CMAKE_GENERATOR: Ninja LIBARROW_MINIMAL: false diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 99a1ff318845..976b843eb2aa 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -28,7 +28,8 @@ URL: https://github.com/apache/arrow/, https://arrow.apache.org/docs/r/ BugReports: https://github.com/apache/arrow/issues Encoding: UTF-8 Language: en-US -SystemRequirements: C++20; for AWS S3 support on Linux, libcurl and openssl (optional); +SystemRequirements: C++20; for AWS S3 support on Linux, libcurl and openssl, and + libxml2 for Azure (optional); cmake >= 3.26 (build-time only, and only for full source build) Biarch: true Imports: diff --git a/r/NAMESPACE b/r/NAMESPACE index f74034c965b7..46c29b3e9370 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -183,6 +183,7 @@ S3method(vec_ptype_full,arrow_fixed_size_list) S3method(vec_ptype_full,arrow_large_list) S3method(vec_ptype_full,arrow_list) export(Array) +export(AzureFileSystem) export(Buffer) export(BufferOutputStream) export(BufferReader) @@ -282,6 +283,7 @@ export(arrow_available) export(arrow_info) export(arrow_table) export(arrow_with_acero) +export(arrow_with_azure) export(arrow_with_dataset) export(arrow_with_gcs) export(arrow_with_json) @@ -295,6 +297,7 @@ export(as_data_type) export(as_record_batch) export(as_record_batch_reader) export(as_schema) +export(az_container) export(binary) export(bool) export(boolean) diff --git a/r/R/arrow-info.R b/r/R/arrow-info.R index 699f94dcbdb5..91b46788aab2 100644 --- a/r/R/arrow-info.R +++ b/r/R/arrow-info.R @@ -46,6 +46,7 @@ arrow_info <- function() { json = arrow_with_json(), s3 = arrow_with_s3(), gcs = arrow_with_gcs(), + azure = arrow_with_azure(), utf8proc = "utf8_upper" %in% compute_funcs, re2 = "replace_substring_regex" %in% compute_funcs, vapply(tolower(names(CompressionType)[-1]), codec_is_available, logical(1)) @@ -128,6 +129,15 @@ arrow_with_gcs <- function() { }) } +#' @rdname arrow_info +#' @export +arrow_with_azure <- function() { + tryCatch(.Call(`_azure_available`), error = function(e) { + return(FALSE) + }) +} + + #' @rdname arrow_info #' @export arrow_with_json <- function() { diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 22e66e2243ec..dac7b4609c56 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1424,6 +1424,10 @@ fs___GcsFileSystem__options <- function(fs) { .Call(`_arrow_fs___GcsFileSystem__options`, fs) } +fs___AzureFileSystem__Make <- function(options) { + .Call(`_arrow_fs___AzureFileSystem__Make`, options) +} + io___Readable__Read <- function(x, nbytes) { .Call(`_arrow_io___Readable__Read`, x, nbytes) } diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 99c09c40dc3b..abbc9cd6c5bf 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -189,6 +189,31 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' - `default_metadata`: default metadata to write in new objects. #' - `project_id`: the project to use for creating buckets. #' +#' `AzureFileSystem$create()` takes following required argument: +#' +#' - `account_name`: Azure Blob Storage account name. +#' +#' `AzureFileSystem$create()` takes following optional arguments: +#' +#' - `account_key`: Account key of the storage account. Cannot be used with +#' `sas_token`. +#' - `blob_storage_authority`: Hostname of the blob service, defaulting to +#' `"blob.core.windows.net"`. +#' - `blob_storage_scheme`: Either `"http"` or `"https"` (the default). +#' - `client_id`: The client/application ID for Azure Active Directory +#' authentication. If used with `client_secret` and `tenant_id` then it is the +#' application ID for a registered Azure AD application. Otherwise, it is the +#' client ID of a user-assigned managed identity. +#' - `client_secret`: Client secret for Azure Active Directory authentication. +#' Must be provided with both `client_id` and `tenant_id`. +#' - `dfs_storage_authority`: Hostname of the data lake (gen 2) service, +#' defaulting to `"dfs.core.windows.net"`. +#' - `dfs_storage_scheme`: Either `"http"` or `"https"` (the default). +#' - `sas_token`: Shared access signature (SAS) token for the storage account. +#' Cannot be used with `account key`. +#' - `tenant_id`: Tenant ID for Azure Active Directory authentication. Must +#' be provided with both `client_id` and `client_secret`. +#' #' @section Methods: #' #' - `path(x)`: Create a `SubTreeFileSystem` from the current `FileSystem` @@ -253,6 +278,10 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' (the default), 'ERROR', 'WARN', 'INFO', 'DEBUG' (recommended), 'TRACE', and #' 'OFF'. #' +#' On `AzureFileSystem`, passing no arguments for authentication uses the +#' `AzureDefaultCredential` for authentication, so that several authentication +#' types are tried until one succeeds. +#' #' @usage NULL #' @format NULL #' @docType class @@ -645,6 +674,95 @@ GcsFileSystem$create <- function(anonymous = FALSE, retry_limit_seconds = 15, .. fs___GcsFileSystem__Make(anonymous, options) } +#' @usage NULL +#' @format NULL +#' @rdname FileSystem +#' @importFrom utils modifyList +#' @export +AzureFileSystem <- R6Class( + "AzureFileSystem", + inherit = FileSystem +) + +AzureFileSystem$create <- function(account_name, ...) { + options <- list(...) + valid_opts <- c( + "account_key", + "blob_storage_authority", + "blob_storage_scheme", + "client_id", + "client_secret", + "dfs_storage_authority", + "dfs_storage_scheme", + "sas_token", + "tenant_id" + ) + + invalid_opts <- setdiff(names(options), valid_opts) + if (length(invalid_opts)) { + stop( + "Invalid options for AzureFileSystem: ", + oxford_paste(invalid_opts), + call. = FALSE + ) + } + # The c++ code assumes that the various combinations of authentication methods + # have been validated in this function. + if (!is.null(options$tenant_id) || !is.null(options$client_id) || !is.null(options$client_secret)) { + if (is.null(options$client_id)) { + stop( + "`client_id` must be given with `tenant_id` and `client_secret`", + call. = FALSE + ) + } + if (sum(is.null(options$tenant_id), is.null(options$client_secret)) == 1) { + stop( + "Provide only `client_id` to authenticate with ", + "Managed Identity Credential, or provide `client_id`, `tenant_id`, ", + "and`client_secret` to authenticate with Client Secret Credential", + call. = FALSE + ) + } + } else if (!is.null(options$account_key) && !is.null(options$sas_token)) { + stop( + "Cannot specify both `account_key` and `sas_token`", + call. = FALSE + ) + } + + fs___AzureFileSystem__Make(c(account_name = account_name, options)) +} + +#' Connect to an Azure Blob Storage container +#' +#' `az_conainer` is a convenience function to create an `AzureFileSystem` object +#' that provides a file system interface for blob storage containers in an Azure +#' Storage Account. +#' +#' @param container_path string Container name or path. +#' @param ... Additional connection options, passed to `AzureFileSystem$create()`. +#' +#' @return A `SubTreeFileSystem` containing an `AzureFileSystem` and the container's +#' relative path. Note that this function's success does not guarantee that you +#' are authorized to access the container's contents. +#' @examplesIf FALSE +#' container_fs <- az_container( +#' container_path = "arrow-datasets", +#' account_name = azurite_account_name, +#' account_key = azurite_account_key, +#' blob_storage_authority = azurite_blob_storage_authority, +#' blob_storage_scheme = azurite_blob_storage_scheme +#' ) +#' @export +az_container <- function(container_path, ...) { + assert_that(is.string(container_path)) + args <- list2(...) + + fs <- exec(AzureFileSystem$create, !!!args) + + SubTreeFileSystem$create(container_path, fs) +} + #' @usage NULL #' @format NULL #' @rdname FileSystem diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 57e99648ec55..c3f68b221602 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -261,10 +261,11 @@ reference: - title: File systems desc: > - Functions for working with files on S3 and GCS + Functions for working with files on S3, GCS, and Azure contents: - s3_bucket - gs_bucket + - az_container - copy_files - title: Flight diff --git a/r/configure b/r/configure index 9e92eb6b47f2..8724f3eaf991 100755 --- a/r/configure +++ b/r/configure @@ -359,10 +359,14 @@ add_feature_flags () { if arrow_built_with ARROW_S3; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_S3" fi + if arrow_built_with ARROW_AZURE; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_AZURE" + PKG_LIBS_FEATURES="$PKG_LIBS_FEATURES -lxml2" + fi if arrow_built_with ARROW_GCS; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" fi - if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3; then + if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3 || arrow_built_with ARROW_AZURE; then # If pkg-config is available it will handle this for us automatically SSL_LIBS_WITHOUT_PC="-lcurl -lssl -lcrypto" fi diff --git a/r/configure.win b/r/configure.win index 16c5ec1bee8d..7315013f29a4 100755 --- a/r/configure.win +++ b/r/configure.win @@ -67,6 +67,7 @@ function configure_binaries() { # pkg-config --libs libcurl GCS_LIBS="-lcurl -lnormaliz -lssh2 -lgdi32 -lssl -lcrypto -lcrypt32 -lwldap32 \ -lz -lws2_32 -lnghttp2 -ldbghelp" + # AZURE_LIBS="-lcurl -lssl -lxml2" # Set the right flags to point to and enable arrow/parquet if [ -d "windows/r-libarrow-windows-x86_64-$VERSION" ]; then @@ -94,8 +95,8 @@ function configure_binaries() { # S3, GCS, and re2 support only for Rtools40 (i.e. R >= 4.0) "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e 'R.version$major >= 4' | grep TRUE >/dev/null 2>&1 if [ $? -eq 0 ]; then - PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3 -DARROW_R_WITH_GCS" - PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS} ${GCS_LIBS}" + PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3 -DARROW_R_WITH_GCS" # -DARROW_R_WITH_AZURE + PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS} ${GCS_LIBS}" # ${AZURE_LIBS} else # It seems that order matters PKG_LIBS="${PKG_LIBS} -lws2_32" @@ -187,6 +188,10 @@ add_feature_flags () { if arrow_built_with ARROW_S3; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_S3" fi + # if arrow_built_with ARROW_AZURE; then + # PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_AZURE" + # PKG_LIBS_FEATURES="$PKG_LIBS_FEATURES -lxml2" + # fi if arrow_built_with ARROW_GCS; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" fi @@ -292,6 +297,10 @@ function configure_dev() { PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_GCS" fi + # if [ $(cmake_option ARROW_AZURE) -eq 1 ]; then + # PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_AZURE" + # fi + if [ $(cmake_option ARROW_JSON) -eq 1 ]; then PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_JSON" fi diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index 9acfef109c56..8a78ba7ecaac 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -30,7 +30,7 @@ # Ensure that all machines are sorting the same way invisible(Sys.setlocale("LC_COLLATE", "C")) -features <- c("acero", "dataset", "substrait", "parquet", "s3", "gcs", "json") +features <- c("acero", "dataset", "substrait", "parquet", "s3", "gcs", "azure", "json") suppressPackageStartupMessages({ library(decor) diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index 349531b75fd9..e7f453bb64e4 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -84,6 +84,7 @@ ${CMAKE_WRAPPER} ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -Dlz4_SOURCE=${lz4_SOURCE:-} \ -DARROW_FILESYSTEM=ON \ -DARROW_GCS=${ARROW_GCS:-OFF} \ + -DARROW_AZURE=${ARROW_AZURE:-$ARROW_DEFAULT_PARAM} \ -DARROW_JEMALLOC=${ARROW_JEMALLOC:-$ARROW_DEFAULT_PARAM} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-ON} \ -DARROW_JSON=${ARROW_JSON:-ON} \ diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index 83e7fc652616..0ecb391549f6 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -6,6 +6,7 @@ \alias{LocalFileSystem} \alias{S3FileSystem} \alias{GcsFileSystem} +\alias{AzureFileSystem} \alias{SubTreeFileSystem} \title{FileSystem classes} \description{ @@ -89,6 +90,33 @@ the filesystem encounters errors. Default is 15 seconds. \item \code{default_metadata}: default metadata to write in new objects. \item \code{project_id}: the project to use for creating buckets. } + +\code{AzureFileSystem$create()} takes following required argument: +\itemize{ +\item \code{account_name}: Azure Blob Storage account name. +} + +\code{AzureFileSystem$create()} takes following optional arguments: +\itemize{ +\item \code{account_key}: Account key of the storage account. Cannot be used with +\code{sas_token}. +\item \code{blob_storage_authority}: Hostname of the blob service, defaulting to +\code{"blob.core.windows.net"}. +\item \code{blob_storage_scheme}: Either \code{"http"} or \code{"https"} (the default). +\item \code{client_id}: The client/application ID for Azure Active Directory +authentication. If used with \code{client_secret} and \code{tenant_id} then it is the +application ID for a registered Azure AD application. Otherwise, it is the +client ID of a user-assigned managed identity. +\item \code{client_secret}: Client secret for Azure Active Directory authentication. +Must be provided with both \code{client_id} and \code{tenant_id}. +\item \code{dfs_storage_authority}: Hostname of the data lake (gen 2) service, +defaulting to \code{"dfs.core.windows.net"}. +\item \code{dfs_storage_scheme}: Either \code{"http"} or \code{"https"} (the default). +\item \code{sas_token}: Shared access signature (SAS) token for the storage account. +Cannot be used with \verb{account key}. +\item \code{tenant_id}: Tenant ID for Azure Active Directory authentication. Must +be provided with both \code{client_id} and \code{client_secret}. +} } \section{Methods}{ @@ -161,5 +189,12 @@ environment variable \code{ARROW_S3_LOG_LEVEL} (e.g., to running any code that interacts with S3. Possible values include 'FATAL' (the default), 'ERROR', 'WARN', 'INFO', 'DEBUG' (recommended), 'TRACE', and 'OFF'. + +On \code{AzureFileSystem}, passing no arguments for authentication uses the +\code{AzureDefaultCredential} for authentication, so that several authentication +types are tried until one succeeds. + +\code{AzureFileSystem} is not presently supported on Windows due to upstream compatibility +issues between the Azure C++ SDK and the MinGW toolchain. } diff --git a/r/man/arrow_info.Rd b/r/man/arrow_info.Rd index a839d3ba8fd2..4e6d12c46cbe 100644 --- a/r/man/arrow_info.Rd +++ b/r/man/arrow_info.Rd @@ -9,6 +9,7 @@ \alias{arrow_with_parquet} \alias{arrow_with_s3} \alias{arrow_with_gcs} +\alias{arrow_with_azure} \alias{arrow_with_json} \title{Report information on the package's capabilities} \usage{ @@ -28,6 +29,8 @@ arrow_with_s3() arrow_with_gcs() +arrow_with_azure() + arrow_with_json() } \value{ diff --git a/r/man/az_container.Rd b/r/man/az_container.Rd new file mode 100644 index 000000000000..a749b4a4e188 --- /dev/null +++ b/r/man/az_container.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filesystem.R +\name{az_container} +\alias{az_container} +\title{Connect to an Azure Blob Storage container} +\usage{ +az_container(container_path, ...) +} +\arguments{ +\item{container_path}{string Container name or path.} + +\item{...}{Additional connection options, passed to \code{AzureFileSystem$create()}.} +} +\value{ +A \code{SubTreeFileSystem} containing an \code{AzureFileSystem} and the container's +relative path. Note that this function's success does not guarantee that you +are authorized to access the container's contents. +} +\description{ +\code{az_conainer} is a convenience function to create an \code{AzureFileSystem} object +that provides a file system interface for blob storage containers in an Azure +Storage Account. +} +\examples{ +\dontshow{if (FALSE) withAutoprint(\{ # examplesIf} +container_fs <- az_container( + container_path = "arrow-datasets", + account_name = azurite_account_name, + account_key = azurite_account_key, + blob_storage_authority = azurite_blob_storage_authority, + blob_storage_scheme = azurite_blob_storage_scheme +) +\dontshow{\}) # examplesIf} +} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 5482c8679f68..eb370907d2fc 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3676,6 +3676,21 @@ extern "C" SEXP _arrow_fs___GcsFileSystem__options(SEXP fs_sexp){ } #endif +// filesystem.cpp +#if defined(ARROW_R_WITH_AZURE) +std::shared_ptr fs___AzureFileSystem__Make(cpp11::list options); +extern "C" SEXP _arrow_fs___AzureFileSystem__Make(SEXP options_sexp){ +BEGIN_CPP11 + arrow::r::Input::type options(options_sexp); + return cpp11::as_sexp(fs___AzureFileSystem__Make(options)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_fs___AzureFileSystem__Make(SEXP options_sexp){ + Rf_error("Cannot call fs___AzureFileSystem__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // io.cpp std::shared_ptr io___Readable__Read(const std::shared_ptr& x, int64_t nbytes); extern "C" SEXP _arrow_io___Readable__Read(SEXP x_sexp, SEXP nbytes_sexp){ @@ -5725,6 +5740,15 @@ return Rf_ScalarLogical( #endif ); } +extern "C" SEXP _azure_available() { +return Rf_ScalarLogical( +#if defined(ARROW_R_WITH_AZURE) + TRUE +#else + FALSE +#endif +); +} extern "C" SEXP _json_available() { return Rf_ScalarLogical( #if defined(ARROW_R_WITH_JSON) @@ -5741,6 +5765,7 @@ static const R_CallMethodDef CallEntries[] = { { "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, { "_s3_available", (DL_FUNC)& _s3_available, 0 }, { "_gcs_available", (DL_FUNC)& _gcs_available, 0 }, + { "_azure_available", (DL_FUNC)& _azure_available, 0 }, { "_json_available", (DL_FUNC)& _json_available, 0 }, { "_arrow_is_arrow_altrep", (DL_FUNC) &_arrow_is_arrow_altrep, 1}, { "_arrow_test_arrow_altrep_set_string_elt", (DL_FUNC) &_arrow_test_arrow_altrep_set_string_elt, 3}, @@ -6098,6 +6123,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_FinalizeS3", (DL_FUNC) &_arrow_FinalizeS3, 0}, { "_arrow_fs___GcsFileSystem__Make", (DL_FUNC) &_arrow_fs___GcsFileSystem__Make, 2}, { "_arrow_fs___GcsFileSystem__options", (DL_FUNC) &_arrow_fs___GcsFileSystem__options, 1}, + { "_arrow_fs___AzureFileSystem__Make", (DL_FUNC) &_arrow_fs___AzureFileSystem__Make, 1}, { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 9324a13ce0f4..aa9e4b186cf9 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -37,9 +37,9 @@ const char* r6_class_name::get( return "S3FileSystem"; } else if (type_name == "gcs") { return "GcsFileSystem"; + } else if (type_name == "abfs") { + return "AzureFileSystem"; // Uncomment these once R6 classes for these filesystems are added - // } else if (type_name == "abfs") { - // return "AzureBlobFileSystem"; // } else if (type_name == "hdfs") { // return "HadoopFileSystem"; } else if (type_name == "subtree") { @@ -526,3 +526,56 @@ cpp11::list fs___GcsFileSystem__options(const std::shared_ptr } #endif + +#if defined(ARROW_R_WITH_AZURE) +#include + +// [[azure::export]] +std::shared_ptr fs___AzureFileSystem__Make(cpp11::list options) { + fs::AzureOptions azure_opts; + + // Set account name + azure_opts.account_name = cpp11::as_cpp(options["account_name"]); + + if (!Rf_isNull(options["blob_storage_authority"])) { + azure_opts.blob_storage_authority = + cpp11::as_cpp(options["blob_storage_authority"]); + } + if (!Rf_isNull(options["dfs_storage_authority"])) { + azure_opts.dfs_storage_authority = + cpp11::as_cpp(options["dfs_storage_authority"]); + } + if (!Rf_isNull(options["blob_storage_scheme"])) { + azure_opts.blob_storage_scheme = + cpp11::as_cpp(options["blob_storage_scheme"]); + } + if (!Rf_isNull(options["dfs_storage_scheme"])) { + azure_opts.dfs_storage_scheme = + cpp11::as_cpp(options["dfs_storage_scheme"]); + } + // Validation of the different auth paths happens in the R code. + if (!Rf_isNull(options["client_id"])) { + if (Rf_isNull(options["tenant_id"]) && Rf_isNull(options["client_secret"])) { + StopIfNotOk(azure_opts.ConfigureManagedIdentityCredential( + cpp11::as_cpp(options["client_id"]))); + } else if (!Rf_isNull(options["tenant_id"]) && !Rf_isNull(options["client_secret"])) { + StopIfNotOk(azure_opts.ConfigureClientSecretCredential( + cpp11::as_cpp(options["tenant_id"]), + cpp11::as_cpp(options["client_id"]), + cpp11::as_cpp(options["client_secret"]))); + } + } else if (!Rf_isNull(options["account_key"])) { + StopIfNotOk(azure_opts.ConfigureAccountKeyCredential( + cpp11::as_cpp(options["account_key"]))); + } else if (!Rf_isNull(options["sas_token"])) { + StopIfNotOk(azure_opts.ConfigureSASCredential( + cpp11::as_cpp(options["sas_token"]))); + } else { + StopIfNotOk(azure_opts.ConfigureDefaultCredential()); + } + + auto io_context = MainRThread::GetInstance().CancellableIOContext(); + return ValueOrStop(fs::AzureFileSystem::Make(azure_opts, io_context)); +} + +#endif diff --git a/r/tests/testthat/helper-filesystems.R b/r/tests/testthat/helper-filesystems.R index 7b37abf764b0..9fba086a18e3 100644 --- a/r/tests/testthat/helper-filesystems.R +++ b/r/tests/testthat/helper-filesystems.R @@ -25,12 +25,18 @@ #' returns a URI containing the filesystem scheme (e.g. 's3://', 'gs://'), the #' absolute path, and any necessary connection options as URL query parameters. test_filesystem <- function(name, fs, path_formatter, uri_formatter) { - # NOTE: it's important that we label these tests with name of filesystem so + # NOTE 1: it's important that we label these tests with name of filesystem so # that we can differentiate the different calls to these test in the output. - test_that(sprintf("read/write Feather on %s using URIs", name), { - write_feather(example_data, uri_formatter("test.feather")) - expect_identical(read_feather(uri_formatter("test.feather")), example_data) - }) + + # NOTE 2: as far as I can tell, Azure doesn't support passing a URI directly + # like we can do in S3/GCS. Skipping any tests that rely on this feature + # for name == "azure". + if (name != "azure") { + test_that(sprintf("read/write Feather on %s using URIs", name), { + write_feather(example_data, uri_formatter("test.feather")) + expect_identical(read_feather(uri_formatter("test.feather")), example_data) + }) + } test_that(sprintf("read/write Feather on %s using Filesystem", name), { write_feather(example_data, fs$path(path_formatter("test2.feather"))) @@ -72,11 +78,13 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { ) }) - test_that(sprintf("read/write Parquet on %s", name), { - skip_if_not_available("parquet") - write_parquet(example_data, fs$path(path_formatter("test.parquet"))) - expect_identical(read_parquet(uri_formatter("test.parquet")), example_data) - }) + if (name != "azure") { + test_that(sprintf("read/write Parquet on %s", name), { + skip_if_not_available("parquet") + write_parquet(example_data, fs$path(path_formatter("test.parquet"))) + expect_identical(read_parquet(uri_formatter("test.parquet")), example_data) + }) + } if (arrow_with_dataset()) { make_temp_dir <- function() { @@ -85,39 +93,41 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { normalizePath(path, winslash = "/") } - test_that(sprintf("open_dataset with an %s file (not directory) URI", name), { - skip_if_not_available("parquet") - expect_identical( - open_dataset(uri_formatter("test.parquet")) |> collect() |> arrange(int), - example_data |> arrange(int) - ) - }) - - test_that(sprintf("open_dataset with vector of %s file URIs", name), { - expect_identical( - open_dataset( - c(uri_formatter("test.feather"), uri_formatter("test2.feather")), - format = "feather" - ) |> - arrange(int) |> - collect(), - rbind(example_data, example_data) |> arrange(int) - ) - }) - - test_that(sprintf("open_dataset errors if passed URIs mixing %s and local fs", name), { - td <- make_temp_dir() - expect_error( - open_dataset( - c( - uri_formatter("test.feather"), - paste0("file://", file.path(td, "fake.feather")) + if (name != "azure") { + test_that(sprintf("open_dataset with an %s file (not directory) URI", name), { + skip_if_not_available("parquet") + expect_identical( + open_dataset(uri_formatter("test.parquet")) |> collect() |> arrange(int), + example_data |> arrange(int) + ) + }) + + test_that(sprintf("open_dataset with vector of %s file URIs", name), { + expect_identical( + open_dataset( + c(uri_formatter("test.feather"), uri_formatter("test2.feather")), + format = "feather" + ) |> + arrange(int) |> + collect(), + rbind(example_data, example_data) |> arrange(int) + ) + }) + + test_that(sprintf("open_dataset errors if passed URIs mixing %s and local fs", name), { + td <- make_temp_dir() + expect_error( + open_dataset( + c( + uri_formatter("test.feather"), + paste0("file://", file.path(td, "fake.feather")) + ), + format = "feather" ), - format = "feather" - ), - "Vectors of URIs for different file systems are not supported" - ) - }) + "Vectors of URIs for different file systems are not supported" + ) + }) + } # Dataset test setup, cf. test-dataset.R first_date <- lubridate::ymd_hms("2015-04-29 03:12:39") @@ -167,18 +177,21 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { write_dataset(ds, fs$path(path_formatter("new_dataset_dir"))) expect_length(fs$ls(path_formatter("new_dataset_dir")), 1) }) - + if (name != "azure") { + test_that(sprintf("copy files with %s", name), { + td <- make_temp_dir() + copy_files(uri_formatter("hive_dir"), td) + expect_length(dir(td), 2) + ds <- open_dataset(td) + expect_identical( + ds |> select(int, dbl, lgl) |> collect() |> arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) |> arrange(int) + ) + }) + } test_that(sprintf("copy files with %s", name), { td <- make_temp_dir() - copy_files(uri_formatter("hive_dir"), td) - expect_length(dir(td), 2) - ds <- open_dataset(td) - expect_identical( - ds |> select(int, dbl, lgl) |> collect() |> arrange(int), - rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) |> arrange(int) - ) - - # Let's copy the other way and use a SubTreeFileSystem rather than URI + copy_files(fs$path(path_formatter("hive_dir")), td) copy_files(td, fs$path(path_formatter("hive_dir2"))) ds2 <- open_dataset(fs$path(path_formatter("hive_dir2"))) expect_identical( diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R new file mode 100644 index 000000000000..378444791981 --- /dev/null +++ b/r/tests/testthat/test-azure.R @@ -0,0 +1,247 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("azure") + +# test_filesystem requires dplyr +library(dplyr) + +# This test script depends on ./ci/scripts/install_azurite.sh +skip_if_not(nzchar(Sys.which("azurite")), message = "azurite is not installed.") + +# Use default azurite credentials, +# see https://learn.microsoft.com/en-us/azure/storage/common/storage-connect-azurite?tabs=blob-storage +azurite_account_name <- "devstoreaccount1" +# Note that this is a well-known default credential for local development on Azurite. +azurite_account_key <- "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" +azurite_blob_host <- "127.0.0.1" +azurite_blob_port <- "10000" +azurite_blob_storage_authority <- sprintf("%s:%s", azurite_blob_host, azurite_blob_port) +azurite_blob_storage_scheme <- "http" + +pid_azurite <- sys::exec_background( + "azurite", + c("azurite", "--inMemoryPersistence", "--blobHost", azurite_blob_host), + std_out = FALSE +) +# Kill azurite background process once tests have finished running. +withr::defer(tools::pskill(pid_azurite)) + +# Helper functions for Azure URIs and paths +azure_uri <- function(...) { + endpoint <- sprintf("%s%s%s", azurite_blob_host, "%3A", azurite_blob_port) + template <- "abfs://%s:%s@%s?endpoint=%s" + # URL encode the account key because it contains reserved characters + encoded_key <- curl::curl_escape(azurite_account_key) + sprintf(template, azurite_account_name, encoded_key, azure_path(...), endpoint) +} + +azure_path <- function(...) { + # 'dir' is the container name (following the convention in the s3 tests). + paste(dir, ..., sep = "/") +} + +fs <- AzureFileSystem$create( + account_name = azurite_account_name, + account_key = azurite_account_key, + blob_storage_authority = azurite_blob_storage_authority, + blob_storage_scheme = azurite_blob_storage_scheme +) + +# (1) CreateDir and DeleteDir work correctly +dir <- "test" +fs$CreateDir(dir) +# Clean up when we're all done +withr::defer(fs$DeleteDir(dir)) + +# (2) Run default filesystem tests on azure filesystem + +# TODO: As far as I can tell, there is no way to pass an Azurite URI to write_feather +# (or any other read/write helper), so some of the test_filesystem tests can't be run +# with AzureFilesystem. Some tests below cover some of the skipped cases in +# test_filesystem. +test_filesystem("azure", fs, azure_path, azure_uri) + +# (3) Test write/read parquet + +example_data <- tibble::tibble( + int = c(1:3, NA_integer_, 5:10), + dbl = c(1:8, NA, 10) + 0.1, + dbl2 = rep(5, 10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + false = logical(10), + chr = letters[c(1:5, NA, 7:10)], + fct = factor(letters[c(1:4, NA, NA, 7:10)]) +) + +test_that("read/write Parquet on azure", { + skip_if_not_available("parquet") + write_parquet(example_data, fs$path(azure_path("test.parquet"))) + expect_identical(read_parquet(fs$path(azure_path("test.parquet"))), example_data) +}) + +# (4) open_dataset with a vector of azure file paths + +# TODO: I couldn't pass a vector of paths similar to the original test in +# test_filesystem, but you can pass a folder containing many files. +write_feather(example_data, fs$path(azure_path("openmulti/dataset1.feather"))) +write_feather(example_data, fs$path(azure_path("openmulti/dataset2.feather"))) + +open_multi_fs <- arrow:::az_container( + container_path = azure_path("openmulti"), + account_name = azurite_account_name, + account_key = azurite_account_key, + blob_storage_authority = azurite_blob_storage_authority, + blob_storage_scheme = azurite_blob_storage_scheme +) + +test_that("open_dataset with AzureFileSystem folder", { + expect_identical( + open_dataset( + open_multi_fs, + format = "feather" + ) |> + arrange(int) |> + collect(), + rbind(example_data, example_data) |> arrange(int) + ) +}) + +# (5) Check that multiple valid combinations of options can be used to +# instantiate AzureFileSystem. + +fs1 <- AzureFileSystem$create(account_name = "fake-account-name") +expect_s3_class(fs1, "AzureFileSystem") + +fs2 <- AzureFileSystem$create(account_name = "fake-account-name", account_key = "fakeaccountkey") +expect_s3_class(fs2, "AzureFileSystem") + + +fs3 <- AzureFileSystem$create( + account_name = "fake-account", + account_key = "fakeaccount", + blob_storage_authority = "fake-blob-authority", + dfs_storage_authority = "fake-dfs-authority", + blob_storage_scheme = "https", + dfs_storage_scheme = "https" +) +expect_s3_class(fs3, "AzureFileSystem") + +fs4 <- AzureFileSystem$create( + account_name = "fake-account-name", + sas_token = "fakesastoken" +) +expect_s3_class(fs4, "AzureFileSystem") + +fs5 <- AzureFileSystem$create( + account_name = "fake-account-name", + tenant_id = "fake-tenant-id", + client_id = "fake-client-id", + client_secret = "fake-client-secret" +) +expect_s3_class(fs5, "AzureFileSystem") + +fs6 <- AzureFileSystem$create( + account_name = "fake-account-name", + client_id = "fake-client-id" +) +expect_s3_class(fs6, "AzureFileSystem") + +# (6) Check that invalid argument combinations are caught upfront +# with appropriate error message. + +error_msg_1 <- "`client_id` must be given with `tenant_id` and `client_secret`" +error_msg_2 <- "Provide only `client_id` to authenticate with Managed Identity Credential, or provide `client_id`, `tenant_id`, and`client_secret` to authenticate with Client Secret Credential" # nolint + +test_that("client_id must be specified with account_name and tenant_id", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + tenant_id = "fake-tenant-id" + ), + error_msg_1, + fixed = TRUE + ) +}) + +test_that("client_id must be specified with account_name and client_secret", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + client_secret = "fake-client-secret" + ), + error_msg_1, + fixed = TRUE + ) +}) + +test_that("client_secret must not be provided with client_id", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + client_id = "fake-client-id", + client_secret = "fake-client-secret" + ), + error_msg_2, + fixed = TRUE + ) +}) + +test_that("client_id must be specified with account_name, tenant_id, and client_secret", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + tenant_id = "fake-tenant-id", + client_secret = "fake-client-secret" + ), + error_msg_1, + fixed = TRUE + ) +}) + + +test_that("client_id must be provided alone or with tenant_id and client_secret", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + tenant_id = "fake-tenant-id", + client_id = "fake-client-id" + ), + error_msg_2, + fixed = TRUE + ) +}) + +test_that("cannot specify both account_key and sas_token", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + account_key = "fakeaccount", + sas_token = "fakesastoken" + ), + "Cannot specify both `account_key` and `sas_token`", + fixed = TRUE + ) +}) + +test_that("at a minimum account_name must be passed", { + expect_error( + AzureFileSystem$create(), + 'argument "account_name" is missing, with no default', + fixed = TRUE + ) +}) diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index ba705e03ad7e..802de8a517c7 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -605,6 +605,7 @@ build_libarrow <- function(src_dir, dst_dir) { env_var_list <- c( env_var_list, ARROW_S3 = Sys.getenv("ARROW_S3", "ON"), + ARROW_AZURE = Sys.getenv("ARROW_AZURE", "ON"), # ARROW_GCS = Sys.getenv("ARROW_GCS", "ON"), ARROW_WITH_ZSTD = Sys.getenv("ARROW_WITH_ZSTD", "ON") ) @@ -615,6 +616,11 @@ build_libarrow <- function(src_dir, dst_dir) { env_var_list <- c(env_var_list, ARROW_MIMALLOC = Sys.getenv("ARROW_MIMALLOC", "OFF")) } + if (on_windows) { + # Disable azure on windows due to issues building azure c++ sdk with mingw. + env_var_list <- c(env_var_list, ARROW_AZURE = Sys.getenv("ARROW_AZURE", "OFF")) + } + env_var_list <- with_cloud_support(env_var_list) env_var_list <- with_wasm_support(env_var_list) @@ -800,6 +806,7 @@ turn_off_all_optional_features <- function(env_var_list) { "ARROW_DATASET" = "OFF", # depends on parquet "ARROW_S3" = "OFF", "ARROW_GCS" = "OFF", + "ARROW_AZURE" = "OFF", "ARROW_WITH_GOOGLE_CLOUD_CPP" = "OFF", "ARROW_WITH_NLOHMANN_JSON" = "OFF", "ARROW_SUBSTRAIT" = "OFF", @@ -921,18 +928,22 @@ with_wasm_support <- function(env_var_list) { with_cloud_support <- function(env_var_list) { arrow_s3 <- is_feature_requested("ARROW_S3", env_var_list) arrow_gcs <- is_feature_requested("ARROW_GCS", env_var_list) - - if (arrow_s3 || arrow_gcs) { - # User wants S3 or GCS support. - # Make sure that we have curl and openssl system libs - feats <- c( - if (arrow_s3) "S3", - if (arrow_gcs) "GCS" - ) - start_msg <- paste(feats, collapse = "/") - off_flags <- paste("ARROW_", feats, "=OFF", sep = "", collapse = " and ") - print_warning <- function(msg) { - # Utility to assemble warning message in the console + arrow_azure <- is_feature_requested("ARROW_AZURE", env_var_list) + + if (arrow_s3 || arrow_gcs || arrow_azure) { + # User wants S3 or GCS or Azure support. + # Make sure that we have curl, openssl, and libxml2 system libs + # Utility to assemble warning message in the console + print_warning <- function( + msg, + feats = c( + if (arrow_s3) "S3", + if (arrow_gcs) "GCS", + if (arrow_azure) "AZURE" + ), + start_msg = paste(feats, collapse = "/") + ) { + off_flags <- paste("ARROW_", feats, "=OFF", sep = "", collapse = " and ") cat("**** ", start_msg, " support ", msg, "; building with ", off_flags, "\n") } @@ -942,16 +953,22 @@ with_cloud_support <- function(env_var_list) { print_warning("requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb)") arrow_s3 <- FALSE arrow_gcs <- FALSE + arrow_azure <- FALSE } else if (!cmake_find_package("OpenSSL", "1.0.2", env_var_list)) { print_warning("requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew)") arrow_s3 <- FALSE arrow_gcs <- FALSE + arrow_azure <- FALSE + } else if (!cmake_find_package("libxml2", NULL, env_var_list)) { + print_warning("requires libxml2-devel (rpm), or libxml2-dev (deb), libxml2 (brew)", "AZURE") + arrow_azure <- FALSE } } # Update the build flags env_var_list <- replace(env_var_list, "ARROW_S3", ifelse(arrow_s3, "ON", "OFF")) - replace(env_var_list, "ARROW_GCS", ifelse(arrow_gcs, "ON", "OFF")) + env_var_list <- replace(env_var_list, "ARROW_GCS", ifelse(arrow_gcs, "ON", "OFF")) + replace(env_var_list, "ARROW_AZURE", ifelse(arrow_azure, "ON", "OFF")) } cmake_find_package <- function(pkg, version = NULL, env_var_list) { diff --git a/r/vignettes/developers/binary_features.Rmd b/r/vignettes/developers/binary_features.Rmd index ed6c7180f5b1..fa242e8f73eb 100644 --- a/r/vignettes/developers/binary_features.Rmd +++ b/r/vignettes/developers/binary_features.Rmd @@ -31,11 +31,11 @@ users with a fully-featured experience out of the box. ### Current binary feature set -| Platform | S3 | GCS | Configured in | +| Platform | S3 | GCS | Azure | Configured in | |----------|----|----|---------------| -| macOS (ARM64, x86_64) | ON | ON | `dev/tasks/r/github.packages.yml` | -| Windows | ON | ON | `ci/scripts/PKGBUILD` | -| Linux (x86_64) | ON | ON | `compose.yaml` (`ubuntu-cpp-static`) | +| macOS (ARM64, x86_64) | ON | ON | ON | `dev/tasks/r/github.packages.yml` | +| Windows | ON | ON | OFF | `ci/scripts/PKGBUILD` | +| Linux (x86_64) | ON | ON | ON | `compose.yaml` (`ubuntu-cpp-static`) | ### Exceptions to our build defaults @@ -48,6 +48,9 @@ our prebuilt binaries because: user machines 3. **Parity across platforms** - users get the same features regardless of OS +Azure is always set to OFF for Windows because of a low-level incompatibility +with MinGW. The `azure-identity-cpp` SDK for Azure relies on the Windows +Implementation Library (WIL), and this lacks stable support for MinGW. ## Feature configuration in source builds of libarrow @@ -85,12 +88,15 @@ When `LIBARROW_MINIMAL=false`, the following additional features are enabled | Feature | CMake Flag | Default | |---------|------------|---------| | S3 | `ARROW_S3` | `$ARROW_DEFAULT_PARAM` | +| Azure | `ARROW_AZURE` | `$ARROW_DEFAULT_PARAM` | | Jemalloc | `ARROW_JEMALLOC` | `$ARROW_DEFAULT_PARAM` | | Brotli | `ARROW_WITH_BROTLI` | `$ARROW_DEFAULT_PARAM` | | BZ2 | `ARROW_WITH_BZ2` | `$ARROW_DEFAULT_PARAM` | | Zlib | `ARROW_WITH_ZLIB` | `$ARROW_DEFAULT_PARAM` | | Zstd | `ARROW_WITH_ZSTD` | `$ARROW_DEFAULT_PARAM` | +Note that `ARROW_AZURE` is always OFF on Windows. + ### Features that require explicit opt-in GCS (Google Cloud Storage) is **always off by default**, even when diff --git a/r/vignettes/developers/setup.Rmd b/r/vignettes/developers/setup.Rmd index d13fc53db1ee..4a08dbd770b9 100644 --- a/r/vignettes/developers/setup.Rmd +++ b/r/vignettes/developers/setup.Rmd @@ -71,18 +71,18 @@ There are five major steps to the process. ### Step 1 - Install dependencies -When building libarrow, by default, system dependencies will be used if suitable versions are found. If system dependencies are not present, libarrow will build them during its own build process. The only dependencies that you need to install _outside_ of the build process are [cmake](https://cmake.org/) (for configuring the build) and [openssl](https://www.openssl.org/) if you are building with S3 support. +When building libarrow, by default, system dependencies will be used if suitable versions are found. If system dependencies are not present, libarrow will build them during its own build process. The only dependencies that you need to install _outside_ of the build process are [cmake](https://cmake.org/) (for configuring the build), [openssl](https://www.openssl.org/) and [curl](https://curl.se/libcurl/) if you are building with S3 and GCS support, and [libxml2](https://gitlab.gnome.org/GNOME/libxml2/-/wikis/home) if you're building with Azure support. For a faster build, you may choose to pre-install more C++ library dependencies (such as [lz4](http://lz4.github.io/lz4/), [zstd](https://facebook.github.io/zstd/), etc.) on the system so that they don't need to be built from source in the libarrow build. #### Ubuntu ```{bash, save=run & ubuntu} -sudo apt install -y cmake libcurl4-openssl-dev libssl-dev +sudo apt install -y cmake libcurl4-openssl-dev libssl-dev libxml2-dev ``` #### macOS ```{bash, save=run & macos} -brew install cmake openssl +brew install cmake openssl libxml2 ``` ### Step 2 - Configure the libarrow build @@ -155,6 +155,7 @@ To enable optional features including: S3 support, an alternative memory allocat -DARROW_GCS=ON \ -DARROW_MIMALLOC=ON \ -DARROW_S3=ON \ + -DARROW_AZURE=ON \ -DARROW_WITH_BROTLI=ON \ -DARROW_WITH_BZ2=ON \ -DARROW_WITH_LZ4=ON \ @@ -228,6 +229,7 @@ cmake \ -DARROW_MIMALLOC=ON \ -DARROW_PARQUET=ON \ -DARROW_S3=ON \ + -DARROW_AZURE=ON \ -DARROW_WITH_BROTLI=ON \ -DARROW_WITH_BZ2=ON \ -DARROW_WITH_LZ4=ON \ diff --git a/r/vignettes/fs.Rmd b/r/vignettes/fs.Rmd index 52652ad7e9ed..cb981ef5e130 100644 --- a/r/vignettes/fs.Rmd +++ b/r/vignettes/fs.Rmd @@ -1,29 +1,30 @@ --- -title: "Using cloud storage (S3, GCS)" +title: "Using cloud storage (S3, GCS, Azure)" description: > Learn how to work with data sets stored in an - Amazon S3 bucket or on Google Cloud Storage + Amazon S3 bucket, on Google Cloud Storage, or on Azure output: rmarkdown::html_vignette --- -Working with data stored in cloud storage systems like [Amazon Simple Storage Service](https://docs.aws.amazon.com/s3/) (S3) and [Google Cloud Storage](https://cloud.google.com/storage/docs) (GCS) is a very common task. Because of this, the Arrow C++ library provides a toolkit aimed to make it as simple to work with cloud storage as it is to work with the local filesystem. +Working with data stored in cloud storage systems like [Amazon Simple Storage Service](https://docs.aws.amazon.com/s3/) (S3), [Google Cloud Storage](https://cloud.google.com/storage/docs) (GCS), and [Microsoft Azure](https://azure.microsoft.com) is a very common task. Because of this, the Arrow C++ library provides a toolkit aimed to make it as simple to work with cloud storage as it is to work with the local filesystem. -To make this work, the Arrow C++ library contains a general-purpose interface for file systems, and the arrow package exposes this interface to R users. For instance, if you want to you can create a `LocalFileSystem` object that allows you to interact with the local file system in the usual ways: copying, moving, and deleting files, obtaining information about files and folders, and so on (see `help("FileSystem", package = "arrow")` for details). In general you probably don't need this functionality because you already have tools for working with your local file system, but this interface becomes much more useful in the context of remote file systems. Currently there is a specific implementation for Amazon S3 provided by the `S3FileSystem` class, and another one for Google Cloud Storage provided by `GcsFileSystem`. +To make this work, the Arrow C++ library contains a general-purpose interface for file systems, and the arrow package exposes this interface to R users. For instance, if you want to you can create a `LocalFileSystem` object that allows you to interact with the local file system in the usual ways: copying, moving, and deleting files, obtaining information about files and folders, and so on (see `help("FileSystem", package = "arrow")` for details). In general you probably don't need this functionality because you already have tools for working with your local file system, but this interface becomes much more useful in the context of remote file systems. Currently there is a specific implementation for Amazon S3 provided by the `S3FileSystem` class, one for Google Cloud Storage provided by `GcsFileSystem`, and another for Microsoft Azure provided by the `AzureFileSystem` class. -This article provides an overview of working with both S3 and GCS data using the Arrow toolkit. +This article provides an overview of working with S3, GCS, and Azure data using the Arrow toolkit. -## S3 and GCS support +## S3, GCS, and Azure support -Before you start, make sure that your arrow installation has support for S3 and/or GCS enabled. You can check whether support is enabled via helper functions: +Before you start, make sure that your arrow installation has support for S3, GCS, and/or Azure enabled. You can check whether support is enabled via helper functions: ```r arrow_with_s3() arrow_with_gcs() +arrow_with_azure() ``` If these return `TRUE` then the relevant support is enabled. -CRAN builds of arrow include S3 support but not GCS support. If you need GCS support, you can install arrow with full features using one of the following methods: +CRAN builds of arrow include S3 and Azure support but not GCS support. If you need GCS support, you can install arrow with full features using one of the following methods: ```r # Option 1: Install from R-universe @@ -36,15 +37,15 @@ Sys.setenv("NOT_CRAN" = "true") install.packages("arrow", type = "source") ``` -On Linux, S3 and GCS support is not always enabled by default when installing from source, and there are additional system requirements involved. See the [installation article](./install.html) for details. +On Linux, S3, GCS, and Azure support is not always enabled by default when installing from source, and there are additional system requirements involved. See the [installation article](./install.html) for details. Note that it is not currently possible to work with Azure on Windows. ## Connecting to cloud storage One way of working with filesystems is to create `?FileSystem` objects. `?S3FileSystem` objects can be created with the `s3_bucket()` function, which automatically detects the bucket's AWS region. Similarly, `?GcsFileSystem` objects -can be created with the `gs_bucket()` function. The resulting -`FileSystem` will consider paths relative to the bucket's path (so for example +can be created with the `gs_bucket()` function and `?AzureFileSystem` objects can be created with the `az_container()` function. The resulting +`FileSystem` will consider paths relative to the bucket/container's path (so for example you don't need to prefix the bucket path when listing a directory). With a `FileSystem` object, you can point to specific files in it with the `$path()` method @@ -52,7 +53,7 @@ and pass the result to file readers and writers (`read_parquet()`, `write_feathe Often the reason users work with cloud storage in real world analysis is to access large data sets. An example of this is discussed in the [datasets article](./dataset.html), but new users may prefer to work with a much smaller data set while learning how the arrow cloud storage interface works. To that end, the examples in this article rely on a multi-file Parquet dataset that stores a copy of the `diamonds` data made available through the [`ggplot2`](https://ggplot2.tidyverse.org/) package, documented in `help("diamonds", package = "ggplot2")`. The cloud storage version of this data set consists of 5 Parquet files totaling less than 1MB in size. -The diamonds data set is hosted on both S3 and GCS, in a bucket named `arrow-datasets`. To create an S3FileSystem object that refers to that bucket, use the following command: +The diamonds data set is hosted on both S3 and GCS, in a bucket named `arrow-datasets`. To create an `S3FileSystem` object that refers to that bucket, use the following command: ```r bucket <- s3_bucket("arrow-datasets") @@ -147,7 +148,7 @@ june2019 <- SubTreeFileSystem$create("s3://arrow-datasets/nyc-taxi/year=2019/mon ## Connecting directly with a URI -In most use cases, the easiest and most natural way to connect to cloud storage in arrow is to use the FileSystem objects returned by `s3_bucket()` and `gs_bucket()`, especially when multiple file operations are required. However, in some cases you may want to download a file directly by specifying the URI. This is permitted by arrow, and functions like `read_parquet()`, `write_feather()`, `open_dataset()` etc will all accept URIs to cloud resources hosted on S3 or GCS. The format of an S3 URI is as follows: +In most use cases, the easiest and most natural way to connect to cloud storage in arrow is to use the FileSystem objects returned by `s3_bucket()`, `gs_bucket()`, and `az_container()`, especially when multiple file operations are required. However, in some cases you may want to download a file directly by specifying the URI. This is permitted by arrow, and functions like `read_parquet()`, `write_feather()`, `open_dataset()` etc will all accept URIs to cloud resources hosted on S3, GCS, or Azure. The format of an S3 URI is as follows: ``` s3://[access_key:secret_key@]bucket/path[?region=] @@ -160,6 +161,12 @@ gs://[access_key:secret_key@]bucket/path gs://anonymous@bucket/path ``` +For Azure, the URI format looks like this: + +``` +abfs://container@account_name.dfs.core.windows.net/path +``` + For example, the Parquet file storing the "good cut" diamonds that we downloaded earlier in the article is available on both S3 and CGS. The relevant URIs are as follows: ```r @@ -258,6 +265,21 @@ df <- read_parquet("gs://anonymous@arrow-datasets/diamonds/cut=Good/part-0.parqu +### Azure Authentication + +By default, `AzureFileSystem$create()` and `az_container()` use the [DefaultAzureCredential]( https://github.com/Azure/azure-sdk-for-cpp/blob/main/sdk/identity/azure-identity/README.md#defaultazurecredential) for authentication. This will try several different types of authentication, using the first one that succeeds. Like with GCS, a simple way to authenticate with Azure is to first use [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/?view=azure-cli-latest) to login and setup default credentials: + +``` +az login +``` + +It is possible to use other forms of authentication with Azure when calling `AzureFileSystem$create()` and `az_container()`. + +- Passing `client_id` on its own will use [`ManagedIdentityCredential`](https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/overview) to authenticate. +- Passing `client_id` with `tenant_id` and `client_secret` will use [`ClientSecretCredential`](https://learn.microsoft.com/en-us/entra/identity-platform/app-objects-and-service-principals?tabs=browser) to authenticate. +- Passing `sas_token` will use a shared access signature (SAS) token for the storage account. +- Passing `account_key` will use the account key for the storage account. + ## Using a proxy server If you need to use a proxy server to connect to an S3 bucket, you can provide @@ -329,10 +351,8 @@ variables, you can set environment variable `AWS_EC2_METADATA_DISABLED` to Sys.setenv(AWS_EC2_METADATA_DISABLED = TRUE) ``` - ## Further reading -- To learn more about `FileSystem` classes, including `S3FileSystem` and `GcsFileSystem`, see `help("FileSystem", package = "arrow")`. -- To see a data analysis example that relies on data hosted on cloud storage, see the [dataset article](./dataset.html). - +- To learn more about `FileSystem` classes, including `S3FileSystem`, `GcsFileSystem`, and `AzureFileSystem`, see `help("FileSystem", package = "arrow")`. +- To see a data analysis example that relies on data hosted on cloud storage, see the [dataset article](./dataset.html). diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index a058975ccf19..099ed6d3580c 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -32,13 +32,14 @@ exception, as it ships with gcc 4.8. ### Libraries -Optional support for reading from cloud storage--AWS S3 and -Google Cloud Storage (GCS)--requires additional system dependencies: +Optional support for reading from cloud storage--AWS S3, +Google Cloud Storage (GCS), and Azure--requires additional system dependencies: * CURL: install `libcurl-devel` (rpm) or `libcurl4-openssl-dev` (deb) * OpenSSL >= 3.0: install `openssl-devel` (rpm) or `libssl-dev` (deb) +* libxml2 (Azure only): install `libxml2-devel` (rpm) or `libxml2-dev` (deb) -The prebuilt binaries come with S3 and GCS support enabled, so you will need to meet these system requirements in order to use them. If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3 and GCS support in the build if the prerequisites are not met--installation will succeed but without S3 or GCS functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3 and GCS support. +The prebuilt binaries come with S3, GCS, and Azure support enabled, so you will need to meet these system requirements in order to use them. If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3, GCS, and Azure support in the build if the prerequisites are not met--installation will succeed but without S3, GCS, or Azure functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3, GCS, and Azure support. ## Install release version (easy way) @@ -99,9 +100,9 @@ install.packages("arrow") This installs the source version of the R package, but during the installation process will check for compatible libarrow binaries that we host and use those if available. If no binary is available or can't be found, then this option falls back onto method 2 below (full source build), but setting the environment variable results in a more fully-featured build than default. -The libarrow binaries include support for AWS S3 and GCS, so they require the -libcurl and openssl libraries installed separately, as noted above. -If you don't have these installed, the libarrow binary won't be used, and you will fall back to the full source build (with S3 and GCS support disabled). +The libarrow binaries include support for AWS S3, GCS, and Azure, so they require the +libcurl and openssl libraries installed separately (along with libxml2 for Azure), as noted above. +If you don't have these installed, the libarrow binary won't be used, and you will fall back to the full source build (with S3, GCS, and Azure support disabled). If the internet access of your computer doesn't allow downloading the libarrow binaries (e.g. if access is limited to CRAN), you can first identify the right source and version by trying to install on the offline computer: @@ -204,19 +205,19 @@ information about dependencies and minimum versions. If downloading dependencies at build time is not an option, as when building on a system that is disconnected or behind a firewall, there are a few options. See "Offline builds" below. -#### Dependencies for S3 and GCS support +#### Dependencies for S3, GCS, and Azure support -Support for working with data in S3 and GCS is not enabled in the default +Support for working with data in S3, GCS, and Azure is not enabled in the default source build, and it has additional system requirements as described above. To enable it, set the environment variable `LIBARROW_MINIMAL=false` or `NOT_CRAN=true` to choose the full-featured build, or more selectively set -`ARROW_S3=ON` and/or `ARROW_GCS=ON`. +`ARROW_S3=ON`, `ARROW_GCS=ON`, and/or `ARROW_AZURE=ON`. -When either feature is enabled, the install script will check for the presence -of the required dependencies, and if the prerequisites are met, it will turn -off S3 and GCS support--installation will succeed but without S3 or GCS +When one of these features is enabled, the install script will check for the presence +of the required dependencies, and if the prerequisites are not met, it will turn +off S3, GCS, and Azure support--installation will succeed but without S3, GCS, or Azure functionality. If afterwards you install the missing system requirements, -you'll need to reinstall the package in order to enable S3 and GCS support. +you'll need to reinstall the package in order to enable S3, GCS, and Azure support. ### Advanced configuration @@ -233,6 +234,7 @@ default values are shown below. | ---| --- | :-: | | `ARROW_S3` | S3 support (if dependencies are met)* | `OFF` | | `ARROW_GCS` | GCS support (if dependencies are met)* | `OFF` | +| `ARROW_Azure` | Azure support (if dependencies are met)* | `OFF` | | `ARROW_JEMALLOC` | The `jemalloc` memory allocator | `ON` | | `ARROW_MIMALLOC` | The `mimalloc` memory allocator | `ON` | | `ARROW_PARQUET` | | `ON` |