Skip to content

Add insta / snapshot testing to CLI & set up AWS mock #13672

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 48 commits into from
Mar 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
2887bb9
Do not normalize values
blaginin Nov 26, 2024
813d634
Fix tests & update docs
blaginin Nov 26, 2024
c3de620
Prettier
blaginin Nov 26, 2024
0e11d36
Merge branch 'main' into bugfix/do-not-normalize-values
blaginin Dec 1, 2024
cb7da8c
Merge branch 'main' into bugfix/do-not-normalize-values
blaginin Dec 2, 2024
7c2b3fe
Lowercase config params
blaginin Dec 3, 2024
9146e4b
Add snap to CLI & set up AWS mock
blaginin Dec 5, 2024
9d856c3
Refactor tests
blaginin Dec 6, 2024
0574ab8
Unify transform and parse
blaginin Dec 9, 2024
2045495
Merge branch 'main' into bugfix/do-not-normalize-values
blaginin Dec 9, 2024
06c013d
Fix tests
blaginin Dec 9, 2024
4870219
Merge branch 'main' into cli-snap
blaginin Dec 10, 2024
a02625e
Merge branch 'bugfix/do-not-normalize-values' into cli-snap
blaginin Dec 10, 2024
65809a7
Setup CLI
blaginin Dec 10, 2024
05a562f
Show minio output
blaginin Dec 10, 2024
36f8550
Format Cargo.toml
blaginin Dec 10, 2024
921f229
Do not hardcode AWS params
blaginin Dec 11, 2024
107c515
Test options parsing
blaginin Dec 11, 2024
2d430a0
Add allow http
blaginin Dec 11, 2024
4b56506
Fix aws build
blaginin Dec 12, 2024
1581e7a
Merge branch 'main' into cli-snap
blaginin Dec 12, 2024
ad9734e
Fix ip
blaginin Dec 13, 2024
0b24daa
Remove slash ☠️
blaginin Dec 13, 2024
5f29c00
Format cargo toml
blaginin Dec 13, 2024
a3826d4
Remove integration_setup.bash
blaginin Dec 13, 2024
9cb1c99
Update docs
blaginin Dec 13, 2024
53c9c51
Do not hardcode test names
blaginin Dec 13, 2024
eb87f60
Merge branch 'main' into cli-snap
blaginin Dec 17, 2024
855315f
Relock cargo
blaginin Dec 17, 2024
fd3240d
Merge branch 'main' into cli-snap
blaginin Dec 20, 2024
17e8486
Merge branch 'main' into cli-snap
blaginin Jan 9, 2025
7d4050d
Remove aws sdk and set up minio in-place
blaginin Jan 9, 2025
fe2d345
Merge main
blaginin Jan 16, 2025
c037577
Nit: Add missing ready local to the docs
blaginin Jan 16, 2025
7293add
Merge main
blaginin Feb 4, 2025
680b474
Merge main
blaginin Mar 5, 2025
3c2ead2
Merge branch 'main' into cli-snap
blaginin Mar 5, 2025
7922db7
Fix backslash test
blaginin Mar 5, 2025
348e881
Add missing backslash
blaginin Mar 5, 2025
4238b41
put integration scripts in a separate folder
blaginin Mar 5, 2025
a880c42
Move s3 tests from extended to rust flow
blaginin Mar 7, 2025
5f80368
Reorganise the docs
blaginin Mar 7, 2025
5cafb77
Prettier
blaginin Mar 7, 2025
7e3ed3f
Do not use rust container to get docker
blaginin Mar 7, 2025
194ce80
Add missing protobuf
blaginin Mar 7, 2025
631a7bd
Merge remote-tracking branch 'apache/main' into cli-snap
alamb Mar 7, 2025
fc81ac9
revert change to extended.yml
alamb Mar 7, 2025
83f81c5
Merge remote-tracking branch 'apache/main' into cli-snap
alamb Mar 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 26 additions & 6 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -171,21 +171,41 @@ jobs:
name: cargo test (amd64)
needs: linux-build-lib
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: stable
run: rustup toolchain install stable
- name: Install Protobuf Compiler
run: sudo apt-get install -y protobuf-compiler
- name: Setup Minio - S3-compatible storage
run: |
docker run -d --name minio-container \
-p 9000:9000 \
-e MINIO_ROOT_USER=TEST-DataFusionLogin -e MINIO_ROOT_PASSWORD=TEST-DataFusionPassword \
-v $(pwd)/datafusion/core/tests/data:/source quay.io/minio/minio \
server /data
docker exec minio-container /bin/sh -c "\
mc ready local
mc alias set localminio http://localhost:9000 TEST-DataFusionLogin TEST-DataFusionPassword && \
mc mb localminio/data && \
mc cp -r /source/* localminio/data"
- name: Run tests (excluding doctests)
run: cargo test --profile ci --exclude datafusion-examples --exclude ffi_example_table_provider --exclude datafusion-benchmarks --workspace --lib --tests --bins --features serde,avro,json,backtrace,integration-tests
env:
RUST_BACKTRACE: 1
AWS_ENDPOINT: http://127.0.0.1:9000
AWS_ACCESS_KEY_ID: TEST-DataFusionLogin
AWS_SECRET_ACCESS_KEY: TEST-DataFusionPassword
TEST_STORAGE_INTEGRATION: 1
AWS_ALLOW_HTTP: true
run: cargo test --profile ci --exclude datafusion-examples --exclude ffi_example_table_provider --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace,integration-tests
- name: Verify Working Directory Clean
run: git diff --exit-code
- name: Minio Output
if: ${{ !cancelled() }}
run: docker logs minio-container

linux-test-example:
name: cargo examples (amd64)
Expand Down
49 changes: 49 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

75 changes: 75 additions & 0 deletions datafusion-cli/CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

# Development instructions

## Running Tests

Tests can be run using `cargo`

```shell
cargo test
```

## Running Storage Integration Tests

By default, storage integration tests are not run. To run them you will need to set `TEST_STORAGE_INTEGRATION=1` and
then provide the necessary configuration for that object store.

For some of the tests, [snapshots](https://datafusion.apache.org/contributor-guide/testing.html#snapshot-testing) are used.

### AWS

To test the S3 integration against [Minio](https://github.com/minio/minio)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested the instructions locally and they worked great 👍


First start up a container with Minio and load test files.

```shell
docker run -d \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we consider having the tests themselves start the containers they need?
In Trino land this is done with testcontainers-java and it works awesomely well from developer experience perspective.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it’s a nice improvement if tests just handle the setup automatically! Created an issue to discuss: #15092

--name datafusion-test-minio \
-p 9000:9000 \
-e MINIO_ROOT_USER=TEST-DataFusionLogin \
-e MINIO_ROOT_PASSWORD=TEST-DataFusionPassword \
-v $(pwd)/../datafusion/core/tests/data:/source \
quay.io/minio/minio server /data

docker exec datafusion-test-minio /bin/sh -c "\
mc ready local
mc alias set localminio http://localhost:9000 TEST-DataFusionLogin TEST-DataFusionPassword && \
mc mb localminio/data && \
mc cp -r /source/* localminio/data"
```

Setup environment

```shell
export TEST_STORAGE_INTEGRATION=1
export AWS_ACCESS_KEY_ID=TEST-DataFusionLogin
export AWS_SECRET_ACCESS_KEY=TEST-DataFusionPassword
export AWS_ENDPOINT=http://127.0.0.1:9000
export AWS_ALLOW_HTTP=true
```

Note that `AWS_ENDPOINT` is set without slash at the end.

Run tests

```shell
cargo test
```
2 changes: 2 additions & 0 deletions datafusion-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,7 @@ url = { workspace = true }
[dev-dependencies]
assert_cmd = "2.0"
ctor = { workspace = true }
insta = { version = "1.41.1", features = ["glob", "filters"] }
insta-cmd = "0.6.0"
predicates = "3.0"
rstest = { workspace = true }
123 changes: 104 additions & 19 deletions datafusion-cli/tests/cli_integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,46 +17,131 @@

use std::process::Command;

use assert_cmd::prelude::{CommandCargoExt, OutputAssertExt};
use predicates::prelude::predicate;
use rstest::rstest;

use insta::{glob, Settings};
use insta_cmd::{assert_cmd_snapshot, get_cargo_bin};
use std::{env, fs};

fn cli() -> Command {
Command::new(get_cargo_bin("datafusion-cli"))
}

fn make_settings() -> Settings {
let mut settings = Settings::clone_current();
settings.set_prepend_module_to_snapshot(false);
settings.add_filter(r"Elapsed .* seconds\.", "[ELAPSED]");
settings.add_filter(r"DataFusion CLI v.*", "[CLI_VERSION]");
settings
}

#[cfg(test)]
#[ctor::ctor]
fn init() {
// Enable RUST_LOG logging configuration for tests
let _ = env_logger::try_init();
}

// Disabled due to https://github.com/apache/datafusion/issues/10793
#[cfg(not(target_family = "windows"))]
#[rstest]
#[case::exec_from_commands(
["--command", "select 1", "--format", "json", "-q"],
"[{\"Int64(1)\":1}]\n"
)]
#[case::exec_multiple_statements(
["--command", "select 1; select 2;", "--format", "json", "-q"],
"[{\"Int64(1)\":1}]\n[{\"Int64(2)\":2}]\n"
"statements",
["--command", "select 1; select 2;", "-q"],
)]
#[case::exec_backslash(
["--file", "tests/data/backslash.txt", "--format", "json", "-q"],
"[{\"Utf8(\\\"\\\\\\\")\":\"\\\\\",\"Utf8(\\\"\\\\\\\\\\\")\":\"\\\\\\\\\",\"Utf8(\\\"\\\\\\\\\\\\\\\\\\\\\\\")\":\"\\\\\\\\\\\\\\\\\\\\\",\"Utf8(\\\"dsdsds\\\\\\\\\\\\\\\\\\\")\":\"dsdsds\\\\\\\\\\\\\\\\\",\"Utf8(\\\"\\\\t\\\")\":\"\\\\t\",\"Utf8(\\\"\\\\0\\\")\":\"\\\\0\",\"Utf8(\\\"\\\\n\\\")\":\"\\\\n\"}]\n"
"backslash",
["--file", "tests/sql/backslash.sql", "--format", "json", "-q"],
)]
#[case::exec_from_files(
["--file", "tests/data/sql.txt", "--format", "json", "-q"],
"[{\"Int64(1)\":1}]\n"
"files",
["--file", "tests/sql/select.sql", "-q"],
)]
#[case::set_batch_size(
["--command", "show datafusion.execution.batch_size", "--format", "json", "-q", "-b", "1"],
"[{\"name\":\"datafusion.execution.batch_size\",\"value\":\"1\"}]\n"
"batch_size",
["--command", "show datafusion.execution.batch_size", "-q", "-b", "1"],
)]
#[test]
fn cli_quick_test<'a>(
#[case] snapshot_name: &'a str,
#[case] args: impl IntoIterator<Item = &'a str>,
#[case] expected: &str,
) {
let mut cmd = Command::cargo_bin("datafusion-cli").unwrap();
let mut settings = make_settings();
settings.set_snapshot_suffix(snapshot_name);
let _bound = settings.bind_to_scope();

let mut cmd = cli();
cmd.args(args);
cmd.assert().stdout(predicate::eq(expected));

assert_cmd_snapshot!(cmd);
}

#[rstest]
#[case("csv")]
#[case("tsv")]
#[case("table")]
#[case("json")]
#[case("nd-json")]
#[case("automatic")]
#[test]
fn test_cli_format<'a>(#[case] format: &'a str) {
let mut settings = make_settings();
settings.set_snapshot_suffix(format);
let _bound = settings.bind_to_scope();

let mut cmd = cli();
cmd.args(["--command", "select 1", "-q", "--format", format]);

assert_cmd_snapshot!(cmd);
}

#[tokio::test]
async fn test_cli() {
if env::var("TEST_STORAGE_INTEGRATION").is_err() {
eprintln!("Skipping external storages integration tests");
return;
}

let settings = make_settings();
let _bound = settings.bind_to_scope();

glob!("sql/integration/*.sql", |path| {
let input = fs::read_to_string(path).unwrap();
assert_cmd_snapshot!(cli().pass_stdin(input))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW we have had really nice luck in influxdb_iox using insta, and instead of using the external snap shot testing we used inline snapshots (so the expected results are inline with the test code)

It looks like this:

   fn test_union_not_nested() {
        let plan = Arc::new(UnionExec::new(vec![other_node()]));
        let opt = NestedUnion;
        insta::assert_yaml_snapshot!(
            OptimizationTest::new(plan, opt),
            @r#"
        input:
          - " UnionExec"
          - "   EmptyExec"
        output:
          Ok:
            - " UnionExec"
            - "   EmptyExec"
        "#
        );
    }

Is that possible with assert_cmt_snapshot?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is absolutely possible! My idea was to make it closer to slt and separate the code from the data. Setting up the suite is quite hard (we need to create a new user, upload a test file, etc.), but with the current approach, we only need to do it once rather than for every single test.

However, I'm happy to make changes if you'd prefer it to be inline?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally prefer it to be inline, but perhaps we can do that as a follow on PR

(the way we do this in influxdb_iox is that we have a test function that returns a string and then compare the string with the assert_yaml_snapshot -- that way the setup code isn't replicated and we can still have the tests inline

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is absolutely possible! My idea was to make it closer to slt and separate the code from the data.

FWIW I think .slt keeps the tests setup and expected output together (the slt driver is separate for sure, but most of the tests are self contained in the .slt file

});
}

#[tokio::test]
async fn test_aws_options() {
// Separate test is needed to pass aws as options in sql and not via env

if env::var("TEST_STORAGE_INTEGRATION").is_err() {
eprintln!("Skipping external storages integration tests");
return;
}

let settings = make_settings();
let _bound = settings.bind_to_scope();

let access_key_id =
env::var("AWS_ACCESS_KEY_ID").expect("AWS_ACCESS_KEY_ID is not set");
let secret_access_key =
env::var("AWS_SECRET_ACCESS_KEY").expect("AWS_SECRET_ACCESS_KEY is not set");
let endpoint_url = env::var("AWS_ENDPOINT").expect("AWS_ENDPOINT is not set");

let input = format!(
r#"CREATE EXTERNAL TABLE CARS
STORED AS CSV
LOCATION 's3://data/cars.csv'
OPTIONS(
'aws.access_key_id' '{}',
'aws.secret_access_key' '{}',
'aws.endpoint' '{}',
'aws.allow_http' 'true'
);

SELECT * FROM CARS limit 1;
"#,
access_key_id, secret_access_key, endpoint_url
);

assert_cmd_snapshot!(cli().env_clear().pass_stdin(input));
}
25 changes: 25 additions & 0 deletions datafusion-cli/tests/snapshots/aws_options.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
source: tests/cli_integration.rs
info:
program: datafusion-cli
args: []
stdin: "CREATE EXTERNAL TABLE CARS\nSTORED AS CSV\nLOCATION 's3://data/cars.csv'\nOPTIONS(\n 'aws.access_key_id' 'TEST-DataFusionLogin',\n 'aws.secret_access_key' 'TEST-DataFusionPassword',\n 'aws.endpoint' 'http://127.0.0.1:9000',\n 'aws.allow_http' 'true'\n);\n\nSELECT * FROM CARS limit 1;\n"
---
success: true
exit_code: 0
----- stdout -----
[CLI_VERSION]
0 row(s) fetched.
[ELAPSED]

+-----+-------+---------------------+
| car | speed | time |
+-----+-------+---------------------+
| red | 20.0 | 1996-04-12T12:05:03 |
+-----+-------+---------------------+
1 row(s) fetched.
[ELAPSED]

\q

----- stderr -----
Loading
Loading