From 5d215e90205b20ed8eefedaeea98bad1ebec21ad Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Sat, 1 Feb 2025 20:48:45 -0800 Subject: [PATCH 01/12] [tmpnet] Deploy collectors with golang to simplify cross-repo use Previously, prometheus and promtail were installed and launched by with bash scripts. Migrating installation to nix and launch to golang enables directly sharing the functionality with subnet-evm and hypersdk. No more having to copy and maintain copies of the scripts in multiple repos. --- .../run-monitored-tmpnet-cmd/action.yml | 26 +- scripts/run_prometheus.sh | 93 ------- scripts/run_promtail.sh | 91 ------- tests/fixture/e2e/env.go | 4 + tests/fixture/e2e/flags.go | 44 ++-- tests/fixture/e2e/metrics_link.go | 2 +- tests/fixture/tmpnet/node_process.go | 7 +- tests/fixture/tmpnet/start_collectors.go | 242 ++++++++++++++++++ 8 files changed, 280 insertions(+), 229 deletions(-) delete mode 100755 scripts/run_prometheus.sh delete mode 100755 scripts/run_promtail.sh create mode 100644 tests/fixture/tmpnet/start_collectors.go diff --git a/.github/actions/run-monitored-tmpnet-cmd/action.yml b/.github/actions/run-monitored-tmpnet-cmd/action.yml index d701a55ab3fc..ee221569c53b 100644 --- a/.github/actions/run-monitored-tmpnet-cmd/action.yml +++ b/.github/actions/run-monitored-tmpnet-cmd/action.yml @@ -44,23 +44,6 @@ runs: # Ensure promtail and prometheus are available - name: Install nix uses: ./.github/actions/install-nix - - name: Start prometheus - # Only run for the original repo; a forked repo won't have access to the monitoring credentials - if: (inputs.prometheus_username != '') - shell: bash - # Assumes calling project has a nix flake that ensures a compatible prometheus - run: nix develop --impure --command bash -x ./scripts/run_prometheus.sh - env: - PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }} - PROMETHEUS_PASSWORD: ${{ inputs.prometheus_password }} - - name: Start promtail - if: (inputs.prometheus_username != '') - shell: bash - # Assumes calling project has a nix flake that ensures a compatible promtail - run: nix develop --impure --command bash -x ./scripts/run_promtail.sh - env: - LOKI_USERNAME: ${{ inputs.loki_username }} - LOKI_PASSWORD: ${{ inputs.loki_password }} - name: Notify of metrics availability if: (inputs.prometheus_username != '') shell: bash @@ -71,9 +54,14 @@ runs: FILTER_BY_OWNER: ${{ inputs.filter_by_owner }} - name: Run command shell: bash - run: ${{ inputs.run_env }} ${{ inputs.run }} + # --impure ensures the env vars are accessible to the command + run: ${{ inputs.run_env }} nix develop --impure --command bash -x ${{ inputs.run }} env: - TMPNET_DELAY_NETWORK_SHUTDOWN: true # Ensure shutdown waits for a final metrics scrape + TMPNET_ENABLE_COLLECTORS: true + LOKI_USERNAME: ${{ inputs.loki_username }} + LOKI_PASSWORD: ${{ inputs.loki_password }} + PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }} + PROMETHEUS_PASSWORD: ${{ inputs.prometheus_password }} GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }} GH_WORKFLOW: ${{ inputs.workflow }} GH_RUN_ID: ${{ inputs.run_id }} diff --git a/scripts/run_prometheus.sh b/scripts/run_prometheus.sh deleted file mode 100755 index 92585a10cb76..000000000000 --- a/scripts/run_prometheus.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -# - Starts a prometheus instance in agent-mode to collect metrics from nodes running -# locally and in CI. -# -# - promtail will remain running in the background and will forward metrics to the -# specified prometheus endpoint. -# -# - Each node is configured with a file written to ~/.tmpnet/prometheus/file_sd_configs -# -# - To stop the running instance: -# $ kill -9 `cat ~/.tmpnet/promtheus/run.pid` && rm ~/.tmpnet/promtail/run.pid - -# e.g., -# PROMETHEUS_USERNAME= PROMETHEUS_PASSWORD= ./scripts/run_prometheus.sh -if ! [[ "$0" =~ scripts/run_prometheus.sh ]]; then - echo "must be run from repository root" - exit 255 -fi - -CMD=prometheus - -if ! command -v "${CMD}" &> /dev/null; then - echo "prometheus not found, have you run 'nix develop'?" - echo "To install nix: https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix" - exit 1 -fi - -PROMETHEUS_WORKING_DIR="${HOME}/.tmpnet/prometheus" -PIDFILE="${PROMETHEUS_WORKING_DIR}"/run.pid - -# First check if an agent-mode prometheus is already running. A single instance can collect -# metrics from all local temporary networks. -if pgrep --pidfile="${PIDFILE}" -f 'prometheus.*enable-feature=agent' &> /dev/null; then - echo "prometheus is already running locally with --enable-feature=agent" - exit 0 -fi - -PROMETHEUS_URL="${PROMETHEUS_URL:-https://prometheus-poc.avax-dev.network}" -if [[ -z "${PROMETHEUS_URL}" ]]; then - echo "Please provide a value for PROMETHEUS_URL" - exit 1 -fi - -PROMETHEUS_USERNAME="${PROMETHEUS_USERNAME:-}" -if [[ -z "${PROMETHEUS_USERNAME}" ]]; then - echo "Please provide a value for PROMETHEUS_USERNAME" - exit 1 -fi - -PROMETHEUS_PASSWORD="${PROMETHEUS_PASSWORD:-}" -if [[ -z "${PROMETHEUS_PASSWORD}" ]]; then - echo "Please provide a value for PROMETHEUS_PASSWORD" - exit 1 -fi - -# Configure prometheus -FILE_SD_PATH="${PROMETHEUS_WORKING_DIR}/file_sd_configs" -mkdir -p "${FILE_SD_PATH}" - -CONFIG_PATH="${PROMETHEUS_WORKING_DIR}/prometheus.yaml" -cat > "${CONFIG_PATH}" < prometheus.log 2>&1 & -echo $! > "${PIDFILE}" -echo "prometheus started with pid $(cat "${PIDFILE}")" -# shellcheck disable=SC2016 -echo 'To stop prometheus: "kill -SIGTERM `cat ~/.tmpnet/prometheus/run.pid` && rm ~/.tmpnet/prometheus/run.pid"' diff --git a/scripts/run_promtail.sh b/scripts/run_promtail.sh deleted file mode 100755 index 43030bd8aad3..000000000000 --- a/scripts/run_promtail.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -# - Starts a promtail instance to collect logs from nodes running locally and in CI. -# -# - promtail will remain running in the background and will forward logs to the -# specified Loki endpoint. -# -# - Each node is configured with a file written to ~/.tmpnet/promtail/file_sd_configs/ -# -# - To stop the running instance: -# $ kill -9 `cat ~/.tmpnet/promtail/run.pid` && rm ~/.tmpnet/promtail/run.pid - -# e.g., -# LOKI_USERNAME= LOKI_PASSWORD= ./scripts/run_promtail.sh -if ! [[ "$0" =~ scripts/run_promtail.sh ]]; then - echo "must be run from repository root" - exit 255 -fi - -CMD=promtail - -if ! command -v "${CMD}" &> /dev/null; then - echo "promtail not found, have you run 'nix develop'?" - echo "To install nix: https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix" - exit 1 -fi - -PROMTAIL_WORKING_DIR="${HOME}/.tmpnet/promtail" -PIDFILE="${PROMTAIL_WORKING_DIR}"/run.pid - -# First check if promtail is already running. A single instance can -# collect logs from all local temporary networks. -if pgrep --pidfile="${PIDFILE}" &> /dev/null; then - echo "promtail is already running" - exit 0 -fi - -LOKI_URL="${LOKI_URL:-https://loki-poc.avax-dev.network}" -if [[ -z "${LOKI_URL}" ]]; then - echo "Please provide a value for LOKI_URL" - exit 1 -fi - -LOKI_USERNAME="${LOKI_USERNAME:-}" -if [[ -z "${LOKI_USERNAME}" ]]; then - echo "Please provide a value for LOKI_USERNAME" - exit 1 -fi - -LOKI_PASSWORD="${LOKI_PASSWORD:-}" -if [[ -z "${LOKI_PASSWORD}" ]]; then - echo "Please provide a value for LOKI_PASSWORD" - exit 1 -fi - -# Configure promtail -FILE_SD_PATH="${PROMTAIL_WORKING_DIR}/file_sd_configs" -mkdir -p "${FILE_SD_PATH}" - -CONFIG_PATH="${PROMTAIL_WORKING_DIR}/promtail.yaml" -cat > "${CONFIG_PATH}" < promtail.log 2>&1 & -echo $! > "${PIDFILE}" -echo "promtail started with pid $(cat "${PIDFILE}")" -# shellcheck disable=SC2016 -echo 'To stop promtail: "kill -SIGTERM `cat ~/.tmpnet/promtail/run.pid` && rm ~/.tmpnet/promtail/run.pid"' diff --git a/tests/fixture/e2e/env.go b/tests/fixture/e2e/env.go index d0ca308e7145..55d22e36632b 100644 --- a/tests/fixture/e2e/env.go +++ b/tests/fixture/e2e/env.go @@ -130,6 +130,10 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork } } + if flagVars.EnableCollectors() { + require.NoError(tmpnet.EnsureCollectorsRunning(tc.Log())) + } + // Start a new network if network == nil { network = desiredNetwork diff --git a/tests/fixture/e2e/flags.go b/tests/fixture/e2e/flags.go index 09209444d658..893a7346d573 100644 --- a/tests/fixture/e2e/flags.go +++ b/tests/fixture/e2e/flags.go @@ -15,25 +15,17 @@ import ( "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" ) -const ( - // Ensure that this value takes into account the scrape_interval - // defined in scripts/run_prometheus.sh. - networkShutdownDelay = 12 * time.Second - - delayNetworkShutdownEnvName = "TMPNET_DELAY_NETWORK_SHUTDOWN" -) - type FlagVars struct { - avalancheGoExecPath string - pluginDir string - networkDir string - reuseNetwork bool - delayNetworkShutdown bool - startNetwork bool - stopNetwork bool - restartNetwork bool - nodeCount int - activateFortuna bool + avalancheGoExecPath string + pluginDir string + networkDir string + reuseNetwork bool + enableCollectors bool + startNetwork bool + stopNetwork bool + restartNetwork bool + nodeCount int + activateFortuna bool } func (v *FlagVars) AvalancheGoExecPath() (string, error) { @@ -81,10 +73,14 @@ func (v *FlagVars) RestartNetwork() bool { return v.restartNetwork } +func (v *FlagVars) EnableCollectors() bool { + return v.enableCollectors +} + func (v *FlagVars) NetworkShutdownDelay() time.Duration { - if v.delayNetworkShutdown { + if v.enableCollectors { // Only return a non-zero value if the delay is enabled. - return networkShutdownDelay + return tmpnet.NetworkShutdownDelay } return 0 } @@ -152,10 +148,10 @@ func RegisterFlags() *FlagVars { "[optional] restart an existing network previously started with --reuse-network. Useful for ensuring a network is running with the current state of binaries on disk. Ignored if a network is not already running or --stop-network is provided.", ) flag.BoolVar( - &vars.delayNetworkShutdown, - "delay-network-shutdown", - cast.ToBool(GetEnvWithDefault(delayNetworkShutdownEnvName, "false")), - "[optional] whether to delay network shutdown to allow a final metrics scrape.", + &vars.enableCollectors, + "enable-collectors", + cast.ToBool(GetEnvWithDefault("TMPNET_ENABLE_COLLECTORS", "false")), + "[optional] whether to enable collectors of logs and metrics from nodes of the temporary network.", ) flag.BoolVar( &vars.startNetwork, diff --git a/tests/fixture/e2e/metrics_link.go b/tests/fixture/e2e/metrics_link.go index 716938520066..2042d0e461e1 100644 --- a/tests/fixture/e2e/metrics_link.go +++ b/tests/fixture/e2e/metrics_link.go @@ -49,7 +49,7 @@ var _ = ginkgo.AfterEach(func() { // Extend the end time by the shutdown delay (a proxy for the metrics // scrape interval) to maximize the chances of the specified duration // including all metrics relevant to the current spec. - endTime := time.Now().Add(networkShutdownDelay).UnixMilli() + endTime := time.Now().Add(tmpnet.NetworkShutdownDelay).UnixMilli() metricsLink := tmpnet.MetricsLinkForNetwork( env.GetNetwork().UUID, strconv.FormatInt(startTime, 10), diff --git a/tests/fixture/tmpnet/node_process.go b/tests/fixture/tmpnet/node_process.go index 0ecade0cf8a0..ccaf562d7086 100644 --- a/tests/fixture/tmpnet/node_process.go +++ b/tests/fixture/tmpnet/node_process.go @@ -226,7 +226,12 @@ func (p *NodeProcess) getProcess() (*os.Process, error) { return nil, nil } - proc, err := os.FindProcess(p.pid) + return getProcess(p.pid) +} + +// getProcess retrieves the process if it is running. +func getProcess(pid int) (*os.Process, error) { + proc, err := os.FindProcess(pid) if err != nil { return nil, fmt.Errorf("failed to find process: %w", err) } diff --git a/tests/fixture/tmpnet/start_collectors.go b/tests/fixture/tmpnet/start_collectors.go new file mode 100644 index 000000000000..21591cea2fc1 --- /dev/null +++ b/tests/fixture/tmpnet/start_collectors.go @@ -0,0 +1,242 @@ +// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package tmpnet + +import ( + "errors" + "fmt" + "io/fs" + "os" + "os/exec" + "path/filepath" + "strconv" + "time" + + "go.uber.org/zap" + + "github.com/ava-labs/avalanchego/utils/logging" + "github.com/ava-labs/avalanchego/utils/perms" +) + +type configGeneratorFunc func(workingDir string, username string, password string) string + +// Use a delay slightly longer than the 10s scrape interval configured for prometheus to ensure a final scrape before shutdown +const NetworkShutdownDelay = 12 * time.Second + +func EnsureCollectorsRunning(log logging.Logger) error { + if err := ensurePrometheusRunning(log); err != nil { + return err + } + return ensurePromtailRunning(log) +} + +func ensurePrometheusRunning(log logging.Logger) error { + return ensureCollectorRunning( + log, + "prometheus", + "--config.file=prometheus.yaml --storage.agent.path=./data --web.listen-address=localhost:0 --enable-feature=agent", + "PROMETHEUS", + func(workingDir string, username string, password string) string { + return fmt.Sprintf(` +global: + scrape_interval: 10s # Default is every 1 minute. + evaluation_interval: 10s # The default is every 1 minute. + scrape_timeout: 5s # The default is every 10s + +scrape_configs: + - job_name: "avalanchego" + metrics_path: "/ext/metrics" + file_sd_configs: + - files: + - '%s/file_sd_configs/*.json' + +remote_write: + - url: "https://prometheus-poc.avax-dev.network/api/v1/write" + basic_auth: + username: "%s" + password: "%s" +`, workingDir, username, password) + }, + ) +} + +func ensurePromtailRunning(log logging.Logger) error { + return ensureCollectorRunning( + log, + "promtail", + "-config.file=promtail.yaml", + "LOKI", + func(workingDir string, username string, password string) string { + return fmt.Sprintf(` +server: + http_listen_port: 0 + grpc_listen_port: 0 + +positions: + filename: %s/positions.yaml + +client: + url: "https://loki-poc.avax-dev.network/api/prom/push" + basic_auth: + username: "%s" + password: "%s" + +scrape_configs: + - job_name: "avalanchego" + file_sd_configs: + - files: + - '%s/file_sd_configs/*.json' +`, workingDir, username, password, workingDir) + }, + ) +} + +func ensureCollectorRunning( + log logging.Logger, + cmdName string, + args string, + baseEnvName string, + configGenerator configGeneratorFunc, +) error { + tmpnetDir, err := getTmpnetPath() + if err != nil { + return err + } + workingDir := filepath.Join(tmpnetDir, cmdName) + pidFilename := "run.pid" + pidPath := filepath.Join(workingDir, pidFilename) + + if err := os.MkdirAll(workingDir, perms.ReadWriteExecute); err != nil { + return fmt.Errorf("failed to create %s dir: %w", cmdName, err) + } + + if err := os.MkdirAll(filepath.Join(workingDir, "file_sd_configs"), perms.ReadWriteExecute); err != nil { + return fmt.Errorf("failed to create promtail file_sd_configs dir: %w", err) + } + + // Read the PID from the file + pidData, err := os.ReadFile(pidPath) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return fmt.Errorf("failed to read %s PID file %s: %w", cmdName, pidPath, err) + } + if len(pidData) > 0 { + pid, err := strconv.Atoi(string(pidData)) + if err != nil { + return fmt.Errorf("failed to parse %s PID: %w", cmdName, err) + } + process, err := getProcess(pid) + if err != nil { + return err + } + if process != nil { + log.Info(cmdName + " is already running") + return nil + } + } + + // Remove the pid file to avoid conflicting with the new one starting + if err := os.Remove(pidPath); err != nil { + if !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("failed to remove stale pid file: %w", err) + } + } else { + log.Info("deleted stale "+cmdName+" pid file", + zap.String("path", pidPath), + ) + } + + // TODO(marun) Maybe collect errors instead of returning them 1-by-1? + if _, err := exec.LookPath(cmdName); err != nil { + return fmt.Errorf("%s command not found. Maybe run 'nix develop'?", cmdName) + } + + usernameEnvVar := baseEnvName + "_USERNAME" + username := getEnv(usernameEnvVar, "") + if len(username) == 0 { + return fmt.Errorf("%s env var not set", usernameEnvVar) + } + + passwordEnvVar := baseEnvName + "_PASSWORD" + password := getEnv(passwordEnvVar, "") + if len(password) == 0 { + return fmt.Errorf("%s var not set", passwordEnvVar) + } + + confFilename := cmdName + ".yaml" + confPath := filepath.Join(workingDir, confFilename) + log.Info("writing "+cmdName+" config", + zap.String("path", confPath), + ) + config := configGenerator(workingDir, username, password) + if err := os.WriteFile(confPath, []byte(config), perms.ReadWrite); err != nil { + return err + } + + fullCmd := "nohup " + cmdName + " " + args + " > " + cmdName + ".log 2>&1 & echo -n \"$!\" > " + pidFilename + log.Info("starting "+cmdName, + zap.String("workingDir", workingDir), + zap.String("fullCmd", fullCmd), + ) + + // TODO(marun) Figure out a way to redirect stdout and stderr of a detached child process without a bash shell + cmd := exec.Command("bash", "-c", fullCmd) + configureDetachedProcess(cmd) // Ensure the child process will outlive its parent + cmd.Dir = workingDir + + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start %s: %w", cmdName, err) + } + + var pid string + // TODO(marun) Use a context instead + for { + if fileExistsAndNotEmpty(pidPath) { + var err error + pid, err = readFileContents(pidPath) + if err != nil { + return fmt.Errorf("failed to read pid file: %w", err) + } + break + } + time.Sleep(100 * time.Millisecond) + } + log.Info(cmdName+" started", + zap.String("pid", pid), + ) + + killMsg := fmt.Sprintf("To stop %s: kill -SIGTERM $(cat %s) && rm %s", cmdName, pidPath, pidPath) + log.Info(killMsg) + + return nil +} + +// Function to check if a file exists and is not empty +func fileExistsAndNotEmpty(filename string) bool { + fileInfo, err := os.Stat(filename) + if err != nil { + if os.IsNotExist(err) { + return false + } + fmt.Printf("Error stating file: %v\n", err) + return false + } + return fileInfo.Size() > 0 +} + +// Function to read the contents of a file +func readFileContents(filename string) (string, error) { + content, err := os.ReadFile(filename) + if err != nil { + return "", err + } + return string(content), nil +} + +// TODO(marun) Put this somewhere standard +func getEnv(key, fallback string) string { + if value, ok := os.LookupEnv(key); ok { + return value + } + return fallback +} From 5b36ecf21188d16457d3f75f8ae565d719fcfa01 Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Sat, 22 Feb 2025 17:11:00 +0100 Subject: [PATCH 02/12] fixup: Refactor for review --- .../run-monitored-tmpnet-cmd/action.yml | 12 +- bin/tmpnetctl | 13 + tests/e2e/README.md | 29 ++ tests/fixture/e2e/env.go | 2 +- tests/fixture/e2e/flags.go | 27 +- tests/fixture/tmpnet/README.md | 88 ++-- tests/fixture/tmpnet/cmd/main.go | 33 +- tests/fixture/tmpnet/collectors.go | 419 ++++++++++++++++++ tests/fixture/tmpnet/start_collectors.go | 242 ---------- tests/fixture/tmpnet/utils.go | 9 + tests/upgrade/upgrade_test.go | 6 + 11 files changed, 582 insertions(+), 298 deletions(-) create mode 100755 bin/tmpnetctl create mode 100644 tests/fixture/tmpnet/collectors.go delete mode 100644 tests/fixture/tmpnet/start_collectors.go diff --git a/.github/actions/run-monitored-tmpnet-cmd/action.yml b/.github/actions/run-monitored-tmpnet-cmd/action.yml index ee221569c53b..dc633653e880 100644 --- a/.github/actions/run-monitored-tmpnet-cmd/action.yml +++ b/.github/actions/run-monitored-tmpnet-cmd/action.yml @@ -41,9 +41,15 @@ inputs: runs: using: composite steps: - # Ensure promtail and prometheus are available - - name: Install nix - uses: ./.github/actions/install-nix + # - Ensure promtail and prometheus are available + # - Avoid using the install-nix custom action since a relative + # path wouldn't be resolveable from other repos and an absolute + # path would require setting a version. + - uses: cachix/install-nix-action@v30 + with: + github_access_token: ${{ inputs.github_token }} + - run: nix develop --command echo "dependencies installed" + shell: bash - name: Notify of metrics availability if: (inputs.prometheus_username != '') shell: bash diff --git a/bin/tmpnetctl b/bin/tmpnetctl new file mode 100755 index 000000000000..d9cd8b8bb595 --- /dev/null +++ b/bin/tmpnetctl @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Ensure the go command is run from the root of the repository +AVALANCHE_PATH=$(cd "$( dirname "${BASH_SOURCE[0]}" )"; cd .. && pwd ) +cd "${AVALANCHE_PATH}" + +# Build if needed +if [[ ! -f ./build/tmpnetctl ]]; then + ./scripts/build_tmpnetctl.sh +fi +./build/tmpnetctl diff --git a/tests/e2e/README.md b/tests/e2e/README.md index 1851ffd3141e..464b0f55462f 100644 --- a/tests/e2e/README.md +++ b/tests/e2e/README.md @@ -107,3 +107,32 @@ these bootstrap checks during development, set the ```bash E2E_SKIP_BOOTSTRAP_CHECKS=1 ./bin/ginkgo -v ./tests/e2e ... ``` + +## Monitoring + +It is possible to enable collection of logs and metrics from the +temporary networks used for e2e testing by: + + - Supplying `--enable-collectors` as an argument to the test suite + - Starting collectors in advance of a test run with `tmpnetctl + start-collectors` + +Both methods require: + + - Auth credentials to be supplied as env vars: + - `PROMETHEUS_USERNAME` + - `PROMETHEUS_PASSWORD` + - `LOKI_USERNAME` + - `LOKI_PASSWORD` + - The availability in the path of binaries for promtail and prometheus + - Starting a development shell with `nix develop` is one way to + ensure this and requires the [installation of + nix](https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix). + +Once started, the collectors will continue to run in the background +until stopped by `tmpnetctl stop-collectors`. + +The results of collection will be viewable at +https://grafana-poc.avax-dev.network. + +For more detail, see the [tmpnet docs](../tmpnet/README.md#monitoring). diff --git a/tests/fixture/e2e/env.go b/tests/fixture/e2e/env.go index 55d22e36632b..35eea8478a2f 100644 --- a/tests/fixture/e2e/env.go +++ b/tests/fixture/e2e/env.go @@ -131,7 +131,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork } if flagVars.EnableCollectors() { - require.NoError(tmpnet.EnsureCollectorsRunning(tc.Log())) + require.NoError(tmpnet.EnsureCollectorsRunning(tc.DefaultContext(), tc.Log())) } // Start a new network diff --git a/tests/fixture/e2e/flags.go b/tests/fixture/e2e/flags.go index 893a7346d573..aebc0cdd9c0b 100644 --- a/tests/fixture/e2e/flags.go +++ b/tests/fixture/e2e/flags.go @@ -101,14 +101,6 @@ func (v *FlagVars) ActivateFortuna() bool { return v.activateFortuna } -func GetEnvWithDefault(envVar, defaultVal string) string { - val := os.Getenv(envVar) - if len(val) == 0 { - return defaultVal - } - return val -} - func RegisterFlags() *FlagVars { vars := FlagVars{} flag.StringVar( @@ -123,7 +115,7 @@ func RegisterFlags() *FlagVars { flag.StringVar( &vars.pluginDir, "plugin-dir", - GetEnvWithDefault(tmpnet.AvalancheGoPluginDirEnvName, os.ExpandEnv("$HOME/.avalanchego/plugins")), + tmpnet.GetEnvWithDefault(tmpnet.AvalancheGoPluginDirEnvName, os.ExpandEnv("$HOME/.avalanchego/plugins")), fmt.Sprintf( "[optional] the dir containing VM plugins. Also possible to configure via the %s env variable.", tmpnet.AvalancheGoPluginDirEnvName, @@ -147,12 +139,7 @@ func RegisterFlags() *FlagVars { false, "[optional] restart an existing network previously started with --reuse-network. Useful for ensuring a network is running with the current state of binaries on disk. Ignored if a network is not already running or --stop-network is provided.", ) - flag.BoolVar( - &vars.enableCollectors, - "enable-collectors", - cast.ToBool(GetEnvWithDefault("TMPNET_ENABLE_COLLECTORS", "false")), - "[optional] whether to enable collectors of logs and metrics from nodes of the temporary network.", - ) + SetEnableCollectorsFlag(&vars.enableCollectors) flag.BoolVar( &vars.startNetwork, "start-network", @@ -180,3 +167,13 @@ func RegisterFlags() *FlagVars { return &vars } + +// Enable reuse by the upgrade job +func SetEnableCollectorsFlag(p *bool) { + flag.BoolVar( + p, + "enable-collectors", + cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_ENABLE_COLLECTORS", "false")), + "[optional] whether to enable collectors of logs and metrics from nodes of the temporary network.", + ) +} diff --git a/tests/fixture/tmpnet/README.md b/tests/fixture/tmpnet/README.md index b6a0880d611e..dccde8c668b5 100644 --- a/tests/fixture/tmpnet/README.md +++ b/tests/fixture/tmpnet/README.md @@ -24,18 +24,24 @@ repositories. The functionality in this package is grouped by logical purpose into the following non-test files: -| Filename | Types | Purpose | -|:------------------|:------------|:-----------------------------------------------| -| defaults.go | | Defines common default configuration | -| flags.go | FlagsMap | Simplifies configuration of avalanchego flags | -| genesis.go | | Creates test genesis | -| network.go | Network | Orchestrates and configures temporary networks | -| network_config.go | Network | Reads and writes network configuration | -| node.go | Node | Orchestrates and configures nodes | -| node_config.go | Node | Reads and writes node configuration | -| node_process.go | NodeProcess | Orchestrates node processes | -| subnet.go | Subnet | Orchestrates subnets | -| utils.go | | Defines shared utility functions | +| Filename | Types | Purpose | +|:----------------------------|:------------|:----------------------------------------------------| +| collectors.go | | Starts and stops collectors for logs and metrics | +| defaults.go | | Defines common default configuration | +| detached_process_default.go | | Configures detached processes for darwin and linux | +| detached_process_windows.go | | No-op detached process configuration for windows | +| flags.go | FlagsMap | Simplifies configuration of avalanchego flags | +| genesis.go | | Creates test genesis | +| kube.go | | Library for Kubernetes interaction | +| local_network.go | | Defines configuration for the default local network | +| network.go | Network | Orchestrates and configures temporary networks | +| network_config.go | Network | Reads and writes network configuration | +| network_test.go | | Simple test round-tripping Network serialization | +| node.go | Node | Orchestrates and configures nodes | +| node_config.go | Node | Reads and writes node configuration | +| node_process.go | NodeProcess | Orchestrates node processes | +| subnet.go | Subnet | Orchestrates subnets | +| utils.go | | Defines shared utility functions | ## Usage @@ -280,35 +286,54 @@ shared. ### Example usage ```bash -# Start prometheus to collect metrics -PROMETHEUS_USERNAME= PROMETHEUS_PASSWORD= ./scripts/run_prometheus.sh +# Start a nix shell to ensure the availability of promtail and prometheus. +nix develop -# Start promtail to collect logs -LOKI_USERNAME= LOKI_PASSWORD= ./scripts/run_promtail.sh +# Enable collection of logs and metrics +PROMETHEUS_USERNAME= \ +PROMETHEUS_PASSWORD= \ +LOKI_USERNAME= \ +LOKI_PASSWORD= \ +./bin/tmpnetctl start-collectors # Network start emits link to grafana displaying collected logs and metrics ./bin/tmpnetctl start-network -# Configure metrics collection from a local node binding to the default API -# port of 9650 and storing its logs in ~/.avalanchego/logs. The script will -# also emit a link to grafana. -./scripts/configure-local-metrics-collection.sh +# When done with the network, stop the collectors +./bin/tmpnetctl stop-collectors ``` +### Starting collectors + +Collectors for logs and metrics can be started by `tmpnetctl +start-collectors`: + + - Requires that the following env vars be set + - `PROMETHEUS_USERNAME` + - `PROMETHEUS_PASSWORD` + - `LOKI_USERNAME` + - `LOKI_PASSWORD` + - Requires that binaries for promtail and prometheus be available in the path + - Starting a development shell with `nix develop` is one way to + ensure this and requires the [installation of + nix](https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix). + - Starts prometheus in agent mode configured to scrape metrics from + configured nodes and forward them to + https://prometheus-poc.avax-dev.network. + - Starts promtail configured to collect logs from configured nodes + and forward them to https://loki-poc.avax-dev.network. + +### Stopping collectors + +Collectors for logs and metrics can be stopped by `tmpnetctl +stop-collectors`: + ### Metrics collection When a node is started, configuration enabling collection of metrics from the node is written to `~/.tmpnet/prometheus/file_sd_configs/[network uuid]-[node id].json`. -The `scripts/run_prometheus.sh` script starts prometheus in agent mode -configured to scrape metrics from configured nodes and forward the -metrics to a persistent prometheus instance. The script requires that -the `PROMETHEUS_USERNAME` and `PROMETHEUS_PASSWORD` env vars be set. By -default the prometheus instance at -https://prometheus-poc.avax-dev.network will be targeted and -this can be overridden via the `PROMETHEUS_URL` env var. - ### Log collection Nodes log are stored at `~/.tmpnet/networks/[network id]/[node @@ -320,13 +345,6 @@ collection of logs for the node is written to `~/.tmpnet/promtail/file_sd_configs/[network uuid]-[node id].json`. -The `scripts/run_promtail.sh` script starts promtail configured to -collect logs from configured nodes and forward the results to loki. The -script requires that the `LOKI_USERNAME` and `LOKI_PASSWORD` env vars be -set. By default the loki instance at -https://loki-poc.avax-dev.network will be targeted and this -can be overridden via the `LOKI_URL` env var. - ### Labels The logs and metrics collected for temporary networks will have the diff --git a/tests/fixture/tmpnet/cmd/main.go b/tests/fixture/tmpnet/cmd/main.go index 1e5ebbf39735..6257c449ac38 100644 --- a/tests/fixture/tmpnet/cmd/main.go +++ b/tests/fixture/tmpnet/cmd/main.go @@ -16,7 +16,6 @@ import ( "go.uber.org/zap" "github.com/ava-labs/avalanchego/tests" - "github.com/ava-labs/avalanchego/tests/fixture/e2e" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" "github.com/ava-labs/avalanchego/utils/logging" "github.com/ava-labs/avalanchego/version" @@ -124,7 +123,7 @@ func main() { startNetworkCmd.PersistentFlags().StringVar( &pluginDir, "plugin-dir", - e2e.GetEnvWithDefault(tmpnet.AvalancheGoPluginDirEnvName, os.ExpandEnv("$HOME/.avalanchego/plugins")), + tmpnet.GetEnvWithDefault(tmpnet.AvalancheGoPluginDirEnvName, os.ExpandEnv("$HOME/.avalanchego/plugins")), "[optional] the dir containing VM plugins", ) startNetworkCmd.PersistentFlags().Uint8Var(&nodeCount, "node-count", tmpnet.DefaultNodeCount, "Number of nodes the network should initially consist of") @@ -167,6 +166,36 @@ func main() { } rootCmd.AddCommand(restartNetworkCmd) + startCollectorsCmd := &cobra.Command{ + Use: "start-collectors", + Short: "Start log and metric collectors for local process-based nodes", + RunE: func(*cobra.Command, []string) error { + ctx, cancel := context.WithTimeout(context.Background(), tmpnet.DefaultNetworkTimeout) + defer cancel() + log, err := tests.LoggerForFormat("", rawLogFormat) + if err != nil { + return err + } + return tmpnet.EnsureCollectorsRunning(ctx, log) + }, + } + rootCmd.AddCommand(startCollectorsCmd) + + stopCollectorsCmd := &cobra.Command{ + Use: "stop-collectors", + Short: "Stop log and metric collectors for local process-based nodes", + RunE: func(*cobra.Command, []string) error { + ctx, cancel := context.WithTimeout(context.Background(), tmpnet.DefaultNetworkTimeout) + defer cancel() + log, err := tests.LoggerForFormat("", rawLogFormat) + if err != nil { + return err + } + return tmpnet.EnsureCollectorsStopped(ctx, log) + }, + } + rootCmd.AddCommand(stopCollectorsCmd) + if err := rootCmd.Execute(); err != nil { fmt.Fprintf(os.Stderr, "tmpnetctl failed: %v\n", err) os.Exit(1) diff --git a/tests/fixture/tmpnet/collectors.go b/tests/fixture/tmpnet/collectors.go new file mode 100644 index 000000000000..ac4625e7b8f5 --- /dev/null +++ b/tests/fixture/tmpnet/collectors.go @@ -0,0 +1,419 @@ +// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package tmpnet + +import ( + "context" + "errors" + "fmt" + "io/fs" + "os" + "os/exec" + "path/filepath" + "strconv" + "syscall" + "time" + + "go.uber.org/zap" + + "github.com/ava-labs/avalanchego/utils/logging" + "github.com/ava-labs/avalanchego/utils/perms" +) + +type configGeneratorFunc func(workingDir string, username string, password string) string + +const ( + collectorTickerInterval = 100 * time.Millisecond + + prometheusScrapeInterval = 10 * time.Second + + prometheusCmd = "prometheus" + promtailCmd = "promtail" + + // Use a delay slightly longer than the scrape interval to ensure a final scrape before shutdown + NetworkShutdownDelay = prometheusScrapeInterval + 2*time.Second +) + +// EnsureCollectorsRunning ensures collectors are running to collect logs and metrics from local nodes. +func EnsureCollectorsRunning(ctx context.Context, log logging.Logger) error { + if _, ok := ctx.Deadline(); !ok { + return errors.New("unable to start collectors with a context without a deadline") + } + if err := ensurePrometheusRunning(ctx, log); err != nil { + return err + } + if err := ensurePromtailRunning(ctx, log); err != nil { + return err + } + + log.Info("To stop: tmpnetctl stop-collectors") + + return nil +} + +// EnsureCollectorsStopped ensures collectors are not running. +func EnsureCollectorsStopped(ctx context.Context, log logging.Logger) error { + if _, ok := ctx.Deadline(); !ok { + return errors.New("unable to start collectors with a context without a deadline") + } + for _, cmdName := range []string{prometheusCmd, promtailCmd} { + // Determine if the process is running + workingDir, err := getWorkingDir(cmdName) + if err != nil { + return err + } + pidPath := getPIDPath(workingDir) + proc, err := processFromPIDFile(workingDir, pidPath) + if err != nil { + return err + } + if proc == nil { + log.Info("collector not running", + zap.String("cmd", cmdName), + ) + continue + } + + log.Info("sending SIGTERM to collector process", + zap.String("cmdName", cmdName), + zap.Int("pid", proc.Pid), + ) + if err := proc.Signal(syscall.SIGTERM); err != nil { + return fmt.Errorf("failed to send SIGTERM to pid %d: %w", proc.Pid, err) + } + + log.Info("waiting for collector process to stop", + zap.String("cmdName", cmdName), + zap.Int("pid", proc.Pid), + ) + ticker := time.NewTicker(collectorTickerInterval) + defer ticker.Stop() + for { + p, err := getProcess(proc.Pid) + if err != nil { + return fmt.Errorf("failed to retrieve process: %w", err) + } + if p == nil { + // Process is no longer running + + // Attempt to clear the PID file. Not critical that it is removed, just good housekeeping. + if err := clearStalePIDFile(log, cmdName, pidPath); err != nil { + log.Warn("failed to remove stale PID file", + zap.String("cmd", cmdName), + zap.String("pidFile", pidPath), + zap.Error(err), + ) + } + + break + } + + select { + case <-ctx.Done(): + return fmt.Errorf("failed to see %s stop before timeout: %w", cmdName, ctx.Err()) + case <-ticker.C: + } + } + log.Info("collector stopped", + zap.String("cmdName", cmdName), + ) + } + + return nil +} + +// ensurePrometheusRunning ensures an agent-mode prometheus process is running to collect metrics from local nodes. +func ensurePrometheusRunning(ctx context.Context, log logging.Logger) error { + return ensureCollectorRunning( + ctx, + log, + prometheusCmd, + "--config.file=prometheus.yaml --storage.agent.path=./data --web.listen-address=localhost:0 --enable-feature=agent", + "PROMETHEUS", + func(workingDir string, username string, password string) string { + return fmt.Sprintf(` +global: + scrape_interval: %v # Default is every 1 minute. + evaluation_interval: 10s # The default is every 1 minute. + scrape_timeout: 5s # The default is every 10s + +scrape_configs: + - job_name: "avalanchego" + metrics_path: "/ext/metrics" + file_sd_configs: + - files: + - '%s/file_sd_configs/*.json' + +remote_write: + - url: "https://prometheus-poc.avax-dev.network/api/v1/write" + basic_auth: + username: "%s" + password: "%s" +`, prometheusScrapeInterval, workingDir, username, password) + }, + ) +} + +// ensurePromtailRunning ensures a promtail process is running to collect logs from local nodes. +func ensurePromtailRunning(ctx context.Context, log logging.Logger) error { + return ensureCollectorRunning( + ctx, + log, + promtailCmd, + "-config.file=promtail.yaml", + "LOKI", + func(workingDir string, username string, password string) string { + return fmt.Sprintf(` +server: + http_listen_port: 0 + grpc_listen_port: 0 + +positions: + filename: %s/positions.yaml + +client: + url: "https://loki-poc.avax-dev.network/api/prom/push" + basic_auth: + username: "%s" + password: "%s" + +scrape_configs: + - job_name: "avalanchego" + file_sd_configs: + - files: + - '%s/file_sd_configs/*.json' +`, workingDir, username, password, workingDir) + }, + ) +} + +func getWorkingDir(cmdName string) (string, error) { + tmpnetDir, err := getTmpnetPath() + if err != nil { + return "", err + } + return filepath.Join(tmpnetDir, cmdName), nil +} + +func getPIDPath(workingDir string) string { + return filepath.Join(workingDir, "run.pid") +} + +// ensureCollectorRunning starts a collector process if it is not already running. +func ensureCollectorRunning( + ctx context.Context, + log logging.Logger, + cmdName string, + args string, + baseEnvName string, + configGenerator configGeneratorFunc, +) error { + // Determine paths + workingDir, err := getWorkingDir(cmdName) + if err != nil { + return err + } + pidPath := getPIDPath(workingDir) + + // Ensure required paths exist + if err := os.MkdirAll(workingDir, perms.ReadWriteExecute); err != nil { + return fmt.Errorf("failed to create %s dir: %w", cmdName, err) + } + if err := os.MkdirAll(filepath.Join(workingDir, "file_sd_configs"), perms.ReadWriteExecute); err != nil { + return fmt.Errorf("failed to create %s file_sd_configs dir: %w", cmdName, err) + } + + // Check if the process is already running + if process, err := processFromPIDFile(cmdName, pidPath); err != nil { + return err + } else if process != nil { + log.Info(cmdName + " is already running") + return nil + } + + // Clear any stale pid file + if err := clearStalePIDFile(log, cmdName, pidPath); err != nil { + return err + } + + // Check if the specified command is available in the path + if _, err := exec.LookPath(cmdName); err != nil { + return fmt.Errorf("%s command not found. Maybe run 'nix develop'?", cmdName) + } + + // Write the collector config file + if err := writeConfigFile(log, cmdName, workingDir, baseEnvName, configGenerator); err != nil { + return err + } + + // Start the collector + return startCollector(ctx, log, cmdName, args, workingDir, pidPath) +} + +// processFromPIDFile attempts to retrieve a running process from the specified PID file. +func processFromPIDFile(cmdName string, pidPath string) (*os.Process, error) { + pid, err := getPID(cmdName, pidPath) + if err != nil { + return nil, err + } + if pid == 0 { + return nil, nil + } + return getProcess(pid) +} + +// getPID attempts to read the PID of the collector from a PID file. +func getPID(cmdName string, pidPath string) (int, error) { + pidData, err := os.ReadFile(pidPath) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return 0, fmt.Errorf("failed to read %s PID file %s: %w", cmdName, pidPath, err) + } + if len(pidData) == 0 { + return 0, nil + } + pid, err := strconv.Atoi(string(pidData)) + if err != nil { + return 0, fmt.Errorf("failed to parse %s PID: %w", cmdName, err) + } + return pid, nil +} + +// clearStalePIDFile remove an existing pid file to avoid conflicting with a new process. +func clearStalePIDFile(log logging.Logger, cmdName string, pidPath string) error { + if err := os.Remove(pidPath); err != nil { + if !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("failed to remove stale pid file: %w", err) + } + } else { + log.Info("deleted stale "+cmdName+" pid file", + zap.String("path", pidPath), + ) + } + return nil +} + +// writeConfigFile writes the configuration file for a collector +func writeConfigFile( + log logging.Logger, + cmdName string, + workingDir string, + baseEnvName string, + configGenerator configGeneratorFunc, +) error { + // Retrieve the credentials for the command + username, password, err := getCredentials(baseEnvName) + if err != nil { + return err + } + + // Generate configuration for the command to its working dir + confFilename := cmdName + ".yaml" + confPath := filepath.Join(workingDir, confFilename) + log.Info("writing "+cmdName+" config", + zap.String("path", confPath), + ) + config := configGenerator(workingDir, username, password) + return os.WriteFile(confPath, []byte(config), perms.ReadWrite) +} + +// getCredentials retrieves the username and password for the given base env name. +func getCredentials(baseEnvName string) (string, string, error) { + usernameEnvVar := baseEnvName + "_USERNAME" + username := GetEnvWithDefault(usernameEnvVar, "") + if len(username) == 0 { + return "", "", fmt.Errorf("%s env var not set", usernameEnvVar) + } + passwordEnvVar := baseEnvName + "_PASSWORD" + password := GetEnvWithDefault(passwordEnvVar, "") + if len(password) == 0 { + return "", "", fmt.Errorf("%s var not set", passwordEnvVar) + } + return username, password, nil +} + +// Start a collector. Use bash to execute the command in the background and enable +// stderr and stdout redirection to a log file. +// +// Ideally this would be possible without bash, but it does not seem possible to +// have this process open a log file, set cmd.Stdout cmd.Stderr to that file, and +// then have the child process be able to write to that file once the parent +// process exits. Attempting to do so resulted in an empty log file. +func startCollector( + ctx context.Context, + log logging.Logger, + cmdName string, + args string, + workingDir string, + pidPath string, +) error { + fullCmd := "nohup " + cmdName + " " + args + " > " + cmdName + ".log 2>&1 & echo -n \"$!\" > " + pidPath + log.Info("starting "+cmdName, + zap.String("workingDir", workingDir), + zap.String("fullCmd", fullCmd), + ) + + cmd := exec.Command("bash", "-c", fullCmd) + configureDetachedProcess(cmd) // Ensure the child process will outlive its parent + cmd.Dir = workingDir + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start %s: %w", cmdName, err) + } + + // Wait for PID file to be written. It's not enough to check for the PID of cmd + // because the PID we want is a child of the process that cmd represents. + if pid, err := waitForPIDFile(ctx, cmdName, pidPath); err != nil { + return err + } else { + log.Info(cmdName+" started", + zap.String("pid", pid), + ) + } + + return nil +} + +// waitForPIDFile waits for the PID file to be written as an indication of process start. +func waitForPIDFile(ctx context.Context, cmdName string, pidPath string) (string, error) { + var ( + ticker = time.NewTicker(collectorTickerInterval) + pid string + ) + defer ticker.Stop() + for { + if fileExistsAndNotEmpty(pidPath) { + var err error + pid, err = readFileContents(pidPath) + if err != nil { + return "", fmt.Errorf("failed to read pid file: %w", err) + } + break + } + select { + case <-ctx.Done(): + return "", fmt.Errorf("failed to wait for %s to start before timeout: %w", cmdName, ctx.Err()) + case <-ticker.C: + } + } + return pid, nil +} + +func fileExistsAndNotEmpty(filename string) bool { + fileInfo, err := os.Stat(filename) + if err != nil { + if os.IsNotExist(err) { + return false + } + fmt.Printf("Error stating file: %v\n", err) + return false + } + return fileInfo.Size() > 0 +} + +func readFileContents(filename string) (string, error) { + content, err := os.ReadFile(filename) + if err != nil { + return "", err + } + return string(content), nil +} diff --git a/tests/fixture/tmpnet/start_collectors.go b/tests/fixture/tmpnet/start_collectors.go deleted file mode 100644 index 21591cea2fc1..000000000000 --- a/tests/fixture/tmpnet/start_collectors.go +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. -// See the file LICENSE for licensing terms. - -package tmpnet - -import ( - "errors" - "fmt" - "io/fs" - "os" - "os/exec" - "path/filepath" - "strconv" - "time" - - "go.uber.org/zap" - - "github.com/ava-labs/avalanchego/utils/logging" - "github.com/ava-labs/avalanchego/utils/perms" -) - -type configGeneratorFunc func(workingDir string, username string, password string) string - -// Use a delay slightly longer than the 10s scrape interval configured for prometheus to ensure a final scrape before shutdown -const NetworkShutdownDelay = 12 * time.Second - -func EnsureCollectorsRunning(log logging.Logger) error { - if err := ensurePrometheusRunning(log); err != nil { - return err - } - return ensurePromtailRunning(log) -} - -func ensurePrometheusRunning(log logging.Logger) error { - return ensureCollectorRunning( - log, - "prometheus", - "--config.file=prometheus.yaml --storage.agent.path=./data --web.listen-address=localhost:0 --enable-feature=agent", - "PROMETHEUS", - func(workingDir string, username string, password string) string { - return fmt.Sprintf(` -global: - scrape_interval: 10s # Default is every 1 minute. - evaluation_interval: 10s # The default is every 1 minute. - scrape_timeout: 5s # The default is every 10s - -scrape_configs: - - job_name: "avalanchego" - metrics_path: "/ext/metrics" - file_sd_configs: - - files: - - '%s/file_sd_configs/*.json' - -remote_write: - - url: "https://prometheus-poc.avax-dev.network/api/v1/write" - basic_auth: - username: "%s" - password: "%s" -`, workingDir, username, password) - }, - ) -} - -func ensurePromtailRunning(log logging.Logger) error { - return ensureCollectorRunning( - log, - "promtail", - "-config.file=promtail.yaml", - "LOKI", - func(workingDir string, username string, password string) string { - return fmt.Sprintf(` -server: - http_listen_port: 0 - grpc_listen_port: 0 - -positions: - filename: %s/positions.yaml - -client: - url: "https://loki-poc.avax-dev.network/api/prom/push" - basic_auth: - username: "%s" - password: "%s" - -scrape_configs: - - job_name: "avalanchego" - file_sd_configs: - - files: - - '%s/file_sd_configs/*.json' -`, workingDir, username, password, workingDir) - }, - ) -} - -func ensureCollectorRunning( - log logging.Logger, - cmdName string, - args string, - baseEnvName string, - configGenerator configGeneratorFunc, -) error { - tmpnetDir, err := getTmpnetPath() - if err != nil { - return err - } - workingDir := filepath.Join(tmpnetDir, cmdName) - pidFilename := "run.pid" - pidPath := filepath.Join(workingDir, pidFilename) - - if err := os.MkdirAll(workingDir, perms.ReadWriteExecute); err != nil { - return fmt.Errorf("failed to create %s dir: %w", cmdName, err) - } - - if err := os.MkdirAll(filepath.Join(workingDir, "file_sd_configs"), perms.ReadWriteExecute); err != nil { - return fmt.Errorf("failed to create promtail file_sd_configs dir: %w", err) - } - - // Read the PID from the file - pidData, err := os.ReadFile(pidPath) - if err != nil && !errors.Is(err, os.ErrNotExist) { - return fmt.Errorf("failed to read %s PID file %s: %w", cmdName, pidPath, err) - } - if len(pidData) > 0 { - pid, err := strconv.Atoi(string(pidData)) - if err != nil { - return fmt.Errorf("failed to parse %s PID: %w", cmdName, err) - } - process, err := getProcess(pid) - if err != nil { - return err - } - if process != nil { - log.Info(cmdName + " is already running") - return nil - } - } - - // Remove the pid file to avoid conflicting with the new one starting - if err := os.Remove(pidPath); err != nil { - if !errors.Is(err, fs.ErrNotExist) { - return fmt.Errorf("failed to remove stale pid file: %w", err) - } - } else { - log.Info("deleted stale "+cmdName+" pid file", - zap.String("path", pidPath), - ) - } - - // TODO(marun) Maybe collect errors instead of returning them 1-by-1? - if _, err := exec.LookPath(cmdName); err != nil { - return fmt.Errorf("%s command not found. Maybe run 'nix develop'?", cmdName) - } - - usernameEnvVar := baseEnvName + "_USERNAME" - username := getEnv(usernameEnvVar, "") - if len(username) == 0 { - return fmt.Errorf("%s env var not set", usernameEnvVar) - } - - passwordEnvVar := baseEnvName + "_PASSWORD" - password := getEnv(passwordEnvVar, "") - if len(password) == 0 { - return fmt.Errorf("%s var not set", passwordEnvVar) - } - - confFilename := cmdName + ".yaml" - confPath := filepath.Join(workingDir, confFilename) - log.Info("writing "+cmdName+" config", - zap.String("path", confPath), - ) - config := configGenerator(workingDir, username, password) - if err := os.WriteFile(confPath, []byte(config), perms.ReadWrite); err != nil { - return err - } - - fullCmd := "nohup " + cmdName + " " + args + " > " + cmdName + ".log 2>&1 & echo -n \"$!\" > " + pidFilename - log.Info("starting "+cmdName, - zap.String("workingDir", workingDir), - zap.String("fullCmd", fullCmd), - ) - - // TODO(marun) Figure out a way to redirect stdout and stderr of a detached child process without a bash shell - cmd := exec.Command("bash", "-c", fullCmd) - configureDetachedProcess(cmd) // Ensure the child process will outlive its parent - cmd.Dir = workingDir - - if err := cmd.Start(); err != nil { - return fmt.Errorf("failed to start %s: %w", cmdName, err) - } - - var pid string - // TODO(marun) Use a context instead - for { - if fileExistsAndNotEmpty(pidPath) { - var err error - pid, err = readFileContents(pidPath) - if err != nil { - return fmt.Errorf("failed to read pid file: %w", err) - } - break - } - time.Sleep(100 * time.Millisecond) - } - log.Info(cmdName+" started", - zap.String("pid", pid), - ) - - killMsg := fmt.Sprintf("To stop %s: kill -SIGTERM $(cat %s) && rm %s", cmdName, pidPath, pidPath) - log.Info(killMsg) - - return nil -} - -// Function to check if a file exists and is not empty -func fileExistsAndNotEmpty(filename string) bool { - fileInfo, err := os.Stat(filename) - if err != nil { - if os.IsNotExist(err) { - return false - } - fmt.Printf("Error stating file: %v\n", err) - return false - } - return fileInfo.Size() > 0 -} - -// Function to read the contents of a file -func readFileContents(filename string) (string, error) { - content, err := os.ReadFile(filename) - if err != nil { - return "", err - } - return string(content), nil -} - -// TODO(marun) Put this somewhere standard -func getEnv(key, fallback string) string { - if value, ok := os.LookupEnv(key); ok { - return value - } - return fallback -} diff --git a/tests/fixture/tmpnet/utils.go b/tests/fixture/tmpnet/utils.go index 1ed097c864ac..29b36af1330e 100644 --- a/tests/fixture/tmpnet/utils.go +++ b/tests/fixture/tmpnet/utils.go @@ -9,6 +9,7 @@ import ( "errors" "fmt" "net" + "os" "syscall" "time" @@ -125,3 +126,11 @@ func NodesToIDs(nodes ...*Node) []ids.NodeID { } return nodeIDs } + +func GetEnvWithDefault(envVar, defaultVal string) string { + val := os.Getenv(envVar) + if len(val) == 0 { + return defaultVal + } + return val +} diff --git a/tests/upgrade/upgrade_test.go b/tests/upgrade/upgrade_test.go index 2db31b983cfd..99679ccf08d0 100644 --- a/tests/upgrade/upgrade_test.go +++ b/tests/upgrade/upgrade_test.go @@ -22,6 +22,7 @@ func TestUpgrade(t *testing.T) { var ( avalancheGoExecPath string avalancheGoExecPathToUpgradeTo string + enableCollectors bool ) func init() { @@ -37,6 +38,7 @@ func init() { "", "avalanchego executable path to upgrade to", ) + e2e.SetEnableCollectorsFlag(&enableCollectors) } var _ = ginkgo.Describe("[Upgrade]", func() { @@ -51,6 +53,10 @@ var _ = ginkgo.Describe("[Upgrade]", func() { require.NoError(err) network.Genesis = genesis + if enableCollectors { + require.NoError(tmpnet.EnsureCollectorsRunning(tc.DefaultContext(), tc.Log())) + } + e2e.StartNetwork( tc, network, From eaf59a2274d40527cc6d53a38ff0b0ad1a2ff0a5 Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Mon, 24 Feb 2025 10:11:06 +0100 Subject: [PATCH 03/12] fixup: Cleanup for consistency - ensure started/stopped -> start/stop collectors - use the same helpers between collector start and node process --- .../run-monitored-tmpnet-cmd/action.yml | 4 +- tests/e2e/README.md | 2 +- tests/fixture/e2e/env.go | 4 +- tests/fixture/e2e/flags.go | 18 ++--- tests/fixture/e2e/metrics_link.go | 4 -- tests/fixture/tmpnet/README.md | 36 +++++----- tests/fixture/tmpnet/cmd/main.go | 4 +- .../{collectors.go => monitor_processes.go} | 65 ++++++++++++------- tests/fixture/tmpnet/node_process.go | 39 ++++++----- tests/upgrade/upgrade_test.go | 8 +-- 10 files changed, 98 insertions(+), 86 deletions(-) rename tests/fixture/tmpnet/{collectors.go => monitor_processes.go} (83%) diff --git a/.github/actions/run-monitored-tmpnet-cmd/action.yml b/.github/actions/run-monitored-tmpnet-cmd/action.yml index dc633653e880..205e1d677116 100644 --- a/.github/actions/run-monitored-tmpnet-cmd/action.yml +++ b/.github/actions/run-monitored-tmpnet-cmd/action.yml @@ -50,6 +50,7 @@ runs: github_access_token: ${{ inputs.github_token }} - run: nix develop --command echo "dependencies installed" shell: bash + # TODO(marun) Stop emitting this annotation so that any annotation that appears can be assumed to be actionable - name: Notify of metrics availability if: (inputs.prometheus_username != '') shell: bash @@ -63,7 +64,7 @@ runs: # --impure ensures the env vars are accessible to the command run: ${{ inputs.run_env }} nix develop --impure --command bash -x ${{ inputs.run }} env: - TMPNET_ENABLE_COLLECTORS: true + TMPNET_START_COLLECTORS: true LOKI_USERNAME: ${{ inputs.loki_username }} LOKI_PASSWORD: ${{ inputs.loki_password }} PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }} @@ -79,3 +80,4 @@ runs: if: always() with: name: ${{ inputs.artifact_prefix }}-tmpnet-data + # TODO(marun) Check that collection is working by querying prometheus and loki with the GH_* labels above diff --git a/tests/e2e/README.md b/tests/e2e/README.md index 464b0f55462f..1a4b212d0282 100644 --- a/tests/e2e/README.md +++ b/tests/e2e/README.md @@ -113,7 +113,7 @@ E2E_SKIP_BOOTSTRAP_CHECKS=1 ./bin/ginkgo -v ./tests/e2e ... It is possible to enable collection of logs and metrics from the temporary networks used for e2e testing by: - - Supplying `--enable-collectors` as an argument to the test suite + - Supplying `--start-collectors` as an argument to the test suite - Starting collectors in advance of a test run with `tmpnetctl start-collectors` diff --git a/tests/fixture/e2e/env.go b/tests/fixture/e2e/env.go index 35eea8478a2f..6f359b499ef0 100644 --- a/tests/fixture/e2e/env.go +++ b/tests/fixture/e2e/env.go @@ -130,8 +130,8 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork } } - if flagVars.EnableCollectors() { - require.NoError(tmpnet.EnsureCollectorsRunning(tc.DefaultContext(), tc.Log())) + if flagVars.StartCollectors() { + require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log())) } // Start a new network diff --git a/tests/fixture/e2e/flags.go b/tests/fixture/e2e/flags.go index aebc0cdd9c0b..b313f627cdbd 100644 --- a/tests/fixture/e2e/flags.go +++ b/tests/fixture/e2e/flags.go @@ -20,7 +20,7 @@ type FlagVars struct { pluginDir string networkDir string reuseNetwork bool - enableCollectors bool + startCollectors bool startNetwork bool stopNetwork bool restartNetwork bool @@ -73,12 +73,12 @@ func (v *FlagVars) RestartNetwork() bool { return v.restartNetwork } -func (v *FlagVars) EnableCollectors() bool { - return v.enableCollectors +func (v *FlagVars) StartCollectors() bool { + return v.startCollectors } func (v *FlagVars) NetworkShutdownDelay() time.Duration { - if v.enableCollectors { + if v.startCollectors { // Only return a non-zero value if the delay is enabled. return tmpnet.NetworkShutdownDelay } @@ -139,7 +139,7 @@ func RegisterFlags() *FlagVars { false, "[optional] restart an existing network previously started with --reuse-network. Useful for ensuring a network is running with the current state of binaries on disk. Ignored if a network is not already running or --stop-network is provided.", ) - SetEnableCollectorsFlag(&vars.enableCollectors) + SetStartCollectorsFlag(&vars.startCollectors) flag.BoolVar( &vars.startNetwork, "start-network", @@ -169,11 +169,11 @@ func RegisterFlags() *FlagVars { } // Enable reuse by the upgrade job -func SetEnableCollectorsFlag(p *bool) { +func SetStartCollectorsFlag(p *bool) { flag.BoolVar( p, - "enable-collectors", - cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_ENABLE_COLLECTORS", "false")), - "[optional] whether to enable collectors of logs and metrics from nodes of the temporary network.", + "start-collectors", + cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_START_COLLECTORS", "false")), + "[optional] whether to start collectors of logs and metrics from nodes of the temporary network.", ) } diff --git a/tests/fixture/e2e/metrics_link.go b/tests/fixture/e2e/metrics_link.go index 2042d0e461e1..c7a164df0130 100644 --- a/tests/fixture/e2e/metrics_link.go +++ b/tests/fixture/e2e/metrics_link.go @@ -24,16 +24,12 @@ var EmitMetricsLink bool // will be emitted at the end of spec execution. If the test uses a // private network, it can disable this behavior by setting // EmitMetricsLink to false. -// -// TODO(marun) Make this conditional on metrics collection being enabled var _ = ginkgo.BeforeEach(func() { EmitMetricsLink = true }) // This event handler attempts to emit a metrics link scoped to the duration // of the current spec. -// -// TODO(marun) Make this conditional on metrics collection being enabled var _ = ginkgo.AfterEach(func() { tc := NewTestContext() env := GetEnv(tc) diff --git a/tests/fixture/tmpnet/README.md b/tests/fixture/tmpnet/README.md index dccde8c668b5..1a90f71a9228 100644 --- a/tests/fixture/tmpnet/README.md +++ b/tests/fixture/tmpnet/README.md @@ -24,24 +24,24 @@ repositories. The functionality in this package is grouped by logical purpose into the following non-test files: -| Filename | Types | Purpose | -|:----------------------------|:------------|:----------------------------------------------------| -| collectors.go | | Starts and stops collectors for logs and metrics | -| defaults.go | | Defines common default configuration | -| detached_process_default.go | | Configures detached processes for darwin and linux | -| detached_process_windows.go | | No-op detached process configuration for windows | -| flags.go | FlagsMap | Simplifies configuration of avalanchego flags | -| genesis.go | | Creates test genesis | -| kube.go | | Library for Kubernetes interaction | -| local_network.go | | Defines configuration for the default local network | -| network.go | Network | Orchestrates and configures temporary networks | -| network_config.go | Network | Reads and writes network configuration | -| network_test.go | | Simple test round-tripping Network serialization | -| node.go | Node | Orchestrates and configures nodes | -| node_config.go | Node | Reads and writes node configuration | -| node_process.go | NodeProcess | Orchestrates node processes | -| subnet.go | Subnet | Orchestrates subnets | -| utils.go | | Defines shared utility functions | +| Filename | Types | Purpose | +|:----------------------------|:------------|:------------------------------------------------------------| +| defaults.go | | Defines common default configuration | +| detached_process_default.go | | Configures detached processes for darwin and linux | +| detached_process_windows.go | | No-op detached process configuration for windows | +| flags.go | FlagsMap | Simplifies configuration of avalanchego flags | +| genesis.go | | Creates test genesis | +| kube.go | | Library for Kubernetes interaction | +| local_network.go | | Defines configuration for the default local network | +| monitor_processes.go | | Enables collection of logs and metrics from local processes | +| network.go | Network | Orchestrates and configures temporary networks | +| network_config.go | Network | Reads and writes network configuration | +| network_test.go | | Simple test round-tripping Network serialization | +| node.go | Node | Orchestrates and configures nodes | +| node_config.go | Node | Reads and writes node configuration | +| node_process.go | NodeProcess | Orchestrates node processes | +| subnet.go | Subnet | Orchestrates subnets | +| utils.go | | Defines shared utility functions | ## Usage diff --git a/tests/fixture/tmpnet/cmd/main.go b/tests/fixture/tmpnet/cmd/main.go index 6257c449ac38..bdd00c397cc0 100644 --- a/tests/fixture/tmpnet/cmd/main.go +++ b/tests/fixture/tmpnet/cmd/main.go @@ -176,7 +176,7 @@ func main() { if err != nil { return err } - return tmpnet.EnsureCollectorsRunning(ctx, log) + return tmpnet.StartCollectors(ctx, log) }, } rootCmd.AddCommand(startCollectorsCmd) @@ -191,7 +191,7 @@ func main() { if err != nil { return err } - return tmpnet.EnsureCollectorsStopped(ctx, log) + return tmpnet.StopCollectors(ctx, log) }, } rootCmd.AddCommand(stopCollectorsCmd) diff --git a/tests/fixture/tmpnet/collectors.go b/tests/fixture/tmpnet/monitor_processes.go similarity index 83% rename from tests/fixture/tmpnet/collectors.go rename to tests/fixture/tmpnet/monitor_processes.go index ac4625e7b8f5..c9cf2cb78700 100644 --- a/tests/fixture/tmpnet/collectors.go +++ b/tests/fixture/tmpnet/monitor_processes.go @@ -21,7 +21,7 @@ import ( "github.com/ava-labs/avalanchego/utils/perms" ) -type configGeneratorFunc func(workingDir string, username string, password string) string +type configGeneratorFunc func(workingDir string, serviceDiscoveryDir string, username string, password string) string const ( collectorTickerInterval = 100 * time.Millisecond @@ -35,15 +35,15 @@ const ( NetworkShutdownDelay = prometheusScrapeInterval + 2*time.Second ) -// EnsureCollectorsRunning ensures collectors are running to collect logs and metrics from local nodes. -func EnsureCollectorsRunning(ctx context.Context, log logging.Logger) error { +// StartCollectors ensures collectors are running to collect logs and metrics from local nodes. +func StartCollectors(ctx context.Context, log logging.Logger) error { if _, ok := ctx.Deadline(); !ok { return errors.New("unable to start collectors with a context without a deadline") } - if err := ensurePrometheusRunning(ctx, log); err != nil { + if err := startPrometheus(ctx, log); err != nil { return err } - if err := ensurePromtailRunning(ctx, log); err != nil { + if err := startPromtail(ctx, log); err != nil { return err } @@ -53,7 +53,7 @@ func EnsureCollectorsRunning(ctx context.Context, log logging.Logger) error { } // EnsureCollectorsStopped ensures collectors are not running. -func EnsureCollectorsStopped(ctx context.Context, log logging.Logger) error { +func StopCollectors(ctx context.Context, log logging.Logger) error { if _, ok := ctx.Deadline(); !ok { return errors.New("unable to start collectors with a context without a deadline") } @@ -123,15 +123,15 @@ func EnsureCollectorsStopped(ctx context.Context, log logging.Logger) error { return nil } -// ensurePrometheusRunning ensures an agent-mode prometheus process is running to collect metrics from local nodes. -func ensurePrometheusRunning(ctx context.Context, log logging.Logger) error { - return ensureCollectorRunning( +// startPrometheus ensures an agent-mode prometheus process is running to collect metrics from local nodes. +func startPrometheus(ctx context.Context, log logging.Logger) error { + return startCollector( ctx, log, prometheusCmd, "--config.file=prometheus.yaml --storage.agent.path=./data --web.listen-address=localhost:0 --enable-feature=agent", "PROMETHEUS", - func(workingDir string, username string, password string) string { + func(_ string, serviceDiscoveryDir string, username string, password string) string { return fmt.Sprintf(` global: scrape_interval: %v # Default is every 1 minute. @@ -143,27 +143,27 @@ scrape_configs: metrics_path: "/ext/metrics" file_sd_configs: - files: - - '%s/file_sd_configs/*.json' + - '%s/*.json' remote_write: - url: "https://prometheus-poc.avax-dev.network/api/v1/write" basic_auth: username: "%s" password: "%s" -`, prometheusScrapeInterval, workingDir, username, password) +`, prometheusScrapeInterval, serviceDiscoveryDir, username, password) }, ) } -// ensurePromtailRunning ensures a promtail process is running to collect logs from local nodes. -func ensurePromtailRunning(ctx context.Context, log logging.Logger) error { - return ensureCollectorRunning( +// startPromtail ensures a promtail process is running to collect logs from local nodes. +func startPromtail(ctx context.Context, log logging.Logger) error { + return startCollector( ctx, log, promtailCmd, "-config.file=promtail.yaml", "LOKI", - func(workingDir string, username string, password string) string { + func(workingDir string, serviceDiscoveryDir string, username string, password string) string { return fmt.Sprintf(` server: http_listen_port: 0 @@ -182,8 +182,8 @@ scrape_configs: - job_name: "avalanchego" file_sd_configs: - files: - - '%s/file_sd_configs/*.json' -`, workingDir, username, password, workingDir) + - '%s/*.json' +`, workingDir, username, password, serviceDiscoveryDir) }, ) } @@ -196,12 +196,20 @@ func getWorkingDir(cmdName string) (string, error) { return filepath.Join(tmpnetDir, cmdName), nil } +func getServiceDiscoveryDir(cmdName string) (string, error) { + tmpnetDir, err := getTmpnetPath() + if err != nil { + return "", err + } + return filepath.Join(tmpnetDir, cmdName, "file_sd_configs"), nil +} + func getPIDPath(workingDir string) string { return filepath.Join(workingDir, "run.pid") } -// ensureCollectorRunning starts a collector process if it is not already running. -func ensureCollectorRunning( +// startCollector starts a collector process if it is not already running. +func startCollector( ctx context.Context, log logging.Logger, cmdName string, @@ -247,8 +255,8 @@ func ensureCollectorRunning( return err } - // Start the collector - return startCollector(ctx, log, cmdName, args, workingDir, pidPath) + // Start the process + return startCollectorProcess(ctx, log, cmdName, args, workingDir, pidPath) } // processFromPIDFile attempts to retrieve a running process from the specified PID file. @@ -313,7 +321,11 @@ func writeConfigFile( log.Info("writing "+cmdName+" config", zap.String("path", confPath), ) - config := configGenerator(workingDir, username, password) + serviceDiscoveryDir, err := getServiceDiscoveryDir(cmdName) + if err != nil { + return err + } + config := configGenerator(workingDir, serviceDiscoveryDir, username, password) return os.WriteFile(confPath, []byte(config), perms.ReadWrite) } @@ -332,14 +344,14 @@ func getCredentials(baseEnvName string) (string, string, error) { return username, password, nil } -// Start a collector. Use bash to execute the command in the background and enable +// Start a collector process. Use bash to execute the command in the background and enable // stderr and stdout redirection to a log file. // // Ideally this would be possible without bash, but it does not seem possible to // have this process open a log file, set cmd.Stdout cmd.Stderr to that file, and // then have the child process be able to write to that file once the parent // process exits. Attempting to do so resulted in an empty log file. -func startCollector( +func startCollectorProcess( ctx context.Context, log logging.Logger, cmdName string, @@ -370,6 +382,9 @@ func startCollector( ) } + // TODO(marun) Perform a readiness check + // TODO(marun) Check that the log is not empty + return nil } diff --git a/tests/fixture/tmpnet/node_process.go b/tests/fixture/tmpnet/node_process.go index ccaf562d7086..7013de4561f3 100644 --- a/tests/fixture/tmpnet/node_process.go +++ b/tests/fixture/tmpnet/node_process.go @@ -273,18 +273,13 @@ func (p *NodeProcess) writeMonitoringConfig() error { "gh_job_id": os.Getenv("GH_JOB_ID"), } - tmpnetDir, err := getTmpnetPath() - if err != nil { - return err - } - prometheusConfig := []FlagsMap{ { "targets": []string{strings.TrimPrefix(p.node.URI, "http://")}, "labels": commonLabels, }, } - if err := p.writeMonitoringConfigFile(tmpnetDir, "prometheus", prometheusConfig); err != nil { + if err := p.writeMonitoringConfigFile(prometheusCmd, prometheusConfig); err != nil { return err } @@ -298,36 +293,40 @@ func (p *NodeProcess) writeMonitoringConfig() error { "labels": promtailLabels, }, } - return p.writeMonitoringConfigFile(tmpnetDir, "promtail", promtailConfig) + return p.writeMonitoringConfigFile(promtailCmd, promtailConfig) } // Return the path for this node's prometheus configuration. -func (p *NodeProcess) getMonitoringConfigPath(tmpnetDir string, name string) string { +func (p *NodeProcess) getMonitoringConfigPath(name string) (string, error) { // Ensure a unique filename to allow config files to be added and removed // by multiple nodes without conflict. - return filepath.Join(tmpnetDir, name, "file_sd_configs", fmt.Sprintf("%s_%s.json", p.node.NetworkUUID, p.node.NodeID)) -} - -// Ensure the removal of the prometheus configuration file for this node. -func (p *NodeProcess) removeMonitoringConfig() error { - tmpnetDir, err := getTmpnetPath() + serviceDiscoveryDir, err := getServiceDiscoveryDir(name) if err != nil { - return err + return "", err } + return filepath.Join(serviceDiscoveryDir, fmt.Sprintf("%s_%s.json", p.node.NetworkUUID, p.node.NodeID)), nil +} - for _, name := range []string{"promtail", "prometheus"} { - configPath := p.getMonitoringConfigPath(tmpnetDir, name) +// Ensure the removal of the monitoring configuration files for this node. +func (p *NodeProcess) removeMonitoringConfig() error { + for _, name := range []string{promtailCmd, prometheusCmd} { + configPath, err := p.getMonitoringConfigPath(name) + if err != nil { + return err + } if err := os.Remove(configPath); err != nil && !errors.Is(err, fs.ErrNotExist) { return fmt.Errorf("failed to remove %s config: %w", name, err) } } - return nil } // Write the configuration for a type of monitoring (e.g. prometheus, promtail). -func (p *NodeProcess) writeMonitoringConfigFile(tmpnetDir string, name string, config []FlagsMap) error { - configPath := p.getMonitoringConfigPath(tmpnetDir, name) +func (p *NodeProcess) writeMonitoringConfigFile(name string, config []FlagsMap) error { + configPath, err := p.getMonitoringConfigPath(name) + if err != nil { + return err + } dir := filepath.Dir(configPath) if err := os.MkdirAll(dir, perms.ReadWriteExecute); err != nil { diff --git a/tests/upgrade/upgrade_test.go b/tests/upgrade/upgrade_test.go index 99679ccf08d0..162c7887fa6c 100644 --- a/tests/upgrade/upgrade_test.go +++ b/tests/upgrade/upgrade_test.go @@ -22,7 +22,7 @@ func TestUpgrade(t *testing.T) { var ( avalancheGoExecPath string avalancheGoExecPathToUpgradeTo string - enableCollectors bool + startCollectors bool ) func init() { @@ -38,7 +38,7 @@ func init() { "", "avalanchego executable path to upgrade to", ) - e2e.SetEnableCollectorsFlag(&enableCollectors) + e2e.SetStartCollectorsFlag(&startCollectors) } var _ = ginkgo.Describe("[Upgrade]", func() { @@ -53,8 +53,8 @@ var _ = ginkgo.Describe("[Upgrade]", func() { require.NoError(err) network.Genesis = genesis - if enableCollectors { - require.NoError(tmpnet.EnsureCollectorsRunning(tc.DefaultContext(), tc.Log())) + if startCollectors { + require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log())) } e2e.StartNetwork( From 2fa844bf0723d17f2537eb1041c2252180644503 Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Mon, 24 Feb 2025 12:07:48 +0100 Subject: [PATCH 04/12] fixup: More cleanup --- tests/fixture/tmpnet/monitor_processes.go | 307 +++++++++++++--------- 1 file changed, 187 insertions(+), 120 deletions(-) diff --git a/tests/fixture/tmpnet/monitor_processes.go b/tests/fixture/tmpnet/monitor_processes.go index c9cf2cb78700..114e3351d8a4 100644 --- a/tests/fixture/tmpnet/monitor_processes.go +++ b/tests/fixture/tmpnet/monitor_processes.go @@ -7,7 +7,9 @@ import ( "context" "errors" "fmt" + "io" "io/fs" + "net/http" "os" "os/exec" "path/filepath" @@ -16,37 +18,63 @@ import ( "time" "go.uber.org/zap" + "k8s.io/apimachinery/pkg/util/wait" "github.com/ava-labs/avalanchego/utils/logging" "github.com/ava-labs/avalanchego/utils/perms" ) -type configGeneratorFunc func(workingDir string, serviceDiscoveryDir string, username string, password string) string - const ( collectorTickerInterval = 100 * time.Millisecond + // TODO(marun) Maybe use dynamic HTTP ports to avoid the possibility of them being already bound? + + // Prometheus configuration + prometheusCmd = "prometheus" + defaultPrometheusURL = "https://prometheus-poc.avax-dev.network" prometheusScrapeInterval = 10 * time.Second + prometheusHTTPPort = 9090 - prometheusCmd = "prometheus" - promtailCmd = "promtail" + // Promtail configuration + promtailCmd = "promtail" + defaultLokiURL = "https://loki-poc.avax-dev.network" + promtailHTTPPort = 3101 // Use a delay slightly longer than the scrape interval to ensure a final scrape before shutdown NetworkShutdownDelay = prometheusScrapeInterval + 2*time.Second ) +var ( + prometheusListenAddress = fmt.Sprintf("127.0.0.1:%d", prometheusHTTPPort) + prometheusReadinessURL = fmt.Sprintf("http://%s/-/ready", prometheusListenAddress) + + promtailReadinessURL = fmt.Sprintf("http://127.0.0.1:%d/ready", promtailHTTPPort) +) + // StartCollectors ensures collectors are running to collect logs and metrics from local nodes. func StartCollectors(ctx context.Context, log logging.Logger) error { if _, ok := ctx.Deadline(); !ok { return errors.New("unable to start collectors with a context without a deadline") } - if err := startPrometheus(ctx, log); err != nil { + if err := startPromtail(ctx, log); err != nil { return err } - if err := startPromtail(ctx, log); err != nil { + if err := startPrometheus(ctx, log); err != nil { return err } + // Wait for readiness. These checks are performed separately from start to + // minimize time to readiness. + readinessURLs := map[string]string{ + promtailCmd: promtailReadinessURL, + prometheusCmd: prometheusReadinessURL, + } + for cmdName, readinessURLs := range readinessURLs { + if err := waitForReadiness(ctx, log, cmdName, readinessURLs); err != nil { + return err + } + } + log.Info("To stop: tmpnetctl stop-collectors") return nil @@ -57,7 +85,7 @@ func StopCollectors(ctx context.Context, log logging.Logger) error { if _, ok := ctx.Deadline(); !ok { return errors.New("unable to start collectors with a context without a deadline") } - for _, cmdName := range []string{prometheusCmd, promtailCmd} { + for _, cmdName := range []string{promtailCmd, prometheusCmd} { // Determine if the process is running workingDir, err := getWorkingDir(cmdName) if err != nil { @@ -87,12 +115,10 @@ func StopCollectors(ctx context.Context, log logging.Logger) error { zap.String("cmdName", cmdName), zap.Int("pid", proc.Pid), ) - ticker := time.NewTicker(collectorTickerInterval) - defer ticker.Stop() - for { + if err := pollUntilContextCancel(ctx, func(_ context.Context) (bool, error) { p, err := getProcess(proc.Pid) if err != nil { - return fmt.Errorf("failed to retrieve process: %w", err) + return false, fmt.Errorf("failed to retrieve process: %w", err) } if p == nil { // Process is no longer running @@ -105,15 +131,10 @@ func StopCollectors(ctx context.Context, log logging.Logger) error { zap.Error(err), ) } - - break - } - - select { - case <-ctx.Done(): - return fmt.Errorf("failed to see %s stop before timeout: %w", cmdName, ctx.Err()) - case <-ticker.C: } + return p == nil, nil + }); err != nil { + return err } log.Info("collector stopped", zap.String("cmdName", cmdName), @@ -125,14 +146,24 @@ func StopCollectors(ctx context.Context, log logging.Logger) error { // startPrometheus ensures an agent-mode prometheus process is running to collect metrics from local nodes. func startPrometheus(ctx context.Context, log logging.Logger) error { - return startCollector( - ctx, - log, - prometheusCmd, - "--config.file=prometheus.yaml --storage.agent.path=./data --web.listen-address=localhost:0 --enable-feature=agent", - "PROMETHEUS", - func(_ string, serviceDiscoveryDir string, username string, password string) string { - return fmt.Sprintf(` + cmdName := prometheusCmd + + args := fmt.Sprintf( + "--config.file=prometheus.yaml --web.listen-address=%s --enable-feature=agent --storage.agent.path=./data", + prometheusListenAddress, + ) + + username, password, err := getCollectorCredentials(cmdName) + if err != nil { + return err + } + + serviceDiscoveryDir, err := getServiceDiscoveryDir(cmdName) + if err != nil { + return err + } + + config := fmt.Sprintf(` global: scrape_interval: %v # Default is every 1 minute. evaluation_interval: 10s # The default is every 1 minute. @@ -146,34 +177,49 @@ scrape_configs: - '%s/*.json' remote_write: - - url: "https://prometheus-poc.avax-dev.network/api/v1/write" + - url: "%s/api/v1/write" basic_auth: username: "%s" password: "%s" -`, prometheusScrapeInterval, serviceDiscoveryDir, username, password) - }, - ) +`, prometheusScrapeInterval, serviceDiscoveryDir, getPrometheusURL(), username, password) + + return startCollector(ctx, log, cmdName, args, config) } // startPromtail ensures a promtail process is running to collect logs from local nodes. func startPromtail(ctx context.Context, log logging.Logger) error { - return startCollector( - ctx, - log, - promtailCmd, - "-config.file=promtail.yaml", - "LOKI", - func(workingDir string, serviceDiscoveryDir string, username string, password string) string { - return fmt.Sprintf(` + cmdName := promtailCmd + + args := fmt.Sprintf( + "--config.file=prometheus.yaml --web.listen-address=%s --enable-feature=agent --storage.agent.path=./data", + prometheusListenAddress, + ) + + username, password, err := getCollectorCredentials(cmdName) + if err != nil { + return err + } + + workingDir, err := getWorkingDir(cmdName) + if err != nil { + return err + } + + serviceDiscoveryDir, err := getServiceDiscoveryDir(cmdName) + if err != nil { + return err + } + + config := fmt.Sprintf(` server: - http_listen_port: 0 + http_listen_port: %d grpc_listen_port: 0 positions: filename: %s/positions.yaml client: - url: "https://loki-poc.avax-dev.network/api/prom/push" + url: "%s/api/prom/push" basic_auth: username: "%s" password: "%s" @@ -183,9 +229,9 @@ scrape_configs: file_sd_configs: - files: - '%s/*.json' -`, workingDir, username, password, serviceDiscoveryDir) - }, - ) +`, promtailHTTPPort, workingDir, getLokiURL(), username, password, serviceDiscoveryDir) + + return startCollector(ctx, log, cmdName, args, config) } func getWorkingDir(cmdName string) (string, error) { @@ -214,8 +260,7 @@ func startCollector( log logging.Logger, cmdName string, args string, - baseEnvName string, - configGenerator configGeneratorFunc, + config string, ) error { // Determine paths workingDir, err := getWorkingDir(cmdName) @@ -251,7 +296,12 @@ func startCollector( } // Write the collector config file - if err := writeConfigFile(log, cmdName, workingDir, baseEnvName, configGenerator); err != nil { + confFilename := cmdName + ".yaml" + confPath := filepath.Join(workingDir, confFilename) + log.Info("writing "+cmdName+" config", + zap.String("path", confPath), + ) + if err := os.WriteFile(confPath, []byte(config), perms.ReadWrite); err != nil { return err } @@ -301,36 +351,26 @@ func clearStalePIDFile(log logging.Logger, cmdName string, pidPath string) error return nil } -// writeConfigFile writes the configuration file for a collector -func writeConfigFile( - log logging.Logger, - cmdName string, - workingDir string, - baseEnvName string, - configGenerator configGeneratorFunc, -) error { - // Retrieve the credentials for the command - username, password, err := getCredentials(baseEnvName) - if err != nil { - return err - } +func getPrometheusURL() string { + return GetEnvWithDefault("PROMETHEUS_URL", defaultPrometheusURL) +} - // Generate configuration for the command to its working dir - confFilename := cmdName + ".yaml" - confPath := filepath.Join(workingDir, confFilename) - log.Info("writing "+cmdName+" config", - zap.String("path", confPath), - ) - serviceDiscoveryDir, err := getServiceDiscoveryDir(cmdName) - if err != nil { - return err - } - config := configGenerator(workingDir, serviceDiscoveryDir, username, password) - return os.WriteFile(confPath, []byte(config), perms.ReadWrite) +func getLokiURL() string { + return GetEnvWithDefault("LOKI_URL", defaultPrometheusURL) } -// getCredentials retrieves the username and password for the given base env name. -func getCredentials(baseEnvName string) (string, string, error) { +// getCollectorCredentials retrieves the username and password for the command. +func getCollectorCredentials(cmdName string) (string, string, error) { + var baseEnvName string + switch cmdName { + case prometheusCmd: + baseEnvName = "PROMETHEUS" + case promtailCmd: + baseEnvName = "LOKI" + default: + return "", "", fmt.Errorf("unsupported cmd: %s", cmdName) + } + usernameEnvVar := baseEnvName + "_USERNAME" username := GetEnvWithDefault(usernameEnvVar, "") if len(username) == 0 { @@ -359,7 +399,8 @@ func startCollectorProcess( workingDir string, pidPath string, ) error { - fullCmd := "nohup " + cmdName + " " + args + " > " + cmdName + ".log 2>&1 & echo -n \"$!\" > " + pidPath + logFilename := cmdName + ".log" + fullCmd := "nohup " + cmdName + " " + args + " > " + logFilename + " 2>&1 & echo -n \"$!\" > " + pidPath log.Info("starting "+cmdName, zap.String("workingDir", workingDir), zap.String("fullCmd", fullCmd), @@ -372,63 +413,89 @@ func startCollectorProcess( return fmt.Errorf("failed to start %s: %w", cmdName, err) } - // Wait for PID file to be written. It's not enough to check for the PID of cmd - // because the PID we want is a child of the process that cmd represents. - if pid, err := waitForPIDFile(ctx, cmdName, pidPath); err != nil { + // Wait for PID file + var pid int + if err := pollUntilContextCancel(ctx, func(_ context.Context) (bool, error) { + var err error + pid, err = getPID(cmdName, pidPath) + if err != nil { + log.Warn("failed to read PID file", + zap.String("cmd", cmdName), + zap.String("pidPath", pidPath), + zap.Error(err), + ) + } + return pid != 0, nil + }); err != nil { return err - } else { - log.Info(cmdName+" started", - zap.String("pid", pid), - ) } + log.Info(cmdName+" started", + zap.Int("pid", pid), + ) - // TODO(marun) Perform a readiness check - // TODO(marun) Check that the log is not empty + // Wait for non-empty log file. An empty log file should only occur if the command + // invocation is not correctly redirecting stderr and stdout to the expected file. + logPath := filepath.Join(workingDir, logFilename) + if err := pollUntilContextCancel(ctx, func(_ context.Context) (bool, error) { + logData, err := os.ReadFile(logPath) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + return false, fmt.Errorf("failed to read log file %s for %s: %w", logPath, cmdName, err) + } + return len(logData) != 0, nil + }); err != nil { + return fmt.Errorf("empty log file %s for %s indicates misconfiguration: %w", logPath, cmdName, err) + } return nil } -// waitForPIDFile waits for the PID file to be written as an indication of process start. -func waitForPIDFile(ctx context.Context, cmdName string, pidPath string) (string, error) { - var ( - ticker = time.NewTicker(collectorTickerInterval) - pid string - ) - defer ticker.Stop() - for { - if fileExistsAndNotEmpty(pidPath) { - var err error - pid, err = readFileContents(pidPath) - if err != nil { - return "", fmt.Errorf("failed to read pid file: %w", err) - } - break - } - select { - case <-ctx.Done(): - return "", fmt.Errorf("failed to wait for %s to start before timeout: %w", cmdName, ctx.Err()) - case <-ticker.C: - } +// checkReadiness retrieves the provided URL and indicates whether it returned 200 +func checkReadiness(ctx context.Context, url string) (bool, string, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return false, "", err } - return pid, nil -} -func fileExistsAndNotEmpty(filename string) bool { - fileInfo, err := os.Stat(filename) + resp, err := http.DefaultClient.Do(req) if err != nil { - if os.IsNotExist(err) { - return false - } - fmt.Printf("Error stating file: %v\n", err) - return false + return false, "", fmt.Errorf("request failed: %w", err) } - return fileInfo.Size() > 0 -} + defer resp.Body.Close() -func readFileContents(filename string) (string, error) { - content, err := os.ReadFile(filename) + body, err := io.ReadAll(resp.Body) if err != nil { - return "", err + return false, "", fmt.Errorf("failed to read response: %w", err) } - return string(content), nil + + return resp.StatusCode == http.StatusOK, string(body), nil +} + +// waitForReadiness waits until the given readiness URL returns 200 +func waitForReadiness(ctx context.Context, log logging.Logger, cmdName string, readinessURL string) error { + log.Info("waiting for "+cmdName+" readiness", + zap.String("url", readinessURL), + ) + if err := pollUntilContextCancel(ctx, func(_ context.Context) (bool, error) { + ready, body, err := checkReadiness(ctx, readinessURL) + if err == nil { + return ready, nil + } + log.Warn("failed to check readiness", + zap.String("cmd", cmdName), + zap.String("url", readinessURL), + zap.String("body", body), + zap.Error(err), + ) + return false, nil + }); err != nil { + return err + } + log.Info(cmdName+" ready", + zap.String("url", readinessURL), + ) + return nil +} + +func pollUntilContextCancel(ctx context.Context, condition wait.ConditionWithContextFunc) error { + return wait.PollUntilContextCancel(ctx, collectorTickerInterval, true /* immediate */, condition) } From 86eba76567a58718d8766adfdad54ae9c1f33653 Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Mon, 24 Feb 2025 14:23:44 +0100 Subject: [PATCH 05/12] fixup: Add README for action --- tests/fixture/tmpnet/README.md | 69 ++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tests/fixture/tmpnet/README.md b/tests/fixture/tmpnet/README.md index 1a90f71a9228..f402f652fb88 100644 --- a/tests/fixture/tmpnet/README.md +++ b/tests/fixture/tmpnet/README.md @@ -373,6 +373,75 @@ additional labels will be applied: These labels are sourced from Github Actions' `github` context as per https://docs.github.com/en/actions/learn-github-actions/contexts#github-context. +### CI Collection + +A [custom github +action](../../../.github/actions/run-monitored-tmpnet-cmd/action.yml) +exists to simplify collection of logs and metrics from CI. The action +takes care of invoking a nix shell to ensure the availability of +binary dependencies, configures tmpnet to collect metrics and ensures +that the tmpnet path is collected as a github artifact to aid in troubleshooting. + +Example usage: + +```yaml +- name: Run e2e tests + + # A qualified path is required for use outside of avalanchego + # e.g. `ava-labs/avalanchego/.github/actions/run-monitored-tmpnet-cmd@[sha or tag]` + uses: ./.github/actions/run-monitored-tmpnet-cmd # + + with: + # This needs to be the path to a bash script + run: ./scripts/tests.e2e.sh + + # Env vars for the script need to be provided via run_env as a space-separated string + # e.g. `MY_VAR1=foo MY_VAR2=bar` + run_env: E2E_SERIAL=1 + + # Sets the prefix of the artifact containing the tmpnet network dir for this job. + # Only required if a workflow uses this action more than once so that each artifact + # will have a unique name. + artifact_prefix: e2e + + # These credentials are mandatory + prometheus_username: ${{ secrets.PROMETHEUS_ID || '' }} + prometheus_password: ${{ secrets.PROMETHEUS_PASSWORD || '' }} + loki_username: ${{ secrets.LOKI_ID || '' }} + loki_password: ${{ secrets.LOKI_PASSWORD || '' }} +``` + +The action assumes a nix flake file in the repo root that enables +availability of promtail and prometheus. The following is a minimal +flake file that inherits from avalanchego's flake: + +```nix +{ + # To use: + # - install nix: https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix + # - run `nix develop` or use direnv (https://direnv.net/) + # - for quieter direnv output, set `export DIRENV_LOG_FORMAT=` + + description = "VM development environment"; + + inputs = { + nixpkgs.url = "https://flakehub.com/f/NixOS/nixpkgs/0.2405.*.tar.gz"; + # Make sure to set a SHA or tag to the desired version + avalanchego.url = "github:ava-labs/avalanchego?ref=[sha or tag]"; + }; + + outputs = { self, nixpkgs, avalanchego, ... }: + let + allSystems = builtins.attrNames avalanchego.devShells; + forAllSystems = nixpkgs.lib.genAttrs allSystems; + in { + devShells = forAllSystems (system: { + default = avalanchego.devShells.${system}.default; + }); + }; +} +``` + ### Viewing #### Local networks From 58d5431a8dbd6d4305e8c9e5e95e57dfb343e6b2 Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Mon, 24 Feb 2025 14:58:38 +0100 Subject: [PATCH 06/12] fixup: Fix args --- tests/fixture/tmpnet/monitor_processes.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/fixture/tmpnet/monitor_processes.go b/tests/fixture/tmpnet/monitor_processes.go index 114e3351d8a4..ebcf53b0c4f9 100644 --- a/tests/fixture/tmpnet/monitor_processes.go +++ b/tests/fixture/tmpnet/monitor_processes.go @@ -149,7 +149,8 @@ func startPrometheus(ctx context.Context, log logging.Logger) error { cmdName := prometheusCmd args := fmt.Sprintf( - "--config.file=prometheus.yaml --web.listen-address=%s --enable-feature=agent --storage.agent.path=./data", + "--config.file=%s.yaml --web.listen-address=%s --enable-feature=agent --storage.agent.path=./data", + cmdName, prometheusListenAddress, ) @@ -190,10 +191,7 @@ remote_write: func startPromtail(ctx context.Context, log logging.Logger) error { cmdName := promtailCmd - args := fmt.Sprintf( - "--config.file=prometheus.yaml --web.listen-address=%s --enable-feature=agent --storage.agent.path=./data", - prometheusListenAddress, - ) + args := fmt.Sprintf("-config.file=%s.yaml", cmdName) username, password, err := getCollectorCredentials(cmdName) if err != nil { From 00a63dcd720acb5d41d7a182f05eb3383bdfea0e Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Mon, 24 Feb 2025 15:04:36 +0100 Subject: [PATCH 07/12] fixup: Remove apostrophe to avoid github's shite formatting --- tests/fixture/tmpnet/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixture/tmpnet/README.md b/tests/fixture/tmpnet/README.md index f402f652fb88..0ef0d8b63ea5 100644 --- a/tests/fixture/tmpnet/README.md +++ b/tests/fixture/tmpnet/README.md @@ -413,7 +413,7 @@ Example usage: The action assumes a nix flake file in the repo root that enables availability of promtail and prometheus. The following is a minimal -flake file that inherits from avalanchego's flake: +flake file that inherits from the avalanchego flake: ```nix { From 3d3f7de68eb07053b92cc83cf2230a0da035a57a Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Mon, 24 Feb 2025 16:14:16 +0100 Subject: [PATCH 08/12] fixup: Move promtail readiness check to after nodes have started --- tests/fixture/e2e/env.go | 14 +++++++++---- tests/fixture/tmpnet/monitor_processes.go | 25 ++++++++++------------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/tests/fixture/e2e/env.go b/tests/fixture/e2e/env.go index 6f359b499ef0..694ff2af6f5c 100644 --- a/tests/fixture/e2e/env.go +++ b/tests/fixture/e2e/env.go @@ -79,6 +79,11 @@ func (te *TestEnvironment) Marshal() []byte { func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork *tmpnet.Network) *TestEnvironment { require := require.New(tc) + // Start collectors for any command but stop + if flagVars.StartCollectors() && !flagVars.StopNetwork() { + require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log())) + } + var network *tmpnet.Network // Need to load the network if it is being stopped or reused if flagVars.StopNetwork() || flagVars.ReuseNetwork() { @@ -130,10 +135,6 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork } } - if flagVars.StartCollectors() { - require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log())) - } - // Start a new network if network == nil { network = desiredNetwork @@ -151,6 +152,11 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork ) } + // Once one or more nodes are running it should be safe to wait for promtail to report readiness + if flagVars.StartCollectors() { + require.NoError(tmpnet.WaitForPromtailReadiness(tc.DefaultContext(), tc.Log())) + } + if flagVars.StartNetwork() { os.Exit(0) } diff --git a/tests/fixture/tmpnet/monitor_processes.go b/tests/fixture/tmpnet/monitor_processes.go index ebcf53b0c4f9..639c053f637e 100644 --- a/tests/fixture/tmpnet/monitor_processes.go +++ b/tests/fixture/tmpnet/monitor_processes.go @@ -63,16 +63,9 @@ func StartCollectors(ctx context.Context, log logging.Logger) error { return err } - // Wait for readiness. These checks are performed separately from start to - // minimize time to readiness. - readinessURLs := map[string]string{ - promtailCmd: promtailReadinessURL, - prometheusCmd: prometheusReadinessURL, - } - for cmdName, readinessURLs := range readinessURLs { - if err := waitForReadiness(ctx, log, cmdName, readinessURLs); err != nil { - return err - } + log.Info("skipping promtail readiness check until one or more nodes have written their service discovery configuration") + if err := waitForReadiness(ctx, log, prometheusCmd, prometheusReadinessURL); err != nil { + return err } log.Info("To stop: tmpnetctl stop-collectors") @@ -80,6 +73,12 @@ func StartCollectors(ctx context.Context, log logging.Logger) error { return nil } +// WaitForPromtailReadiness waits until prometheus is ready. It can only succeed after +// one or more nodes have written their service discovery configuration. +func WaitForPromtailReadiness(ctx context.Context, log logging.Logger) error { + return waitForReadiness(ctx, log, promtailCmd, promtailReadinessURL) +} + // EnsureCollectorsStopped ensures collectors are not running. func StopCollectors(ctx context.Context, log logging.Logger) error { if _, ok := ctx.Deadline(); !ok { @@ -354,7 +353,7 @@ func getPrometheusURL() string { } func getLokiURL() string { - return GetEnvWithDefault("LOKI_URL", defaultPrometheusURL) + return GetEnvWithDefault("LOKI_URL", defaultLokiURL) } // getCollectorCredentials retrieves the username and password for the command. @@ -488,9 +487,7 @@ func waitForReadiness(ctx context.Context, log logging.Logger, cmdName string, r }); err != nil { return err } - log.Info(cmdName+" ready", - zap.String("url", readinessURL), - ) + log.Info(cmdName + " ready") return nil } From ceef695bf01479a37c223878b6e1bd2329489d0d Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Mon, 24 Feb 2025 23:27:15 +0100 Subject: [PATCH 09/12] fixup: Inline step uploading tmpnet network data --- .github/actions/run-monitored-tmpnet-cmd/action.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/actions/run-monitored-tmpnet-cmd/action.yml b/.github/actions/run-monitored-tmpnet-cmd/action.yml index 205e1d677116..a5dcce720240 100644 --- a/.github/actions/run-monitored-tmpnet-cmd/action.yml +++ b/.github/actions/run-monitored-tmpnet-cmd/action.yml @@ -75,9 +75,18 @@ runs: GH_RUN_NUMBER: ${{ inputs.run_number }} GH_RUN_ATTEMPT: ${{ inputs.run_attempt }} GH_JOB_ID: ${{ inputs.job }} - - name: Upload tmpnet network dir - uses: ./.github/actions/upload-tmpnet-artifact + # This step is duplicated from upload-tmpnet-artifact for the same + # reason as the nix installation. There doesn't appear to be an + # easy way to composee custom actions for use by other repos + # without running into versioning issues. + - name: Upload tmpnet data + uses: actions/upload-artifact@v4 if: always() with: name: ${{ inputs.artifact_prefix }}-tmpnet-data + path: | + ~/.tmpnet/networks + ~/.tmpnet/prometheus/prometheus.log + ~/.tmpnet/promtail/promtail.log + if-no-files-found: error # TODO(marun) Check that collection is working by querying prometheus and loki with the GH_* labels above From fa7b613ee9263a7c5fe65d5e1eb0839c9bec2787 Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Tue, 25 Feb 2025 00:30:39 +0100 Subject: [PATCH 10/12] fixup: Update README to mention the need to vendor tmpnet --- tests/fixture/tmpnet/README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/fixture/tmpnet/README.md b/tests/fixture/tmpnet/README.md index 0ef0d8b63ea5..dda7944ea5ae 100644 --- a/tests/fixture/tmpnet/README.md +++ b/tests/fixture/tmpnet/README.md @@ -411,9 +411,9 @@ Example usage: loki_password: ${{ secrets.LOKI_PASSWORD || '' }} ``` -The action assumes a nix flake file in the repo root that enables +The action requires a flake.nix file in the repo root that enables availability of promtail and prometheus. The following is a minimal -flake file that inherits from the avalanchego flake: +flake that inherits from the avalanchego flake: ```nix { @@ -442,6 +442,18 @@ flake file that inherits from the avalanchego flake: } ``` +The action also requires being able to invoke tmpnetctl via `go +run`. Use of a `tools.go` file that imports tmpnetctl is suggested to +enable this: + +```golang +package tools + +import ( + _ "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/cmd" // tmpnetctl +) +``` + ### Viewing #### Local networks From 9cf8d2ff00e10af064bfa8bd08b3ebe36fcfc612 Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 4 Mar 2025 15:16:49 +0100 Subject: [PATCH 11/12] Update tests/fixture/e2e/flags.go Co-authored-by: Stephen Buttolph Signed-off-by: maru --- tests/fixture/e2e/flags.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/fixture/e2e/flags.go b/tests/fixture/e2e/flags.go index b313f627cdbd..f1cc9391a9b7 100644 --- a/tests/fixture/e2e/flags.go +++ b/tests/fixture/e2e/flags.go @@ -79,7 +79,8 @@ func (v *FlagVars) StartCollectors() bool { func (v *FlagVars) NetworkShutdownDelay() time.Duration { if v.startCollectors { - // Only return a non-zero value if the delay is enabled. + // Only return a non-zero value if we want to ensure the collectors have + // a chance to collect the metrics at the end of the test. return tmpnet.NetworkShutdownDelay } return 0 From 9c408ffc51aaabc5ca1527374b4ba096d369ccde Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Tue, 4 Mar 2025 15:37:44 +0100 Subject: [PATCH 12/12] fixup: Respond to review comments --- tests/fixture/tmpnet/monitor_processes.go | 162 ++++++++++++---------- 1 file changed, 91 insertions(+), 71 deletions(-) diff --git a/tests/fixture/tmpnet/monitor_processes.go b/tests/fixture/tmpnet/monitor_processes.go index 639c053f637e..50771356fe7d 100644 --- a/tests/fixture/tmpnet/monitor_processes.go +++ b/tests/fixture/tmpnet/monitor_processes.go @@ -33,24 +33,19 @@ const ( prometheusCmd = "prometheus" defaultPrometheusURL = "https://prometheus-poc.avax-dev.network" prometheusScrapeInterval = 10 * time.Second - prometheusHTTPPort = 9090 + prometheusListenAddress = "127.0.0.1:9090" + prometheusReadinessURL = "http://" + prometheusListenAddress + "/-/ready" // Promtail configuration - promtailCmd = "promtail" - defaultLokiURL = "https://loki-poc.avax-dev.network" - promtailHTTPPort = 3101 + promtailCmd = "promtail" + defaultLokiURL = "https://loki-poc.avax-dev.network" + promtailHTTPPort = "3101" + promtailReadinessURL = "http://127.0.0.1:" + promtailHTTPPort + "/ready" // Use a delay slightly longer than the scrape interval to ensure a final scrape before shutdown NetworkShutdownDelay = prometheusScrapeInterval + 2*time.Second ) -var ( - prometheusListenAddress = fmt.Sprintf("127.0.0.1:%d", prometheusHTTPPort) - prometheusReadinessURL = fmt.Sprintf("http://%s/-/ready", prometheusListenAddress) - - promtailReadinessURL = fmt.Sprintf("http://127.0.0.1:%d/ready", promtailHTTPPort) -) - // StartCollectors ensures collectors are running to collect logs and metrics from local nodes. func StartCollectors(ctx context.Context, log logging.Logger) error { if _, ok := ctx.Deadline(); !ok { @@ -103,7 +98,7 @@ func StopCollectors(ctx context.Context, log logging.Logger) error { } log.Info("sending SIGTERM to collector process", - zap.String("cmdName", cmdName), + zap.String("cmd", cmdName), zap.Int("pid", proc.Pid), ) if err := proc.Signal(syscall.SIGTERM); err != nil { @@ -111,28 +106,32 @@ func StopCollectors(ctx context.Context, log logging.Logger) error { } log.Info("waiting for collector process to stop", - zap.String("cmdName", cmdName), + zap.String("cmd", cmdName), zap.Int("pid", proc.Pid), ) - if err := pollUntilContextCancel(ctx, func(_ context.Context) (bool, error) { - p, err := getProcess(proc.Pid) - if err != nil { - return false, fmt.Errorf("failed to retrieve process: %w", err) - } - if p == nil { - // Process is no longer running - - // Attempt to clear the PID file. Not critical that it is removed, just good housekeeping. - if err := clearStalePIDFile(log, cmdName, pidPath); err != nil { - log.Warn("failed to remove stale PID file", - zap.String("cmd", cmdName), - zap.String("pidFile", pidPath), - zap.Error(err), - ) + err = pollUntilContextCancel( + ctx, + func(_ context.Context) (bool, error) { + p, err := getProcess(proc.Pid) + if err != nil { + return false, fmt.Errorf("failed to retrieve process: %w", err) } - } - return p == nil, nil - }); err != nil { + if p == nil { + // Process is no longer running + + // Attempt to clear the PID file. Not critical that it is removed, just good housekeeping. + if err := clearStalePIDFile(log, cmdName, pidPath); err != nil { + log.Warn("failed to remove stale PID file", + zap.String("cmd", cmdName), + zap.String("pidFile", pidPath), + zap.Error(err), + ) + } + } + return p == nil, nil + }, + ) + if err != nil { return err } log.Info("collector stopped", @@ -209,7 +208,7 @@ func startPromtail(ctx context.Context, log logging.Logger) error { config := fmt.Sprintf(` server: - http_listen_port: %d + http_listen_port: %s grpc_listen_port: 0 positions: @@ -278,7 +277,9 @@ func startCollector( if process, err := processFromPIDFile(cmdName, pidPath); err != nil { return err } else if process != nil { - log.Info(cmdName + " is already running") + log.Info("collector already running", + zap.String("cmd", cmdName), + ) return nil } @@ -295,7 +296,8 @@ func startCollector( // Write the collector config file confFilename := cmdName + ".yaml" confPath := filepath.Join(workingDir, confFilename) - log.Info("writing "+cmdName+" config", + log.Info("writing collector config", + zap.String("cmd", cmdName), zap.String("path", confPath), ) if err := os.WriteFile(confPath, []byte(config), perms.ReadWrite); err != nil { @@ -341,7 +343,8 @@ func clearStalePIDFile(log logging.Logger, cmdName string, pidPath string) error return fmt.Errorf("failed to remove stale pid file: %w", err) } } else { - log.Info("deleted stale "+cmdName+" pid file", + log.Info("deleted stale collector pid file", + zap.String("cmd", cmdName), zap.String("path", pidPath), ) } @@ -398,7 +401,8 @@ func startCollectorProcess( ) error { logFilename := cmdName + ".log" fullCmd := "nohup " + cmdName + " " + args + " > " + logFilename + " 2>&1 & echo -n \"$!\" > " + pidPath - log.Info("starting "+cmdName, + log.Info("starting collector", + zap.String("cmd", cmdName), zap.String("workingDir", workingDir), zap.String("fullCmd", fullCmd), ) @@ -412,34 +416,43 @@ func startCollectorProcess( // Wait for PID file var pid int - if err := pollUntilContextCancel(ctx, func(_ context.Context) (bool, error) { - var err error - pid, err = getPID(cmdName, pidPath) - if err != nil { - log.Warn("failed to read PID file", - zap.String("cmd", cmdName), - zap.String("pidPath", pidPath), - zap.Error(err), - ) - } - return pid != 0, nil - }); err != nil { + err := pollUntilContextCancel( + ctx, + func(_ context.Context) (bool, error) { + var err error + pid, err = getPID(cmdName, pidPath) + if err != nil { + log.Warn("failed to read PID file", + zap.String("cmd", cmdName), + zap.String("pidPath", pidPath), + zap.Error(err), + ) + } + return pid != 0, nil + }, + ) + if err != nil { return err } - log.Info(cmdName+" started", + log.Info("started collector", + zap.String("cmd", cmdName), zap.Int("pid", pid), ) // Wait for non-empty log file. An empty log file should only occur if the command // invocation is not correctly redirecting stderr and stdout to the expected file. logPath := filepath.Join(workingDir, logFilename) - if err := pollUntilContextCancel(ctx, func(_ context.Context) (bool, error) { - logData, err := os.ReadFile(logPath) - if err != nil && !errors.Is(err, fs.ErrNotExist) { - return false, fmt.Errorf("failed to read log file %s for %s: %w", logPath, cmdName, err) - } - return len(logData) != 0, nil - }); err != nil { + err = pollUntilContextCancel( + ctx, + func(_ context.Context) (bool, error) { + logData, err := os.ReadFile(logPath) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + return false, fmt.Errorf("failed to read log file %s for %s: %w", logPath, cmdName, err) + } + return len(logData) != 0, nil + }, + ) + if err != nil { return fmt.Errorf("empty log file %s for %s indicates misconfiguration: %w", logPath, cmdName, err) } @@ -469,25 +482,32 @@ func checkReadiness(ctx context.Context, url string) (bool, string, error) { // waitForReadiness waits until the given readiness URL returns 200 func waitForReadiness(ctx context.Context, log logging.Logger, cmdName string, readinessURL string) error { - log.Info("waiting for "+cmdName+" readiness", + log.Info("waiting for collector readiness", + zap.String("cmd", cmdName), zap.String("url", readinessURL), ) - if err := pollUntilContextCancel(ctx, func(_ context.Context) (bool, error) { - ready, body, err := checkReadiness(ctx, readinessURL) - if err == nil { - return ready, nil - } - log.Warn("failed to check readiness", - zap.String("cmd", cmdName), - zap.String("url", readinessURL), - zap.String("body", body), - zap.Error(err), - ) - return false, nil - }); err != nil { + err := pollUntilContextCancel( + ctx, + func(_ context.Context) (bool, error) { + ready, body, err := checkReadiness(ctx, readinessURL) + if err == nil { + return ready, nil + } + log.Warn("failed to check readiness", + zap.String("cmd", cmdName), + zap.String("url", readinessURL), + zap.String("body", body), + zap.Error(err), + ) + return false, nil + }, + ) + if err != nil { return err } - log.Info(cmdName + " ready") + log.Info("collector ready", + zap.String("cmd", cmdName), + ) return nil }