Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrates upgrades tests into Cloud Build #4037

Merged
merged 12 commits into from
Dec 6, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -28,6 +28,9 @@ bin
/site/public
/test

# Allow upgrade test directory
!/test/upgrade

# Created by .ignore support plugin (hsz.mobi)
### Go template
# Binaries for programs and plugins
169 changes: 163 additions & 6 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -233,9 +233,18 @@ steps:
# End to end tests
#

# wait for us to be the oldest ongoing build before we run e2es
- name: gcr.io/cloud-builders/gcloud
id: e2e-wait-to-become-leader
# Build and Push upgrade test
- name: make-docker
id: push-upgrade-test
dir: test/upgrade
env: ['REGISTRY=${_REGISTRY}']
args: [push]
waitFor:
- push-images

# Wait for us to be the oldest ongoing build before we run upgrade and e2e tests
- name: gcr.io/google.com/cloudsdktool/cloud-sdk
id: wait-to-become-leader
waitFor: [push-images]
script: |
#!/usr/bin/env bash
@@ -258,10 +267,157 @@ steps:
- BUILD_ID=$BUILD_ID
- TRIGGER_NAME=$TRIGGER_NAME

# Run the upgrade tests parallel, fail this step if any of the tests fail
- name: gcr.io/google.com/cloudsdktool/cloud-sdk
id: submit-upgrade-test-cloud-build
dir: test/upgrade
entrypoint: bash
args:
- -c
- |
#!/usr/bin/env bash
set -e
set -o pipefail
export KUBECONFIG="/root/.kube/config"
mkdir -p /go/src/agones.dev/ /root/.kube/
ln -s /workspace /go/src/agones.dev/agones
cd /go/src/agones.dev/agones/test/upgrade

pids=()
typeset -A waitPids # Associative array for mapping `kubectl wait job` pid -> `kubectl wait job` output log name
tmpdir=$(mktemp -d)
trap 'rm -rf -- "$tmpdir"' EXIT SIGTERM

# Update image tags to include the current build version.
DevVersion="${_BASE_VERSION}-dev-$(git rev-parse --short=7 HEAD)"
export DevVersion
sed "s/\${DevVersion}/${DevVersion}/" upgradeTest.yaml > "${tmpdir}"/upgradeTest.yaml
sed "s/\${DevVersion}/${DevVersion}/" versionMap.yaml > "${tmpdir}"/versionMap.yaml

# Kill all currently running child processes on exit or if a non-zero signal is seen
trap 'echo Cleaning up any remaining running pids: $(jobs -p) ; kill $(jobs -p) 2> /dev/null || :' EXIT SIGTERM

cloudProducts=("generic" "gke-autopilot")
declare -A versionsAndRegions=( [1.31]=us-east1 [1.30]=us-central1 [1.29]=us-west1 )

for cloudProduct in "${cloudProducts[@]}"
do
for version in "${!versionsAndRegions[@]}"
do
region=${versionsAndRegions[$version]}
if [ "$cloudProduct" = generic ]
then
testCluster="standard-upgrade-test-cluster-${version//./-}"
else
testCluster="gke-autopilot-upgrade-test-cluster-${version//./-}"
fi
testClusterLocation="${region}"

gcloud container clusters get-credentials "$testCluster" --region="$testClusterLocation" --project="$PROJECT_ID"

if [ "$cloudProduct" = gke-autopilot ] ; then
# For autopilot clusters use evictable "balloon" pods to keep a buffer in node pool autoscaling.
kubectl apply -f evictablePods.yaml
fi

# Clean up any existing job / namespace / apiservice from previous run
echo Checking if resources from a previous build of upgrade-test-runner exist and need to be cleaned up on cluster "${testCluster}".
if kubectl get jobs | grep upgrade-test-runner ; then
echo Deleting job from previous run of upgrade-test-runner on cluster "${testCluster}".
kubectl delete job upgrade-test-runner
kubectl wait --for=delete pod -l job-name=upgrade-test-runner --timeout=5m
fi

# Check if there are any dangling game servers.
if kubectl get gs | grep ".*"; then
# Remove any finalizers so that dangling game servers can be manually deleted.
kubectl get gs -o=custom-columns=:.metadata.name --no-headers | xargs kubectl patch gs -p '{"metadata":{"finalizers":[]}}' --type=merge
sleep 5
echo Deleting game servers from previous run of upgrade-test-runner on cluster "${testCluster}".
kubectl delete gs -l app=sdk-client-test
fi

if kubectl get po -l app=sdk-client-test | grep ".*"; then
echo Deleting pods from previous run of upgrade-test-runner on cluster "${testCluster}".
kubectl delete po -l app=sdk-client-test
kubectl wait --for=delete pod -l app=sdk-client-test --timeout=5m
fi

# The v1.allocation.agones.dev apiservice does not get removed automatically and will prevent the namespace from terminating.
if kubectl get apiservice | grep v1.allocation.agones.dev ; then
echo Deleting v1.allocation.agones.dev from previous run of upgrade-test-runner on cluster "${testCluster}".
kubectl delete apiservice v1.allocation.agones.dev
fi

if kubectl get namespace | grep agones-system ; then
echo Deleting agones-system namespace from previous run of upgrade-test-runner on cluster "${testCluster}".
kubectl delete namespace agones-system
kubectl wait --for=delete ns agones-system --timeout=5m
fi

if kubectl get crds | grep agones ; then
echo Deleting crds from previous run of upgrade-test-runner on cluster "${testCluster}".
kubectl get crds -o=custom-columns=:.metadata.name | grep agones | xargs kubectl delete crd
fi

echo kubectl apply -f permissions.yaml on cluster "${testCluster}"
kubectl apply -f permissions.yaml
echo kubectl apply -f versionMap.yaml on cluster "${testCluster}"
kubectl apply -f "${tmpdir}"/versionMap.yaml
echo kubectl apply -f gameserverTemplate.yaml on cluster "${testCluster}"
kubectl apply -f gameserverTemplate.yaml

echo kubectl apply -f upgradeTest.yaml on cluster "${testCluster}"
kubectl apply -f "${tmpdir}"/upgradeTest.yaml

# We need to wait for job pod to be created and ready before we can wait on the job itself.
# TODO: Once all test clusters are at Kubernetes Version >= 1.31 use `kubectl wait --for=create` instead of sleep.
# kubectl wait --for=create pod -l job-name=upgrade-test-runner --timeout=1m
sleep 10s
kubectl wait --for=condition=ready pod -l job-name=upgrade-test-runner --timeout=5m

echo Wait for job upgrade-test-runner to complete or fail on cluster "${testCluster}"
kubectl wait job/upgrade-test-runner --timeout=20m --for jsonpath='{.status.conditions[*].status}'=True -o jsonpath='{.status.conditions[*].type}' | tee "${tmpdir}"/"${testCluster}".log &
waitPid=$!
pids+=( "$waitPid" )
waitPids[$waitPid]="${tmpdir}"/"${testCluster}".log
done
done

for pid in "${pids[@]}"; do
# This block executes when the process exits and pid status==0
if wait $pid; then
outputLog="${waitPids[$pid]}"
# wait for output to finish writing to file
until [ -s "$outputLog" ]; do sleep 1; done
output=$(<"${outputLog}")
echo "${outputLog}": "${output}"

# "Complete" is successful job run.
# Version 1.31 has "SuccessCriteriaMet" as the first completion status returned, or "FailureTarget" in case of failure.
if [ "$output" == "Complete" ] || [ "$output" == "SuccessCriteriaMet" ] ; then
continue
else
exit 1
fi
# This block executes when the process exits and pid status!=0
else
status=$?
outputLog="${waitPids[$pid]}"
echo "One of the upgrade tests pid $pid from cluster log $outputLog exited with a non-zero status ${status}."
exit $status
fi
done
echo "End of Upgrade Tests"

waitFor:
- wait-to-become-leader
- push-upgrade-test

# cancel all the orphan e2e test cloud builds, fail to cancel any of the build will fail this whole build
- name: gcr.io/cloud-builders/gcloud
id: cancel-orphan-e2e-tests
waitFor: [e2e-wait-to-become-leader]
waitFor: [wait-to-become-leader]
script: |
#!/usr/bin/env bash
until gcloud builds list --ongoing --filter "tags:'e2e-test'" --format="value(id)" | xargs --no-run-if-empty gcloud builds cancel
@@ -386,7 +542,7 @@ steps:
#
- name: gcr.io/cloud-builders/gcloud
id: cleanup-services
waitFor: [e2e-wait-to-become-leader]
waitFor: [wait-to-become-leader]
allowFailure: true
entrypoint: bash
args:
@@ -400,14 +556,15 @@ steps:
done

substitutions:
_BASE_VERSION: 1.46.0
_CACHE_BUCKET: agones-build-cache
_HTMLTEST_CACHE_KEY: htmltest-0.10.1
_CPP_SDK_BUILD_CACHE_KEY: cpp-sdk-build
_CPP_SDK_CONFORMANCE_CACHE_KEY: cpp-sdk-conformance
_RUST_SDK_BUILD_CACHE_KEY: rust-sdk-build
_REGISTRY: us-docker.pkg.dev/${PROJECT_ID}/ci
tags: [ci, 'commit-${COMMIT_SHA}']
timeout: 18000s # 5h: 3h (e2e-wait-to-become-leader) + 1.5h (e2e timeout) + 0.5h (everything else)
timeout: 18000s # 5h: 3h (wait-to-become-leader) + 1.5h (e2e timeout) + 0.5h (everything else)
queueTtl: 259200s # 72h
images:
- ${_REGISTRY}/agones-controller
2 changes: 1 addition & 1 deletion test/sdk/go/Makefile
Original file line number Diff line number Diff line change
@@ -29,7 +29,7 @@ project_path := $(dir $(mkfile_path))
root_path = $(realpath $(project_path)/)
# Because go mod init in the Dockerfile installs the most recently released version of Agones, this
# will need to be built and pushed post-release. During DEV it will be built at DEV - 1.
release_version = 1.44.0
release_version = 1.45.0
server_tag := $(REGISTRY)/sdk-client-test:$(release_version)

# _____ _
34 changes: 14 additions & 20 deletions test/upgrade/Dockerfile
Original file line number Diff line number Diff line change
@@ -12,50 +12,44 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM gcr.io/cloud-builders/gcloud AS builder
FROM golang:1.22.9-alpine AS builder

RUN apt-get update && \
apt-get install -y curl && \
apt-get clean
# install curl
RUN apk update && \
apk upgrade && \
apk --no-cache add curl

WORKDIR /usr/local

# install kubectl
ENV KUBECTL_VER=1.29.7
ENV KUBECTL_VER=1.30.4
RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VER}/bin/linux/amd64/kubectl && \
chmod go+rx ./kubectl && \
mv ./kubectl /usr/local/bin/kubectl

# install Helm package manager
ENV HELM_VER=3.14.3
ENV HELM_VER=3.16.3
ENV HELM_URL=https://get.helm.sh/helm-v${HELM_VER}-linux-amd64.tar.gz
RUN curl -L ${HELM_URL} > /tmp/helm.tar.gz \
&& tar -zxvf /tmp/helm.tar.gz -C /tmp \
&& mv /tmp/linux-amd64/helm /usr/local/bin/helm \
&& chmod go+rx /usr/local/bin/helm \
&& rm /tmp/helm.tar.gz && rm -rf /tmp/linux-amd64

# Build the Go image from source
FROM golang:1.22.6 AS build-stage

# Copy and build the Go application
WORKDIR /agones.dev

COPY *.go ./

COPY test/upgrade/main.go ./
RUN go mod init agones.dev/agones/test/upgrade/testContainer
RUN go mod tidy
RUN go mod download

RUN CGO_ENABLED=0 GOOS=linux go build -o /upgrade-test

# Copy the above binary into a lean image
FROM gcr.io/distroless/static-debian12:nonroot AS build-release-stage

# Copy the dev build Agones Helm chart
WORKDIR /

COPY --from=build-stage /upgrade-test /upgrade-test
COPY --from=builder /usr/local /usr/local

USER nonroot:nonroot
# Use a non-root user for security best practices
RUN adduser -D -g '' adduser
USER adduser
COPY --chown=adduser install/helm/agones /install/helm

ENTRYPOINT ["/upgrade-test"]
16 changes: 9 additions & 7 deletions test/upgrade/Makefile
Original file line number Diff line number Diff line change
@@ -24,22 +24,24 @@
#

REGISTRY ?=
mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
project_path := $(dir $(mkfile_path))
root_path = $(realpath $(project_path)/)
dev_version = 1.44.0-dev
server_tag := $(REGISTRY)/upgrade-test-controller:$(dev_version)

base_version = 1.46.0
# Version defaults to the short hash of the latest commit
VERSION ?= $(base_version)-dev-$(shell git rev-parse --short=7 HEAD)
server_tag := $(REGISTRY)/upgrade-test-controller:$(VERSION)
cwd:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
# _____ _
# |_ _|_ _ _ __ __ _ ___| |_ ___
# | |/ _` | '__/ _` |/ _ \ __/ __|
# | | (_| | | | (_| | __/ |_\__ \
# |_|\__,_|_| \__, |\___|\__|___/
# |___/

# Using .ONESHELL allows us to `cd` to the parent directory agones. This gives the Dockerfile the
# context of the agones directory, which allows it to COPY files from any child directory.
.ONESHELL:
# Build a docker image for the server, and tag it
build:
cd $(root_path) && docker build -f $(project_path)Dockerfile --tag=$(server_tag) .
cd "$(cwd)/../.." && DOCKER_BUILDKIT=1 docker build -f $(cwd)/Dockerfile --tag=$(server_tag) .

push: build
docker push $(server_tag)
67 changes: 67 additions & 0 deletions test/upgrade/evictablePods.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright 2024 Google LLC All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Create evictable pods to prevent Autopilot clusters from completely scaling down.
# https://cloud.google.com/kubernetes-engine/docs/how-to/capacity-provisioning
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: low-priority
value: -10
preemptionPolicy: Never
globalDefault: false
description: "Low priority workloads"
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: evictable-pods-deployment
spec:
replicas: 200
selector:
matchLabels:
app: evictable-pods
template:
metadata:
labels:
app: evictable-pods
# Label for use with packed game server pod affinity rules
agones.dev/role: gameserver
spec:
priorityClassName: low-priority
terminationGracePeriodSeconds: 0
containers:
- name: ubuntu
image: ubuntu
imagePullPolicy: IfNotPresent
command: ["sleep"]
args: ["infinity"]
resources:
requests:
memory: 52Mi
cpu: 30m
limits:
memory: 52Mi
cpu: 30m
# Use same affinity as packed game server pods
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- podAffinityTerm:
labelSelector:
matchLabels:
agones.dev/role: gameserver
topologyKey: kubernetes.io/hostname
weight: 100
11 changes: 9 additions & 2 deletions test/upgrade/gameserverTemplate.yaml
Original file line number Diff line number Diff line change
@@ -51,16 +51,23 @@ data:
metadata:
labels:
agonesVersion: {{ .AgonesVersion }}
app: sdk-client-test
spec:
containers:
- name: sdk-client-test
image: "{{ .Registry }}:{{ .AgonesVersion }}"
imagePullPolicy: Always
env:
- name: SHUTDOWN_DELAY_SECONDS
value: "10"
- name: GRACEFUL_TERMINATION_DELAY_SECONDS
value: "10"
resources:
requests:
memory: 64Mi
memory: 52Mi
cpu: 20m
limits:
memory: 64Mi
memory: 52Mi
cpu: 20m
serviceAccountName: agones-sa
restartPolicy: Never
51 changes: 0 additions & 51 deletions test/upgrade/go.mod

This file was deleted.

208 changes: 163 additions & 45 deletions test/upgrade/main.go
Original file line number Diff line number Diff line change
@@ -28,8 +28,12 @@ import (
"strings"
"time"

agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
"agones.dev/agones/pkg/client/clientset/versioned"
"agones.dev/agones/pkg/client/informers/externalversions"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
@@ -48,17 +52,24 @@ const (
SidecarPullPolicy = "true"
// LogLevel sets the Agones Helm configuration log level
LogLevel = "debug"
// Timeout sets the amount of time to wait for resources to become ready. Should be more than the
// time for an Autopilot cluster to scale up.
Timeout = 10 * time.Minute
// HelmChart is the helm chart for the public Agones releases
HelmChart = "agones/agones"
// TestChart is the registry for Agones Helm chart development builds
TestChart = "./install/helm"
// AgonesRegistry is the public registry for Agones releases
AgonesRegistry = "us-docker.pkg.dev/agones-images/release"
// TestRegistry is the public registry for upgrade test container files
TestRegistry = "us-docker.pkg.dev/agones-images/ci/sdk-client-test"
// TestRegistry is the registry for Agones development builds
TestRegistry = "us-docker.pkg.dev/agones-images/ci"
// ContainerRegistry is the registry for upgrade test container files
ContainerRegistry = "us-docker.pkg.dev/agones-images/ci/sdk-client-test"
)

var (
// Dev is the current development version of Agones
Dev = os.Getenv("Dev")
// DevVersion is the current development version of Agones
DevVersion = os.Getenv("DevVersion")
// ReleaseVersion is the latest released version of Agones (DEV - 1).
ReleaseVersion = os.Getenv("ReleaseVersion")
// PodName the name of the pod this container is running in
@@ -81,8 +92,14 @@ func main() {
log.Fatal("Could not create the kubernetes api clientset", err)
}

agonesClient, err := versioned.NewForConfig(cfg)
if err != nil {
log.Fatal("Could not create the agones api clientset")
}

validConfigs := configTestSetup(ctx, kubeClient)
go watchGameServerPods(kubeClient, make(chan struct{}), make(map[string]podLog), len(validConfigs)*2)
go watchGameServers(agonesClient, len(validConfigs)*2)
go watchGameServerEvents(kubeClient)
addAgonesRepo()
runConfigWalker(ctx, validConfigs)
cleanUpResources()
@@ -111,9 +128,10 @@ type gameServerTemplate struct {
CountsAndLists bool
}

type podLog struct {
type gsLog struct {
SdkVersion string
GameServerVersion string
GameServerState string
}

type helmStatuses []struct {
@@ -136,7 +154,7 @@ func configTestSetup(ctx context.Context, kubeClient *kubernetes.Clientset) []*c
// Get the mappings of valid Kubernetes, Agones, and Feature Gate versions from the configmap.
err := json.Unmarshal([]byte(VersionMappings), &versionMap)
if err != nil {
log.Fatal("Could not Unmarshal", err)
log.Fatal("Could not Unmarshal ", err)
}

// Find valid Agones versions and feature gates for the current version of Kubernetes.
@@ -148,7 +166,7 @@ func configTestSetup(ctx context.Context, kubeClient *kubernetes.Clientset) []*c
countsAndLists := containsCountsAndLists(agonesVersion)
ct.agonesVersion = agonesVersion
if agonesVersion == "Dev" {
ct.agonesVersion = Dev
ct.agonesVersion = DevVersion
// Game server container cannot be created at DEV version due to go.mod only able to access
// published Agones versions. Use N-1 for DEV.
ct.gameServerPath = createGameServerFile(ReleaseVersion, countsAndLists)
@@ -285,19 +303,19 @@ func runConfigWalker(ctx context.Context, validConfigs []*configTest) {
for _, config := range validConfigs {
registry := AgonesRegistry
chart := HelmChart
if config.agonesVersion == Dev {
// TODO: Update to templated value for registry and chart for Dev build
continue
if config.agonesVersion == DevVersion {
registry = TestRegistry
chart = TestChart
}
err := installAgonesRelease(config.agonesVersion, registry, config.featureGates, ImagePullPolicy,
SidecarPullPolicy, LogLevel, chart)
if err != nil {
log.Printf("installAgonesRelease err: %s", err)
log.Fatalf("installAgonesRelease err: %s", err)
}

// Wait for the helm release to install. Waits the same amount of time as the Helm timeout.
var helmStatus string
err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 10*time.Minute, true, func(ctx context.Context) (done bool, err error) {
err = wait.PollUntilContextTimeout(ctx, 10*time.Second, Timeout, true, func(_ context.Context) (done bool, err error) {
helmStatus = checkHelmStatus(config.agonesVersion)
if helmStatus == "deployed" {
return true, nil
@@ -309,7 +327,11 @@ func runConfigWalker(ctx context.Context, validConfigs []*configTest) {
config.agonesVersion, helmStatus)
}

go createGameServers(cancelCtx, config.gameServerPath)
gsReady := make(chan bool)
go createGameServers(cancelCtx, config.gameServerPath, gsReady)
// Wait for the first game server pod created to become ready
<-gsReady
close(gsReady)
// Allow some soak time at the Agones version before next upgrade
time.Sleep(1 * time.Minute)
}
@@ -332,6 +354,12 @@ func checkHelmStatus(agonesVersion string) string {
log.Fatal("Could not Unmarshal", err)
}

// Remove the commit sha from the DevVersion i.e. from 1.46.0-dev-7168dd3 to 1.46.0-dev
if agonesVersion == DevVersion {
r := regexp.MustCompile(`1\.\d+\.\d+-dev`)
agonesVersion = r.FindString(DevVersion)
}

for _, status := range helmStatus {
if status.AppVersion == agonesVersion {
return status.Status
@@ -342,8 +370,9 @@ func checkHelmStatus(agonesVersion string) string {

// Creates a gameserver yaml file from the mounted gameserver.yaml template. The name of the new
// gameserver yaml is based on the Agones version, i.e. gs1440.yaml for Agones version 1.44.0
// Note: This does not validate the created file.
func createGameServerFile(agonesVersion string, countsAndLists bool) string {
gsTmpl := gameServerTemplate{Registry: TestRegistry, AgonesVersion: agonesVersion, CountsAndLists: countsAndLists}
gsTmpl := gameServerTemplate{Registry: ContainerRegistry, AgonesVersion: agonesVersion, CountsAndLists: countsAndLists}

gsTemplate, err := template.ParseFiles("gameserver.yaml")
if err != nil {
@@ -377,12 +406,16 @@ func createGameServerFile(agonesVersion string, countsAndLists bool) string {
}

// Create a game server every five seconds until the context is cancelled. The game server container
// be the same binary version as the game server file. The SDK version is always the same as the
// is the same binary version as the game server file. The SDK version is always the same as the
// version of the Agones controller that created it. The Game Server shuts itself down after the
// tests have run as part of the `sdk-client-test` logic.
func createGameServers(ctx context.Context, gsPath string) {
func createGameServers(ctx context.Context, gsPath string, gsReady chan bool) {
args := []string{"create", "-f", gsPath}
checkFirstGameServerReady(ctx, gsReady, args...)

ticker := time.NewTicker(5 * time.Second)
retries := 8
retry := 0

for {
select {
@@ -391,39 +424,82 @@ func createGameServers(ctx context.Context, gsPath string) {
return
case <-ticker.C:
_, err := runExecCommand(KubectlCmd, args...)
// TODO: Do not ignore error if unable to create due to something other than cluster scale up
// Ignore failures for ~45s at at time to account for the brief (~30s) during which the
// controller service is unavailable during upgrade.
if err != nil {
log.Printf("Could not create Gameserver %s: %s", gsPath, err)
if retry > retries {
log.Fatalf("Could not create Gameserver %s: %s. Too many successive errors.", gsPath, err)
}
log.Printf("Could not create Gameserver %s: %s. Retries left: %d.", gsPath, err, retries-retry)
retry++
} else {
retry = 0
}
}
}
}

// watchGameServerPods watches all game server pods for CrashLoopBackOff. Errors if the number of
// CrashLoopBackOff backoff pods exceeds the number of acceptedFailures.
func watchGameServerPods(kubeClient *kubernetes.Clientset, stopCh chan struct{}, failedPods map[string]podLog, acceptedFailures int) {
// Filter by label agones.dev/role=gameserver to only game server pods
labelOptions := informers.WithTweakListOptions(func(opts *metav1.ListOptions) {
opts.LabelSelector = "agones.dev/role=gameserver"
// checkFirstGameServerReady waits for the Game Server Pod to be running. This may take several
// minutes in Autopilot.
func checkFirstGameServerReady(ctx context.Context, gsReady chan bool, args ...string) {
// Sample output: gameserver.agones.dev/sdk-client-test-5zjdn created
output, err := runExecCommand(KubectlCmd, args...)
if err != nil {
log.Fatalf("Could not create Gameserver: %s", err)
}
r := regexp.MustCompile(`sdk-client-test-\S+`)
gsName := r.FindString(string(output))
// Game Server has too many states, so using the pod instead as there are only two healthy states.
// Includes the gs name to make output logs easier to read.
getPodStatus := []string{"get", "pod", gsName, "-o=custom-columns=:.status.phase,:.metadata.name", "--no-headers"}

// Pod is created after Game Server, wait briefly before erroring out on unable to get pod.
retries := 0
err = wait.PollUntilContextTimeout(ctx, 2*time.Second, Timeout, true, func(_ context.Context) (done bool, err error) {
out, err := runExecCommand(KubectlCmd, getPodStatus...)
if err != nil && retries > 2 {
log.Fatalf("Could not get Gameserver %s state: %s", gsName, err)
}
if err != nil {
retries++
return false, nil
}
// Sample output: Running sdk-client-test-bbvx9
podStatus := strings.Split(string(out), " ")
if podStatus[0] == "Running" || podStatus[0] == "Succeeded" {
gsReady <- true
return true, nil
}
return false, nil
})
kubeInformerFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, 5*time.Second,
informers.WithNamespace("default"), labelOptions)
podInformer := kubeInformerFactory.Core().V1().Pods().Informer()
if err != nil {
log.Fatalf("PollUntilContextTimeout timed out while wait for first gameserver %s to be Ready", gsName)
}
}

_, err := podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
// watchGameServers watches all game servers for errors. Errors if the number of failed game servers
// exceeds the number of acceptedFailures.
func watchGameServers(agonesClient *versioned.Clientset, acceptedFailures int) {
stopCh := make(chan struct{})
failedGs := make(map[string]gsLog)

agonesInformerFactory := externalversions.NewSharedInformerFactory(agonesClient, 5*time.Second)
gsInformer := agonesInformerFactory.Agones().V1().GameServers().Informer()

_, err := gsInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
UpdateFunc: func(_, newObj interface{}) {
newPod := newObj.(*v1.Pod)
for _, cs := range newPod.Status.ContainerStatuses {
if cs.Name != "sdk-client-test" || cs.State.Waiting == nil || cs.State.Waiting.Reason != "CrashLoopBackOff" {
continue
}
gsVersion := newPod.Labels["agonesVersion"]
sdkVersion := newPod.Annotations["agones.dev/sdk-version"]
log.Printf("%s for pod: %s with game server binary version %s, and SDK version %s", cs.State.Waiting.Reason, newPod.Name, gsVersion, sdkVersion)
// Put failed pods into the map until it reaches capacity.
failedPods[newPod.Name] = podLog{GameServerVersion: gsVersion, SdkVersion: sdkVersion}
if len(failedPods) > acceptedFailures {
log.Fatalf("Too many Game Server pods in CrashLoopBackOff: %v", failedPods)
newGs := newObj.(*agonesv1.GameServer)
if newGs.Status.State == "Error" || newGs.Status.State == "Unhealthy" {
gsVersion := newGs.Labels["agonesVersion"]
sdkVersion := newGs.Annotations["agones.dev/sdk-version"]
log.Printf("Game server %s with binary version %s, and SDK version %s in %s state\n",
newGs.Name, gsVersion, sdkVersion, newGs.Status.State)

// Put failed game servers into the map until it reaches capacity.
failedGs[newGs.Name] = gsLog{GameServerVersion: gsVersion, SdkVersion: sdkVersion,
GameServerState: string(newGs.Status.State)}
if len(failedGs) > acceptedFailures {
log.Fatalf("Too many Game Servers in Error or Unhealthy states: %v", failedGs)
}
}
},
@@ -432,9 +508,51 @@ func watchGameServerPods(kubeClient *kubernetes.Clientset, stopCh chan struct{},
log.Fatal("Not able to create AddEventHandler", err)
}

go podInformer.Run(stopCh)
if !cache.WaitForCacheSync(stopCh, podInformer.HasSynced) {
log.Fatal("Timed out waiting for caches to sync")
go gsInformer.Run(stopCh)
if !cache.WaitForCacheSync(stopCh, gsInformer.HasSynced) {
log.Fatal("Timed out waiting for game server informer cache to sync")
}
}

// watchGameServerEvents watches all events on `sdk-client-test` containers for BackOff errors. The
// purpose is to catch ImagePullBackOff errors.
func watchGameServerEvents(kubeClient *kubernetes.Clientset) {
stopCh := make(chan struct{})

// Filter by Game Server `sdk-client-test` containers
containerName := "sdk-client-test"
containerPath := "spec.containers{sdk-client-test}"
fieldSelector := fields.OneTermEqualSelector("involvedObject.fieldPath", containerPath).String()
// First delete previous `sdk-client-test` events, otherwise there will be events from previous runs.
_, err := runExecCommand(KubectlCmd, []string{"delete", "events", "--field-selector", fieldSelector}...)
if err != nil {
log.Fatal("Could not delete `sdk-client-test` events", err)
}

eventOptions := informers.WithTweakListOptions(func(opts *metav1.ListOptions) {
opts.FieldSelector = fieldSelector
})
kubeInformerFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, 5*time.Second,
informers.WithNamespace("default"), eventOptions)
eventInformer := kubeInformerFactory.Core().V1().Events().Informer()

_, err = eventInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
newEvent := obj.(*v1.Event)
gsPodName := newEvent.InvolvedObject.Name
if newEvent.Reason == "Failed" {
log.Fatalf("%s on %s %s has failed. Latest event: message %s", containerName, newEvent.Kind,
gsPodName, newEvent.Message)
}
},
})
if err != nil {
log.Fatal("Not able to create AddEventHandler", err)
}

go eventInformer.Run(stopCh)
if !cache.WaitForCacheSync(stopCh, eventInformer.HasSynced) {
log.Fatal("Timed out waiting for eventInformer cache to sync")
}
}

@@ -455,7 +573,7 @@ func cleanUpResources() {
// Apiservice v1.allocation.agones.dev, which is part of Service agones-system/agones-controller-service,
// does not always get cleaned up on Helm uninstall, and needs to be deleted (if it exists) before
// the agones-system namespace can be removed.
// Ignore the error, because an "error" means Helm already uninstall the apiservice.
// Ignore the error, because an "error" means Helm already uninstalled the apiservice.
args = []string{"delete", "apiservice", "v1.allocation.agones.dev"}
out, err := runExecCommand(KubectlCmd, args...)
if err == nil {
64 changes: 51 additions & 13 deletions test/upgrade/permissions.yaml
Original file line number Diff line number Diff line change
@@ -24,18 +24,18 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
namespace: default
name: pod-reader
name: pod-manager
rules:
- apiGroups: [""] # "" indicates the core API group
resources: ["pods"]
verbs: ["get", "watch", "list"]
resources: ["pods", "events"]
verbs: ["get", "delete", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
# This role binding allows default service account to read all pods in the "default" namespace.
# You need to already have a Role named "pod-reader" in that namespace.
kind: RoleBinding
metadata:
name: read-pods
name: manage-pods
namespace: default
subjects:
- kind: ServiceAccount
@@ -44,7 +44,7 @@ subjects:
roleRef:
# "roleRef" specifies the binding to a Role / ClusterRole
kind: Role # this must be Role or ClusterRole
name: pod-reader # this must match the name of the Role or ClusterRole you wish to bind to
name: pod-manager # this must match the name of the Role or ClusterRole you wish to bind to
apiGroup: rbac.authorization.k8s.io
---
kind: ClusterRole
@@ -202,23 +202,23 @@ roleRef:
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: apiservices-creator
name: apiservices-manager
rules:
- apiGroups: ["apiregistration.k8s.io"]
resources: ["apiservices"]
verbs: ["get", "watch", "list", "create", "patch"]
verbs: ["create", "delete", "get", "list", "patch", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: create-apiservices
name: manage-apiservices
subjects:
- kind: ServiceAccount
name: agones-sa
namespace: default
roleRef:
kind: ClusterRole
name: apiservices-creator
name: apiservices-manager
apiGroup: rbac.authorization.k8s.io
---
# Agones needs to be able to create Agones CustomResourceDefinitions
@@ -249,23 +249,23 @@ roleRef:
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: clusterrole-creator
name: clusterrole-manager
rules:
- apiGroups: ["rbac.authorization.k8s.io"]
resources: ["clusterroles", "clusterrolebindings", "rolebindings"]
verbs: ["get", "watch", "list", "create", "patch"]
verbs: ["create", "delete", "get", "list", "patch", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: create-clusterroles
name: manager-clusterroles
subjects:
- kind: ServiceAccount
name: agones-sa
namespace: default
roleRef:
kind: ClusterRole
name: clusterrole-creator
name: clusterrole-manager
apiGroup: rbac.authorization.k8s.io
---
# Agones needs to be able to create deployments
@@ -498,3 +498,41 @@ roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: sdk
---
# Source: agones/templates/hooks/sa.yaml
# Permissions to grant to helm on helm uninstall
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
namespace: agones-system
name: helm-cleanup
labels:
app: agones
rules:
- apiGroups: ["agones.dev", "multicluster.agones.dev", "autoscaling.agones.dev"]
resources: ["fleets", "fleetautoscalers", "gameservers", "gameserversets", "gameserverallocationpolicies"]
verbs: ["delete", "get", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["create", "delete", "get", "list"]
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["create", "delete", "get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: helm-cleanup-access
labels:
app: agones
subjects:
- kind: ServiceAccount
name: agones-sa
namespace: default
roleRef:
kind: ClusterRole
name: helm-cleanup
apiGroup: rbac.authorization.k8s.io
3 changes: 1 addition & 2 deletions test/upgrade/upgradeTest.yaml
Original file line number Diff line number Diff line change
@@ -26,8 +26,7 @@ spec:
spec:
containers:
- name: upgrade-test-controller
# TODO: Update image name to use a templated value for current Dev version
image: us-docker.pkg.dev/agones-images/ci/upgrade-test-controller:1.44.0-dev
image: us-docker.pkg.dev/agones-images/ci/upgrade-test-controller:${DevVersion}
imagePullPolicy: Always
env:
- name: PodName
83 changes: 21 additions & 62 deletions test/upgrade/versionMap.yaml
Original file line number Diff line number Diff line change
@@ -18,82 +18,33 @@ kind: ConfigMap
metadata:
name: version-map
data:
Dev: "1.44.0-dev"
ReleaseVersion: "1.43.0"
DevVersion: ${DevVersion}
ReleaseVersion: "1.45.0"
version-mappings.json: |
{
"k8sToAgonesVersions": {
"1.25": [
"1.34.0",
"1.35.0"
],
"1.26": [
"1.34.0",
"1.35.0",
"1.36.0",
"1.37.0",
"1.38.0",
"1.39.0"
],
"1.27": [
"1.34.0",
"1.35.0",
"1.36.0",
"1.37.0",
"1.38.0",
"1.39.0",
"1.40.0",
"1.41.0",
"1.42.0"
],
"1.28": [
"1.36.0",
"1.37.0",
"1.38.0",
"1.39.0",
"1.40.0",
"1.41.0",
"1.42.0",
"1.43.0",
"Dev"
],
"1.29": [
"1.40.0",
"1.41.0",
"1.42.0",
"1.43.0",
"1.44.0",
"1.45.0",
"Dev"
],
"1.30": [
"1.43.0",
"1.44.0",
"1.45.0",
"Dev"
],
"1.31": [
"1.44.0",
"1.45.0",
"Dev"
]
},
"agonesVersionFeatureGates": {
"1.34.0": {
"alphaGates": ["PlayerAllocationFilter", "PlayerTracking"],
"betaGates": []
},
"1.35.0": {
"alphaGates": ["PlayerAllocationFilter", "PlayerTracking"],
"betaGates": []
},
"1.36.0": {
"alphaGates": ["PlayerAllocationFilter", "PlayerTracking"],
"betaGates": []
},
"1.37.0": {
"alphaGates": ["CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking"],
"betaGates": []
},
"1.38.0": {
"alphaGates": ["CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking"],
"betaGates": []
},
"1.39.0": {
"alphaGates": ["CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking"],
"betaGates": []
},
"1.40.0": {
"alphaGates": ["CountsAndLists", "GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking"],
"betaGates": ["DisableResyncOnSDKServer"]
@@ -110,9 +61,17 @@ data:
"alphaGates": ["GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix"],
"betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer"]
},
"1.44.0": {
"alphaGates": ["PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"],
"betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods"]
},
"1.45.0": {
"alphaGates": ["PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"],
"betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods"]
},
"Dev": {
"alphaGates": ["GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"],
"betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer"]
"alphaGates": ["PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"],
"betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods"]
}
}
}