diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 76acf952..fa261da4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -128,6 +128,34 @@ jobs: retention-days: 7 if-no-files-found: ignore + # ---------- Orca Integration Tests ---------- + # Spins up LocalStack and Azurite via testcontainers-go and runs the + # orca in-process integration suite (internal/orca/inttest). Docker + # is preinstalled on GitHub-hosted Ubuntu runners; no extra services: + # block is required. + orca-inttest: + name: Orca Integration Tests + needs: [frontend] + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Download frontend dist + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 + with: + name: frontend-dist + path: internal/net/html/dist + + - name: Set up Go + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + cache-dependency-path: go.sum + + - name: Run orca-inttest + run: make orca-inttest + # ---------- Build ---------- build: name: Build diff --git a/Makefile b/Makefile index 5be64f18..1c0134c8 100644 --- a/Makefile +++ b/Makefile @@ -74,6 +74,14 @@ STAMP_LDFLAGS=-X github.com/Azure/unbounded/internal/version.Version=$(VERSION) METALMAN_IMAGE=$(CONTAINER_REGISTRY)/metalman:$(VERSION) +# Orca configuration +ORCA_BIN=bin/orca +ORCA_CMD=./cmd/orca +ORCA_IMAGE ?= $(CONTAINER_REGISTRY)/orca:$(VERSION) +ORCA_NAMESPACE ?= unbounded-kube +ORCA_MANIFEST_TEMPLATES_DIR := deploy/orca +ORCA_MANIFEST_RENDERED_DIR := deploy/orca/rendered + # kubectl-unbounded also stamps the metalman image reference. KUBECTL_UNBOUNDED_LDFLAGS=$(STAMP_LDFLAGS) -X github.com/Azure/unbounded/cmd/kubectl-unbounded/app.MetalmanImage=$(METALMAN_IMAGE) @@ -112,6 +120,7 @@ REACT_DEV ?= false .PHONY: all help fmt lint test build vulncheck check-deps kubectl-unbounded kubectl-unbounded-build install-tools install-protoc generate kubectl-unbounded forge unbounded-agent machina machina-build machina-oci machina-oci-push machina-manifests machine-ops-controller machine-ops-controller-build machine-ops-controller-oci machine-ops-controller-oci-push machine-ops-manifests metalman metalman-build metalman-oci metalman-oci-push gomod docs-serve unbounded-net-controller unbounded-net-node unbounded-net-routeplan-debug unping unroute notice notice-check .PHONY: net-frontend net-frontend-clean net-build-ebpf net-manifests release-manifests .PHONY: image-machina-local image-machine-ops-controller-local image-metalman-local image-net-controller-local image-net-node-local images-local +.PHONY: orca orca-build orca-manifests orca-oci orca-oci-push orca-up orca-down orca-reset orca-inttest image-orca-local ##@ General @@ -176,6 +185,8 @@ help: ## Show this help @echo " machina-oci-push Build machina image and push" @echo " machine-ops-controller-oci-push Build machine-ops-controller image and push" @echo " metalman-oci-push Build metalman image and push" + @echo " image-orca-local Build orca image" + @echo " orca-oci-push Build orca image and push" @echo "" @echo "Net Frontend:" @echo " net-frontend Build frontend into \$$(NET_FRONTEND_DIST_DIR) (cached)" @@ -188,10 +199,19 @@ help: ## Show this help @echo " machina-manifests Render machina manifests into deploy/machina/rendered" @echo " machine-ops-manifests Render machine-ops manifests into deploy/machine-ops/rendered" @echo " net-manifests Render net manifests into \$$(NET_MANIFEST_RENDERED_DIR)" + @echo " orca-manifests Render orca manifests into deploy/orca/rendered" @echo "" @echo "Net Kubernetes (apply to current kubectl context):" @echo " See \`make -C hack/net help\` for cluster deploy/undeploy targets." @echo "" + @echo "Orca Dev Harness (Kind cluster):" + @echo " orca | orca-build Build orca binary (with/without lint/test)" + @echo " orca-up Bring up Orca dev harness in Kind" + @echo " orca-down Tear down Orca dev harness Kind cluster" + @echo " orca-reset Rebuild image and rollout-restart deployment" + @echo " orca-inttest Run orca integration tests (Docker required)" + @echo " See \`make -C hack/orca help\` for full list." + @echo "" @echo "Documentation:" @echo " docs-serve Start local Hugo dev server" @echo "" @@ -570,6 +590,58 @@ metalman-oci: image-metalman-local ## Alias for image-metalman-local metalman-oci-push: metalman-oci ## Build and push the metalman container image $(CONTAINER_ENGINE) push $(METALMAN_IMAGE) +##@ Orca + +orca-build: ## Build the orca binary (no lint/test) + $(GOBUILD) -ldflags '$(STAMP_LDFLAGS)' -o $(ORCA_BIN) $(ORCA_CMD)/main.go + +orca: test orca-build ## Build the orca binary (implies test) + +orca-manifests: ## Render orca deployment manifests into deploy/orca/rendered + @mkdir -p $(ORCA_MANIFEST_RENDERED_DIR) + @find $(ORCA_MANIFEST_RENDERED_DIR) -mindepth 1 -not -name .gitignore -delete 2>/dev/null || true + $(GOCMD) run ./hack/cmd/render-manifests \ + --templates-dir $(ORCA_MANIFEST_TEMPLATES_DIR) \ + --output-dir $(ORCA_MANIFEST_RENDERED_DIR) \ + --set Namespace=$(ORCA_NAMESPACE) \ + --set Image=$(ORCA_IMAGE) + @echo "Rendered orca manifests into $(ORCA_MANIFEST_RENDERED_DIR) (image: $(ORCA_IMAGE))" + +image-orca-local: ## Build the orca container image locally (single-arch) + $(CONTAINER_ENGINE) build \ + --build-arg VERSION=$(VERSION) \ + --build-arg GIT_COMMIT=$(GIT_COMMIT) \ + --build-arg BUILD_TIME=$(BUILD_TIME) \ + -t orca:$(VERSION) -t $(ORCA_IMAGE) \ + -f ./images/orca/Containerfile . + +orca-oci: image-orca-local ## Alias for image-orca-local + +orca-oci-push: orca-oci ## Build and push the orca container image + $(CONTAINER_ENGINE) push $(ORCA_IMAGE) + +# Dev-cluster proxy targets. The actual implementations live in +# hack/orca/Makefile (see AGENTS.md convention; mirrors hack/net/). +orca-up: ## Bring up the Orca dev harness in a Kind cluster + $(MAKE) -C hack/orca up + +orca-down: ## Tear down the Orca dev harness Kind cluster + $(MAKE) -C hack/orca down + +orca-reset: ## Rebuild orca image and rolling-restart the dev deployment + $(MAKE) -C hack/orca reset + +# orca-inttest mirrors the test/test-race pattern: race detector in CI +# (ubuntu-latest has gcc), no -race locally so developers without a C +# toolchain can still run integration tests. +ifdef CI +orca-inttest: ## Run orca integration tests (LocalStack + Azurite via testcontainers; requires Docker) + $(GOTEST) -tags=integrationtest -race -timeout 15m ./internal/orca/inttest/... +else +orca-inttest: ## Run orca integration tests (LocalStack + Azurite via testcontainers; requires Docker) + $(GOTEST) -tags=integrationtest -timeout 15m ./internal/orca/inttest/... +endif + image-net-controller-local: net-frontend resources/cni-plugins-linux-$(HOST_GOARCH)-$(CNI_PLUGINS_VERSION).tgz ## Build the unbounded-net-controller image locally (single-arch) $(CONTAINER_ENGINE) build \ --target controller \ diff --git a/cmd/orca/main.go b/cmd/orca/main.go new file mode 100644 index 00000000..f7ea8484 --- /dev/null +++ b/cmd/orca/main.go @@ -0,0 +1,10 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package main + +import "github.com/Azure/unbounded/cmd/orca/orca" + +func main() { + orca.Run() +} diff --git a/cmd/orca/orca/orca.go b/cmd/orca/orca/orca.go new file mode 100644 index 00000000..a770bdd7 --- /dev/null +++ b/cmd/orca/orca/orca.go @@ -0,0 +1,99 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package orca wires the Orca cache binary together. It is invoked by +// cmd/orca/main.go and is responsible for parsing flags, loading the +// YAML config, and delegating to internal/orca/app for actual runtime +// wiring. +package orca + +import ( + "context" + "fmt" + "log/slog" + "os" + "os/signal" + "syscall" + "time" + + "github.com/spf13/cobra" + + "github.com/Azure/unbounded/internal/orca/app" + "github.com/Azure/unbounded/internal/orca/config" +) + +// Run is the entrypoint invoked by cmd/orca/main.go. +func Run() { + root := &cobra.Command{ + Use: "orca", + Short: "Orca origin cache - S3-compatible read-only cache fronting Azure / S3 origins", + } + root.AddCommand(newServeCmd()) + + if err := root.Execute(); err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } +} + +func newServeCmd() *cobra.Command { + var configPath string + + cmd := &cobra.Command{ + Use: "serve", + Short: "Run the Orca cache server", + RunE: func(cmd *cobra.Command, _ []string) error { + return serve(cmd.Context(), configPath) + }, + } + cmd.Flags().StringVarP(&configPath, "config", "c", "/etc/orca/config.yaml", + "path to YAML config file") + + return cmd +} + +func serve(parent context.Context, configPath string) error { + log := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + Level: slog.LevelInfo, + })) + slog.SetDefault(log) + + log.Info("orca starting", "config_path", configPath) + + cfg, err := config.Load(configPath) + if err != nil { + return fmt.Errorf("load config: %w", err) + } + + log.Info("config loaded", + "origin_id", cfg.Origin.ID, + "replicas_target", cfg.Cluster.TargetReplicas, + "target_global", cfg.Origin.TargetGlobal, + "internal_tls", cfg.Cluster.InternalTLS.Enabled, + "client_auth", cfg.Server.Auth.Enabled, + ) + + ctx, cancel := signal.NotifyContext(parent, os.Interrupt, syscall.SIGTERM) + defer cancel() + + a, err := app.Start(ctx, cfg, app.WithLogger(log)) + if err != nil { + return err + } + + if waitErr := a.Wait(ctx); waitErr != nil { + log.Error("listener exited with error", "err", waitErr) + cancel() + } else { + log.Info("shutdown signal received") + } + + shutdownCtx, shCancel := context.WithTimeout(context.Background(), 10*time.Second) + defer shCancel() + + _ = a.Shutdown(shutdownCtx) //nolint:errcheck // shutdown errors already logged inside App.Shutdown + + log.Info("orca stopped") + + return nil +} diff --git a/deploy/orca/01-namespace.yaml.tmpl b/deploy/orca/01-namespace.yaml.tmpl new file mode 100644 index 00000000..fd353a35 --- /dev/null +++ b/deploy/orca/01-namespace.yaml.tmpl @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca diff --git a/deploy/orca/02-rbac.yaml.tmpl b/deploy/orca/02-rbac.yaml.tmpl new file mode 100644 index 00000000..5961196b --- /dev/null +++ b/deploy/orca/02-rbac.yaml.tmpl @@ -0,0 +1,8 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: orca + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca diff --git a/deploy/orca/03-config.yaml.tmpl b/deploy/orca/03-config.yaml.tmpl new file mode 100644 index 00000000..811e2fb6 --- /dev/null +++ b/deploy/orca/03-config.yaml.tmpl @@ -0,0 +1,71 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: orca-config + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca +data: + config.yaml: | + # Orca origin cache configuration. + # Secret values (account keys, S3 access/secret) are sourced from + # environment variables ORCA_AZUREBLOB_ACCOUNT_KEY, + # ORCA_CACHESTORE_S3_ACCESS_KEY, ORCA_CACHESTORE_S3_SECRET_KEY, + # populated by the orca-credentials Secret via envFrom. + + server: + listen: "0.0.0.0:8443" + auth: + # Dev: disabled. Production: enable bearer or mtls. + enabled: {{ default "false" .ServerAuthEnabled }} + + origin: + id: {{ default "azureblob-default" .OriginID | quote }} + driver: {{ default "azureblob" .OriginDriver }} + target_global: {{ default "192" .TargetGlobal }} + queue_timeout: 5s + retry: + attempts: 3 + backoff_initial: 100ms + backoff_max: 2s + max_total_duration: 5s + azureblob: + account: {{ default "" .AzureAccount | quote }} + container: {{ default "" .AzureContainer | quote }} + endpoint: {{ default "" .AzureEndpoint | quote }} + enforce_block_blob_only: true + awss3: + endpoint: {{ default "" .OriginAWSS3Endpoint | quote }} + region: {{ default "us-east-1" .OriginAWSS3Region | quote }} + bucket: {{ default "" .OriginAWSS3Bucket | quote }} + use_path_style: {{ default "false" .OriginAWSS3UsePathStyle }} + + cachestore: + driver: s3 + s3: + endpoint: {{ default "http://localstack.unbounded-kube.svc.cluster.local:4566" .CachestoreEndpoint | quote }} + bucket: {{ default "orca-cache" .CachestoreBucket | quote }} + region: {{ default "us-east-1" .CachestoreRegion | quote }} + use_path_style: true + require_unversioned_bucket: true + + cluster: + service: {{ default "orca-peers.unbounded-kube.svc.cluster.local" .ClusterService | quote }} + membership_refresh: 5s + internal_listen: "0.0.0.0:8444" + target_replicas: {{ default "3" .TargetReplicas }} + internal_tls: + # Dev: disabled (plain HTTP/2 between peers). Production: true. + enabled: {{ default "false" .InternalTLSEnabled }} + + chunk_catalog: + max_entries: 100000 + + metadata: + ttl: 5m + negative_ttl: 60s + max_entries: 10000 + + chunking: + size: 8388608 diff --git a/deploy/orca/04-deployment.yaml.tmpl b/deploy/orca/04-deployment.yaml.tmpl new file mode 100644 index 00000000..44a0eb80 --- /dev/null +++ b/deploy/orca/04-deployment.yaml.tmpl @@ -0,0 +1,76 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: orca + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca +spec: + replicas: {{ default "3" .TargetReplicas }} + # Required pod-anti-affinity below pins one Orca pod per node. + # In the dev harness the worker count == replica count, so default + # RollingUpdate can't surge: the new pod has no node to land on. + # maxSurge=0 / maxUnavailable=1 walks the replicas one-at-a-time. + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: orca + template: + metadata: + labels: + app.kubernetes.io/name: orca + spec: + serviceAccountName: orca + # Required anti-affinity: at most one Orca pod per node so that a + # single node failure does not knock out multiple replicas. The + # dev harness Kind cluster has 3 worker nodes to match the default + # 3 replicas. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/name: orca + topologyKey: kubernetes.io/hostname + containers: + - name: orca + image: {{ default "ghcr.io/azure/orca:latest" .Image | quote }} + imagePullPolicy: {{ default "IfNotPresent" .ImagePullPolicy }} + args: + - serve + - --config=/etc/orca/config.yaml + env: + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + envFrom: + - secretRef: + name: orca-credentials + ports: + - containerPort: 8443 + name: edge + protocol: TCP + - containerPort: 8444 + name: internal + protocol: TCP + resources: + requests: + cpu: {{ default "200m" .ResourceCPURequest }} + memory: {{ default "256Mi" .ResourceMemoryRequest }} + limits: + cpu: {{ default "2" .ResourceCPULimit }} + memory: {{ default "1Gi" .ResourceMemoryLimit }} + volumeMounts: + - name: config + mountPath: /etc/orca + readOnly: true + volumes: + - name: config + configMap: + name: orca-config diff --git a/deploy/orca/05-service.yaml.tmpl b/deploy/orca/05-service.yaml.tmpl new file mode 100644 index 00000000..36dba4fd --- /dev/null +++ b/deploy/orca/05-service.yaml.tmpl @@ -0,0 +1,43 @@ +--- +# Client-facing Service: standard ClusterIP. Clients of the cache (e.g. +# tools speaking S3 to fetch objects) connect here. Kube-proxy load +# balances across the 3 replicas. +apiVersion: v1 +kind: Service +metadata: + name: orca + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: orca + ports: + - name: edge + port: 8443 + targetPort: edge + protocol: TCP + +--- +# Peer-discovery Service: headless (ClusterIP: None). LookupHost on +# orca-peers..svc.cluster.local returns all pod IPs, enabling +# rendezvous-hash coordination among Orca replicas. +apiVersion: v1 +kind: Service +metadata: + name: orca-peers + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca +spec: + type: ClusterIP + clusterIP: None + publishNotReadyAddresses: true + selector: + app.kubernetes.io/name: orca + ports: + - name: internal + port: 8444 + targetPort: internal + protocol: TCP diff --git a/deploy/orca/dev/01-localstack.yaml.tmpl b/deploy/orca/dev/01-localstack.yaml.tmpl new file mode 100644 index 00000000..87dfcc02 --- /dev/null +++ b/deploy/orca/dev/01-localstack.yaml.tmpl @@ -0,0 +1,83 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: localstack + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: localstack + app.kubernetes.io/part-of: orca-dev +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: localstack + ports: + - name: edge + port: 4566 + targetPort: 4566 + protocol: TCP + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: localstack + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: localstack + app.kubernetes.io/part-of: orca-dev +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: localstack + template: + metadata: + labels: + app.kubernetes.io/name: localstack + app.kubernetes.io/part-of: orca-dev + spec: + containers: + - name: localstack + # 3.8 is community-tier; 'latest' became Pro-only and exits + # with code 55 ("License activation failed"). + image: {{ default "localstack/localstack:3.8" .LocalstackImage | quote }} + imagePullPolicy: IfNotPresent + ports: + - containerPort: 4566 + name: edge + protocol: TCP + env: + - name: SERVICES + value: s3 + - name: DEBUG + value: "0" + - name: PERSISTENCE + value: "0" + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 1 + memory: 1Gi + readinessProbe: + httpGet: + path: /_localstack/health + port: 4566 + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + livenessProbe: + httpGet: + path: /_localstack/health + port: 4566 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + volumeMounts: + - name: data + mountPath: /var/lib/localstack + volumes: + - name: data + emptyDir: {} diff --git a/deploy/orca/dev/02-init-job.yaml.tmpl b/deploy/orca/dev/02-init-job.yaml.tmpl new file mode 100644 index 00000000..0eb41832 --- /dev/null +++ b/deploy/orca/dev/02-init-job.yaml.tmpl @@ -0,0 +1,80 @@ +--- +# Init Job: creates the cachestore + origin S3 buckets in LocalStack so +# that Orca can pass the versioningGate boot check and so that reviewers +# have an origin bucket to seed sample objects into. Idempotent: +# CreateBucket returns BucketAlreadyOwnedByYou on rerun, swallowed by +# the script. +# +# Cachestore bucket: versioning left unset (which is what +# require_unversioned_bucket=true expects). +# Origin bucket: no versioning constraint; sample objects live here. +apiVersion: batch/v1 +kind: Job +metadata: + name: orca-buckets-init + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca + app.kubernetes.io/part-of: orca-dev +spec: + backoffLimit: 6 + template: + metadata: + labels: + app.kubernetes.io/name: orca + app.kubernetes.io/part-of: orca-dev + spec: + restartPolicy: OnFailure + containers: + - name: aws-cli + image: {{ default "amazon/aws-cli:latest" .AwsCliImage | quote }} + env: + - name: AWS_ACCESS_KEY_ID + value: test + - name: AWS_SECRET_ACCESS_KEY + value: test + - name: AWS_DEFAULT_REGION + value: us-east-1 + - name: CACHESTORE_BUCKET + value: {{ default "orca-cache" .CachestoreBucket | quote }} + - name: ORIGIN_BUCKET + value: {{ default "orca-origin" .OriginBucket | quote }} + - name: ENDPOINT + value: http://localstack.{{ default "unbounded-kube" .Namespace }}.svc.cluster.local:4566 + command: + - /bin/sh + - -c + - | + set -e + echo "Waiting for LocalStack at $ENDPOINT ..." + for i in $(seq 1 60); do + if aws --endpoint-url "$ENDPOINT" s3api list-buckets >/dev/null 2>&1; then + echo "LocalStack ready." + break + fi + sleep 2 + done + + ensure_bucket() { + bucket="$1" + echo "Ensuring bucket $bucket (idempotent) ..." + if aws --endpoint-url "$ENDPOINT" s3api head-bucket --bucket "$bucket" >/dev/null 2>&1; then + echo "Bucket $bucket already exists." + else + aws --endpoint-url "$ENDPOINT" s3api create-bucket --bucket "$bucket" + echo "Bucket $bucket created." + fi + } + + ensure_bucket "$CACHESTORE_BUCKET" + ensure_bucket "$ORIGIN_BUCKET" + + # Verify cachestore bucket versioning is unset (Orca's + # versioningGate rejects Enabled or Suspended). + status=$(aws --endpoint-url "$ENDPOINT" s3api get-bucket-versioning --bucket "$CACHESTORE_BUCKET" --query Status --output text 2>/dev/null || echo "None") + echo "Cachestore bucket versioning: $status (None means unset, which is required)." + if [ "$status" = "Enabled" ] || [ "$status" = "Suspended" ]; then + echo "ERROR: cachestore bucket versioning is $status; Orca requires unset/None." + exit 1 + fi + echo "Init complete." diff --git a/deploy/orca/dev/03-azurite.yaml.tmpl b/deploy/orca/dev/03-azurite.yaml.tmpl new file mode 100644 index 00000000..4282c248 --- /dev/null +++ b/deploy/orca/dev/03-azurite.yaml.tmpl @@ -0,0 +1,108 @@ +--- +# Azurite is Microsoft's official Azure Storage emulator. We use it as +# an alternative origin in the dev harness so reviewers can exercise +# the azureblob origin driver path without a real Azure account. +# +# Well-known dev account/key (documented at +# https://learn.microsoft.com/azure/storage/common/storage-use-azurite): +# AccountName: devstoreaccount1 +# AccountKey: Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== +# BlobURL: http://azurite..svc.cluster.local:10000/devstoreaccount1 +apiVersion: v1 +kind: Service +metadata: + name: azurite + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: azurite + app.kubernetes.io/part-of: orca-dev +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: azurite + ports: + - name: blob + port: 10000 + targetPort: 10000 + protocol: TCP + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: azurite + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: azurite + app.kubernetes.io/part-of: orca-dev +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: azurite + template: + metadata: + labels: + app.kubernetes.io/name: azurite + app.kubernetes.io/part-of: orca-dev + spec: + containers: + - name: azurite + image: {{ default "mcr.microsoft.com/azure-storage/azurite:3.33.0" .AzuriteImage | quote }} + imagePullPolicy: IfNotPresent + # Bind to 0.0.0.0 so the Service can reach it; default is + # 127.0.0.1. + # --skipApiVersionCheck allows newer Azure SDK clients + # (which advertise API versions Azurite hasn't yet caught up + # with) to talk to it. + # --loose disables strict validation of newer SDK headers. + # --disableProductStyleUrl forces path-style URL parsing. + # Without it, Azurite parses the first DNS label of the Host + # header as the account name (so requests to azurite.... + # would be misinterpreted as account="azurite" rather than + # account="devstoreaccount1"). + # --debug routes Azurite's internal request log to a file; + # tail it via `kubectl exec ... -- cat /tmp/azurite-debug.log` + # when triaging 4xx responses. + args: + - azurite-blob + - --blobHost + - 0.0.0.0 + - --blobPort + - "10000" + - --skipApiVersionCheck + - --loose + - --disableProductStyleUrl + - --debug + - /tmp/azurite-debug.log + - --location + - /data + ports: + - containerPort: 10000 + name: blob + protocol: TCP + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + readinessProbe: + tcpSocket: + port: 10000 + initialDelaySeconds: 3 + periodSeconds: 5 + timeoutSeconds: 3 + livenessProbe: + tcpSocket: + port: 10000 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + volumeMounts: + - name: data + mountPath: /data + volumes: + - name: data + emptyDir: {} diff --git a/deploy/orca/dev/04-azurite-init.yaml.tmpl b/deploy/orca/dev/04-azurite-init.yaml.tmpl new file mode 100644 index 00000000..8ad9433f --- /dev/null +++ b/deploy/orca/dev/04-azurite-init.yaml.tmpl @@ -0,0 +1,54 @@ +--- +# Init Job: creates the Azure container in Azurite so Orca's azureblob +# origin driver has somewhere to read from. Idempotent: az container +# create with --fail-on-exist false treats existence as success. +# +# Uses the well-known Azurite dev creds (devstoreaccount1 + the +# documented public key); these are baked into Azurite and not +# secrets. +apiVersion: batch/v1 +kind: Job +metadata: + name: orca-azurite-container-init + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca + app.kubernetes.io/part-of: orca-dev +spec: + backoffLimit: 6 + template: + metadata: + labels: + app.kubernetes.io/name: orca + app.kubernetes.io/part-of: orca-dev + spec: + restartPolicy: OnFailure + containers: + - name: az-cli + image: {{ default "mcr.microsoft.com/azure-cli:latest" .AzCliImage | quote }} + env: + - name: AZURE_STORAGE_CONNECTION_STRING + value: "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite.{{ default "unbounded-kube" .Namespace }}.svc.cluster.local:10000/devstoreaccount1;" + - name: CONTAINER + value: {{ default "orca-test" .AzuriteContainer | quote }} + command: + - /bin/sh + - -c + - | + set -e + echo "Waiting for Azurite ..." + for i in $(seq 1 60); do + if az storage container list --output none 2>/dev/null; then + echo "Azurite ready." + break + fi + sleep 2 + done + echo "Ensuring container ${CONTAINER} (idempotent) ..." + if az storage container exists --name "${CONTAINER}" --query exists --output tsv | grep -qi true; then + echo "Container ${CONTAINER} already exists." + else + az storage container create --name "${CONTAINER}" --output none + echo "Container ${CONTAINER} created." + fi + echo "Init complete." \ No newline at end of file diff --git a/deploy/orca/rendered/.gitignore b/deploy/orca/rendered/.gitignore new file mode 100644 index 00000000..f79c394d --- /dev/null +++ b/deploy/orca/rendered/.gitignore @@ -0,0 +1,3 @@ +# rendered manifests are gitignored; produced by `make orca-manifests`. +* +!.gitignore diff --git a/design/orca/brief.md b/design/orca/brief.md new file mode 100644 index 00000000..43940c82 --- /dev/null +++ b/design/orca/brief.md @@ -0,0 +1,368 @@ +# Orca - Origin Cache - Architecture Brief + +A short brief intended for technical leads who need to understand the +shape of the system, the load-bearing decisions, and what is in v1 +without wading through the full design. Drill-down references point at +[design.md](./design.md). + +## 1. Problem and approach + +Cloud blob origins (AWS S3, Azure Blob) are slow and expensive when +read from on-prem at scale. The intended workload is large immutable +artifacts (job inputs, model weights, training shards) read by +thousands of clients with strongly correlated cold starts (job +launches, distributed-training kickoffs), including FUSE-mounted +filesystems where edge clients perform interactive `ls` and +directory navigation. Naive direct access stampedes origin egress +and cost. + +Orca is a read-only S3-compatible HTTP cache deployed inside +the on-prem datacenter as a multi-replica Kubernetes Deployment +fronting AWS S3 and Azure Blob. It serves chunked, ETag-keyed bytes +out of a shared in-DC backing store, dedupes concurrent fills both +within and across replicas, and presents the same `GetObject` / +`HeadObject` / `ListObjectsV2` surface clients already use. + +## 2. Goals and non-goals + +Goals (v1): +- Read-only S3-compatible API at the edge: `GetObject` (with byte-range + `Range`), `HeadObject`, `ListObjectsV2`. +- Multi-PB working set; thousands of concurrent clients. +- Multi-DC deployment; each DC independent (no cross-DC peering). +- Negligible origin stampede under correlated cold-access bursts. +- Low **TTFB** (time to first byte) on both warm and cold paths. +- Atomic, durable commit of fetched chunks; safe under concurrent + fills. +- Bounded staleness: `metadata_ttl` (default 5m) on contract violation, + `negative_metadata_ttl` (default 60s) on create-after-404; zero + otherwise. + +Non-goals (v1): +- Write path, multipart upload, object versioning. +- Cross-DC peering. +- SigV4 verification at the edge (bearer / mTLS only). +- Multi-tenant quotas or per-tenant credentials. +- Per-client / per-IP edge rate limiting. +- Mutable-blob invalidation beyond ETag identity. +- Encryption at rest beyond what the backing store provides. + +## 3. System at a glance + +Each request lands on one replica (the **assembler**), which iterates +the requested range chunk by chunk. Hits read directly from the +shared **CacheStore**. Misses route to the chunk's **coordinator** +(selected by rendezvous hashing on pod IP from the headless-Service +membership), which runs a singleflight + tee + spool fill against the +**Origin** and atomically commits to the CacheStore. The coordinator +may be the assembler itself (local fill) or a different replica +(per-chunk internal mTLS fill RPC). + +### Diagram A: System overview + +```mermaid +graph TB + subgraph DC["On-prem datacenter"] + Clients["Edge clients"] + Service["Service (ClusterIP / LB)
client traffic"] + subgraph Replicas["orca Deployment"] + R1["Replica 1"] + R2["Replica 2"] + R3["Replica N"] + end + Headless["Headless Service
peer discovery"] + Internal["Internal listener :8444
per-chunk fill RPC
(mTLS, peer-IP authz)"] + CS[("CacheStore
in-DC S3 / posixfs / localfs")] + end + subgraph Cloud["Cloud origins"] + S3[("AWS S3")] + Azure[("Azure Blob
Block Blobs only")] + end + Clients -- "S3 GET / HEAD / LIST
+ Range" --> Service + Service --> R1 + Service --> R2 + Service --> R3 + R1 -. "DNS refresh
default 5s" .-> Headless + R2 -.-> Headless + R3 -.-> Headless + R1 <--> Internal + R2 <--> Internal + R3 <--> Internal + R1 <--> CS + R2 <--> CS + R3 <--> CS + R1 -- "miss-fill
If-Match: etag" --> S3 + R2 -- "miss-fill
If-Match: etag" --> S3 + R3 -- "miss-fill
If-Match: etag" --> Azure +``` + +## 4. Components + +Named building blocks. The first five (Origin, CacheStore, ChunkCatalog, +Cluster, Spool) are formal Go interfaces in +[design.md s7](./design.md#7-internal-interfaces); the request-edge +components (Server, fetch.Coordinator, Singleflight, Auth) are +process-internal and are described in +[design.md s4](./design.md#4-architecture) and +[s8](./design.md#8-stampede-protection). + +- **Server** - the S3-compatible HTTP edge for clients, plus a + separate internal listener for per-chunk fill RPCs between + replicas. Two listeners with two distinct trust roots. +- **fetch.Coordinator** - orchestrates the per-request fan-out: + per-chunk routing, origin concurrency bounding, internal-RPC + client. The brain of the assembler. +- **Singleflight** - per-`ChunkKey` in-flight dedupe so concurrent + cold misses for the same chunk collapse into one origin GET. + Prevents process-local thundering herds. +- **Spool** - bounded local-disk staging for in-flight fills. + Tees bytes in parallel with the client write (s5.2), giving + slow joiners a uniform fallback across all CacheStore drivers + and serving as the source for the asynchronous CacheStore + commit. +- **ChunkCatalog** - in-memory LRU recording which chunks the + CacheStore holds. Pure hot-path optimization; CacheStore is + source of truth. +- **Origin** - read-only adapter to the upstream cloud blob store + (AWS S3, Azure Blob). Sends `If-Match: ` on every range + read so mid-flight overwrites are detected at the wire. +- **CacheStore** - shared in-DC chunk store, source of truth for + chunk presence. Pluggable: `localfs`, `posixfs`, `s3`. Driver + choice invisible above the cachestore boundary. +- **Cluster** - peer discovery from the headless Service plus + rendezvous hashing on pod IP to pick the coordinator per + `ChunkKey`. Refreshes membership every 5s by default. +- **Auth** - bearer / mTLS on the client edge and mTLS plus + peer-IP authorization on the internal listener. Separate trust + roots. + +## 5. Five load-bearing mechanisms + +### 5.1 Chunking and identity + +The cache works in fixed-size chunks (default 8 MiB, configurable +4-16 MiB). The `ChunkKey` is +`{origin_id, bucket, object_key, etag, chunk_size, chunk_index}` and +is the on-store path for that chunk. ETag is treated as identity, not +freshness: any change of origin bytes (under the contract in s5.5) +produces a new ETag, which deterministically yields a new chunk path. +The cache cannot, by construction, serve old bytes for a new ETag. +See [design.md s5](./design.md#5-chunk-model). + +### 5.2 Singleflight + tee + spool + +Per-`ChunkKey` singleflight on the coordinator collapses concurrent +misses to a single origin GET. Cold-path bytes stream **directly +from origin to client**: bounded **pre-header origin retry** +(default 3 attempts, 5s total budget) handles transient origin +failures invisibly before any HTTP response header is sent; the +commit boundary is the first byte arrival from origin. Once +committed, the leader streams bytes to the client as they arrive. +In parallel, the leader tees bytes into a small in-memory ring +buffer (low-TTFB joiners) and a bounded local-disk **Spool** +(slow joiners that fall behind the ring head, plus uniform +behavior across all CacheStore drivers). The CacheStore commit +happens asynchronously after the response completes. The spool +is NOT on the client TTFB path in v1. See +[design.md s8.1](./design.md#81-per-chunkkey-singleflight), +[s8.2](./design.md#82-ttfb-tee--spool), and +[s8.6](./design.md#86-failure-handling-without-re-stampede). + +### 5.3 Per-chunk coordinator (rendezvous hashing) + +Each replica polls a headless Service for peer IPs (default every +5s) and selects the coordinator per `ChunkKey` by rendezvous (Highest +Random Weight) hash on pod IP. The assembler fans out per-chunk fill +RPCs over a separate internal mTLS listener (`:8444`) to coordinators +that are not self. One client request spanning N chunks may use N +different coordinators; this is intentional for highly correlated +cold-access workloads, where any single hot key would otherwise pin +its assembler. Loop prevention is enforced by a header marker plus a +membership self-check (`409 Conflict` fallback to local fill on +disagreement). See [design.md s8.3](./design.md#83-cluster-wide-deduplication-via-per-chunk-fill-rpc) +and [s8.8](./design.md#88-internal-rpc-listener). + +### 5.4 Atomic-commit primitive + +The leader publishes a chunk to the CacheStore in a single no-clobber +operation: the second concurrent commit MUST lose without overwriting +the winner. Two equivalent shapes are picked per driver: object-store +`PutObject + If-None-Match: *` (used by `cachestore/s3`) and POSIX +`link()` (or `renameat2(RENAME_NOREPLACE)`) returning `EEXIST` (used +by `cachestore/localfs` and `cachestore/posixfs`). Both atomic; both +report the loser as `commit_lost`. Each driver runs +`SelfTestAtomicCommit` at boot and refuses to start if the backend +does not honor its primitive. See +[design.md s10.1](./design.md#101-atomic-commit-per-cachestore-driver). + +### 5.5 Bounded staleness contract + +Correctness rests on an **immutable-origin contract** with the +operator: for any given `(origin_id, bucket, key)`, the underlying +bytes are immutable for the life of the key; replacement MUST publish +a new key. Because the +cache key includes ETag (s5.1), as long as the contract holds the +cache cannot serve stale bytes. If the contract is violated by an +in-place overwrite, the cache may serve old bytes for at most one +`metadata_ttl` window (default 5m), bounded by the metadata cache +TTL. This is the load-bearing semantic for correctness and MUST +appear in the consumer-API documentation. Defense in depth: every +`Origin.GetRange` carries `If-Match: `, so a mid-flight +overwrite is caught at fill time and increments +`origin_etag_changed_total`. See +[design.md s11](./design.md#11-bounded-staleness-contract). A +symmetric bound applies to **create-after-404** (a key uploaded after +a client already saw a 404 on it): at most one `negative_metadata_ttl` +window per replica that observed the original 404 (default 60s) +before the cache reflects the upload. See +[design.md s12](./design.md#12-create-after-404-and-negative-cache-lifecycle). +Operators with workloads requiring shorter effective windows on hot +keys can opt into a **bounded-freshness mode** (default off): a +per-replica background loop proactively re-Heads frequently- +accessed keys ahead of `metadata_ttl`, shrinking the effective +window for those keys to `refresh_ahead_ratio * metadata_ttl` +(default 3.5m). See +[design.md s11.2](./design.md#112-bounded-freshness-mode-optional). + +## 6. Backing-store options + +The CacheStore is pluggable; choice is a deployment-time decision and +is invisible above the `cachestore` package boundary. Three drivers +ship in v1: + +- `localfs` - dev only; one POSIX FS per replica; not shared. +- `posixfs` - shared POSIX FS mounted on every replica at the same + path. Supported backends: NFSv4.1+ (baseline), Weka native + (`-t wekafs`), CephFS, Lustre, GPFS / IBM Spectrum Scale. Same + `link()` / `EEXIST` primitive as `localfs`. Alluxio FUSE is hard- + refused (no `link(2)`, no atomic no-overwrite rename). +- `s3` - in-DC S3-compatible object store (e.g. VAST). `PutObject` + + `If-None-Match: *`. + +See [design.md s10.1](./design.md#101-atomic-commit-per-cachestore-driver) +for atomic-commit specifics per driver. + +## 7. A request, end-to-end (cold miss with cross-replica fill) + +The diagram below traces a cold miss on replica A where the chunk's +coordinator is replica B. The hot path (cache hit on A) skips +straight from the catalog lookup to a direct CacheStore read; the +local-coordinator path (B == A) skips the internal RPC. Cold-path +bytes stream from origin -> coordinator -> assembler -> client +in parallel with the spool tee on B. Pre-header retry on B handles +transient origin failures invisibly; the CacheStore commit happens +asynchronously after the client has the full chunk. + +### Diagram B: Cold miss, cross-replica coordinator + +```mermaid +sequenceDiagram + autonumber + participant C as Client + participant A as Replica A (assembler) + participant B as Replica B (coordinator for k) + participant SF as Singleflight (on B) + participant Sp as Spool (B local disk) + participant O as Origin + participant CS as CacheStore (shared) + C->>A: GET /bucket/key Range + A->>CS: Stat(k) + CS-->>A: ErrNotFound + A->>B: /internal/fill?key=k (mTLS) + B->>SF: Acquire(k) [leader] + SF->>O: GetRange(..., If-Match: etag)
(pre-header retry s8.6) + O-->>SF: first byte + Note over SF: commit boundary - origin healthy + par stream + SF-->>B: bytes as they arrive + B-->>A: stream + A-->>C: 200/206 + headers + body + and tee to spool + SF->>Sp: bytes (in parallel) + end + O-->>SF: remaining bytes + SF->>Sp: Commit (fsync + close) [after stream] + SF-)CS: PutObject (or link()) commit [async] + CS--)SF: 200 (commit_won) or failure +``` + +## 8. Top risks worth your attention + +1. **Immutable-origin contract** - Correctness rests on operators + publishing new keys instead of overwriting. Bounded violation + window is `metadata_ttl` (5m default). Must be visible in + consumer-API documentation. See + [design.md s11](./design.md#11-bounded-staleness-contract). +2. **Commit-after-serve failure** - The CacheStore commit happens + asynchronously after the client response is complete (cold-path + bytes stream origin -> client directly with pre-header retry on + the cache side). If the async commit fails after the client has + the full chunk, the chunk is silently uncached and the next + request refills. Sustained failure is visible only via + `commit_after_serve_total{result="failed"}`; alerting is required. + See [design.md s8.6](./design.md#86-failure-handling-without-re-stampede). +3. **Spool locality** - The Spool MUST live on a local block device + by default (boot-time `statfs(2)` check refuses to start on + NFS / SMB / CephFS / Lustre / GPFS / FUSE). With the v1 streaming + design the spool is no longer on the client TTFB path, so this + contract is defense-in-depth: a network-FS spool would only + degrade joiner-fallback latency, not first byte. Operators with + unusual placements MAY relax via `spool.require_local_fs: false`; + production deployments are expected to keep the default. See + [design.md s10.4](./design.md#104-spool-locality-contract). +4. **Per-replica origin semaphore is approximate** - Origin + concurrency is capped per-replica at + `floor(target_global / cluster.target_replicas)` (default 64 + slots/replica at `target_global=192`, + `cluster.target_replicas=3`). Realized cluster-wide concurrency + tracks `target_global` only when actual replica count matches + `cluster.target_replicas`; scale-out without updating the knob + over-allocates against origin, scale-in under-allocates. + Origin throttling is handled by the leader's pre-header retry + loop (exponential backoff) rather than by a hard coordinated + cap. A coordinated cluster-wide limiter and dynamic recompute + are deferred future work; see + [design.md s15.5](./design.md#155-coordinated-cluster-wide-origin-limiter) + and + [design.md s15.6](./design.md#156-dynamic-per-replica-origin-cap). +5. **POSIX backend hardening** - NFS exports MUST be `sync` (not + `async`); Weka NFS `link()`/`EEXIST` is not docs-confirmed and + is gated by `SelfTestAtomicCommit` at boot; Alluxio FUSE is + hard-refused with a documented workaround + (`cachestore.driver: s3` against the Alluxio S3 gateway). See + [design.md s10.1.2](./design.md#1012-cachestoreposixfs). +6. **Create-after-404 staleness** - A key uploaded after clients + already observed it as `404` will return stale `404` for up to + `negative_metadata_ttl` (default 60s) per replica that observed + the original miss. Round-robin LB can produce alternating `404` + / `200` during the drain. No event-driven invalidation or admin- + invalidation in v1 (the immutable-origin contract makes them + unnecessary for the documented workload); operators must wait + the TTL after uploading a previously-missing key. Mitigation: + short default TTL, `metadata_negative_*` metrics. See + [design.md s12](./design.md#12-create-after-404-and-negative-cache-lifecycle). + +## 9. Where to go next + +`design.md` (full mechanism + flow): +- [s2 Decisions](./design.md#2-decisions) - locked design choices. +- [s3 Terminology](./design.md#3-terminology) - full glossary. +- [s4 Architecture and onward](./design.md#4-architecture) - + architecture, request flow, internal interfaces, stampede protection. +- [s8.4 Origin backpressure](./design.md#84-origin-backpressure) - + per-replica static cap and pre-header retry for throttle handling. +- [s10.1 Atomic commit per driver](./design.md#101-atomic-commit-per-cachestore-driver) +- [s11 Bounded staleness](./design.md#11-bounded-staleness-contract) + - [s11.2 Bounded-freshness mode (optional)](./design.md#112-bounded-freshness-mode-optional) +- [s12 Create-after-404 and negative-cache lifecycle](./design.md#12-create-after-404-and-negative-cache-lifecycle) +- [s13 Eviction and capacity](./design.md#13-eviction-and-capacity) - + passive lifecycle and optional active eviction; ChunkCatalog + size-awareness operational guidance. +- [s15 Deferred optimizations](./design.md#15-deferred-optimizations) - + v1 scope-discipline catalog (edge rate limiting, cluster-wide HEAD + singleflight, cluster-wide LIST coordinator, mid-stream origin + resume, coordinated cluster-wide origin limiter, dynamic per- + replica origin cap). +- 12 inline mermaid diagrams covering hits, misses, cross-replica + fills, atomic commit, create-after-404 timeline, and membership + flux. diff --git a/design/orca/design.md b/design/orca/design.md new file mode 100644 index 00000000..f131fe59 --- /dev/null +++ b/design/orca/design.md @@ -0,0 +1,2711 @@ +# Orca - Origin Cache - Design (mechanism & flow) + +Status: draft for review (round 2 incorporating reviewer feedback) +Owner: TBD + +--- + +## Table of contents + +### Sections + +1. [Overview](#1-overview) +2. [Decisions](#2-decisions) +3. [Terminology](#3-terminology) +4. [Architecture](#4-architecture) +5. [Chunk model](#5-chunk-model) +6. [Request flow](#6-request-flow) + - [6.1 HEAD request flow](#61-head-request-flow) + - [6.2 LIST request flow](#62-list-request-flow) + - [6.3 HTTP error-code mapping](#63-http-error-code-mapping) +7. [Internal interfaces](#7-internal-interfaces) +8. [Stampede protection](#8-stampede-protection) + - [8.1 Per-`ChunkKey` singleflight](#81-per-chunkkey-singleflight) + - [8.2 TTFB tee + spool](#82-ttfb-tee--spool) + - [8.3 Cluster-wide deduplication via per-chunk fill RPC](#83-cluster-wide-deduplication-via-per-chunk-fill-rpc) + - [8.4 Origin backpressure](#84-origin-backpressure) + - [8.5 Cancellation safety](#85-cancellation-safety) + - [8.6 Failure handling without re-stampede](#86-failure-handling-without-re-stampede) + - [8.7 Metadata-layer singleflight](#87-metadata-layer-singleflight) + - [8.8 Internal RPC listener](#88-internal-rpc-listener) +9. [Azure adapter: Block Blob only](#9-azure-adapter-block-blob-only) +10. [Concurrency, durability, correctness](#10-concurrency-durability-correctness) + - [10.1 Atomic commit (per CacheStore driver)](#101-atomic-commit-per-cachestore-driver) + - [10.2 Catalog correctness, typed errors, circuit breaker](#102-catalog-correctness-typed-errors-circuit-breaker) + - [10.3 Range, sizes, and edge cases](#103-range-sizes-and-edge-cases) + - [10.4 Spool locality contract](#104-spool-locality-contract) + - [10.5 Readiness probe (`/readyz`)](#105-readiness-probe-readyz) +11. [Bounded staleness contract](#11-bounded-staleness-contract) + - [11.1 The contract and the staleness window](#111-the-contract-and-the-staleness-window) + - [11.2 Bounded-freshness mode (optional)](#112-bounded-freshness-mode-optional) +12. [Create-after-404 and negative-cache lifecycle](#12-create-after-404-and-negative-cache-lifecycle) +13. [Eviction and capacity](#13-eviction-and-capacity) + - [13.1 Passive eviction (lifecycle)](#131-passive-eviction-lifecycle) + - [13.2 Active eviction (opt-in, access-frequency)](#132-active-eviction-opt-in-access-frequency) + - [13.3 ChunkCatalog size awareness](#133-chunkcatalog-size-awareness-load-bearing-operational-note) + - [13.4 Spool capacity](#134-spool-capacity) + - [13.5 `chunk_size` config-change capacity impact](#135-chunk_size-config-change-capacity-impact) + - [13.6 Eviction interactions](#136-eviction-interactions) +14. [Horizontal scale](#14-horizontal-scale) +15. [Deferred optimizations](#15-deferred-optimizations) + - [15.1 Edge rate limiting](#151-edge-rate-limiting) + - [15.2 Cluster-wide HEAD singleflight](#152-cluster-wide-head-singleflight) + - [15.3 Cluster-wide LIST coordinator](#153-cluster-wide-list-coordinator) + - [15.4 Mid-stream origin resume](#154-mid-stream-origin-resume) + - [15.5 Coordinated cluster-wide origin limiter](#155-coordinated-cluster-wide-origin-limiter) + - [15.6 Dynamic per-replica origin cap](#156-dynamic-per-replica-origin-cap) + +### Request scenarios + +Concrete request-flow narratives. Each scenario has a stable letter +identifier reused in the diagram heading. + +- **Scenario A** - warm read (cache hit): [Diagram 3](#diagram-3-scenario-a---warm-read-cache-hit) +- **Scenario B** - cold miss, local coordinator: [Diagram 4](#diagram-4-scenario-b---cold-miss-local-coordinator) +- **Scenario C** - concurrent miss, same-replica joiner: [Diagram 5](#diagram-5-scenario-c---concurrent-miss-same-replica-joiner) +- **Scenario D** - cold miss, remote coordinator (cross-replica fill): [Diagram 6](#diagram-6-scenario-d---cold-miss-remote-coordinator) +- **Scenario E** - range spanning multiple coordinators: [Diagram 7](#diagram-7-scenario-e---range-spanning-multiple-coordinators) +- **Scenario F** - Azure non-BlockBlob rejection: [Diagram 8](#diagram-8-scenario-f---azure-non-blockblob-rejection) +- **Scenario G** - create-after-404 (operator upload after client miss): [Diagram 10](#diagram-10-scenario-g---create-after-404-timeline) +- **Scenario H** - rolling restart membership flux: [Diagram 12](#diagram-12-scenario-h---rolling-restart-membership-flux) + +Other diagrams (D1, D2, D9, D11) depict architecture, math, or +mechanism rather than request scenarios and are reachable from the +Sections list above. + +--- + +## 1. Overview + +Edge devices inside an on-prem datacenter need read access to large files +held in cloud blob storage (S3, Azure Blob). Direct egress per device is +unacceptable (cost, latency, throughput, security boundary). Orca is +a read-only caching layer, deployed inside each datacenter, that fronts +cloud blob storage with an S3-compatible API. Clients issue range reads; +Orca serves from a shared in-DC store when present, otherwise +fetches from the cloud origin, stores the chunk, and returns it. + +This document describes the mechanism: decisions, components, request flow, +stampede protection, atomic commit, and horizontal-scale coordination. + +## 2. Decisions + +| Area | Decision | +|---|---| +| Client API | S3-compatible HTTP; `GET` + `HEAD` + `ListObjectsV2`; supports `Range`. | +| Auth (v1) | Network-perimeter trust + bearer / mTLS. No SigV4 verification yet. | +| Origins | S3 + Azure Blob behind a pluggable `Origin` interface. | +| Azure constraint | Block Blobs only. Append/Page Blobs rejected at `Head`. | +| Backing store | Pluggable `CacheStore`; `localfs` for dev, `s3` (VAST or any S3-compatible in-DC object store) **or** `posixfs` (NFSv4.1+, Weka native, CephFS, Lustre, GPFS, or any shared POSIX FS that honors `link()` / `EEXIST` and directory `fsync`) for prod. The CacheStore is the source of truth for chunk presence. Driver choice is a deployment-time decision per replica set; `s3` and `posixfs` are interchangeable from the cache layer's perspective. | +| In-DC S3 vs. cloud S3 | The in-DC S3-compatible store is treated identically to cloud S3 at the protocol level. The only difference is "much faster, in-DC". Both `Origin` and the `cachestore/s3` driver are thin S3-client adapters with no special-casing. The `cachestore/posixfs` driver replaces the S3 protocol with shared-POSIX primitives but presents the same `CacheStore` interface, so nothing above s7 changes. | +| CacheStore atomic-commit primitive | Two equivalent primitives, picked per driver: object-store `PutObject + If-None-Match: *` (used by `cachestore/s3`) and POSIX `link()` / `renameat2(RENAME_NOREPLACE)` returning `EEXIST` (used by `cachestore/localfs` and `cachestore/posixfs`). Both are atomic, no-clobber, and have a "you lost the race" failure mode that maps cleanly onto `commit_lost`. Each driver runs `SelfTestAtomicCommit` at boot and refuses to start on backends that don't honor its primitive. | +| Chunking | Fixed 8 MiB default (configurable 4-16 MiB). `chunk_size` baked into `ChunkKey`. | +| Consistency | **Origin objects are immutable per operator contract**: an `(origin_id, bucket, key)` never has its bytes modified once published; replacement must be a new key. `ETag` is identity, not freshness. `If-Match: ` on every `Origin.GetRange` is defense-in-depth that traps in-flight overwrites only. Bounded staleness uses two TTLs: `metadata_ttl` (default 5m) on positive entries (caps in-place-overwrite contract violations; see [s11](#11-bounded-staleness-contract)) and `negative_metadata_ttl` (default 60s) on negative entries (caps the create-after-404 unavailability window after an operator uploads a previously-missing key; see [s12](#12-create-after-404-and-negative-cache-lifecycle)). | +| Catalog | In-memory `ChunkCatalog` fronting `CacheStore.Stat`. No persistent local index. Per-entry access-frequency tracking (s10.2) feeds the optional active-eviction loop (s13.2). Bounded by `chunk_catalog.max_entries`; size to estimated working-set chunks (s13.3). | +| Eviction | Two-tier. Passive: bounded LRU on the in-memory ChunkCatalog (always on); CacheStore lifecycle (S3 lifecycle / posixfs operator sweep) for storage-side cleanup. Active: opt-in access-frequency-driven eviction loop (`chunk_catalog.active_eviction.enabled`, default `false`) that deletes cold chunks from the CacheStore via `CacheStore.Delete`. Operators using `cachestore/posixfs` typically enable active eviction since posixfs has no native lifecycle. See [s13](#13-eviction-and-capacity). | +| Prefetch | Sequential read-ahead by default. Configurable depth, capped concurrency. | +| Cluster | Kubernetes Deployment + headless Service for peer discovery + ClusterIP/LB for client traffic. Rendezvous hashing on pod IP selects the coordinator per `ChunkKey` for miss-fills only; receiving replica is the **assembler** that fans out per-chunk fill RPCs to coordinators (s8.3). All replicas can read all chunks directly from the CacheStore on hits. | +| Inter-replica auth | Separate internal mTLS listener (default `:8444`) chained to an internal CA distinct from the client mTLS CA; authorization = "presenter source IP is in current peer-IP set" (s8.8). | +| Local spool | Every fill writes origin bytes through a local spool (`internal/orca/fetch/spool`) in parallel with streaming to the client; serves as a slow-joiner fallback and as the source for the asynchronous CacheStore commit. The spool is NOT on the client-TTFB path in v1; client bytes flow origin -> client directly (s8.2 / s8.6). | +| Atomic commit | `localfs` and `posixfs` stage inside `/.staging/` with parent-dir fsync, then `link()` no-clobber (returns `EEXIST` to the loser); `s3` uses direct `PutObject` with `If-None-Match: *`. Each driver runs `SelfTestAtomicCommit` at boot: `s3` proves the backend honors `If-None-Match: *`; `posixfs` proves the backend honors `link()` / `EEXIST` and that directory fsync is durable, and additionally enforces `nfs.minimum_version` (default `4.1`, with opt-in `nfs.allow_v3`) and refuses to start on Alluxio FUSE backends. Cold-path bytes stream directly from origin to client; bounded leader-side **pre-header origin retry** (s8.6) handles transient origin failures invisibly before response headers are committed. The spool tees in parallel for joiners (s8.2) and as the CacheStore-commit source. CacheStore commit happens asynchronously after the response completes; commit-after-serve failure becomes `commit_after_serve_total{result="failed"}` rather than a client error (s8.6). | +| Versioned buckets on cachestore/s3 | Not supported. The `cachestore/s3` driver requires the bucket to have versioning **disabled**. AWS S3 honors `If-None-Match: *` on both versioned and unversioned buckets, but VAST Cluster (and likely other S3-compatible backends) only honors it on unversioned buckets ([VAST KB][vast-kb-conditional-writes]). The driver enforces this at boot via an explicit `GetBucketVersioning` versioning gate (s10.1.3); refusing to start on enabled or suspended versioning avoids a class of silent atomic-commit failures. | +| LIST caching | Per-replica TTL'd LIST cache (s6.2 / FW3) in front of `Origin.List`, sized for the FUSE-`ls` workload pattern. Default `list_cache.ttl=60s`, configurable. Cluster-wide LIST coordination is a deferred optimization ([s15.3](#153-cluster-wide-list-coordinator)). | +| Origin concurrency cap | Per-replica token bucket sized `floor(target_global / cluster.target_replicas)`. Default `target_global=192` and `cluster.target_replicas=3`, giving 64 slots per replica. Origin throttling responses (503 / 429) are handled by the leader's pre-header retry loop (s8.6) with exponential backoff. A coordinated cluster-wide limiter and dynamic recompute from `len(Cluster.Peers())` are deferred optimizations; see [s15.5](#155-coordinated-cluster-wide-origin-limiter) and [s15.6](#156-dynamic-per-replica-origin-cap). | +| Bounded-freshness mode | Optional, opt-in via `metadata_refresh.enabled` (default `false`). When enabled, a per-replica background loop proactively re-Heads hot keys (`AccessCount >= access_threshold`) ahead of `metadata_ttl` to shrink the effective bounded-staleness window for popular content. See [s11.2](#112-bounded-freshness-mode-optional). | +| Tenancy | Single tenant, single origin credential set in v1. | +| Edge rate limiting | Documented v1 gap; see [s15.1](#151-edge-rate-limiting). v1 has implicit hot-client mitigation via the per-replica origin limiter (s8.4) and singleflight (s8.1); per-client / per-IP / per-credential edge rate limiting is deferred future work. | +| Repo home | This repo. Layout mirrors `machina`. | + +[vast-kb-conditional-writes]: https://kb.vastdata.com/documentation/docs/s3-conditional-writes + +## 3. Terminology + +Terms used throughout this document. Forward-references point at the +section that defines or implements the full mechanism. + +- **Replica** - one running pod of the `orca` Deployment. All + replicas are interchangeable; there is no per-pod state. +- **Client** - external caller using an S3-compatible HTTP API (e.g. + `aws-sdk`, `boto3`). +- **Origin** - upstream cloud blob store (AWS S3 or Azure Blob); read-only + from our perspective. Interface defined in + [s7](#7-internal-interfaces). +- **CacheStore** - the in-DC durable store that holds cached chunk bytes + and is shared by all replicas. Pluggable: `localfs` for dev, `s3` (e.g. + VAST or any S3-compatible in-DC object store) and `posixfs` (shared + POSIX FS - NFSv4.1+, Weka native, CephFS, Lustre, GPFS) for prod; + driver choice is a deployment-time decision and is invisible above the + cachestore boundary. Treated as the source of truth for chunk presence. + Interface in [s7](#7-internal-interfaces); commit semantics in + [s10](#10-concurrency-durability-correctness). +- **Chunk** - a fixed-size byte range of an origin object (default 8 MiB); + the unit of caching and fill. +- **ChunkKey** - the immutable identifier for a chunk: + `{origin_id, bucket, object_key, etag, chunk_size, chunk_index}`. Full + definition in [s5](#5-chunk-model). +- **Headless Service** - Kubernetes `Service` with `clusterIP: None`; its + DNS A-record resolves to the IPs of all Ready pods. We poll it (default + every 5s) to discover the current peer set. +- **Rendezvous hashing** (a.k.a. Highest Random Weight, HRW) - for a given + key, score each peer with `hash(peer_ip || key)` and pick the argmax. + Stable under membership changes that don't add or remove the winning + peer. We use it to pick exactly one coordinator per chunk from the + current peer set. +- **Coordinator** - the replica that rendezvous hashing selects to perform + the miss-fill for a particular chunk. Ownership is **per chunk**, not + per request and not per object: a single client request spanning N + chunks may have N different coordinators. +- **Assembler** - the replica that received the client request. It is + responsible for stitching the client response. For each chunk in the + requested range, the assembler either (a) reads from CacheStore on a + hit, (b) runs a local miss-fill if it is the coordinator for that + chunk, or (c) issues an internal fill RPC to the coordinator otherwise. + See [s8.3](#83-cluster-wide-deduplication-via-per-chunk-fill-rpc). +- **Singleflight** - a per-key in-process deduplication primitive. + Concurrent requests for the same `ChunkKey` share a single in-flight + fill: the first arrival is the **leader** (issues the origin GET); + subsequent arrivals are **joiners** (wait on the leader's stream). Full + mechanism in [s8.1](#81-per-chunkkey-singleflight). +- **Tee** - the leader's origin byte stream is split two ways: into a + small in-memory ring buffer for low-TTFB joiners, and into the Spool + (below) for slow joiners that fall behind the ring head. Joiners + therefore stream through the leader rather than waiting for the full + disk write. Full mechanism in [s8.2](#82-ttfb-tee--spool). +- **Spool** - bounded local-disk staging area for in-flight fills + (`internal/orca/fetch/spool`). Ensures slow joiners always have a + local fallback regardless of CacheStore driver. Detail in + [s8.2](#82-ttfb-tee--spool). +- **Atomic CacheStore commit** - the leader publishes the completed chunk + in a single no-clobber operation: `link()` / + `renameat2(RENAME_NOREPLACE)` for `localfs`; `PutObject` + + `If-None-Match: *` for `s3`. Concurrent commits cannot overwrite each + other; the loser is recorded as `commit_lost`. See + [s10](#10-concurrency-durability-correctness). +- **Per-chunk internal fill RPC** - `GET /internal/fill?key=` over mTLS on the internal listener (default `:8444`). The + assembler calls the coordinator when a chunk is missed and the + coordinator is not self. See [s8.8](#88-internal-rpc-listener). +- **Immutable origin contract** - operator promise that an + `(origin_id, bucket, key)` never has its bytes modified once published; + replacement is always a new key. The cache trusts this contract; on + violation, the bounded staleness window is `metadata_ttl` (default 5m). + Full statement in [s11](#11-bounded-staleness-contract). +- **Pre-header retry** - the leader retries `Origin.GetRange` on + transient errors **before** sending HTTP response headers to the + client, making transient origin failures invisible to the client. + Bounded by `origin.retry.attempts` (default 3) and + `origin.retry.max_total_duration` (default 5s). The "commit + boundary" is the first byte arrival from origin: once received, + the cache sends headers and starts streaming; subsequent origin + failures become mid-stream client aborts (handled by S3 SDK + retry via `Content-Length` mismatch). `OriginETagChangedError` + is non-retryable. Detail in + [s8.6](#86-failure-handling-without-re-stampede). Mid-stream + origin resume is deferred future work + ([s15.4](#154-mid-stream-origin-resume)). +- **CacheStore circuit breaker** - per-process error-rate breaker around + `CacheStore` calls. On sustained `ErrTransient` / `ErrAuth`, the + breaker opens, short-circuits writes, and surfaces via metrics and + `/readyz`. Defaults: 10 errors / 30s window, 30s open, 3 half-open + probes. Detail in [s10.2](#102-catalog-correctness-typed-errors-circuit-breaker). +- **Negative-cache entry** - a metadata-cache entry recording an + authoritative `404` (or unsupported-blob-type rejection) from + origin. Reused for `negative_metadata_ttl` (default 60s) before + re-Heading. Bounds the create-after-404 unavailability window; + see [s12](#12-create-after-404-and-negative-cache-lifecycle). +- **Shared-POSIX CacheStore** - the `cachestore/posixfs` driver: a + `CacheStore` backed by a shared POSIX-style filesystem mounted on every + replica at the same path. Concrete supported backends are NFSv4.1+ (the + baseline), Weka native (`-t wekafs`), CephFS (`-t ceph`), Lustre + (`-t lustre`), and IBM Spectrum Scale / GPFS (`-t gpfs`). Disqualified + on purpose: Alluxio FUSE (no `link(2)`, no atomic no-overwrite rename, + no NFS gateway). The driver depends on + `internal/orca/cachestore/internal/posixcommon/` (link-based + commit, dir-fsync, staging-dir helpers, fan-out path layout) which is + also depended on by `cachestore/localfs`. Detail in + [s10.1.2](#1012-cachestoreposixfs). +- **Atomic-commit primitive** - the no-clobber publish step that ends a + fill. Two equivalent shapes: object-store + `PutObject + If-None-Match: *` (used by `cachestore/s3`) and POSIX + `link()` / `renameat2(RENAME_NOREPLACE)` returning `EEXIST` to the + loser (used by `cachestore/localfs` and `cachestore/posixfs`). Both are + atomic, return a "you lost the race" signal that becomes + `commit_lost`, and are validated at boot by `SelfTestAtomicCommit`. + Detail in [s10.1](#101-atomic-commit-per-cachestore-driver). +- **Spool locality contract** - the local Spool (`spool.dir`) MUST live + on a local block device. The cache layer enforces this at boot via + `statfs(2)` against a denylist of network filesystems + (NFS / SMB / Ceph / Lustre / GPFS / FUSE) and refuses to start on + violation. Governed by `spool.require_local_fs` (default `true`). The + rationale and the boot check are in + [s10.4](#104-spool-locality-contract); the spool's role in the + cold-path TTFB barrier is in [s8.2](#82-ttfb-tee--spool). +- **LIST cache** - per-replica TTL'd cache of `Origin.List` responses + keyed on the full query tuple `(origin_id, bucket, prefix, + continuation_token, start_after, delimiter, max_keys)`. Default + `list_cache.ttl=60s`, configurable. Sized for the FUSE-`ls` + workload pattern (s6.2). Cluster-wide LIST coordination is a + deferred optimization ([s15.3](#153-cluster-wide-list-coordinator)). +- **Active eviction** - optional, opt-in background loop in the + cache layer (`chunk_catalog.active_eviction.enabled`, default + `false`) that uses access-frequency tracking on the + `ChunkCatalog` to delete cold chunks from the CacheStore via + `CacheStore.Delete`. Recommended for `cachestore/posixfs` + deployments without external sweep tooling. Detail in + [s13.2](#132-active-eviction-opt-in-access-frequency). +- **Bounded-freshness mode** - optional, opt-in + (`metadata_refresh.enabled`, default `false`) per-replica + background loop that proactively re-Heads hot keys ahead of + `metadata_ttl`. Shrinks the effective bounded-staleness window + for popular content from `metadata_ttl` to + `refresh_ahead_ratio * metadata_ttl` (default 3.5m). Hot-key + detection uses access-frequency counters on the metadata cache + (parallel to the ChunkCatalog tracking from FW8). Detail in + [s11.2](#112-bounded-freshness-mode-optional). +- **S3 versioning gate** - boot-time `GetBucketVersioning` check + by `cachestore/s3` that refuses to start if the bucket has + versioning enabled or suspended. Required because + `If-None-Match: *` is not honored on versioned buckets across + all S3-compatible backends; without this gate the atomic-commit + primitive silently degrades. Detail in + [s10.1.3](#1013-cachestores3). + +## 4. Architecture + +A single binary, `orca`, deployed as a Kubernetes Deployment. +Replicas discover each other through a headless Service and refresh the +peer set on a configurable interval (default 5s). A request from a client +lands on one replica - the **assembler** - which iterates the requested +range chunk-by-chunk. For each `ChunkKey`, the assembler reads directly +from the shared CacheStore on a hit; on a miss it routes to the chunk's +**coordinator** (selected by rendezvous hashing on the current peer-IP +set) for a singleflight + tee + spool + atomic-commit fill. The +coordinator may be the assembler itself, in which case the fill runs +locally; otherwise the assembler issues a per-chunk internal fill RPC. +All terms are defined in [s3](#3-terminology). Single tenant. One origin +credential set per deployment. + +### Diagram 1: System overview + +```mermaid +graph TB + subgraph DC["On-prem datacenter"] + Clients["Edge clients"] + Service["Service (ClusterIP / LB)
client traffic"] + subgraph Replicas["orca Deployment"] + R1["Replica 1"] + R2["Replica 2"] + R3["Replica N"] + end + Headless["Headless Service
peer discovery"] + Internal["Internal listener :8444
per-chunk fill RPC
(mTLS, peer-IP authz)"] + CS[("CacheStore
in-DC S3 / posixfs / localfs")] + end + subgraph Cloud["Cloud origins"] + S3[("AWS S3")] + Azure[("Azure Blob
Block Blobs only")] + end + Clients -- "S3 GET / HEAD / LIST
+ Range" --> Service + Service --> R1 + Service --> R2 + Service --> R3 + R1 -. "DNS refresh
default 5s" .-> Headless + R2 -.-> Headless + R3 -.-> Headless + R1 <--> Internal + R2 <--> Internal + R3 <--> Internal + R1 <--> CS + R2 <--> CS + R3 <--> CS + R1 -- "miss-fill
If-Match: etag" --> S3 + R2 -- "miss-fill
If-Match: etag" --> S3 + R3 -- "miss-fill
If-Match: etag" --> Azure +``` + +## 5. Chunk model + +- `ChunkKey = {origin_id, bucket, object_key, etag, chunk_size, chunk_index}`. + - `origin_id` is a deployment-scoped identifier from config (e.g. + `aws-us-east-1-prod`, `azure-eastus-research`). Required. Namespaces + cache key derivation and the on-store path so two deployments can + safely share a CacheStore bucket. + - `etag` captures immutability. A new ETag is treated as a new logical + object and gets a fresh set of chunks. Old chunks age out via the + CacheStore's lifecycle policy. + - `chunk_size` is part of the key so a runtime config change does not + silently corrupt or shadow existing data. +- `chunk_index = floor(byte / chunk_size)`. +- An object metadata cache holds `{origin_id, bucket, key} -> {size, etag, + content_type, last_validated, last_status}` with a small TTL. Avoids + re-`HEAD`ing origin on every request. + +The CacheStore's namespace **is** the chunk index. `ChunkKey` +deterministically produces a path. Cache key derivation uses canonical +length-prefixed encoding to remove ambiguity from separators that may +appear in any field: + +``` +LP(s) = LE64(uint64(len(s))) || s +hashKey = sha256( + LP(origin_id) || + LP(bucket) || + LP(key) || + LP(etag) || + LE64(chunk_size) + ) +path = "//" +``` + +`origin_id` appears in the path in the clear (and `chunk_size` is folded +into the hash, not the path) so operators can run per-origin lifecycle +policies and target a specific deployment with `aws s3 rm --recursive +//`. + +The `cachestore/posixfs` driver inserts a 2-character hex fan-out +between `` and `` to keep directory sizes +manageable on multi-PB working sets; that variant and its +`cachestore.posixfs.fanout_chars` knob are specified in +[s10.1.2](#1012-cachestoreposixfs). The `s3` and `localfs` drivers use +the unmodified path above. + +**Operational note: changing `chunk_size`.** Because `chunk_size` is a +field of `ChunkKey` and is folded into the path hash, changing it in +deployment config never corrupts or shadows existing chunks; old-sized +chunks remain valid byte ranges of the old logical layout but are no +longer addressable. Operators should plan for transient storage +doubling and a cold-period origin-cost spike when changing +`chunk_size` on a hot working set: the working set is rebuilt at the +new size on demand while the old set ages out via the CacheStore +lifecycle policy (or, on `posixfs`, the operator's external sweep - +see [s13](#13-eviction-and-capacity)). + +Whether a chunk is present is answered by `CacheStore.Stat(key)`. An +in-memory `ChunkCatalog` LRU memoizes recent positive lookups so the hot +path never touches the CacheStore for metadata. The catalog is purely a +hot-path optimization; it can be dropped at any time without affecting +correctness. + +For a request `Range: bytes=A-B`: + +``` +firstChunk = A / chunk_size +lastChunk = B / chunk_size +for cid := firstChunk; cid <= lastChunk; cid++ { // streaming iterator + fetchOrServe(cid) // + sliding prefetch window + sliceWithin(cid, max(A, cid*sz), min(B, (cid+1)*sz - 1)) +} +``` + +The chunk loop is a **streaming iterator**: at no point is the full +`[]ChunkKey` for the range materialized into a slice. Prefetch operates on +a sliding window of `min(prefetch_depth, lastChunk - cid)` ahead of the +current cursor. A configurable `server.max_response_bytes` cap returns +`416 Requested Range Not Satisfiable` (with header +`x-orca-cap-exceeded: true`) before any cache lookup if the +computed response size exceeds the cap. + +### Diagram 2: Range request -> chunk index mapping + +```mermaid +flowchart LR + Req["GET /bucket/key
Range: bytes=A-B"] --> Math["chunk_size = 8 MiB
firstChunk = A / chunk_size
lastChunk = B / chunk_size"] + Math --> Iter["streaming iterator
cid := firstChunk..lastChunk
sliding prefetch window"] + Iter --> Keys["per cid: ChunkKey =
{origin_id, bucket, key,
etag, chunk_size, cid}"] + Keys --> Path["path =
origin_id /
hex(sha256(LP(origin_id) || ...)) /
cid"] + Path --> CS[("CacheStore
address")] +``` + +## 6. Request flow + +1. `GET /{bucket}/{key}` arrives with optional `Range`. +2. Auth middleware (bearer / mTLS) validates the caller. +3. `fetch.Coordinator` looks up object metadata in the metadata cache. On + miss, **per-replica** singleflight at the metadata layer issues at most + one `HEAD` per object per replica per metadata-cache window. Cluster-wide + bound is therefore N HEADs per object per window worst case where N is + the current peer-set size; this is acceptable in v1 (a cluster-wide HEAD + singleflight is a deferred optimization; see [s15.2](#152-cluster-wide-head-singleflight)). + Two TTLs apply, asymmetric by design (s12): + **positive entries** (`200` + ETag) are reused for `metadata_ttl` + (default 5m), which also bounds the staleness window if the + immutable-origin contract (s11) is violated. **Negative entries** + (`404`, unsupported-blob-type) are reused for `negative_metadata_ttl` + (default 60s), which bounds the create-after-404 unavailability window + after an operator uploads a previously-missing key. +4. If the request has `Range`, validate against `ObjectInfo.Size`; serve + `416` if unsatisfiable. Compute `firstChunk` and `lastChunk`. If + `server.max_response_bytes > 0` and the computed response size exceeds + it, return `400 RequestSizeExceedsLimit` (S3-style XML error body) + with `x-orca-cap-exceeded: true`. `416` is reserved for true + Range-vs-object-size violations. +5. Iterate the chunk range as a streaming iterator. For each `ChunkKey`: + - **ChunkCatalog hit:** open reader from `CacheStore`. Typed + `CacheStore` errors (s7) are honored: only `ErrNotFound` triggers a + refill; `ErrTransient` surfaces as `503 Slow Down` with `Retry-After`, + `ErrAuth` surfaces as `502 Bad Gateway` and counts toward the + `/readyz` `ErrAuth` threshold (default 3 consecutive -> NotReady). + - **ChunkCatalog miss:** call `CacheStore.Stat(key)`. If present, + record in the catalog and serve from the CacheStore. If absent, take + the miss-fill path (s8), which routes to the coordinator for that + specific chunk via local singleflight or per-chunk internal RPC. +6. **Cold path: stream directly with pre-header retry**. On a chunk + miss, the leader issues `Origin.GetRange` with bounded retry + (s8.6) **before** any HTTP response header is sent to the client. + Transient origin failures (5xx, network errors) on retryable + attempts are invisible to the client: the leader retries up to + `origin.retry.attempts` (default 3) with exponential backoff + capped by `origin.retry.max_total_duration` (default 5s). The + commit boundary is the **first byte arrival from origin**: once + the leader has received any byte, response headers + (`Content-Length`, `Content-Range`, `ETag`, + `Accept-Ranges: bytes`) are sent immediately and the leader + begins streaming bytes to the client as they arrive from origin. + The leader simultaneously tees bytes into the local Spool (s8.2) + for joiner support and for the asynchronous CacheStore commit. + `Content-Length` and `Content-Range` are computable from + `ObjectInfo.Size` and the chunk math, so headers can be sent + before the body completes. Pre-commit failures + (`OriginETagChangedError`, retry budget exhausted, internal RPC + failure, semaphore timeout) return a clean HTTP error before + any byte is sent (typically `502 Bad Gateway` or `503 Slow + Down`). The CacheStore commit happens asynchronously after the + client response completes, using whichever atomic primitive the + configured driver advertises (`PutObject + If-None-Match: *` for + `s3`; `link()` / `EEXIST` for `localfs` and `posixfs`). The + assembler is driver-agnostic: it calls `CacheStore.PutChunk` and + treats the typed error the same way regardless of backing store. + Commit-after-serve failure does NOT affect the in-flight client + response; it increments + `orca_commit_after_serve_total{result="failed"}` and the + chunk is **not** recorded in the `ChunkCatalog` (the next + request will refill). +7. **Mid-stream failure**: once any body byte has been written + (i.e., after the commit boundary), no HTTP error status is + possible. Mid-stream failures (origin disconnect after first + byte, or any post-commit error) abort the response (HTTP/2 + `RST_STREAM` with `INTERNAL_ERROR`; HTTP/1.1 `Connection: close` + after the partial write) and increment + `orca_responses_aborted_total{phase="mid_stream",reason}`. + S3 clients (aws-sdk, boto3, etc.) detect this via + `Content-Length` mismatch and retry. Mid-stream origin resume + (re-issue origin GET with `Range: bytes=-` and continue + feeding the client transparently) is deferred future work + ([s15.4](#154-mid-stream-origin-resume)). +8. If sequential prefetch is enabled, the iterator schedules asynchronous + fills for the next N chunks (capped per blob and globally) one chunk + ahead of the cursor. + +### Diagram 3: Scenario A - warm read (cache hit) + +```mermaid +sequenceDiagram + autonumber + participant C as Client + participant R as Replica + participant Cat as ChunkCatalog + participant CS as CacheStore + C->>R: GET /bucket/key Range: bytes=A-B + R->>R: chunk math -> streaming iterator + Note over R: defer headers until first chunk in hand + loop each ChunkKey (streaming) + R->>Cat: Lookup(k) + Cat-->>R: hit (ChunkInfo) + R->>CS: GetChunk(k, off, n) + CS-->>R: bytes + opt first chunk + R-->>C: 200/206 + Content-Length, Content-Range, ETag + end + R-->>C: stream slice + end + Note over R,CS: All replicas read directly from shared CacheStore on hit
and no peer is involved on the hit path +``` + +### Diagram 4: Scenario B - cold miss, local coordinator + +```mermaid +sequenceDiagram + autonumber + participant C as Client + participant R as Replica (assembler == coordinator) + participant Cat as ChunkCatalog + participant SF as Singleflight + participant Sp as Spool + participant O as Origin + participant CS as CacheStore + C->>R: GET /bucket/key Range + R->>Cat: Lookup(k) + Cat-->>R: miss + R->>CS: Stat(k) + CS-->>R: ErrNotFound + R->>SF: Acquire(k) [leader] + SF->>O: GetRange(bucket, key, etag, off, n)
If-Match: etag
(pre-header retry s8.6) + O-->>SF: first byte + Note over SF: commit boundary - origin healthy + par stream to client + SF-->>R: stream bytes as they arrive from origin + R-->>C: 200/206 + headers + body + and tee to spool + SF->>Sp: write bytes (in parallel) + end + O-->>SF: remaining bytes + SF->>Sp: Commit (fsync + close) [after stream complete] + SF-)CS: PutObject(final, body, If-None-Match: *) [async] + CS--)SF: 200 (commit_won) or failure + alt commit ok + SF->>Cat: Record(k, info) + Note over SF: commit_after_serve_total{result=ok}++ + else commit failed + Note over SF: commit_after_serve_total{result=failed}++
chunk NOT recorded - next request refills + end + SF->>SF: Release(k) + SF->>Sp: release after joiners drain +``` + +### 6.1 HEAD request flow + +`HEAD /{bucket}/{key}` is served entirely from object metadata; no +chunk lookup is performed. + +1. Auth as for GET. +2. `fetch.Coordinator` looks up `ObjectInfo` in the metadata cache. + On miss, the metadata-layer singleflight (s8.7) issues at most one + `Origin.Head` per object per replica per `metadata_ttl` window. +3. On success, return `200 OK` with `Content-Length: + ObjectInfo.Size`, `ETag: "ObjectInfo.ETag"`, `Content-Type: + ObjectInfo.ContentType`, `Accept-Ranges: bytes`. No + `CacheStore.Stat` and no `CacheStore.GetChunk` calls. +4. Negative cases reuse the GET error mapping (s6.3): `404` is + negatively cached for `negative_metadata_ttl` (s12); an unsupported azureblob + blob type (s9) returns `502 OriginUnsupported` with the + `x-orca-reject-reason` header. + +HEAD does NOT validate `If-Match` / `If-None-Match` / `If-Modified-Since` +preconditions against the cache state in v1; conditional HEAD is a +read-only client-side concern that operates on the returned `ETag`. + +### 6.2 LIST request flow + +`GET /{bucket}/?list-type=2&prefix=...` (S3 ListObjectsV2). v1 LIST +serves from a per-replica **LIST cache** (s6.2 introduces it; FW3) +in front of the existing per-replica LIST singleflight. The cache +is sized and tuned for the FUSE-`ls` workload pattern: thousands of +edge clients implementing FUSE filesystems perform interactive +`ls` and directory navigation against the S3 API, generating +prefix-clustered LIST traffic where the same query is repeated +many times within a short window. Per-replica caching is naturally +effective for FUSE clients because they typically pin to one +replica via HTTP/2 keepalive. + +**Cache key**: the full LIST query tuple +`(origin_id, bucket, prefix, continuation_token, start_after, +delimiter, max_keys)`. Pagination tokens are part of the key, so +sequential page-through caches each page independently and does +not collide. + +**TTL**: governed by `list_cache.ttl` (default 60s, configurable +typical range 5s - 30m). The 60s default trades freshness vs. +origin load: a freshly-uploaded key is invisible to LIST clients +for up to 60s. Acceptable for the immutable-artifact workload; +operators with write-and-immediately-list patterns should tune +shorter. + +**Eviction**: bounded LRU on `list_cache.max_entries` (default +1024). Memory math: 1024 entries times ~10 KB typical (1000-key +listing) = ~10 MB worst case. + +**Response-size cap**: very large LIST responses +(>`list_cache.max_response_bytes`, default 1 MiB) bypass the cache +entirely; the response is served to the client but not stored. + +**Steps**: + +0. **Cache lookup**. Compute the cache key from the request + parameters. On hit, serve the cached `ListResult` directly with + header `x-orca-list-cache-age: `. No origin + call. No singleflight acquisition. `list_cache_hit_total{origin_id, + result="hit"}++`. + +1. Auth as for GET. + +2. On cache miss, the request parameters `(prefix, continuation-token + / start-after, max-keys, delimiter)` are forwarded verbatim to + `Origin.List`. The continuation token returned to the client is + the origin's token passed through unchanged. There is no token + rewriting. + +3. **Per-replica LIST singleflight** keyed on the same cache-key + tuple collapses concurrent identical LIST calls on the same + replica during the cache miss. There is no cluster-wide LIST + singleflight in v1; cluster-wide bound is up to `N` `Origin.List` + calls per identical query per `list_cache.ttl` window where `N` + is peer-set size. Acceptable at v1 scale; a cluster-wide LIST + coordinator is a deferred optimization + ([s15.3](#153-cluster-wide-list-coordinator)). + +4. **azureblob origin**: when `cachestore.azureblob.list_mode = filter` + (the default), non-BlockBlob entries are stripped while + continuation tokens are preserved (s9). `passthrough` mode + disables filtering and returns the entire listing including + unsupported blob types. + +5. **Cache populate** on successful `Origin.List`. If the serialized + `ListResult` exceeds `list_cache.max_response_bytes`, skip the + populate (serve the response normally) and increment + `list_cache_evict_total{reason="response_too_large"}`. Otherwise + store with TTL = `list_cache.ttl`. Negative responses (errors) + are NOT cached; errors fall through every time. Empty-result + listings ARE cached (an authoritative "this prefix has no keys" + for the TTL window). + +6. LIST does NOT populate the metadata cache for individual entries. + A subsequent GET / HEAD on a listed key still triggers an + `Origin.Head` (subject to its own singleflight and TTL). + Rationale: eager metadata population on large listings would + balloon the metadata cache, and the FUSE workload typically + reads only a fraction of listed entries. + +7. Origin failures during LIST surface as `502 Bad Gateway` + (`ErrTransient` upstream) or the corresponding S3 error code; + LIST does NOT trip the CacheStore circuit breaker because it + never touches the CacheStore. + +**Stale-while-revalidate** is opt-in via +`list_cache.swr_enabled: false` default. When enabled with +`list_cache.swr_threshold_ratio: 0.5` (default), an entry whose +age exceeds half of `list_cache.ttl` is served immediately AND +triggers a background `Origin.List` to refresh; the user-observed +latency stays at cache-hit speed even at TTL boundaries. Adds +small extra origin load (one refresh per entry per TTL window). +Useful for heavy interactive FUSE deployments where `ls` latency +spikes at TTL expiry are user-visible. + +**Toggle**: `list_cache.enabled: true` default. Set `false` to +disable the cache layer for diagnostics; LIST falls through to the +existing pass-through behavior with per-replica singleflight only. + +### 6.3 HTTP error-code mapping + +The complete catalog of HTTP statuses the cache layer can return on +the **client edge**. Internal-listener (`:8444`, s8.8) statuses are +listed inline in s8.3 and are not reproduced here. + +| Status | S3-style code | Reason | Triggered by | Client retry? | +|---|---|---|---|---| +| `200 OK` / `206 Partial Content` | (none) | normal hit or successful fill | hit + range OK; cold-path fill after pre-header-retry commit (s8.6) | n/a | +| `400 RequestSizeExceedsLimit` | `RequestSizeExceedsLimit` | response would exceed `server.max_response_bytes` | range math at request entry; `x-orca-cap-exceeded: true` | no (different range) | +| `416 Requested Range Not Satisfiable` | `InvalidRange` | range vs. `ObjectInfo.Size` violation | range math at request entry | no (different range) | +| `502 Bad Gateway` | `OriginUnreachable` | origin error before commit boundary | `Origin.GetRange` 5xx; origin DNS failure; semaphore exhausted past wait | yes, small backoff | +| `502 Bad Gateway` | `OriginRetryExhausted` | leader retry budget exhausted (`origin.retry.attempts` or `origin.retry.max_total_duration`) before any byte from origin (s8.6) | sustained transient origin failures during pre-header retry | yes (origin may recover) | +| `502 Bad Gateway` | `OriginETagChanged` | `OriginETagChangedError` from `Origin.GetRange` (s8.6) | mid-flight overwrite caught by `If-Match`; non-retryable | yes (next request re-Heads) | +| `502 Bad Gateway` | `OriginUnsupported` | non-BlockBlob azureblob (s9) | `Origin.Head` returns unsupported blob type | no | +| `502 Bad Gateway` | `BackendUnavailable` | CacheStore `ErrAuth` | CacheStore credentials rejected | no (operator) | +| `503 Slow Down` | `SlowDown` | CacheStore `ErrTransient` | CacheStore 5xx / timeout / throttle | yes | +| `503 Slow Down` | `SlowDown` | spool full | `spool.max_inflight` exhausted past wait | yes | +| `503 Slow Down` | `SlowDown` | breaker open | per-process CacheStore breaker open (s10.2) | yes | +| `503 Service Unavailable` | (probe) | replica NotReady | `/readyz` failing predicates (s10.5) | n/a (LB drain) | +| (mid-stream abort) | n/a | post-commit-boundary failure | origin disconnect after first byte sent to client; CacheStore commit failure does NOT cause this (commit is post-response) | client SDK detects via `Content-Length` mismatch and retries; mid-stream resume deferred (s15.4) | + +`Retry-After: 1s` is set on every `503 Slow Down`. Pre-first-byte +errors carry an S3-style XML body (`......`). +Mid-stream aborts terminate the response (`HTTP/2 RST_STREAM(INTERNAL_ERROR)` +or `HTTP/1.1 Connection: close`) and increment +`orca_responses_aborted_total{phase="mid_stream",reason}`. + +## 7. Internal interfaces + +The mechanism's named seams. Implementations live under +`internal/orca/`. + +```go +// Origin: read-only view of upstream blob store. GetRange takes the etag +// from the prior Head and uses it as an If-Match precondition; mid-flight +// overwrite returns OriginETagChangedError. +type Origin interface { + Head(ctx context.Context, bucket, key string) (ObjectInfo, error) + GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) + List(ctx context.Context, bucket, prefix, marker string, max int) (ListResult, error) +} + +// OriginETagChangedError is returned by Origin.GetRange when the origin +// rejects the If-Match precondition. The fill is refused and the metadata +// cache entry for {origin_id, bucket, key} is invalidated; the next +// request re-Heads and gets a fresh ChunkKey.etag. +type OriginETagChangedError struct { + Bucket, Key string + Want, Got string // Want = ETag we expected; Got = current ETag if known +} + +// CacheStore: where chunk bytes physically live in the DC. Treated as the +// source of truth for chunk presence; backed by an in-DC S3-like service +// in production and a local directory in dev. PutChunk is atomic and +// no-clobber; the second concurrent PutChunk for the same key returns a +// CommitLost error. Read/Stat methods return typed errors: +// - ErrNotFound: chunk is absent. ONLY this error triggers a refill. +// - ErrTransient: backend hiccup (5xx, timeout, throttle). Surfaced as +// 503 Slow Down + Retry-After. Counts toward the +// per-process circuit breaker (see s10.2). +// - ErrAuth: backend rejected credentials (401/403). Surfaced as +// 502 BadGateway. Counts toward the breaker AND toward +// the /readyz consecutive-ErrAuth threshold (default 3 +// -> NotReady). +// +// Delete removes a chunk; used by active eviction (s13.2). Idempotent; +// ErrNotFound on a missing chunk is treated as success by the eviction +// loop. Delete errors count toward the same circuit breaker as Get / Put. +type CacheStore interface { + GetChunk(ctx context.Context, k ChunkKey, off, n int64) (io.ReadCloser, error) + PutChunk(ctx context.Context, k ChunkKey, size int64, r io.Reader) error // atomic, no-clobber + Stat(ctx context.Context, k ChunkKey) (ChunkInfo, error) + Delete(ctx context.Context, k ChunkKey) error // s13.2 active eviction + SelfTestAtomicCommit(ctx context.Context) error // startup probe +} + +// CacheStore typed errors. Wrap with %w so callers use errors.Is. +var ( + ErrNotFound = errors.New("cachestore: not found") + ErrTransient = errors.New("cachestore: transient") + ErrAuth = errors.New("cachestore: auth") +) + +// ChunkCatalog: in-memory, best-effort record of chunks known to be +// present in the CacheStore. Purely a hot-path optimization; the +// ChunkCatalog: in-memory, best-effort record of chunks known to be +// present in the CacheStore. Purely a hot-path optimization; the +// CacheStore is the source of truth. A Lookup miss falls through to +// CacheStore.Stat; the result is Recorded for subsequent requests. +// +// Lookup has a side effect: it increments the matched entry's +// AccessCount and updates LastAccessed (s10.2). These access counters +// are consumed by the optional active eviction loop (s13.2). Side +// effects are atomic; Lookup remains safe for concurrent callers. +// +// Forget is invoked when an entry is known to be invalid: +// - on OriginETagChangedError, the assembler Forgets the now-stale +// ChunkKey (its etag has been superseded); +// - on a CacheStore.GetChunk returning ErrNotFound for a key that +// was previously Recorded (lifecycle eviction caught the entry); +// - by the active eviction loop (s13.2) after a successful +// CacheStore.Delete. +// In v1 there are no other callers. +type ChunkCatalog interface { + Lookup(k ChunkKey) (ChunkInfo, bool) + Record(k ChunkKey, info ChunkInfo) + Forget(k ChunkKey) +} + +// Cluster: peer discovery + rendezvous hashing. Returns the coordinator +// peer for a given ChunkKey. self == coordinator means handle locally. +// InternalDial returns a transport (HTTP/2 over mTLS) for issuing +// internal RPCs to a non-self peer. ServerName returns the stable SAN +// (default "orca..svc") used for TLS verification across +// rolling restarts and pod-IP churn; per-replica internal-listener certs +// MUST include this SAN. +type Cluster interface { + Coordinator(k ChunkKey) Peer // returns self or remote Peer + Self() Peer + Peers() []Peer // current membership snapshot + InternalDial(ctx context.Context, p Peer) (InternalClient, error) + ServerName() string // e.g. "orca..svc" +} + +// Spool: bounded local-disk staging area for in-flight fills. Every fill +// writes through the spool so slow joiners can fall back from the leader's +// ring buffer to a local disk reader regardless of CacheStore driver. +type Spool interface { + Begin(k ChunkKey, size int64) (SpoolWriter, error) + Reader(k ChunkKey, off int64) (io.ReadCloser, error) + Release(k ChunkKey) // drop spool entry once all in-flight readers are done +} + +type SpoolWriter interface { + io.Writer + Commit() error // fsync + close + Abort() error // discard +} + +// --------------------------------------------------------------------- +// Supporting types referenced by the interfaces above. +// --------------------------------------------------------------------- + +// ObjectInfo: result of a successful Origin.Head and the metadata-cache +// entry shape. LastValidated and LastStatus are advisory and used for +// negative-cache TTL accounting (s8.6). +type ObjectInfo struct { + Size int64 + ETag string + ContentType string + LastValidated time.Time + LastStatus int // last HTTP status seen from the origin +} + +// ChunkInfo: result of a successful CacheStore.Stat or +// ChunkCatalog.Lookup. Size is the on-store byte length, which equals +// chunk_size for all chunks except the last chunk of an object (which +// is partial; see s10.3). +// +// AccessCount, LastAccessed, and LastEntered are set by the +// ChunkCatalog as access-frequency tracking for the optional active +// eviction loop (s13.2). They are zero-valued on freshly-Recorded +// entries and are atomically updated by Lookup. +type ChunkInfo struct { + Size int64 + Committed time.Time + AccessCount uint32 // s13.2; saturates at MaxUint32 + LastAccessed time.Time // s13.2; updated on Lookup hit + LastEntered time.Time // s13.2; set on Record; never updated +} + +// ListResult: paginated result from Origin.List. +type ListResult struct { + Entries []ObjectEntry + NextMarker string + IsTruncated bool +} + +// ObjectEntry: one item in a ListResult. BlobType is azureblob-specific +// and lets the cache filter non-BlockBlob entries while preserving +// continuation tokens (s9). +type ObjectEntry struct { + Key string + Size int64 + ETag string + BlobType string // "" for s3 origin; "BlockBlob" / "PageBlob" / "AppendBlob" for azureblob +} + +// Peer: a single replica in the current peer-set snapshot returned by +// Cluster.Peers / Cluster.Coordinator / Cluster.Self. +type Peer struct { + IP string // pod IP from the headless Service A-record + Self bool // true iff this is the current process +} + +// InternalClient: HTTP/2 over mTLS client to a peer's internal listener. +// Returned by Cluster.InternalDial. v1 exposes the per-chunk fill RPC +// only. +type InternalClient interface { + Fill(ctx context.Context, k ChunkKey) (io.ReadCloser, error) +} + +// MetadataCacheEntry: per-entry shape of the metadata cache (s8.7, +// s11.2). Access tracking is set unconditionally on Lookup hit but +// only consumed by the optional bounded-freshness mode (s11.2). +type MetadataCacheEntry struct { + ObjectInfo + AccessCount uint32 // s11.2; saturates at MaxUint32 + LastAccessed time.Time // s11.2; updated on Lookup hit + LastEntered time.Time // s11.2; set on Record; never updated +} +``` + +Implementations: + +- `Origin`: `origin/s3`, `origin/azureblob` (Block Blob only). Both pass + the caller's `etag` as `If-Match` on the underlying GET; both translate + the backend's "precondition failed" status into `OriginETagChangedError`. +- `CacheStore`: `cachestore/localfs` (dev), `cachestore/s3` (in-DC + S3-compatible object store, e.g. VAST), `cachestore/posixfs` (shared + POSIX FS: NFSv4.1+ baseline, plus Weka native, CephFS, Lustre, GPFS). + See [s10.1](#101-atomic-commit-per-cachestore-driver) for atomic-commit + specifics per driver. The two POSIX-shaped drivers (`localfs` and + `posixfs`) share their commit primitives (`link()` no-clobber, dir + fsync, staging-dir layout, optional fan-out) via + `internal/orca/cachestore/internal/posixcommon/`; this is an + internal-to-cachestore package and is not visible to the rest of the + cache layer. +- `ChunkCatalog`: a single in-memory LRU implementation with + optional access-frequency tracking driving the active eviction + loop (s13.2). Bounded by `chunk_catalog.max_entries`. +- `Cluster`: a single implementation that polls the headless Service + (default 5s), computes rendezvous hashes against pod IPs, and exposes + an mTLS HTTP/2 client for the internal listener. +- `Spool`: a single implementation backed by a configured local directory + (`spool.dir`) with a capacity cap (`spool.max_bytes`) and an in-flight + cap (`spool.max_inflight`). + +## 8. Stampede protection + +The single most important hot-path correctness issue. Layered defense. + +### 8.1 Per-`ChunkKey` singleflight + +Process-local map `inflight: map[ChunkKey]*Fill`, guarded by a mutex. Each +`*Fill` has a `done` channel, an error slot, the resulting `ChunkInfo`, a +bounded ring buffer, a `Spool` handle (s8.2), and a refcount. Acquire +path: under the lock, either return the existing entry as a joiner or +insert a new entry and become the leader. Release path: leader removes +the entry from the map after signalling, so any thread arriving while the +entry is mapped joins; any thread arriving after removal records the +chunk in the `ChunkCatalog` (which the leader populated before releasing) +and serves a normal hit. + +### 8.2 TTFB tee + spool + +In v1 the leader streams origin bytes directly to the requesting +client (after pre-header retry confirms a healthy origin +connection, s8.6) AND simultaneously tees the bytes into two +side channels for joiner support and the asynchronous CacheStore +commit: + +1. **Ring buffer** (in-memory, bounded 1-2 MiB by default). Joiners + obtain a `Reader` over this buffer that replays buffered bytes + and blocks on a condition variable for more. Delivers low TTFB + for on-pace joiners. +2. **Spool** (local disk file via the `Spool` interface). The + leader writes every byte to a local spool file in parallel + with the client write and the CacheStore upload. A slow joiner + that falls behind the ring buffer head transparently switches + to a `Spool.Reader(k, off)`. The spool exists because the + production `cachestore/s3` driver streams directly into + `PutObject` and does not produce a readable on-disk tmp file - + without the spool, slow joiners on the s3 path would have no + local fallback. The spool unifies joiner-fallback behavior + across `localfs`, `s3`, and `posixfs` drivers. + +**The spool is NOT on the client TTFB path in v1.** Cold-path +client TTFB is bounded by origin first-byte latency plus a small +amount of pre-header retry overhead (s8.6). The leader does NOT +wait for the chunk to be fully written or fsynced into the spool +before sending bytes to the client. The spool is a parallel +side-channel for joiner support and CacheStore commit; the client +write is independent of and in parallel with the spool write. + +**Spool locality is required (with a documented override).** The +Spool MUST live on a local block device by default. At boot, the +cache layer runs `statfs(2)` against `spool.dir` and refuses to +start (exit non-zero) if the filesystem magic matches a network FS +denylist (NFS, SMB / CIFS, CephFS, Lustre, GPFS, FUSE including +Alluxio FUSE), incrementing +`orca_spool_locality_check_total{result="refused"}`. +Governed by `spool.require_local_fs` (default `true`). The +rationale is now defense-in-depth: with the v1 streaming design +the spool no longer gates client TTFB, but joiner-fallback latency +still benefits materially from local NVMe (a remote-FS spool would +convert microsecond-class read-from-spool to milliseconds-class +network-round-trip on every joiner switchover). Operators with +unusual placements (e.g., large RAM-disk) MAY relax the contract +via `spool.require_local_fs: false`; production deployments are +expected to keep the default. See +[s10.4](#104-spool-locality-contract) for the full check. + +**CacheStore commit timing.** After the leader has streamed the +full chunk to the client (and the spool has finished receiving), +the leader performs the CacheStore commit asynchronously +(`PutObject + If-None-Match: *` for `s3`; `link()` for `localfs` +and `posixfs`). Success increments +`commit_after_serve_total{result="ok"}`; failure increments +`commit_after_serve_total{result="failed"}` AND skips +`ChunkCatalog.Record` so the next request refills. The client +response is unaffected either way - by this point the client has +already received the full chunk. + +Capacity: `spool.max_bytes` caps total spool footprint (default 8 +GiB); `spool.max_inflight` caps concurrent fills using the spool. +When the spool is full, new fills wait briefly on the +`spool.max_inflight` semaphore; on timeout they return `503 Slow +Down` to the client. + +After the leader's CacheStore commit succeeds, the spool entry is +retained briefly so any in-flight joiner can finish reading; once +joiner refcount hits zero the spool entry is released. On commit- +after-serve failure the spool entry is released the same way; the +cache layer simply does not record the chunk and the next request +refills. + +### Diagram 5: Scenario C - concurrent miss, same-replica joiner + +```mermaid +sequenceDiagram + autonumber + participant A as Client A (leader request) + participant B as Client B (joiner) + participant R as Replica + participant SF as Singleflight + participant Ring as Ring buffer (1-2 MiB) + participant Sp as Spool (local disk) + participant O as Origin + participant CS as CacheStore + participant Cat as ChunkCatalog + A->>R: GET k + R->>SF: Acquire(k) [leader = A] + SF->>O: GetRange(..., If-Match: etag)
(pre-header retry s8.6) + O-->>SF: first byte + Note over SF: commit boundary - origin healthy + par tee to ring + SF->>Ring: bytes + and tee to spool + SF->>Sp: bytes + and stream to A + SF-->>A: stream bytes as they arrive + end + O-->>SF: remaining bytes + B->>R: GET k (concurrent) + R->>SF: Acquire(k) [joiner = B] + SF-->>B: stream from Ring + Note over B: B falls behind ring head + SF-->>B: switch to Spool.Reader + SF->>Sp: Commit (fsync + close) [after stream complete] + SF-)CS: PutObject(final, body, If-None-Match: *) [async] + CS--)SF: 200 (commit_won) or failure + alt commit ok + SF->>Cat: Record(k, info) + else commit failed + Note over SF: commit_after_serve_total{result=failed}++
chunk NOT recorded + end + SF->>SF: Release(k) + SF->>Sp: Release after joiners drain +``` + +### 8.3 Cluster-wide deduplication via per-chunk fill RPC + +Rendezvous hashing on `ChunkKey` against the current pod-IP set selects +**one coordinator per chunk**. A range request can span N chunks; those +chunks may have N distinct coordinators. The replica that receives the +client request is therefore the **assembler**, not a forwarder of the +whole HTTP request. For each `ChunkKey k` in the requested range: + +- **Hit** (Catalog or `Stat` says present): assembler reads from + `CacheStore` directly. No internal RPC. +- **Miss + `Coordinator(k) == self`**: assembler runs the local + singleflight + tee + spool + commit path (s8.1, s8.2, s10). +- **Miss + `Coordinator(k) != self`**: assembler issues + `GET /internal/fill?key=` to the coordinator on the + coordinator's internal listener (s8.8). The coordinator runs the + singleflight + tee + spool + commit path locally and streams the chunk + bytes back. The assembler stitches the returned bytes into the client + response, slicing the first and last chunk to match the client's `Range`. + +**Loop prevention**: the assembler sets `X-Origincache-Internal: 1` on +internal RPCs. A receiver seeing this header MUST self-check: +`Cluster.Coordinator(k) == Cluster.Self()`. On disagreement (membership +flux), the receiver returns `409 Conflict` with body +`{"reason":"not_coordinator"}`; the assembler falls back to local fill +for that chunk (one duplicate fill possible during flux; observable via +the duplicate-fills metric below). Receivers MUST NOT chain forward +internal RPCs. + +Combined with s8.1, exactly one origin GET per cold chunk per cluster in +steady state. During membership change we accept up to one duplicate fill +per chunk (loser drops on commit collision; observable via +`orca_origin_duplicate_fills_total{result="commit_lost"}`). The +duplicate-fill metric is the leading indicator that this routing is +working: a sustained non-zero `commit_lost` rate signals chronic +membership flux or a bug in the hash distribution. + +### Diagram 6: Scenario D - cold miss, remote coordinator + +```mermaid +sequenceDiagram + autonumber + participant C as Client + participant A as Replica A (assembler) + participant B as Replica B (coordinator for k) + participant SF as Singleflight @ B + participant Sp as Spool @ B + participant O as Origin + participant CS as CacheStore + C->>A: GET /bucket/key Range + A->>A: rendezvous(k, peer IPs) = B + Note over A: B != self + A->>B: GET /internal/fill?key=k
X-Origincache-Internal: 1
(mTLS, internal listener :8444) + B->>B: self-check: Coordinator(k) == self? + Note over B: yes, proceed + B->>SF: Acquire(k) [leader] + SF->>O: GetRange(..., If-Match: etag)
(pre-header retry s8.6) + O-->>SF: first byte + Note over SF: commit boundary - origin healthy + par stream to A + SF-->>B: stream bytes as they arrive + B-->>A: chunk bytes (stream) + A-->>C: stream slice + and tee to spool @ B + SF->>Sp: write bytes (in parallel) + end + O-->>SF: remaining bytes + SF->>Sp: Commit (fsync + close) [after stream complete] + SF-)CS: PutObject(final, body, If-None-Match: *) [async] + CS--)SF: 200 (commit_won) or failure + Note over A,B: On membership disagreement at B
B returns 409 and A falls back to local fill + Note over A,B: On hit (chunk in CacheStore)
A reads CacheStore directly with no internal RPC +``` + +### Diagram 7: Scenario E - range spanning multiple coordinators + +```mermaid +sequenceDiagram + autonumber + participant C as Client + participant A as Replica A (assembler) + participant CS as CacheStore + participant B as Coordinator(k2) + participant D as Coordinator(k3) + Note over A: Range bytes=X-Y -> chunks {k1, k2, k3} + C->>A: GET /bucket/key Range + A->>A: streaming chunk iterator + Note over A: k1: Stat hit -> read CacheStore + A->>CS: GetChunk(k1) + CS-->>A: bytes + A-->>C: stream slice (first chunk -> headers go out) + Note over A: k2: miss, Coordinator(k2) = B != self + A->>B: GET /internal/fill?key=k2 (mTLS) + B-->>A: chunk bytes + A-->>C: stream slice + Note over A: k3: miss, Coordinator(k3) = D != self + A->>D: GET /internal/fill?key=k3 (mTLS) + D-->>A: chunk bytes + A-->>C: stream slice +``` + +### 8.4 Origin backpressure + +Each replica enforces a **per-replica token bucket** that caps +concurrent `Origin.GetRange` calls. The bucket is sized to a +conservative per-replica fraction of the desired cluster-wide +concurrency: + +``` +target_per_replica = floor(target_global / N_typical) +``` + +where `N_typical` is the expected replica count in steady state +(`cluster.target_replicas`, default 3). Defaults: `target_global=192`, +giving `target_per_replica=64`. + +This is approximate. Realized cluster-wide concurrency depends on +the actual replica count `N_actual`: + +- `N_actual == N_typical`: realized cap is `target_global` exactly. +- `N_actual > N_typical` (scaled out without updating + `cluster.target_replicas`): realized cap exceeds `target_global` + by up to `(N_actual - N_typical) * target_per_replica`. +- `N_actual < N_typical` (scaled in): realized cap falls below + `target_global` by `(N_typical - N_actual) * target_per_replica`. + +Operators MUST update `cluster.target_replicas` after any sustained +scale change. Dynamic recompute of the cap from `len(Cluster.Peers())` +is a deferred optimization; see +[s15.6](#156-dynamic-per-replica-origin-cap). + +Origin throttling responses (HTTP 503 SlowDown, 429, retryable +5xx) are handled by the leader's pre-header retry loop (s8.6 / +Option D), which provides exponential backoff transparent to the +client. If the retry budget exhausts, the leader returns +`502 OriginRetryExhausted`. The system self-regulates without +cluster-wide coordination: an over-loaded origin slows individual +fills via backoff; the per-replica cap bounds inflight per pod; +the singleflight (s8.1) collapses concurrent identical fills. + +When the bucket is saturated, leaders queue with bounded wait +(`origin.queue_timeout`, default 5s); on timeout, the request +returns `503 Slow Down` to the client so clients back off. +Joiners on existing fills do not consume slots. + +The current saturation is exposed as +`orca_origin_inflight{origin}` (per-replica gauge). +Operators can sum across replicas in their monitoring stack to +observe approach to `target_global`. + +A real coordinated cluster-wide limiter (Kubernetes-Lease-elected +authority + slot-lease tokens + RPC-based slot acquisition + +graceful fallback) is a deferred optimization; see +[s15.5](#155-coordinated-cluster-wide-origin-limiter) for the +full design, trigger conditions, and v1 bound. Build only when +measured deployment scale (>10 replicas with steady-state slot +under-utilization) justifies the additional surface area. + +Optional token bucket on origin bytes/sec layered on top of the +slot-based concurrency cap. + +### 8.5 Cancellation safety + +`Fill.run()` uses an internal long-lived context, not any single client's +context. The fill outlives any single requester. If every joiner cancels +we still finish the fill (cheap insurance; configurable to abort). A +joiner cancelling unblocks only itself. + +### 8.6 Failure handling without re-stampede + +- **Retryable error**: short-lived negative entry in the singleflight map + (cooldown 100 ms - 1 s) so concurrent joiners share the failure rather + than each retrying immediately. +- **`OriginETagChangedError`**: leader (a) invalidates the metadata cache + entry for `{origin_id, bucket, key}`, (b) fails the in-flight fill, (c) + joiners receive the same error and abort their responses (or, if + pre-commit, get a `502 Bad Gateway`). The next request triggers a + fresh `Head` and a new `ChunkKey` with the new ETag. Old chunks under + the old ETag age out via the CacheStore lifecycle. Increments + `orca_origin_etag_changed_total`. +- **Hard 404 / unsupported blob type**: cached in the metadata cache as + a negative entry for `negative_metadata_ttl` (default 60s, + configurable). Per-replica HEAD singleflight (s8.7) caps origin HEAD + load at one HEAD per object per replica per window. The full + negative-cache lifecycle and the create-after-404 case (an operator + uploads `K` after a client has already observed `404` on `K`) are in + [s12](#12-create-after-404-and-negative-cache-lifecycle). +- **Pre-header origin retry (the v1 cold-path retry mechanism)**: + the leader retries `Origin.GetRange` on transient errors **before** + any HTTP response header is sent to the client, making transient + origin failures invisible to the client. The retry budget is + bounded by both attempt count and total wall-clock duration: + - `origin.retry.attempts` (default 3): max attempts. + - `origin.retry.backoff_initial` (default 100ms), + `origin.retry.backoff_max` (default 2s): exponential backoff + cap per attempt. + - `origin.retry.max_total_duration` (default 5s): absolute + wall-clock cap; if exceeded the leader returns `502 Bad Gateway` + even before all attempts complete. + + The **commit boundary** is the first byte arrival from origin: + once received, the leader sends headers + first byte, then + streams. Pre-commit failures return clean HTTP errors (`502 + Bad Gateway` with code `OriginUnreachable` or + `OriginRetryExhausted`); post-commit failures become mid-stream + client aborts (s6 step 7). `OriginETagChangedError` is + non-retryable (the object identity changed; refilling under the + old ETag is the bug we are preventing); the leader returns + `502 OriginETagChanged` immediately. Joiners sit through retries + on the same `Fill`. Outcomes are exposed as + `orca_origin_retry_total{result="success|exhausted_attempts|exhausted_duration|etag_changed"}` + (one increment per request that entered the retry loop) and + `orca_origin_retry_attempts` (histogram of attempt count + per request). + + The retry budget defaults are intentionally smaller than typical + S3 SDK read timeouts (aws-sdk-go: 30s; boto3: 60s) so retries + complete before clients time out. +- **`CommitFailedAfterServe`**: the CacheStore commit happens + asynchronously after the client response is complete (s8.2). A + failure here is NOT visible to the client. The leader increments + `orca_commit_after_serve_total{result="failed"}` and + does NOT call `ChunkCatalog.Record`. Joiners on the same fill + that are still draining the Spool finish normally; the next + request for the same `ChunkKey` re-runs the fill (one extra + origin GET worst case). Sustained non-zero `failed` rate is a + CacheStore-health alert, not a per-request error path. +- **Typed `CacheStore` errors during read**: `ErrNotFound` triggers the + miss-fill path; `ErrTransient` surfaces as `503 Slow Down` with + `Retry-After: 1s`; `ErrAuth` surfaces as `502 Bad Gateway`. Sustained + `ErrTransient` / `ErrAuth` trips the per-process **CacheStore circuit + breaker** (s10.2). Sustained `ErrAuth` (default 3 consecutive) flips + `/readyz` to NotReady so load balancers drain the replica. + +### 8.7 Metadata-layer singleflight + +Same pattern at the metadata cache: +`metaInflight: map[ObjectKey]*MetaFill`. Without this, a flood of +distinct cold keys shifts the storm from chunk GETs to chunk HEADs. +Stale-while-revalidate behavior: serve stale within a small margin while +one background refresh runs. The singleflight is **per-replica**: a +cluster-wide cold-fan-out can cause up to N HEADs per object per +`metadata_ttl` window where N is the current peer-set size. This is +acceptable in v1; a cluster-wide HEAD singleflight is a deferred +optimization (see [s15.2](#152-cluster-wide-head-singleflight)). + +**LIST cache singleflight (FW3, s6.2).** A parallel per-replica +singleflight collapses concurrent identical `Origin.List` calls +keyed on the full LIST query tuple. Sits in front of the LIST +cache; reused on cache miss. Cluster-wide bound is up to N origin +LIST per identical query per `list_cache.ttl`; a cluster-wide LIST +coordinator is a deferred optimization (s15.3). + +**Bounded-freshness mode interaction (FW5, s11.2).** When +`metadata_refresh.enabled: true`, background refresh workers are +gated by the same per-replica HEAD singleflight: if both an +on-demand miss-fill and a background refresh fire for the same +object key concurrently, they share one `Origin.Head` and both +consumers receive the result. New entries Recorded on a miss-fill +start with `AccessCount=0` and `LastEntered=now`; the cold-start +protection (`min_age`) prevents these from being immediately +eligible for refresh. + +### 8.8 Internal RPC listener + +Per-chunk fill RPCs (`GET /internal/fill?key=`) are +served on a separate listener bound to a distinct port (default `:8444`, +config `cluster.internal_listen`). This isolates inter-replica traffic +from the client edge. + +- **Transport**: HTTP/2 over mTLS. +- **Server cert**: per-replica cert (e.g. cert-manager-issued) chained to + a configured **internal CA** (`cluster.internal_tls.ca_file`). The + internal CA is **distinct** from the client mTLS CA so a leaked client + cert cannot be used to dial the internal listener. The cert MUST + include the stable SAN `cluster.internal_tls.server_name` (default + `orca..svc`); pod-IP SANs are NOT used because pod IPs + change on rolling restart. +- **Client auth**: peer presents a client cert chained to the internal CA + AND the peer's source IP must be in the current peer-IP set + (`Cluster.Peers()`). The IP-set check guards against a leaked internal + cert being usable from outside the Deployment. +- **TLS verification**: the dialer pins `tls.Config.ServerName` to the + value returned by `Cluster.ServerName()` (the same stable SAN above) + rather than to the destination pod IP. This keeps verification + consistent across rolling restarts and pod-IP churn. +- **Authorization scope**: the internal listener serves `GET + /internal/fill?key=` only - the per-chunk + fill RPC (s8.3). No client identity is propagated from the + assembler because chunk content is identity-independent: any + authorized client at the assembler is entitled to the chunk + bytes, and the coordinator is doing the same fill it would do + for a local request. +- **NetworkPolicy**: ingress on `:8444` allowed only from pods with + label `app=orca` in the same namespace. +- **Loop prevention**: receiver enforces `X-Origincache-Internal: 1` -> + self must be coordinator for the requested `ChunkKey`, else + `409 Conflict`. + +Metrics: `orca_cluster_internal_fill_requests_total{direction= +"sent|received|conflict"}`, +`orca_cluster_internal_fill_duration_seconds`. + +## 9. Azure adapter: Block Blob only + +Hardened constraint. + +- Enforced in `internal/orca/origin/azureblob.Head`. Block type is + immutable on an existing blob (you have to delete and recreate to change + it, which produces a new ETag), so checking once per `(container, blob, + etag)` is sufficient. +- Detection via `Get Blob Properties` -> `BlobType` field. Reject anything + other than `BlockBlob` with a typed error `UnsupportedBlobTypeError` + exported from `internal/orca/origin`. +- Surfaced to clients as HTTP `502 Bad Gateway` with S3 error code + `OriginUnsupported`, body containing reason, plus + `x-orca-reject-reason: azure-blob-type=` header. +- Negatively cached in the metadata cache for `negative_metadata_ttl` + (default 60s; see [s12](#12-create-after-404-and-negative-cache-lifecycle)) + and + singleflighted at the metadata layer to prevent re-probing. +- `ListObjectsV2` defaults to `filter` mode: non-Block Blob entries are + skipped while preserving continuation tokens. `passthrough` mode is + available for debugging. +- Config schema reserves `enforce_block_blob_only: true`. Setting it to + false is rejected at startup. +- `Origin.GetRange` on the azureblob adapter uses `If-Match: ` on + the underlying Get Blob; `412 Precondition Failed` is translated to + `OriginETagChangedError` (s8.6). +- Prometheus counter: + `orca_origin_rejected_total{origin="azureblob",reason="non_block_blob",blob_type=...}`. + +### Diagram 8: Scenario F - Azure non-BlockBlob rejection + +```mermaid +flowchart TD + Req["client GET /bucket/key
(azureblob origin)"] --> Meta["Metadata cache lookup"] + Meta -- "hit: BlockBlob" --> OkPath["proceed: chunk path
(GetRange uses If-Match: etag)"] + Meta -- "hit: rejected" --> Reject1["502 OriginUnsupported
(neg cache TTL)"] + Meta -- "miss" --> Head["Origin Get Blob Properties
(metadata-layer singleflight)"] + Head --> Type{"BlobType?"} + Type -- "BlockBlob" --> CacheOk["metadata cache:
BlockBlob
(default TTL)"] + Type -- "PageBlob | AppendBlob" --> CacheReject["metadata cache:
UnsupportedBlobTypeError
(rejection_ttl)
+ rejected_total++"] + CacheOk --> OkPath + CacheReject --> Reject2["502 OriginUnsupported
x-orca-reject-reason:
azure-blob-type=type"] + LR["ListObjectsV2
(list_mode=filter)"] --> Filter["skip non-BlockBlob entries,
preserve continuation tokens"] +``` + +## 10. Concurrency, durability, correctness + +### 10.1 Atomic commit (per CacheStore driver) + +The leader publishes a chunk to the CacheStore atomically and +no-clobber: the second concurrent commit for the same key MUST lose +without overwriting the winner. Cold-path commit happens +asynchronously **after** the client response is complete (s8.2 / s6 +step 6), so a commit failure here does NOT affect the +in-flight client response; it only increments +`orca_commit_after_serve_total{result="failed"}` and skips +`ChunkCatalog.Record` (next request refills). + +Three drivers ship in v1, mapped onto two equivalent atomic-commit +primitives. `localfs` and `posixfs` both use POSIX `link()` (or +`renameat2(RENAME_NOREPLACE)` on Linux) returning `EEXIST` to the +loser, and share their helpers via +`internal/orca/cachestore/internal/posixcommon/`. `s3` uses +`PutObject + If-None-Match: *` returning `412` to the loser. All three +drivers run `SelfTestAtomicCommit` at boot. + +Commit outcomes are recorded as label values on the metric +`orca_origin_duplicate_fills_total{result="commit_won|commit_lost"}` +(s8.3). Throughout this section "increment commit_won" / "increment +commit_lost" is shorthand for "increment that counter with the +matching label value". + +#### 10.1.1 cachestore/localfs + +1. Leader stages the chunk inside `/.staging/` (a fixed + subdirectory of the CacheStore root, NOT `/tmp` and NOT the spool + directory). Staging inside the root keeps the file on the same + filesystem as the destination, which is required for `link()` to + succeed; the spool MAY be on a different filesystem and so cannot + also serve as the staging area. +2. After write, `fsync()` then `fsync()`. +3. Commit: `link(/.staging/, )`. POSIX `link()` is + atomic and returns `EEXIST` if the destination exists. On `EEXIST`, + the leader treats the existing `` as the source of truth, + `unlink(/.staging/)`, `fsync(/.staging/)`, and + increments commit_lost. On success, `unlink(/.staging/)`, + `fsync(/.staging/)`, `fsync()`, and + increment commit_won. +4. On Linux, `renameat2(RENAME_NOREPLACE)` is preferred when available + (single syscall) with the same parent-dir fsync sequencing; the + `link` + `unlink` form is the portable fallback (also works on + macOS dev environments). Plain `rename()` is **never** used because + it overwrites the destination on POSIX. +5. Crash recovery: a periodic background sweep (default every 1 hour) + unlinks `/.staging/` entries older than + `cachestore.localfs.staging_max_age` (default 1h), with a + `fsync(/.staging/)` after the batch. Nothing breaks if a + staging file lingers briefly. Each sweep increments + `orca_localfs_dir_fsync_total{result}`. + +#### 10.1.2 cachestore/posixfs + +`posixfs` runs the same `link()` no-clobber primitive as `localfs`, but +against a shared POSIX-style filesystem mounted on every replica at the +same mount point and the same ``. All replicas race the same +`link()` syscall against the same destination inode; the kernel (NFS +server, Weka, CephFS MDS, Lustre MDS, GPFS, etc.) is the arbiter, and +exactly one wins. + +1. Backend selection and detection. At boot the driver inspects the + filesystem under `` via `statfs(2)` (`f_type`) and + `/proc/mounts` and emits an info gauge + `orca_posixfs_backend{type,version,major,minor}` (e.g. + `type="nfs",version="4.1"`, `type="wekafs"`, `type="ceph"`, + `type="lustre"`, `type="gpfs"`). Operators MAY override the detected + `type` via `cachestore.posixfs.backend_type` for backends with + ambiguous magic numbers; the override is logged loudly. Detected + `type="fuse"` triggers an extra check: if `/proc/mounts` source + matches `alluxio` (case-insensitive), the driver increments + `orca_posixfs_alluxio_refusal_total` and exits non-zero with + `cachestore/posixfs: Alluxio FUSE is unsupported (no link(2), no + atomic no-overwrite rename, no NFS gateway); use cachestore.driver: + s3 against the Alluxio S3 gateway instead`. +2. NFS minimum version. If `type="nfs"`, the driver reads the + negotiated NFS version from `/proc/mounts` (the `vers=` option). If + the version is below `cachestore.posixfs.nfs.minimum_version` + (default `4.1`), the driver refuses to start. NFSv3 is opt-in only + via `cachestore.posixfs.nfs.allow_v3: true`, which logs a loud + warning and increments + `orca_posixfs_nfs_v3_optin_total`. Rationale: NFSv3 has weak + retransmit semantics; NFSv4.0 has atomic CREATE EXCLUSIVE but no + session idempotency; NFSv4.1+ provides session-based idempotency + that makes `link()` / `EEXIST` safe under client retries. +3. Path layout adds a 2-character hex fan-out to keep directory sizes + manageable on multi-PB working sets: + `////` where `hash` + is the existing s5 hex hash. Fan-out width is governed by + `cachestore.posixfs.fanout_chars` (default `2`, 0 disables). The + `localfs` driver does NOT add fan-out by default (small dev working + sets), but the `posixcommon` helper supports it on both drivers. +4. Stage + commit + recovery: identical to `localfs` (steps 1-5 above) + with the fan-out parent dirs created lazily and `fsync`ed on first + use, and `cachestore.posixfs.staging_max_age` (default 1h) governing + the sweep. +5. **Startup self-test** (`SelfTestAtomicCommit`): on driver init the + `posixfs` driver creates a staging file, links it to a probe final, + then attempts a second `link()` to the same probe final and asserts + `EEXIST`. It then writes a known-size payload to the linked file via + a separate handle and asserts the size is observable to a re-`stat` + after `fsync()`. If `EEXIST` is not returned (the + second `link()` succeeds, or returns a different error), or if the + size verification fails, the driver exits non-zero with + `cachestore/posixfs: backend does not honor link()/EEXIST or + directory fsync; refusing to start`. Governed by + `cachestore.posixfs.require_atomic_link_self_test` (default `true`; + never disabled in production). On success, the driver records + `orca_posixfs_selftest_last_success_timestamp`. +6. NFS export hardening. `posixfs` documents (and the operator runbook + enforces) that NFS exports MUST use `sync` (not `async`); an `async` + export weakens the dir-fsync guarantee that the commit primitive + depends on. The driver cannot detect server-side `async` directly; + the runbook is the contract, and the boot self-test catches the most + common misconfigurations by re-`stat`ing through the negotiated + client cache. + +#### 10.1.3 cachestore/s3 + +1. Leader streams origin bytes (via the Spool, s8.2) into a single + `PutObject(final_key, body, If-None-Match: "*")`. There is no tmp + key and no copy hop. +2. `200 OK` -> commit_won. `412 Precondition Failed` -> commit_lost + (treat the existing object as the source of truth; no cleanup + needed because no tmp object was created). +3. **Startup self-test** (`SelfTestAtomicCommit`): on driver init the + `cachestore/s3` driver writes a probe key, then attempts a second + `PutObject(probe_key, ..., If-None-Match: "*")` and asserts a + `412` response. If the backend returns `200` instead (silently + overwrites), the driver fails to start with `cachestore/s3: + backend does not honor If-None-Match: *; refusing to start`. This + prevents silent double-writes on backends that don't implement the + precondition. Verified backends as of v1: AWS S3 (since 2024-08), + MinIO, VAST Cluster (**non-versioned buckets only**). VAST + documents that `If-None-Match: *` is honored on `PutObject` and + `CompleteMultipartUpload` against unversioned buckets but is NOT + supported on versioned buckets ([VAST KB: S3 Conditional + Writes][vast-kb-conditional-writes], 2026-01-26). +4. **Startup versioning gate**: to prevent silent atomic-commit + failures the driver also issues `GetBucketVersioning(bucket)` at + boot. If the response indicates `Status: Enabled` OR + `Status: Suspended` (suspended also disables `If-None-Match`- + based atomic writes on AWS S3), the driver exits non-zero with + `cachestore/s3: bucket has versioning enabled or + suspended; If-None-Match: * is not honored on versioned buckets + and the atomic-commit primitive cannot guarantee no-clobber. + Disable bucket versioning to use cachestore/s3.` Governed by + `cachestore.s3.require_unversioned_bucket` (default `true`; + never disabled in production). The gate emits + `orca_s3_versioning_check_total{result="ok|refused"}` once + per boot. + +[vast-kb-conditional-writes]: https://kb.vastdata.com/documentation/docs/s3-conditional-writes + +### 10.2 Catalog correctness, typed errors, circuit breaker + +The CacheStore is the source of truth. The `ChunkCatalog` is purely an +optimization and may be dropped at any time without affecting correctness; +a `Lookup` miss falls through to `CacheStore.Stat` and refills the +catalog. Catalog entries that point at a now-absent chunk (e.g. evicted +by lifecycle) result in a `CacheStore.GetChunk` returning `ErrNotFound`, +which is the only error treated as a miss and refilled. + +`CacheStore` returns three typed error classes (s7); the cache layer +honors them distinctly: + +- **`ErrNotFound`** (chunk absent): triggers the miss-fill path. Normal + cold-path behavior; not an error from the operator's perspective. +- **`ErrTransient`** (5xx, timeout, throttle): surfaced to the client as + `503 Slow Down` with `Retry-After: 1s`. Counts toward the breaker. + Does NOT trigger refill (would amplify load against an already-degraded + backend). +- **`ErrAuth`** (401/403): surfaced as `502 Bad Gateway`. Counts toward + the breaker. Counts toward the `/readyz` consecutive-`ErrAuth` + threshold (default 3); on threshold the replica reports NotReady and + load balancers drain it. A single non-`ErrAuth` success resets the + counter. + +To prevent amplifying degradation under sustained backend failure, a +**per-process CacheStore circuit breaker** wraps every `CacheStore` +call. Defaults (configurable): + +- `error_window: 30s` +- `error_threshold: 10` (`ErrTransient` + `ErrAuth` count; `ErrNotFound` + does not) +- `open_duration: 30s` +- `half_open_probes: 3` + +State machine: **closed** (normal pass-through) -> **open** (immediately +short-circuits CacheStore writes with `ErrTransient`; reads still attempt +once per `open_duration / 10` for liveness probing) -> **half-open** +(allows up to `half_open_probes` test calls; on all-success returns to +closed; on any failure returns to open). Transitions are exposed as +`orca_cachestore_breaker_transitions_total{from,to}` and the +current state as `orca_cachestore_breaker_state` (0=closed, +1=open, 2=half_open). + +**Access-frequency tracking on `Lookup`.** Per FW8 (s13.2), each +`ChunkCatalog.Lookup` hit has a side effect: it increments the +matched entry's `AccessCount` and updates `LastAccessed`. This data +is consumed by the optional active-eviction loop (s13.2). The side +effect is correctness-irrelevant: catalog `Lookup` continues to be +safe to call from any goroutine; access counters are stored +atomically. New entries Recorded by `ChunkCatalog.Record` start with +`AccessCount=0` and `LastEntered=now`. + +**`CacheStore.Delete` breaker integration.** Active eviction +(s13.2) calls `CacheStore.Delete` in the background. `Delete` +errors count toward the same breaker as `Get` / `Put` errors: +sustained `ErrTransient` or `ErrAuth` from `Delete` opens the +breaker, which short-circuits subsequent writes (including the +eviction loop's deletes). The eviction loop checks breaker state +at run start and skips entirely if the breaker is open +(`active_eviction_runs_total{result="breaker_open"}++`). This +prevents the eviction loop from amplifying load against a +degraded backend. + +### 10.3 Range, sizes, and edge cases + +- Partial last chunk of a blob stored at its actual size; `ChunkInfo.Size` + records it; range math respects it. +- `416 Requested Range Not Satisfiable` is returned by the server before + any cache lookup, using object metadata, **only** for true Range vs. + object-size violations. +- `server.max_response_bytes` overflow returns + `400 RequestSizeExceedsLimit` (S3-style XML error body) with + `x-orca-cap-exceeded: true` (s6). It is reported as `400` and + not `416` because the cap is a server policy, not a property of the + object: clients cannot fix it by re-requesting a different Range past + EOF. +- Origin failure during fill never commits the staging file or makes a + final PutObject. Pre-commit (before first byte from origin): the + pre-header retry loop (s8.6) handles transient cases; if the retry + budget exhausts, the leader returns `502 Bad Gateway` to the client + and records a transient negative singleflight entry. Post-commit + (after first byte sent to client): the response aborts mid-stream + (s6 step 7); any CacheStore commit failure is invisible to the + client and recorded as `commit_after_serve_total{result="failed"}` + (s8.6). Mid-stream origin resume is deferred future work + (s15.4). + +### Diagram 9: Atomic commit (localfs vs posixfs vs s3 CacheStore) + +```mermaid +flowchart TB + Leader["Singleflight leader
finishes origin read
(via Spool tee; client response
already complete)"] --> Driver{"CacheStore
driver"} + Driver -- "localfs" --> L1["stage in <root>/.staging/<uuid>
fsync(file) + fsync(staging dir)"] + L1 --> L2["link(staging, final)
or renameat2(RENAME_NOREPLACE)"] + L2 -- "EEXIST" --> Llost["unlink staging
fsync(staging dir)
commit_lost++
treat existing final as truth"] + L2 -- "ok" --> Lwon["unlink staging
fsync(staging dir) + fsync(final parent dir)
commit_won++"] + Driver -- "posixfs" --> P1["stage in <root>/.staging/<uuid>
fsync(file) + fsync(staging dir)
(shared FS - same primitive as localfs)"] + P1 --> P2["link(staging, final)
across NFSv4.1+ / Weka / CephFS / Lustre / GPFS"] + P2 -- "EEXIST" --> Plost["unlink staging
fsync(staging dir)
commit_lost++
treat existing final as truth"] + P2 -- "ok" --> Pwon["unlink staging
fsync(staging dir) + fsync(final parent dir)
commit_won++"] + Driver -- "s3" --> S1["PutObject(final, body,
If-None-Match: *)"] + S1 -- "200" --> Swon["commit_won++"] + S1 -- "412" --> Slost["commit_lost++
treat existing object as truth"] + Lwon --> Pub["ChunkCatalog.Record(k, info)"] + Llost --> Pub + Pwon --> Pub + Plost --> Pub + Swon --> Pub + Slost --> Pub + Pub --> Done["chunk visible to all replicas"] + Sweep["periodic sweep cleans
stale <root>/.staging/<uuid>
older than staging_max_age"] -.-> L1 + Sweep -.-> P1 + SelfTestS3["startup SelfTestAtomicCommit (s3)
refuse to start if
If-None-Match not honored"] -.-> S1 + SelfTestPosix["startup SelfTestAtomicCommit (posixfs)
link EEXIST + dir-fsync + size verify
refuse on Alluxio FUSE
refuse if NFS < minimum_version
(opt-in via nfs.allow_v3)"] -.-> P1 + Failed["any commit failure
after client response complete"] -.-> CASF["commit_after_serve_total{failed}++
skip Catalog.Record"] +``` + +### 10.4 Spool locality contract + +The local Spool (s8.2) is no longer on the cold-path client-TTFB +path in v1: bytes stream origin -> client directly (s6 step 6 / +s8.6 pre-header retry). The spool is a parallel side-channel that +serves joiner-fallback reads and feeds the asynchronous CacheStore +commit. + +Even so, the spool benefits materially from a local block device. +A joiner that falls behind the in-memory ring buffer head +transparently switches to a `Spool.Reader(k, off)`. Local NVMe +serves these reads in microsecond-class latency; a network +filesystem (NFS, CephFS, Lustre, GPFS, FUSE) instead pays a +network round-trip on every read, which is tens of milliseconds +at best and seconds during congestion. That converts smooth +joiner-fallback into multi-second TTFB stalls for slow joiners. +Network-FS spools also weaken the durability semantics that the +asynchronous CacheStore commit relies on. + +To prevent foot-gun deployments, the cache layer enforces a +**boot-time locality check** before any client traffic is +accepted, governed by `spool.require_local_fs` (default `true`): + +1. Resolve `spool.dir` to an absolute path; resolve symlinks. +2. Call `statfs(2)` on the resolved path. Read `f_type`. +3. Compare `f_type` against a denylist (these magic numbers indicate a + network or virtual FS that violates the locality contract): + - `NFS_SUPER_MAGIC` (`0x6969`) - any NFS version, including + NFSv4.1+. + - `SMB2_MAGIC_NUMBER` (`0xfe534d42`), `CIFS_MAGIC_NUMBER` + (`0xff534d42`) - SMB / CIFS. + - `CEPH_SUPER_MAGIC` (`0x00c36400`) - CephFS kernel client. + - `LUSTRE_SUPER_MAGIC` (`0x0bd00bd0`) - Lustre. + - `GPFS_SUPER_MAGIC` (`0x47504653`) - IBM Spectrum Scale. + - `FUSE_SUPER_MAGIC` (`0x65735546`) - any FUSE mount, including + Alluxio FUSE. +4. On match: increment + `orca_spool_locality_check_total{result="refused",fs_type=""}`, + log `spool: is on a network filesystem (); + joiner-fallback latency would be unbounded. Refusing to start. + Set spool.dir to a local-NVMe-backed path or, for unusual + placements (e.g., RAM-disk), set spool.require_local_fs=false`, + and exit non-zero. +5. On no match: increment + `orca_spool_locality_check_total{result="ok",fs_type=""}` + and proceed. + +**Relaxation**. `spool.require_local_fs: false` allows operators +with unusual placements (RAM-disk, tmpfs, exotic local FS not on +the denylist) to bypass the check. The override is supported but +not recommended for production: with the v1 streaming design the +spool no longer gates client TTFB, but joiner-fallback latency +still benefits materially from local block storage. The metric +label `result="bypassed"` distinguishes overridden runs from +clean ones, and the boot log carries a loud `WARN +spool.require_local_fs is disabled; joiner-fallback latency is +best-effort` line. + +The check is in `internal/orca/fetch/spool/` and runs from +`cmd/orca/orca/main.go` before the HTTP listener binds. +It runs before any CacheStore self-test so a misconfigured spool +fails fast even on backends that would otherwise pass their own +self-test. + +### 10.5 Readiness probe (`/readyz`) + +The HTTP `/readyz` endpoint reports whether the replica should +receive client traffic. It is checked by the Kubernetes readiness +probe and by front-of-cluster load balancers. Distinct from +`/livez`, which is a process-liveness check only. + +**Response shape.** + +- `200 OK`, body `{"ready": true}`, when **all** of the following + predicates hold: + 1. boot self-tests have passed (`SelfTestAtomicCommit` for the + configured CacheStore driver; spool locality check, s10.4); + 2. the per-process CacheStore circuit breaker (s10.2) is `closed` + or `half_open`; + 3. consecutive `ErrAuth` count from the CacheStore is below + `readyz.errauth_consecutive_threshold` (default 3); + 4. peer discovery (s14) has completed at least one successful DNS + refresh since boot (the empty-peer fallback in s14 keeps the + replica functional, but `/readyz` still requires one + successful refresh so a totally broken DNS path does not stay + silently masked); + 5. the local Spool has free capacity below `spool.max_bytes`. + +- `503 Service Unavailable`, body + `{"ready": false, "reasons": ["..."]}`, when any predicate above + fails. The `reasons` array names the failing predicates by stable + string keys (`selftest_pending`, `selftest_failed`, + `breaker_open`, `errauth_threshold`, `peer_discovery_pending`, + `spool_full`) so operators can triage from a probe response + alone. + +**NotReady -> Ready transitions.** The endpoint is stateless apart +from reading the underlying components. Predicates clear themselves +as the system recovers: + +- breaker `open` -> `closed` after `half_open_probes` successful + probes (s10.2); +- `ErrAuth` consecutive counter resets on any non-`ErrAuth` success; +- spool fullness clears as in-flight fills drain; +- peer discovery flips to "completed" on the first successful + refresh and stays sticky for the lifetime of the process. + +**`/livez`.** A liveness-only check that returns `200 OK` if the +process is running and the HTTP listener is bound; it does NOT +consider any of the predicates above and is intentionally trivial. +This separation lets the readiness probe drain a misconfigured +replica without restarting it (so operators can inspect logs). + +`/readyz` and `/livez` are bound to the same client listener as the +S3 API; they are NOT served on the internal listener (`:8444`, +s8.8) because the internal listener's authorization scope is +restricted to the `/internal/fill` per-chunk fill RPC. + +## 11. Bounded staleness contract + +Orca trusts an **operator contract** for correctness, and bounds +the consequences of contract violation by configuration. + +### 11.1 The contract and the staleness window + +**The contract.** For a given `(origin_id, bucket, object_key)`, the +underlying bytes are immutable for the life of the key. If the data +changes, operators MUST publish it under a new key. Replacement in place +is a contract violation. + +**Why we trust it.** Cache key derivation includes the origin `ETag` +(s5), and a new ETag deterministically yields a new `ChunkKey` and a +fresh chunk path on the CacheStore. As long as the contract holds, the +cache cannot serve stale bytes: every change of identity is a change of +key. + +**What happens if the contract is violated.** The cache may serve the +old bytes for up to one **`metadata_ttl`** window (default 5m, +configurable). Mechanism: + +- Object metadata (`size`, `etag`, `content_type`) is cached for + `metadata_ttl` to avoid re-`HEAD`ing on every request. +- During that window, requests resolve to the old `etag`, derive the + same `ChunkKey`, and serve from cached chunks. +- After the window expires, the next request triggers a fresh `Head`, + observes the new ETag, derives a new `ChunkKey`, and refills. + +**Why this is acceptable for v1.** The intended workload is large +immutable artifacts (job inputs, model weights, training shards). The +contract matches how those are produced. The 5m window is a tunable +upper bound, not a typical case: a flood of distinct cold keys reads the +correct ETag on first contact with the cache. + +**Defense in depth.** `If-Match: ` is sent on every +`Origin.GetRange` (s8.6). If an in-flight fill races with an in-place +overwrite, the origin returns `412 Precondition Failed` and the leader +fails the fill, invalidates the metadata cache entry for +`{origin_id, bucket, key}`, and increments +`orca_origin_etag_changed_total`. This catches the narrow window +where a violation happens between the cache's `Head` and its `GetRange`. +It does NOT catch a violation that happens between two complete +request lifecycles within the same `metadata_ttl` window; the +`metadata_ttl` cap is what bounds that case. + +### 11.2 Bounded-freshness mode (optional) + +The default v1 posture is "trust the contract, cap the window". Some +workloads benefit from shorter effective staleness windows on hot keys +(typically: deployments where contract violations are operationally +possible, or where TTL-boundary cold-miss latency on popular content +is unacceptable). For those workloads, FW5 adds an opt-in +**bounded-freshness mode** that proactively re-Heads hot keys ahead +of `metadata_ttl`. + +**Opt-in via config**: `metadata_refresh.enabled: false` (default). +When `false`, no background activity; the cache behaves exactly as +described in s11.1. + +**Hot-key tracking**. Bounded-freshness mode requires per-entry access +tracking on the metadata cache, parallel to the chunk-catalog access +tracking from FW8 (s13.2). Each `MetadataCacheEntry` gains: +- `AccessCount` (uint32, increments on Lookup hit) +- `LastAccessed` (updated on Lookup hit) +- `LastEntered` (set on Record; never updated) + +This tracking is independent of the chunk-catalog tracking; metadata +hotness can diverge from chunk hotness (e.g., random-range reads +access many chunks of one object). + +**Eligibility**. An entry is eligible for proactive refresh when ALL +of: +- `AccessCount >= access_threshold` (default 5; "hot" key) +- `now - LastEntered >= refresh_ahead_ratio * metadata_ttl` (default + 0.7 * 5m = 3.5m; approaching TTL) +- `now - LastEntered < metadata_ttl` (still valid) +- `now - LastEntered >= min_age` (default `metadata_ttl/4` = 75s; + cold-start protection) +- no in-flight refresh for this key (per-replica HEAD singleflight, + s8.7, gates this) + +**Negative entries** (404, unsupported blob type) are NOT refreshed. +Refreshing them would generate HEAD load to confirm a known-missing +key; `negative_metadata_ttl` (default 60s, s12) handles the +create-after-404 recovery instead. + +**Refresh loop**: + +``` +every metadata_refresh.interval: # default 1m + candidates = [] + scan metadata cache: + for each entry e: + if eligible(e): + candidates.append(e) + sort candidates: + primary: highest AccessCount first + secondary: oldest LastEntered first + refresh_count = min(len(candidates), max_refreshes_per_run) # 100 + spawn refresh workers (concurrency: refresh_concurrency, default 8) + for first refresh_count entries: + result = Origin.Head(e.bucket, e.key) + case result of: + ok with same ETag: + metadata_cache.RefreshTTL(e.key) # extend TTL + metric: metadata_refresh_total{result="ok"}++ + ok with new ETag: + metadata_cache.Update(e.key, result) + metric: metadata_refresh_total{result="etag_changed"}++ + metric: origin_etag_changed_total++ # existing metric + # old chunks orphaned; lifecycle / active eviction (s13) + # cleans up + err: + # don't extend TTL; entry expires naturally + metric: metadata_refresh_total{result="error"}++ +``` + +**Origin HEAD load bound**. Per-replica per cycle: at most +`max_refreshes_per_run` HEADs (default 100). Per minute (default +interval): 100 HEADs. At 3 replicas: 300 HEADs/min. Negligible +against documented S3 / Azure HEAD rate limits. + +The refresh workers compete for the existing **origin limiter** +(s8.4) so they cannot starve on-demand fills. If the limiter is +saturated, refresh requests queue with bounded wait and skip past +timeout (`metric: metadata_refresh_total{result="skipped_limiter_busy"}`). + +**Effective staleness window** with bounded-freshness enabled: +`refresh_ahead_ratio * metadata_ttl` for hot keys (default 3.5m). +Cold keys still bounded by full `metadata_ttl` (default 5m). Negative +entries bounded by `negative_metadata_ttl` (default 60s). + +**Cluster-wide HEAD bound** with bounded-freshness enabled: each +replica refreshes its own metadata cache independently. With N +replicas and H hot keys, refresh load is up to N*H HEADs per refresh +cycle. The cluster-wide HEAD coordinator (deferred future work, see +s15.2) would naturally absorb this load if N grows large enough to +matter. + +**Failure modes**: +- `Origin.Head` error during refresh: don't extend TTL; entry expires + naturally at `metadata_ttl`; on-demand miss re-Heads. Log + metric. +- Origin limiter saturated: refresh worker times out; entry expires + naturally. +- Loop hangs / crashes: metadata cache continues to age; entries + expire at `metadata_ttl`. Detected via + `metadata_refresh_runs_total` not advancing. +- Refresh detects ETag change: metadata updated; old chunks orphaned; + active eviction (FW8 / s13.2) or CacheStore lifecycle handles + cleanup. + +**When to enable**: +- Workload has identifiable hot keys with sub-`metadata_ttl` + staleness sensitivity. +- Operators want shorter effective windows on popular content. +- Origin can absorb the additional HEAD load (typically small for + bounded hot-key sets). + +**When to leave disabled (default)**: +- Strict immutable-contract workload where `metadata_ttl` staleness + is acceptable. +- Origin HEAD rate is constrained. +- Hot-key set is unbounded (every key appears hot - refresh load + matches request load, defeating the purpose). + +Cross-references: [s2 Decisions / Consistency](#2-decisions), +[s8.6 Failure handling](#86-failure-handling-without-re-stampede), +[s8.7 Metadata-layer singleflight](#87-metadata-layer-singleflight), +[s10.2 Catalog correctness](#102-catalog-correctness-typed-errors-circuit-breaker), +[s12 Create-after-404 and negative-cache lifecycle](#12-create-after-404-and-negative-cache-lifecycle), +[s13.2 Active eviction](#132-active-eviction-opt-in-access-frequency). + +## 12. Create-after-404 and negative-cache lifecycle + +### 12.1 The scenario + +A client GETs a key `K` before the operator has uploaded it to +origin. The cache observes `404` from `Origin.Head(K)`, records a +negative metadata-cache entry, and returns `404` to the client. The +operator then uploads `K`. Subsequent client requests still see +`404` until the negative entry expires - the "we forgot to upload +that" case. + +This is operationally indistinguishable from a contract violation +(s11): from the client's perspective, the bytes for `K` changed +without the cache being told. Event-driven origin invalidation is +intentionally not in v1 scope (the immutable-origin contract makes +it unnecessary for the documented workload); the cache can only +bound how long it serves the stale `404`. + +### 12.2 Two TTLs (positive vs negative) + +The metadata cache uses two TTLs: + +| TTL | Default | Bounds | Rationale | +|---|---|---|---| +| `metadata_ttl` | 5m | positive entry (`200` + ETag) reuse without re-Head | immutable-origin contract (s11); long TTL keeps HEAD load low | +| `negative_metadata_ttl` | 60s | negative entry (`404` / unsupported blob type) reuse without re-Head | operator "oops upload" recovery should be fast | + +Asymmetric defaults reflect asymmetric operational reality: +positive-entry staleness only matters on contract violation; +negative-entry staleness matters every time an operator uploads a +previously-missing key, which is a normal operational event. + +Per-replica HEAD singleflight (s8.7) caps the HEAD load that a short +negative TTL would otherwise create: a flood of distinct missing +keys generates at most one HEAD per object per replica per +`negative_metadata_ttl` window. At default settings (60s, 3 +replicas) origin sees at most 3 HEADs per missing key per minute, +well under any S3 / Azure HEAD rate limit. + +### 12.3 Worst-case unavailability window + +After an operator uploads a previously-missing key: + +- A replica that observed the original `404` keeps serving `404` + for up to `negative_metadata_ttl` from its OWN observation time, + regardless of when the upload happened. The TTL is + observation-anchored, not upload-anchored, because the cache + cannot know about the upload. +- A replica that did NOT observe the `404` will Head fresh on the + first request after the upload and serve `200` immediately. +- Worst case across replicas: `negative_metadata_ttl` after the + LATEST replica's observation of the old `404`. Under round-robin + load balancing, clients can see alternating `404` / `200` + responses during the drain window (Diagram 10). + +There is no active invalidation in v1: neither event-driven +invalidation (origin-pushed) nor an admin-invalidation RPC is in +v1 scope. Operator workaround: wait `negative_metadata_ttl` after +upload before announcing the key. + +### 12.4 Defense-in-depth and observability + +`If-Match: ` (s8.6) does NOT defend against this case: there +is no in-flight fill for a `404`'d key, so no precondition exists +to trip on. The TTL is the only bound. + +Negative-cache metrics let operators observe drain progress after +an upload: + +- `orca_metadata_negative_entries` (gauge) - current count + of negative entries. +- `orca_metadata_negative_hit_total{origin_id}` (counter) - + returns served from a negative entry. A spike after a known + upload signals ongoing drain. +- `orca_metadata_negative_age_seconds{origin_id}` + (histogram) - age of negative entries at hit time. Use + upper-bound percentiles to size `negative_metadata_ttl`. + +Cross-references: [s2 Decisions / Consistency](#2-decisions), +[s6 Request flow](#6-request-flow), +[s8.6 Failure handling](#86-failure-handling-without-re-stampede), +[s8.7 Metadata-layer singleflight](#87-metadata-layer-singleflight), +[s11 Bounded staleness contract](#11-bounded-staleness-contract). + +### Diagram 10: Scenario G - create-after-404 timeline + +```mermaid +sequenceDiagram + autonumber + participant Op as Operator + participant C as Client + participant A as Replica A + participant B as Replica B + participant O as Origin + Note over A,B: t=0 K not yet uploaded + C->>A: GET /bucket/K + A->>O: Head(K) + O-->>A: 404 + Note over A: cache K -> 404
TTL = negative_metadata_ttl (60s) + A-->>C: 404 + Note over Op,O: t=30s operator uploads K + Op->>O: PUT /bucket/K + Note over A,B: t=45s drain period + C->>B: GET /bucket/K (LB routes to B) + B->>O: Head(K) + O-->>B: 200 + ETag + B->>O: GetRange (fill path) + O-->>B: bytes + B-->>C: 200 + bytes + Note over A,B: inconsistent results across replicas during drain + C->>A: GET /bucket/K (LB routes to A again) + Note over A: negative entry still valid
age 45s less than 60s + A-->>C: 404 STALE + Note over A: t=60s+ negative entry expires + C->>A: GET /bucket/K (t=70s) + A->>O: Head(K) + O-->>A: 200 + ETag + A->>O: GetRange (fill path) + O-->>A: bytes + A-->>C: 200 + bytes + Note over A,B: drain complete - all replicas consistent +``` + +## 13. Eviction and capacity + +Two complementary mechanisms govern CacheStore footprint in v1: +**passive lifecycle eviction** (always on, driver-dependent) and +**optional active eviction** by the cache layer itself (opt-in, +access-frequency-driven). Operators choose one, the other, or both +depending on CacheStore driver and workload. + +### 13.1 Passive eviction (lifecycle) + +Eviction is delegated to the CacheStore's storage system in the +default v1 configuration. Recommended baseline is age-based +expiration on the chunk prefix with a TTL chosen to fit the +deployment's working set in the available capacity. Operators tune +the TTL based on `orca_origin_bytes_total` and capacity +utilization metrics exposed by the CacheStore. Because the +on-store path is namespaced by `origin_id` (s5), per-origin +lifecycle policies can be configured independently on the same +CacheStore bucket. + +**`cachestore/s3` deployments**: AWS S3, MinIO, and VAST all +support bucket lifecycle policies for age-based expiration. +Configure the lifecycle directly on the bucket (or delegate to the +in-DC object store's tooling). + +**`cachestore/posixfs` deployments**: shared POSIX filesystems +(NFSv4.1+, Weka native, CephFS, Lustre, GPFS) do not provide +native object-lifecycle policies. Two options for posixfs: +- **External sweep**: schedule an age-based sweep against + `//` from cron or a Kubernetes `CronJob` (e.g. + `find / -type f -atime + -delete`). The + sweep runs out-of-band; `CacheStore.GetChunk` on a swept entry + returns `ErrNotFound` and re-enters the miss-fill path. + Operators SHOULD NOT sweep the staging subdirectory + `/.staging/` - that is managed by the driver's own + background sweep (`cachestore.posixfs.staging_max_age`, default + 1h, s10.1.2). +- **Active eviction** (s13.2): enable the cache layer's + access-frequency-driven eviction loop. This is the recommended + posixfs path when external sweep tooling is impractical. + +### 13.2 Active eviction (opt-in, access-frequency) + +When `chunk_catalog.active_eviction.enabled: true` (default +`false`), each replica runs a background eviction loop that +deletes cold chunks from BOTH the in-memory `ChunkCatalog` AND +the CacheStore. The decision uses **access-frequency tracking** +recorded in the catalog on every `Lookup` hit. + +**Per-entry tracking** added by FW8 to each `ChunkCatalogEntry`: + +```go +type ChunkCatalogEntry struct { + ChunkInfo + AccessCount uint32 // increments on each Lookup hit; + // saturates at MaxUint32 (practically + // unreachable) + LastAccessed time.Time // updated on each Lookup hit + LastEntered time.Time // set on Record; never updated +} +``` + +**Eviction policy**: a chunk is eligible for active eviction when +ALL of: +- `now - LastAccessed > inactive_threshold` (default 24h) +- `AccessCount < access_threshold` (default 5) +- `now - LastEntered >= min_age` (default 5m, cold-start protection + preventing newly-recorded entries from being evicted before they + accumulate hits) + +**Score** for ordering candidates (lowest first = most evictable): +- primary: `AccessCount` +- tiebreak: oldest `LastAccessed` + +**Loop**: every `eviction_interval` (default 10m), scan the +catalog, identify eligible candidates, sort by score, evict up to +`max_evictions_per_run` (default 1000) per cycle. For each +evicted entry: call `CacheStore.Delete(k)`, then +`ChunkCatalog.Forget(k)` on success. Bounded per-run cost +prevents pathological delete-storms on a large catalog; the next +cycle catches the remainder. + +**Failure handling**: +- `Delete` returns `ErrNotFound` (already gone) - treat as success + and Forget. +- `Delete` returns `ErrTransient` - do NOT Forget; retry next + cycle. Counter feeds the existing per-process circuit breaker + (s10.2). +- `Delete` returns `ErrAuth` - stop the entire run; do NOT + Forget; metric increments. Circuit breaker integrates as usual. +- Circuit breaker open - skip the eviction run entirely + (`active_eviction_runs_total{result="breaker_open"}++`) to + avoid amplifying load against a degraded backend. + +**Counter saturation, no decay in v1**: AccessCount is `uint32` +and saturates at ~4 billion (practically unreachable). New entries +start at 0 and must compete with old popular entries once past +`min_age`. The cold-start protection covers this; for steady-state +workloads the relative ordering remains correct. + +### 13.3 ChunkCatalog size awareness (load-bearing operational note) + +The ChunkCatalog is the active-eviction policy's window into +chunk activity. Its size relative to the CacheStore working set +determines eviction quality: + +- **catalog == working set**: full visibility; eviction policy + considers every chunk; quality is optimal. +- **catalog < working set**: many chunks live in the CacheStore + but are NOT tracked by the catalog. They cannot be considered + for active eviction; they live indefinitely until external + lifecycle (if any) cleans them up. Active eviction has + incomplete visibility; effective behavior is "evict from the + visible subset only". +- **catalog > working set**: wasted RAM but no correctness or + eviction-quality cost. + +**Sizing guidance for operators**: + +``` +target_catalog_entries = 1.2 * estimated_active_working_set_chunks + (where chunk = chunk_size, default 8 MiB) + +memory_estimate = target_catalog_entries * ~120 bytes/entry +``` + +| Active working set | Chunks at 8 MiB | Catalog entries | RAM (~120 B/entry) | +|---|---|---|---| +| 100 GiB | ~13K | 16K | ~2 MB | +| 1 TiB | ~130K | 160K | ~20 MB | +| 10 TiB | ~1.3M | 1.6M | ~190 MB | +| 100 TiB | ~13M | 16M | ~1.9 GB | + +For very large working sets (>1 PiB at 8 MiB chunks), operators +should consider one of: +- larger `chunk_size` (e.g., 16 MiB) to reduce catalog entry count + by half (note: changing `chunk_size` orphans the existing chunk + set, see s5); +- disabling active eviction and relying on CacheStore lifecycle + exclusively (the default v1 posture); +- a future external/persistent catalog (deferred future work, + not in v1). + +**Metrics for detecting undersizing**: +- `orca_chunk_catalog_hit_rate` (derived from `_hit_total`): + sustained < 0.7 suggests undersizing. +- `orca_chunk_catalog_evict_total{reason="size"}`: high + rate means LRU eviction is fighting the access-frequency policy; + catalog is too small. +- `orca_chunk_catalog_entries`: pinned at `max_entries` + may indicate undersizing. + +### 13.4 Spool capacity + +The local **spool** (s8.2) is bounded by `spool.max_bytes`; +full-spool conditions block new fills briefly, then return `503 +Slow Down` to clients. Spool entries are released as soon as +in-flight readers drain. Spool capacity is independent of the +ChunkCatalog and CacheStore footprint. + +### 13.5 `chunk_size` config-change capacity impact + +See the operational note in [s5](#5-chunk-model): changing +`chunk_size` orphans the existing chunk set under the old size; +storage transiently doubles and the working set is rebuilt at the +new size on demand. The CacheStore lifecycle policy (or, on +posixfs with active eviction enabled, the access-frequency loop +detecting the orphans as cold) ages the orphaned chunks out. + +### 13.6 Eviction interactions + +Operators using BOTH passive lifecycle AND active eviction need +to understand the interaction: +- Lifecycle deletes a chunk -> active eviction sees `ErrNotFound` + on `Delete`; treats as success. No conflict. +- Active eviction deletes a chunk -> lifecycle sees it gone. No + conflict. +- Both aggressive on the same chunk -> "double eviction" with no + correctness impact, but the chunk is gone slightly faster than + either policy alone would have removed it. Operators should + pick one as the primary mechanism and configure the other as + defense-in-depth (e.g., long lifecycle TTL + short active + eviction `inactive_threshold`). + +## 14. Horizontal scale + +Cluster membership comes from the headless Service: an A-record lookup +returns the IPs of all Ready pods backing the Service. Cluster code +consumes that list, refreshes it on a configurable interval (default 5s), +and rendezvous-hashes `ChunkKey` against pod IPs to select a coordinator +**per chunk**. The replica that received the client request acts as the +**assembler** (s8.3): for each chunk in the requested range, it serves +from CacheStore on hit, performs a local singleflight + tee + spool + +commit if it is the coordinator, or issues a per-chunk +`GET /internal/fill?key=` to the coordinator on the coordinator's +internal mTLS listener (s8.8). The assembler stitches returned bytes into +the client response, slicing the first and last chunk to match the +client `Range`. + +Pod names are not stable under a Deployment; we never address peers by +name, only by the IPs the headless Service publishes. + +We accept up to one duplicate fill per chunk during membership flux (e.g. +rolling restarts when a pod's IP changes); the duplicate-fill metric +makes that visible. + +Replication factor = 1 in v1 (cache loss is recoverable from origin). +Every replica sees the entire CacheStore. No replica owns bytes; +replica loss never strands data. + +**Empty / unavailable peer set.** If `Cluster.Peers()` returns an +empty set (the headless Service has no Ready endpoints, the DNS +record returns NXDOMAIN, or the kube-dns / CoreDNS path is broken), +the replica treats itself as the only peer: rendezvous hashing +returns self for every `ChunkKey` and all fills run locally. The +replica does NOT refuse to serve; cluster-wide deduplication +(s8.3) degrades to per-replica deduplication for the duration. A +subsequent successful DNS refresh re-introduces peers without +process restart. + +DNS-refresh outcomes are exposed as +`orca_cluster_dns_refresh_total{result="ok|fail|empty"}` and +the current peer-set size as `orca_cluster_peers` (gauge). +Boot-time failure is logged at WARN; sustained empty-peer state is +trivially observable from the gauge. The `/readyz` predicate +(s10.5) requires that **at least one** DNS refresh has succeeded +since boot; a totally broken DNS path therefore keeps the replica +NotReady and load balancers drain it, even though the empty-peer +local-fill fallback would otherwise let it serve. + +### Diagram 11: Membership & rendezvous hash + +```mermaid +flowchart LR + DNS["headless Service
A-record lookup
(every 5s)"] --> IPs["pod IP set:
[10.0.1.5,
10.0.1.6,
10.0.1.7]"] + Req["incoming request
ChunkKey k"] --> Hash["for each IP:
w(IP, k) = hash(IP || k)
argmax(w)"] + IPs --> Hash + Hash --> Coord["coordinator IP
(e.g. 10.0.1.6)"] + Coord --> Decide{"== self?"} + Decide -- "yes" --> Local["local fill path
(singleflight + tee + spool + commit)"] + Decide -- "no" --> Forward["GET /internal/fill?key=k
(mTLS, internal listener)"] +``` + +### Diagram 12: Scenario H - rolling restart membership flux + +```mermaid +sequenceDiagram + autonumber + participant A as Replica A + participant DNS as headless Service DNS + participant B as Replica B (old IP) + participant Bp as Replica B' (new IP) + participant CS as CacheStore + Note over A,B: t=0 peers (A's view) = {A, B}
chunk k owned by B + A->>DNS: refresh + DNS-->>A: [ip(A), ip(B)] + Note over B,Bp: t=5s rolling restart: B terminates,
B' starts with a new IP + Note over A: A's cached membership still {A, B}
until next refresh + A->>A: rendezvous(k, {A,B}) = B (stale) + A->>B: /internal/fill (connection refused) + A->>A: fallback: fill locally + A->>CS: PutObject(final, ..., If-None-Match: *) + Note over Bp: B' bootstraps, refreshes DNS
peers (B's view) = {A, B'} + Bp->>Bp: rendezvous(k, {A,B'}) = B' + Bp->>CS: PutObject(final, ..., If-None-Match: *) + CS-->>A: 200 commit_won + CS-->>Bp: 412 commit_lost + Note over A,Bp: duplicate_fills_total{commit_lost} += 1 + Note over A,DNS: t=10s A refreshes DNS
peers converge to {A, B'}
steady state restored +``` + +## 15. Deferred optimizations + +This section catalogs concerns that are intentionally NOT in v1. Each +entry names what is deferred, why v1 ships without it, what operational +evidence would justify building it, and a sketch of how it would fit +into the existing surface area. None of these items require breaking +changes to v1 interfaces. + +### 15.1 Edge rate limiting + +**What**: Per-client / per-IP / per-credential token-bucket rate +limiting at the S3 edge; '429 Too Many Requests' on exhaustion; +identity from auth subject (mTLS cert subject or bearer-token claim) +with source-IP fallback when no auth identity is established. + +**Why deferred**: v1 has implicit hot-client mitigation - the per- +replica origin semaphore (s8.4) and singleflight (s8.1) +coalesce concurrent identical work and cap cold-fill concurrency +regardless of caller. No measured noisy-neighbor evidence at v1 +scale; cost of building edge rate limiting (token-bucket per +identity, identity extraction, new HTTP error path, new metric) +outweighs the speculative benefit. + +**Trigger**: Operator reports a single client / credential is +measurably monopolizing TTFB or driving disproportionate origin +load past internal mechanisms. + +**Sketch (if built)**: Token bucket per identity in +`internal/orca/server/edgelimit/`; refill rate per identity +configurable; per-replica enforcement (no cluster-wide +coordination); returns `429 Too Many Requests` with +`Retry-After: 1s`. New metric +`orca_edge_ratelimit_total{identity,result}`. + +**Known v1 limitation**: documented gap. Multi-tenant deployments +worried about single-client monopolization should layer rate +limiting at an upstream proxy or LB until this lands. + +### 15.2 Cluster-wide HEAD singleflight + +**What**: A second coordinator role parallel to the chunk fill +coordinator (s8.3): rendezvous-hash on `(origin_id, bucket, key)` +to pick exactly one HEAD coordinator per object per cluster. New +`/internal/head` RPC. After: exactly one `Origin.Head` per object +per `metadata_ttl` window cluster-wide. + +**Why deferred**: Per-replica HEAD singleflight (s8.7) caps +cluster-wide HEAD load at `N * (objects / metadata_ttl)`. At +documented v1 scale (3-5 replicas, 5m TTL), this is well under +documented S3 / Azure HEAD rate limits. Savings only become +material at much larger scale. + +**Trigger**: any of: +- peer-set size exceeds ~10 replicas, AND keys cluster under + shared prefixes approaching per-prefix rate limits (5500/sec on + AWS S3); +- `metadata_ttl` configured short enough that HEAD storms repeat + frequently; +- operator measures HEAD throttling on origin. + +**Sketch (if built)**: New `ObjectKey = {origin_id, bucket, +object_key}` type. New `Cluster.HeadCoordinator(ObjectKey) Peer` +parallel to `Coordinator(ChunkKey) Peer`. New +`InternalClient.Head(ctx, ObjectKey) (ObjectInfo, error)`. New +endpoint `GET /internal/head?origin_id=...&bucket=...&key=...` on +existing internal listener (s8.8); reuses mTLS + peer-IP authz. +Same `409 Conflict` membership-flux fallback as chunk fill. +Coordinator-unreachable degrades to local `Origin.Head`. New +`cluster_internal_head_*` metrics. The bounded-freshness mode +(s11.2) would naturally route its background HEADs through this +same coordinator pattern. + +**Known v1 bound**: at N replicas and `metadata_ttl=5m`, cold +popular-key fan-out generates **N HEADs per object per 5 minutes +cluster-wide**. Documented and acceptable at v1 scale. + +### 15.3 Cluster-wide LIST coordinator + +**What**: Extend FW2's coordinator pattern to LIST: rendezvous- +hash on the full LIST query tuple `(origin_id, bucket, prefix, +continuation_token, start_after, delimiter, max_keys)` to pick +one coordinator per query per cluster. New `/internal/list` RPC. +Coordinator's per-replica LIST cache (s6.2) becomes the de facto +cluster cache. After: exactly one `Origin.List` per identical +query per `list_cache.ttl` cluster-wide. + +**Why deferred**: v1 ships with per-replica LIST cache (s6.2, +default 60s TTL). For the documented FUSE-`ls` workload, FUSE +clients are typically pinned to one replica via HTTP/2 keepalive, +making per-replica caching naturally effective for any single +client. Across many clients sharing prefixes, per-replica caching +holds origin LIST load to N per popular prefix per +`list_cache.ttl` window - well under any documented rate limit +at v1 scale. + +**Trigger**: any of: +- peer-set size exceeds ~10 replicas, AND +- highly-shared FUSE prefixes, AND +- tight `ls` latency budgets (so the additional 5-20ms internal- + RPC hop is acceptable in trade for reduced origin load); +- OR operator measures sustained LIST throttling on origin. + +**Sketch (if built)**: Symmetric to s15.2. New +`Cluster.ListCoordinator(ListKey) Peer`. New +`InternalClient.List` RPC. Coordinator runs the LIST cache and +the existing per-replica LIST singleflight; non-coordinators +route to it on cache miss. Same `409 Conflict` membership-flux +fallback. Coordinator-unreachable degrades to local +`Origin.List`. The internal-RPC latency overhead matters more +for FUSE-`ls` than chunk fills, so caching at the coordinator +must be aggressive (TTL >= 60s). + +**Known v1 bound**: cluster-wide LIST load is up to N origin LIST +calls per identical query per `list_cache.ttl` window where N is +peer count. Acceptable at v1 scale. + +### 15.4 Mid-stream origin resume + +**What**: After the commit boundary (s8.6 / s6 step 6) the v1 cache +streams origin bytes directly to the client. If the origin +connection breaks mid-chunk, the response aborts (HTTP/2 +`RST_STREAM` or HTTP/1.1 `Connection: close`); the S3 SDK detects +the `Content-Length` mismatch and retries. Mid-stream origin +resume would replace the abort with a transparent re-issue: the +leader tracks bytes sent to client; on origin disconnect, it +re-issues `Origin.GetRange` with `Range: bytes=-` (and +the same `If-Match: `) and continues feeding the client +without ever showing an error. + +**Why deferred**: v1 relies on the SDK retry behavior (every +mainstream S3 client handles this case correctly) which is +acceptable for the documented workload. Mid-stream resume +requires non-trivial state tracking (bytes-sent counter, retry +budget for the resume itself, interaction with the singleflight +joiner state), and the abort case is handled by the SDK so the +operational impact is small. + +**Trigger**: any of: +- mid-stream client aborts measurably impact tail TTFB on the + documented workload (visible via + `responses_aborted_total{phase="mid_stream"}` rate); +- workload uses non-S3-compatible clients without robust retry + (uncommon); +- post-commit origin failures are systematically more frequent + than pre-commit (e.g., long-tail origin connections that + succeed initially then drop). + +**Sketch (if built)**: extend `fetch.Coordinator` to track +`bytesSent` per fill. On `Origin.GetRange` error after the commit +boundary, retry origin with `Range: bytes=-` (within +the requested chunk's range; bounded by a separate +`origin.resume.attempts` budget, e.g. 1-2 attempts). Joiners reading +through the leader's tee transparently see the gap closed. The +spool tee continues unaffected; the resumed bytes flow through +the same ring buffer + spool. New metric: +`orca_origin_resume_total{result="success|exhausted|error"}`. + +**Known v1 bound**: post-commit origin failures abort the client +response; client SDK retries from scratch +(`responses_aborted_total{phase="mid_stream"}` increments). +Acceptable for the documented workload at v1 scale. + +### 15.5 Coordinated cluster-wide origin limiter + +**What**: Replace the per-replica static cap (s8.4) with a true +cluster-wide cap on concurrent `Origin.GetRange` calls. Mechanism: +Kubernetes-Lease-elected **limiter authority** + in-memory +counting semaphore at the elected leader + slot-lease tokens +(batched) issued over an internal RPC + per-peer local bucket +that auto-refills + graceful fallback to the v1 per-replica +static cap when the authority is unreachable. + +**Why deferred**: at documented v1 scale (3-5 replicas), the +per-replica static cap (s8.4) is approximate but acceptable; +cluster-wide concurrency tracks `target_global` within a small +margin during steady state, and the pre-header retry loop (s8.6) +handles origin throttling responses (`503 SlowDown` / `429`) +self-correctingly. The K8s Lease design adds substantial surface +area (election machinery, slot-lease tokens, batching, fallback +mode, RBAC, ~12 metrics, ~10 tests, an additional `Limiter` +interface plus `LimiterToken` type, three new internal RPC +endpoints) that is not justified at v1 scale. Reviewer feedback +flagged the cumulative complexity as not earning its keep. + +**Trigger**: any of: +- peer-set size grows past ~10 replicas, AND measured steady- + state slot under-utilization (one replica saturated while + others are idle for the same hot work) is causing + `503 Slow Down` to clients; +- operator requires a hard cluster-wide cap (e.g., dedicated + origin pipe sized for X concurrent connections; cost-sensitive + deployment cannot tolerate the static cap's worst-case + overshoot); +- origin imposes an account-wide rate limit (rather than + per-prefix) that the static cap would routinely exceed. + +**Sketch (if built)**: + +- **Election**: standard `client-go/tools/leaderelection` against + a single `coordination.k8s.io/v1.Lease` resource named e.g. + `orca-limiter` in the deployment's namespace. RBAC: + `get / list / watch / create / update / patch` on the named + Lease, scoped to the deployment's namespace. Steady-state K8s + API load: ~6-30 writes/min/deployment (the elected leader + renews; non-leaders do not write). + +- **Authority**: holds an in-memory counting semaphore of + `cluster.limiter.target_global` slots (default 192). Serves + three RPCs over the existing internal listener (s8.8): + `POST /internal/limiter/acquire` (issues a lease token holding + N batched slots; default `batch.size=8`, configurable; + `token.ttl=30s` wall-clock expiry); `POST /internal/limiter/extend` + (bumps an existing token's expiry; returns `unknown_token` or + `expired` if reclaimed); `POST /internal/limiter/release` + (returns slots; idempotent). Background sweep every 5s reclaims + expired tokens. + +- **Peer**: each non-authority replica holds a small local bucket + of slots acquired in batches; auto-refill triggers when remaining + slots fall to or below `cluster.limiter.batch.refill_threshold` + (default 2). Tokens auto-extend when their age exceeds + `cluster.limiter.token.extend_at_ratio * token.ttl` (default + 0.5 * 30s = 15s). When the local bucket empties, the replica + releases the old token and acquires a fresh one. + +- **Authority changeover**: when the K8s Lease holder changes, + the new authority starts with an empty slot table while old + lease tokens at peers continue draining. Cluster-wide inflight + may transiently exceed `target_global` by up to one full set + of tokens; drains within `lease.duration + token.ttl` = + 45s worst case with defaults. Acceptable because the limiter + is a soft cap; correctness is unaffected. + +- **Fallback mode**: peer cannot reach authority -> activates the + v1 per-replica static cap (the same `floor(target_global / N)` + semaphore from s8.4). Transparent to the client. Reconnects + automatically on `cluster.limiter.fallback.check_interval` + (default 5s). Limiter authority unreachability is intentionally + NOT a `/readyz` predicate: replicas in fallback are still + serving correctly. + +- **Disable toggle**: `cluster.limiter.enabled: false` returns + the v1 per-replica static cap permanently. No K8s API access; + no Lease object created. Useful for deployments without RBAC + for the Lease resource, or for isolated debugging. + +- **New metrics**: `orca_limiter_state{role="authority|peer|fallback"}`, + `orca_limiter_target_global`, + `orca_limiter_slots_available` (authority-only), + `orca_limiter_slots_granted` (authority-only), + `orca_limiter_slots_local` (per-peer), + `orca_limiter_acquire_total{result}`, + `orca_limiter_acquire_duration_seconds`, + `orca_limiter_extend_total{result}`, + `orca_limiter_release_total`, + `orca_limiter_election_total{result}`, + `orca_limiter_lease_expired_total`, + `orca_limiter_fallback_active`. + +- **New interfaces in s7**: `Limiter` (`Acquire(ctx) (Slot, error)`, + `State() LimiterState`); `Slot` (`Release()`); `LimiterToken` + struct (`ID`, `Slots`, `ExpiresAt`); `InternalClient` gains + `LimiterAcquire`, `LimiterExtend`, `LimiterRelease`. + +- **Composition with [s15.6](#156-dynamic-per-replica-origin-cap)**: + the coordinated authority (this entry) and dynamic per-replica + recompute (s15.6) are orthogonal mechanisms. If both ever + ship, dynamic per-replica is the uncoordinated baseline that + coordination tightens further. + +**Known v1 limitation**: per-replica static cap; cluster-wide +concurrency tracks `target_global` only when `N_actual == +cluster.target_replicas`. Documented and acceptable at v1 +documented scale. + +### 15.6 Dynamic per-replica origin cap + +**What**: Derive `target_per_replica` at runtime from +`len(Cluster.Peers())` rather than from the static +`cluster.target_replicas` config knob. The per-replica origin +semaphore is resized on each membership-refresh, keeping +realized cluster-wide concurrency close to `target_global` +regardless of actual replica count. + +**Why deferred**: v1 ships with `cluster.target_replicas` as a +static config knob (s8.4). Static is simpler, deterministic, +and matches the operator's mental model when the deployment has +a stable replica count (the documented v1 target of 3-5 +replicas without HPA). Dynamic adds: + +- a resizable-semaphore primitive (the Go standard library and + `golang.org/x/sync/semaphore` both fix capacity at + construction; a custom wrapper is required, ~30-40 lines); +- a peer-change notification channel on the `Cluster` interface + (`PeersChanges() <-chan []Peer` or equivalent); +- a watcher goroutine that recomputes the cap on each membership + change; +- edge-case handling (empty peer set, current inflight exceeding + the new cap, rapid peer-set churn). + +Roughly 60-80 lines of code plus ~5 new tests. Modest in +isolation but composes with the broader complaint that the v1 +design has too many moving parts. + +**Trigger**: any of: + +- HPA-driven autoscaling produces frequent replica-count + changes; +- operators routinely scale the deployment without updating + `cluster.target_replicas`, leaving the realized cap + mis-sized; +- operator measures sustained over- or under-allocation against + `target_global` (sum of per-replica `origin_inflight` gauges + diverging persistently from `target_global`). + +**Sketch (if built)**: + +- `internal/orca/origin/semaphore.go`: resizable semaphore + wrapper with `Acquire(ctx)`, `Release()`, `SetCapacity(n)`. +- `Cluster` interface gains a peer-change notification surface + (channel or callback). +- Watcher goroutine recomputes on each membership change: + `target_per_replica = floor(target_global / max(1, len(peers)))`. + The `max(1, ...)` matches the empty-peer fallback (s14): a + lone replica gets `target_global` slots, which is correct for + the last-replica-standing case. +- Edge cases: current inflight exceeds new cap (existing holders + complete naturally; new acquires queue against the new cap); + rapid peer-set churn (optional debouncing or rate-limiting on + `SetCapacity` calls). +- Composes naturally with [s15.5](#155-coordinated-cluster-wide-origin-limiter): + the coordinated authority (s15.5) and per-replica dynamic cap + (this entry) are orthogonal mechanisms; if both ever ship, + dynamic is the uncoordinated baseline that coordination + tightens further. + +**Known v1 limitation**: the static cap is approximate. Realized +cluster-wide concurrency depends on `N_actual`: + +- `N_actual > N_typical`: realized cap exceeds `target_global` by + up to `(N_actual - N_typical) * target_per_replica`. +- `N_actual < N_typical`: realized cap falls below `target_global` + by `(N_typical - N_actual) * target_per_replica`. + +Over-allocation may stress origin; under-allocation wastes +capacity. Operators MUST update `cluster.target_replicas` after +any sustained scale change. diff --git a/design/orca/plan.md b/design/orca/plan.md new file mode 100644 index 00000000..e1ef33d3 --- /dev/null +++ b/design/orca/plan.md @@ -0,0 +1,1554 @@ +# Orca - Origin Cache - Implementation & Operations Plan + +Status: draft for review (round 2 incorporating reviewer feedback) +Owner: TBD +Targets: Phase 0 walking skeleton in this repo, growing to multi-PB multi-replica cluster + +> Mechanism, decisions, internal interfaces, and flow diagrams: see [design.md](./design.md). +> Terminology and component glossary: see [design.md#3-terminology](./design.md#3-terminology). + +--- + +## 1. Goal + +Ship a read-only S3-compatible blob caching layer ("Orca") inside an +on-prem datacenter, fronting cloud blob storage (AWS S3 + Azure Blob). +Clients issue range reads against Orca; Orca serves from a +shared in-DC store when present, otherwise fetches from the cloud origin, +stores the chunk, and returns it. There is no client-initiated write path. + +This document covers deliverable scope, repo layout, configuration, auth, +observability, phasing, testing, risks, and the approval checklist. The +mechanism that delivers this behavior is described in +[design.md](./design.md). + +## 2. Scope + +In scope (v1): + +- Read-only S3-compatible client API: `GetObject` (with `Range`), + `HeadObject`, `ListObjectsV2`. +- Origin adapters for AWS S3 and Azure Blob (Block Blobs only - see + [design.md#9-azure-adapter-block-blob-only](./design.md#9-azure-adapter-block-blob-only)). +- Pluggable backing store ("CacheStore"): local filesystem for development; + in-DC S3-compatible store (e.g. VAST) for production. +- Fixed-size chunking with stampede protection (singleflight + tee + + spool). +- ETag-based immutable-blob model with strict `If-Match` enforcement on + every origin range read - see + [design.md#8-stampede-protection](./design.md#8-stampede-protection). +- Sequential read-ahead. +- Single-tenant deployment, network-perimeter trust (bearer / mTLS) on the + client edge, separate internal mTLS listener for inter-replica RPCs, no + SigV4 verification in v1. +- Multi-replica Kubernetes Deployment from day one. All replicas share a + single in-DC CacheStore; rendezvous hashing on `ChunkKey` selects the + coordinator for miss-fills; the receiving replica is the assembler that + fans out per-chunk fill RPCs. +- Observable (Prometheus), operable (health probes, manifests, container + image), testable in CI against `minio` and `azurite`. + +Out of scope (v1): + +- Writes, multipart uploads, object versioning. +- Cross-DC cache peering. +- S3 SigV4 verification on the client edge. +- Multi-tenant quotas and per-tenant credentials. +- Mutable-blob invalidation / origin event subscriptions. +- Encryption at rest beyond what the underlying CacheStore provides. + +## 3. Repo layout (mirrors `machina`) + +``` +cmd/orca/ + main.go # thin wrapper -> orca.Run() + orca/ + orca.go # cobra root, config load, wiring + server/ # S3-compatible HTTP handlers (client edge) + internal/ # internal listener handlers + # GET /internal/fill?key= +internal/orca/ + types.go # ChunkKey, ObjectInfo, ChunkInfo, Config + chunker/ # range <-> chunk math (streaming iterator) + fetch/ # Coordinator: meta + chunk SF, semaphore, + # assembler fan-out, internal RPC client + spool/ # bounded local-disk staging area for in-flight + # fills; slow-joiner fallback regardless of + # CacheStore driver + chunkcatalog/ # in-memory LRU fronting CacheStore.Stat + cachestore/ + localfs/ # dev; link()/renameat2(RENAME_NOREPLACE); + # uses internal/posixcommon for staging, + # link-commit, dir-fsync helpers + posixfs/ # prod; shared POSIX FS (NFSv4.1+ baseline, + # plus Weka native, CephFS, Lustre, GPFS); + # same primitive as localfs via posixcommon; + # adds backend detection, NFS minimum-version + # gate, Alluxio-FUSE refusal, fan-out path + # layout, SelfTestAtomicCommit at startup + s3/ # VAST and other in-DC S3-like stores; + # PutObject + If-None-Match: *; + # SelfTestAtomicCommit at startup + internal/ + posixcommon/ # shared link()/EEXIST commit primitive, + # staging-dir layout, dir-fsync, optional + # 2-char hex fan-out; consumed by + # cachestore/localfs and cachestore/posixfs + # only; not visible above the cachestore + # package boundary + origin/ + types.go # Origin interface, error types incl. + # OriginETagChangedError, UnsupportedBlobTypeError + s3/ # If-Match: on every GetRange + azureblob/ # Block Blob only; If-Match on Get Blob + singleflight/ # per-key in-flight dedupe + tee + cluster/ # membership refresh from headless Service + # DNS (default 5s); rendezvous hashing on + # pod IP; per-chunk internal fill RPC + # client + server helpers + auth/ # bearer / mTLS verification (client edge); + # internal-listener mTLS + peer-IP authz + metrics/ # Prometheus collectors +deploy/orca/ + 01-namespace.yaml.tmpl + 02-rbac.yaml.tmpl + 03-config.yaml.tmpl + 04-deployment.yaml.tmpl # exposes container ports 8443 (client), + # 8444 (internal), 9090 (metrics) + 05-service.yaml.tmpl # headless service for membership + 06-service-clientvip.yaml.tmpl # ClusterIP for client traffic + 07-networkpolicy.yaml.tmpl # restricts ingress on :8444 to pods + # labelled app=orca in-namespace; + # rendered only when + # networkpolicy.enabled=true (omit in dev) + # 08-storage-pvc.yaml.tmpl - RESERVED for Phase 2 cachestore/posixfs + # deployments that wire the shared FS in via + # a PVC + CSI driver rather than a kubelet + # mount or hostPath; content deferred + dev/ # dev-only manifests overlay + 01-localstack-deployment.yaml # LocalStack pod (ephemeral; no PVC); + # pinned to localstack/localstack:3.8 + # (community) + 02-localstack-service.yaml # ClusterIP exposing :4566 + 03-localstack-init-job.yaml # Job that creates the chunks bucket + # via awslocal at bring-up + embed.go + rendered/ # gitignored, produced by render-manifests +images/orca/ + Containerfile +design/orca/ + plan.md # this file + design.md # mechanism + flow diagrams + brief.md # stakeholder-facing brief +hack/orca/ + Makefile # dev-cluster targets: up, down, reset, + # render, port-forward, status, logs, + # seed-azure (real Azure only). + # Top-level Makefile may add `orca-` + # prefixed proxies that invoke + # `make -C hack/orca ` + # (matches the hack/net/ convention). + dev-harness.md # how to use the dev harness in Kind + # (LocalStack as cachestore/s3, real Azure + # as origin) + inttest.md # integration test guide for + # internal/orca/inttest/ + up.sh # kind create + image build + load + render + # manifests + apply + wait-for-ready + down.sh # kind delete cluster + reset.sh # rebuild image + kind load + rollout + # restart + clear-cache.sh # delete LocalStack pod (recreated; cache + # state wiped without rebuilding the + # cluster) + seed-azure.sh # generate small/medium/large blobs and + # upload to the configured Azure account + port-forward.sh # kubectl port-forward orca client + # service to localhost + sample-get.sh, sample-list.sh # example S3 client invocations + logs.sh # tail logs across replicas + .env.example # AZURE_STORAGE_ACCOUNT, AZURE_STORAGE_KEY, + # AZURE_CONTAINER, ORCA_REPLICAS, + # ORCA_IMAGE_TAG + kind-config.yaml # 1 control + 3 worker nodes (one Orca + # replica per worker via required + # anti-affinity) +``` + +`Makefile` additions: `orca`, `orca-build`, `orca-image`, +`orca-manifests`. `make` continues to build everything. + +## 4. Auth (v1) + +Two listeners with two distinct trust roots. + +### 4.1 Client edge listener (default `:8443`) + +- Bearer token middleware: HMAC token validated against a shared secret in + a Kubernetes Secret. +- Optional mTLS: client cert validated against a configured **client CA + bundle** (`server.tls.client_ca_file`). +- Pluggable so SigV4 verification can land later without rewriting the + request pipeline. + +### 4.2 Internal listener (default `:8444`) + +Serves `GET /internal/fill?key=` for per-chunk fill RPCs +between replicas. Implementation follows +[design.md#88-internal-rpc-listener](./design.md#88-internal-rpc-listener). + +- Transport: HTTP/2 over mTLS. +- Server cert: per-replica cert (e.g. cert-manager-issued) chained to a + configured **internal CA** (`cluster.internal_tls.ca_file`). The + internal CA is **distinct** from the client mTLS CA so a leaked client + cert cannot be used to dial the internal listener. +- Client auth: peer presents a client cert chained to the internal CA AND + the peer's source IP must be in the current peer-IP set + (`Cluster.Peers()`). +- NetworkPolicy (`07-networkpolicy.yaml.tmpl`) restricts ingress on `:8444` + to pods with label `app=orca` in the same namespace. +- Loop prevention: receiver enforces `X-Origincache-Internal: 1` and + self-checks `Cluster.Coordinator(k) == Self()`; on disagreement returns + `409 Conflict` and the assembler falls back to local fill (one duplicate + fill possible during membership flux, observable via + `orca_origin_duplicate_fills_total{result="commit_lost"}`). + +## 5. Configuration shape + +```yaml +server: + listen: 0.0.0.0:8443 + max_response_bytes: 0 # 0 = no cap; >0 returns + # 400 RequestSizeExceedsLimit + # (S3-style XML) with header + # x-orca-cap-exceeded: true + # before any cache lookup. + # 416 is reserved for true + # Range vs. object-size violations. + tls: + cert_file: /etc/orca/tls/tls.crt + key_file: /etc/orca/tls/tls.key + client_ca_file: /etc/orca/tls/client-ca.crt # optional, enables mTLS + auth: + enabled: true # production: true. Dev: + # set false to disable client + # auth entirely (no token / + # cert required). NOT a + # dev_mode flag - just an + # auth-on/off knob. + mode: bearer # bearer | mtls | both + # (only meaningful when + # enabled=true) + bearer_secret_file: /etc/orca/secret/token + +readyz: + errauth_consecutive_threshold: 3 # mark NotReady after this many + # consecutive CacheStore ErrAuth; + # one non-ErrAuth success resets + +metadata_ttl: 5m # bounded-staleness window + # (design.md#11-bounded-staleness-contract); + # default 5m. Upper bound on + # serving stale ETag if the + # immutable-origin contract + # is violated by an operator. + +negative_metadata_ttl: 60s # negative-cache window + # (design.md#12-create-after-404-and-negative-cache-lifecycle); + # default 60s. Upper bound on + # serving stale 404 / unsupported- + # blob-type after the operator + # uploads a previously-missing + # key. Independent of metadata_ttl; + # short by design so create-after-404 + # recovery is fast. + +chunking: + size: 8MiB # 4-16 MiB + prefetch: + enabled: true + depth: 4 + max_inflight_per_blob: 8 + max_inflight_global: 256 + +list_cache: # per-replica TTL'd cache + # of Origin.List responses; + # sized for FUSE-`ls` workload + # (design.md s6.2 / FW3) + enabled: true # default true; toggle off + # for diagnostics + ttl: 60s # default 60s; configurable + # 5s - 30m typical range + max_entries: 1024 # bounded LRU + max_response_bytes: 1MiB # responses larger than this + # bypass the cache entirely + swr_enabled: false # stale-while-revalidate; + # off by default + swr_threshold_ratio: 0.5 # background refresh trigger + # when entry age > ratio * ttl; + # only meaningful when + # swr_enabled=true + +chunk_catalog: # in-memory chunk presence + # cache + access tracking + # (design.md s10.2 / s13.2) + max_entries: 100000 # default 100K (~12 MB at + # ~120B/entry); SIZE TO + # WORKING SET per s13.3 + active_eviction: + enabled: false # default false; opt-in + # (preserves v1 lifecycle- + # only behavior); enable + # for posixfs deployments + # without external sweep + interval: 10m # eviction loop period + inactive_threshold: 24h # entry must be older than + # this since last access + access_threshold: 5 # evict only if AccessCount + # < threshold + min_age: 5m # cold-start protection; + # never evict entries + # younger than this + max_evictions_per_run: 1000 # bound per-cycle work + +metadata_refresh: # opt-in bounded-freshness + # mode (design.md s11.2 / + # FW5); proactively re-Heads + # hot keys ahead of + # metadata_ttl + enabled: false # default false; preserves + # "trust the contract" + # posture + interval: 1m # refresh-loop period + refresh_ahead_ratio: 0.7 # eligible when entry age + # >= ratio * metadata_ttl + # (default 0.7 * 5m = 3.5m) + access_threshold: 5 # only refresh hot keys + # (AccessCount >= threshold) + min_age: 75s # cold-start protection; + # never refresh entries + # younger than this + # (default = metadata_ttl/4) + max_refreshes_per_run: 100 # bound per-cycle work + refresh_concurrency: 8 # parallel refresh workers + +spool: + dir: /var/lib/orca/spool # bounded local-disk staging + max_bytes: 8GiB # full-spool -> 503 Slow Down + max_inflight: 64 # concurrent fills using spool + tmp_max_age: 1h # crash-recovery sweep age + require_local_fs: true # boot statfs(2) check; refuse + # to start if spool.dir is on + # NFS/SMB/CephFS/Lustre/GPFS/ + # FUSE. Defense-in-depth: the + # spool is no longer on the + # client TTFB path in v1, but + # joiner-fallback latency + # benefits materially from + # local block storage. + # Operators with unusual + # placements MAY relax to + # false; production deploys + # are expected to keep the + # default. + # See design.md#104-spool-locality-contract. + +origin: # leader-side pre-header + # retry budget; transient + # origin failures retry + # invisibly to the client + # before HTTP response + # headers are committed + # (design.md s8.6 / Option D) + retry: + attempts: 3 # max attempts before giving + # up and returning 502 + # OriginRetryExhausted + backoff_initial: 100ms # initial backoff + backoff_max: 2s # capped backoff per attempt + max_total_duration: 5s # absolute wall-clock cap; + # 502 if exhausted regardless + # of attempt count. Bounded + # well below typical S3 SDK + # read timeouts (aws-sdk-go + # 30s; boto3 60s) so retries + # complete before clients + # time out. + +cachestore: + driver: localfs # localfs | posixfs | s3 + localfs: + root: /var/lib/orca/chunks + staging_max_age: 1h # sweep /.staging/ + # entries older than this; staging + # MUST live inside to keep + # link()/renameat2 atomic on the + # same filesystem + posixfs: # shared POSIX FS backend; same + # link()/EEXIST primitive as + # localfs but mounted on every + # replica at the same path + root: /mnt/orca/chunks # mount point + base dir; MUST + # be the same on every replica + staging_max_age: 1h # sweep /.staging/ + # entries older than this + fanout_chars: 2 # 2-char hex fan-out under + # / to bound dir + # sizes; 0 disables. localfs + # does NOT enable this by + # default; posixfs does. + backend_type: "" # "" = auto-detect via + # statfs(2) f_type + /proc/mounts + # (nfs|wekafs|ceph|lustre|gpfs|...); + # operator override allowed for + # backends with ambiguous magic + # numbers, logged loudly. + nfs: + minimum_version: "4.1" # refuse to start if mount + # negotiates a lower NFS version; + # see design.md#1012-cachestoreposixfs + allow_v3: false # opt-in NFSv3 with loud warning + # and posixfs_nfs_v3_optin_total++; + # NEVER set true in production + mount_check: true # parse /proc/mounts at boot to + # confirm vers= and sync export + # options; warn (not refuse) on + # async export + require_atomic_link_self_test: true # SelfTestAtomicCommit at startup; + # refuse to start if backend + # does not honor link()/EEXIST, + # directory fsync, or size verify + # via re-stat. Never disabled in + # production. + s3: + endpoint: https://vast.dc.example.internal + bucket: orca-chunks + region: us-east-1 + credentials_file: /etc/orca/cachestore-creds + atomic_commit_self_test: true # SelfTestAtomicCommit at + # startup; refuse to start if + # backend silently overwrites + # despite If-None-Match: * + require_unversioned_bucket: true # boot-time GetBucketVersioning + # check (design.md s10.1.3); + # refuse to start if Status: + # Enabled or Suspended; + # required because + # If-None-Match: * is not + # honored on versioned buckets + # across all S3-compatible + # backends (notably VAST) + circuit_breaker: # per-process breaker around all + # CacheStore calls; trips on + # sustained ErrTransient/ErrAuth + # to prevent amplifying degradation + enabled: true + error_window: 30s + error_threshold: 10 # ErrTransient + ErrAuth count; + # ErrNotFound does NOT + open_duration: 30s + half_open_probes: 3 + +chunkcatalog: + max_entries: 1_000_000 # ~128 MiB at ~128 B/entry + +origin: + id: aws-us-east-1-prod # deployment-scoped origin + # identifier; required; + # baked into ChunkKey and the + # on-store path so two + # deployments can safely share + # one CacheStore bucket + target_global: 192 # desired cluster-wide cap + # on concurrent + # Origin.GetRange (design.md + # s8.4). Per-replica cap is + # floor(target_global / + # cluster.target_replicas). + # Realized cluster-wide cap + # tracks target_global only + # when actual replica count + # equals + # cluster.target_replicas. + # Coordinated cluster-wide + # limiter is deferred future + # work (design.md s15.5). + queue_timeout: 5s # bounded wait when the + # per-replica bucket is + # saturated; on timeout the + # request returns 503 Slow + # Down so clients back off + driver: s3 # s3 | azureblob + s3: + region: us-east-1 + bucket: example-data + credentials: env # env | irsa | file + azureblob: + account: exampleacct + container: data + auth: managed-identity # managed-identity | sas | key + enforce_block_blob_only: true # locked true; setting false + # is rejected at startup + list_mode: filter # filter | passthrough + metadata_ttl: 5m + rejection_ttl: 5m + +cluster: + enabled: true + service: orca.orca.svc.cluster.local + port: 8443 # client edge port on peers + # (used only as a discovery + # convention; internal RPCs + # use internal_listen below) + membership_refresh: 5s # headless Service DNS poll + internal_listen: 0.0.0.0:8444 # per-chunk fill RPC listener + internal_tls: + enabled: true # production: true (mTLS). + # Dev: set false to listen + # plain HTTP/2; binary logs + # WARN at startup. NOT a + # dev_mode flag - just a + # security knob. + cert_file: /etc/orca/internal-tls/tls.crt + key_file: /etc/orca/internal-tls/tls.key + ca_file: /etc/orca/internal-tls/ca.crt # internal CA, distinct + # from client CA + server_name: orca..svc # stable SAN; pinned as + # tls.Config.ServerName by + # internal-RPC dialers + # (NOT pod IPs); per-replica + # certs MUST include this SAN + target_replicas: 3 # expected replica count; + # used to compute the + # per-replica origin + # concurrency cap + # (target_per_replica = + # floor(origin.target_global / + # cluster.target_replicas)) + # (design.md s8.4). + # MUST be updated after + # any sustained scale + # change. Dynamic recompute + # is deferred future work + # (design.md s15.6). +``` + +CacheStore eviction (TTL / lifecycle) is configured separately on the +underlying storage system and is not a cache-layer concern. See +`operations.md` for recommended baselines. + +## 6. Observability + +- Prometheus collectors: + - `orca_requests_total{op,status}` + - `orca_request_duration_seconds{op}` (histogram) + - `orca_responses_aborted_total{phase,reason}` -- mid-stream + aborts after first byte sent (HTTP/2 `RST_STREAM` or HTTP/1.1 + `Connection: close`); `phase` in `pre_first_byte|mid_stream` + - `orca_chunk_hits_total`, `orca_chunk_misses_total` + - `orca_chunkcatalog_hits_total`, `orca_chunkcatalog_misses_total` + - `orca_chunkcatalog_entries` + - `orca_cachestore_stat_total{result="present|absent|error"}` + - `orca_cachestore_stat_duration_seconds` (histogram) + - `orca_origin_requests_total{origin,op,status}` + - `orca_origin_bytes_total{origin}` + - `orca_origin_request_duration_seconds{origin,op}` (histogram) + - `orca_origin_rejected_total{origin,reason,blob_type}` + - `orca_origin_etag_changed_total{origin}` -- count of `412 + Precondition Failed` responses to `If-Match: ` GETs; + leading indicator of mid-flight overwrite or stale metadata cache + - `orca_origin_retry_total{result="success|exhausted_attempts|exhausted_duration|etag_changed"}` + -- one increment per request that entered the pre-header retry + loop ([design.md s8.6](./design.md#86-failure-handling-without-re-stampede)). + `success` = origin returned a first byte after some attempts; + `exhausted_attempts` = ran out of attempts within the time + budget -> 502 OriginRetryExhausted; + `exhausted_duration` = exceeded `origin.retry.max_total_duration` + -> 502 OriginRetryExhausted; + `etag_changed` = OriginETagChangedError (non-retryable) -> 502 + OriginETagChanged. Sustained non-zero `exhausted_*` rates + indicate origin health issues. + - `orca_origin_retry_attempts` -- histogram of attempt + count per request that entered the retry loop. p50 should be + 1 (first attempt succeeds); a long tail toward + `origin.retry.attempts` indicates degraded origin. + - `orca_responses_aborted_total{phase="pre_commit|mid_stream",reason}` + -- response abort counters. `pre_commit` covers errors before + response headers are sent (mostly diagnostic; the request + typically returns a clean HTTP error). `mid_stream` covers + aborts after the commit boundary (origin disconnect after + first byte) and is the metric to watch for the cost paid by + the v1 streaming design. Sustained non-zero `mid_stream` rate + is the trigger for considering mid-stream origin resume + ([design.md s15.4](./design.md#154-mid-stream-origin-resume)). + - `orca_origin_duplicate_fills_total{result="commit_won|commit_lost"}` + - increments at every CacheStore commit attempt. The `commit_lost` rate + quantifies cross-replica fill duplication that escaped coordinator + routing (e.g. during membership flux during rolling restart). See + [design.md#8-stampede-protection](./design.md#8-stampede-protection) + and [design.md#14-horizontal-scale](./design.md#14-horizontal-scale). + - `orca_inflight_fills` + - `orca_singleflight_joiners_total` + - `orca_spool_bytes` -- current spool footprint + - `orca_spool_evictions_total{reason="committed|aborted|full"}` + - `orca_cluster_internal_fill_requests_total{direction="sent|received|conflict"}` + -- `conflict` increments whenever the receiver returns `409 Conflict` + because of a coordinator-membership disagreement + - `orca_cluster_internal_fill_duration_seconds` (histogram) + - `orca_cluster_membership_size` + - `orca_cluster_membership_refresh_duration_seconds` (histogram) + - `orca_cachestore_self_test_total{result="ok|failed"}` -- + incremented once per process start by `SelfTestAtomicCommit` + - `orca_cachestore_errors_total{kind="not_found|transient|auth"}` + -- typed CacheStore error counts (see + [design.md#102-catalog-correctness-typed-errors-circuit-breaker](./design.md#102-catalog-correctness-typed-errors-circuit-breaker)); + `not_found` is normal cold-path traffic, `transient` and `auth` + feed the breaker and (for `auth`) the `/readyz` threshold + - `orca_cachestore_breaker_state` -- 0=closed, 1=open, + 2=half_open + - `orca_cachestore_breaker_transitions_total{from,to}` -- + breaker state-transition counter + - `orca_origin_inflight{origin}` -- per-replica gauge of + in-flight `Origin.GetRange` calls; cap is + `floor(target_global / N_replicas)` per + [design.md#84-origin-backpressure](./design.md#84-origin-backpressure) + - `orca_metadata_origin_heads_total{origin,result}` -- + per-replica HEAD calls that actually reached the origin (not + served from the metadata cache); cluster-wide bound is N per + object per `metadata_ttl` window in v1 + - `orca_metadata_negative_entries` -- gauge of negative + metadata-cache entries (404 / unsupported-blob-type) currently + held by this replica. Drains as entries expire after + `negative_metadata_ttl`. See + [design.md#12-create-after-404-and-negative-cache-lifecycle](./design.md#12-create-after-404-and-negative-cache-lifecycle). + - `orca_metadata_negative_hit_total{origin_id}` -- counter + of requests served from a negative entry. A spike following a + known operator upload signals create-after-404 drain in + progress. + - `orca_metadata_negative_age_seconds{origin_id}` -- + histogram of negative-entry age at hit time. Upper-bound + percentiles inform `negative_metadata_ttl` tuning. + - `orca_list_cache_entries` -- gauge of LIST cache size + (current LRU population). Approaches `list_cache.max_entries` + indicate undersizing for the workload. See + [design.md s6.2](./design.md#62-list-request-flow). + - `orca_list_cache_hit_total{origin_id,result="hit|miss"}` + -- LIST cache hit rate; `result="hit"` increments on cache + serve, `result="miss"` on origin pass-through. Hit rate is the + primary indicator of LIST cache effectiveness for the FUSE + workload. + - `orca_list_cache_evict_total{reason="size|ttl|response_too_large"}` + -- LIST cache evictions by trigger. `size` = LRU bound; + `ttl` = lazy expiration on lookup; `response_too_large` = + response exceeded `list_cache.max_response_bytes` and bypassed + cache. + - `orca_list_cache_origin_calls_total{origin_id,result}` + -- LIST calls that actually reached origin (cache miss + + singleflight collapse). With per-replica caching, cluster-wide + bound is N origin LIST per identical query per + `list_cache.ttl`. + - `orca_list_cache_swr_refresh_total{origin_id,result}` + -- background stale-while-revalidate refreshes. Only emitted + when `list_cache.swr_enabled=true`. + - `orca_chunk_catalog_entries` -- gauge of in-memory + ChunkCatalog size. Pinned at `chunk_catalog.max_entries` + suggests undersizing relative to the working set + ([design.md s13.3](./design.md#133-chunkcatalog-size-awareness-load-bearing-operational-note)). + - `orca_chunk_catalog_hit_total{result="hit|miss"}` -- + catalog Lookup outcomes. Sustained hit_rate < 0.7 suggests + undersizing. + - `orca_chunk_catalog_evict_total{reason="size|active|forget"}` + -- catalog evictions by trigger. `size` = LRU bound (passive); + `active` = active eviction loop deleted from CacheStore; + `forget` = explicit Forget (ETag changed, GetChunk ErrNotFound). + - `orca_chunk_catalog_active_eviction_runs_total{result="ok|breaker_open|aborted"}` + -- active eviction loop completions. `breaker_open` means the + loop skipped this cycle because the CacheStore breaker is + open. Only emitted when + `chunk_catalog.active_eviction.enabled=true`. + - `orca_chunk_catalog_active_eviction_candidates` -- + histogram of per-run candidate count. Visibility into + eligible-but-not-yet-evicted entries. + - `orca_cachestore_delete_total{result="ok|not_found|transient|auth"}` + -- `CacheStore.Delete` outcomes (called by active eviction). + `not_found` is treated as success by the eviction loop + (idempotent). `transient` and `auth` count toward the + CacheStore circuit breaker. + - `orca_metadata_refresh_runs_total{result="ok|aborted|breaker_open"}` + -- bounded-freshness mode (FW5) per-loop completions. Only + emitted when `metadata_refresh.enabled=true`. See + [design.md s11.2](./design.md#112-bounded-freshness-mode-optional). + - `orca_metadata_refresh_total{result="ok|etag_changed|error|skipped_limiter_busy"}` + -- per-key refresh outcomes. `etag_changed` indicates an + immutable-contract violation detected proactively (the metric + `orca_origin_etag_changed_total` also increments). + - `orca_metadata_refresh_candidates` -- histogram of + eligible candidates per refresh-loop run. Visibility into the + hot-key set size. + - `orca_metadata_refresh_lag_seconds` -- histogram of + `(now - LastEntered)` at refresh time; should cluster around + `metadata_refresh.refresh_ahead_ratio * metadata_ttl`. + - `orca_s3_versioning_check_total{result="ok|refused"}` -- + once-per-boot emission from the `cachestore/s3` versioning + gate ([design.md s10.1.3](./design.md#1013-cachestores3)). + `refused` indicates the bucket has versioning enabled or + suspended; the process exits non-zero immediately after. + - `orca_commit_after_serve_total{result="ok|failed"}` -- + asynchronous CacheStore commits that run after the client + response is complete; `failed` means the + client response succeeded but the chunk was NOT recorded in the + `ChunkCatalog` (next request refills); see + [design.md#86-failure-handling-without-re-stampede](./design.md#86-failure-handling-without-re-stampede) + - `orca_localfs_dir_fsync_total{result="ok|failed"}` -- + `fsync()` of the `/.staging/` and final-parent directories + on every commit, sweep, and orphaned-staging cleanup + - `orca_posixfs_link_total{result="commit_won|commit_lost|error"}` -- + every `link()` no-clobber commit attempt by `cachestore/posixfs`; + the loser of a race is `commit_lost` (returned `EEXIST`); other + failures are `error` and feed the breaker. See + [design.md#1012-cachestoreposixfs](./design.md#1012-cachestoreposixfs). + - `orca_posixfs_dir_fsync_total{result="ok|failed"}` -- + `fsync()` of `/.staging/` and `` directories + by `cachestore/posixfs`; rate matters because a network FS may + silently degrade dir-fsync semantics under an `async` export. + - `orca_posixfs_backend{type,version,major,minor}` -- info + gauge (value=1) labelled with the auto-detected (or + operator-overridden) backend at boot, e.g. + `type="nfs",version="4.1"`; `type="wekafs"`; `type="ceph"`; + `type="lustre"`; `type="gpfs"`. Used to tag every other posixfs + metric in dashboards via `group_left`. + - `orca_posixfs_selftest_last_success_timestamp` -- unix + seconds of the last successful `SelfTestAtomicCommit`; absent if + the driver never reached a green self-test. + - `orca_posixfs_nfs_v3_optin_total` -- count of boot-time + NFSv3 opt-in events (operator set + `cachestore.posixfs.nfs.allow_v3: true`); should be `0` in + production. + - `orca_posixfs_alluxio_refusal_total` -- count of boot + refusals because the detected backend was Alluxio FUSE; should be + `0`. Operators MUST switch to `cachestore.driver: s3` against the + Alluxio S3 gateway. + - `orca_spool_locality_check_total{result="ok|refused|bypassed",fs_type}` -- + boot `statfs(2)` outcome for `spool.dir`; `refused` means the FS + is on the network-FS denylist and the process exited non-zero; + `bypassed` means `spool.require_local_fs=false` (test-only). + See [design.md#104-spool-locality-contract](./design.md#104-spool-locality-contract). + - `orca_readyz_errauth_consecutive` -- current count of + consecutive `ErrAuth` responses from CacheStore; flips `/readyz` + to NotReady at `readyz.errauth_consecutive_threshold` (default 3) +- Structured logs with request IDs propagated to origin SDKs. +- `/healthz` and `/readyz`. Ready when the CacheStore is reachable, the + CacheStore startup self-test has succeeded (s10 of design.md), the + internal listener is bound, and origin credentials are valid. There is + no persistent local state to load. +- Admin endpoints (gated by separate listener / auth): + dump cluster topology, lookup chunk, force-`Forget` a catalog entry, + dump current spool inventory. +- `kubectl unbounded orca` subcommand for inspection (later phase). + +## 7. Phased delivery + +| Phase | Scope | Definition of done | +|---|---|---| +| **0 - skeleton** | `cmd/orca` boilerplate; `Origin` and `CacheStore` interfaces; `origin/s3`; `cachestore/localfs`; in-memory `chunkcatalog`; single-process Range GET; streaming chunk iterator; `make` integration; basic unit tests | One process serves a Range GET against a real S3 bucket and re-serves it from `localfs` | +| **1 - prod basics** | `fetch.Coordinator` with chunk + meta singleflight + tee; `chunkcatalog` LRU + Stat-on-miss path with **per-entry access-frequency tracking** (FW8) and bounded by `chunk_catalog.max_entries` with size-awareness operational guidance ([design.md s13.3](./design.md#133-chunkcatalog-size-awareness-load-bearing-operational-note)); atomic CacheStore writes (`localfs` `link`/`renameat2(RENAME_NOREPLACE)` with **staging inside `/.staging/` + parent-dir fsync**); metadata cache with `metadata_ttl=5m` and **`negative_metadata_ttl=60s`** (asymmetric defaults; bounds the create-after-404 unavailability window per [design.md s12](./design.md#12-create-after-404-and-negative-cache-lifecycle)) including `metadata_negative_entries` / `metadata_negative_hit_total` / `metadata_negative_age_seconds` metrics; **per-replica LIST cache** (FW3) with default `list_cache.ttl=60s`, `max_entries=1024`, sized for FUSE-`ls` workload ([design.md s6.2](./design.md#62-list-request-flow)); **active eviction** (FW8) opt-in via `chunk_catalog.active_eviction.enabled` (default off; recommended on for posixfs deployments without external sweep) including `CacheStore.Delete` interface method; **bounded-freshness mode** (FW5) opt-in via `metadata_refresh.enabled` (default off) with hot-key detection via metadata-cache access counters ([design.md s11.2](./design.md#112-bounded-freshness-mode-optional)); **distributed origin limiter** is deferred future work (see [design.md s15.5](./design.md#155-coordinated-cluster-wide-origin-limiter)); v1 ships with a per-replica token bucket sized `floor(origin.target_global / cluster.target_replicas)` (default 64 slots/replica at `target_global=192`, `target_replicas=3`), with origin throttling responses handled by the leader's pre-header retry loop ([design.md s8.4](./design.md#84-origin-backpressure)); **bounded staleness contract documented**; **strict `If-Match: ` on every `Origin.GetRange` plus `OriginETagChangedError` handling**; **typed `CacheStore` errors (`ErrNotFound|ErrTransient|ErrAuth`)** with only `ErrNotFound` triggering refill; **per-replica HEAD singleflight wording** in metadata layer; **pre-header origin retry** (`origin.retry.attempts=3`, `origin.retry.max_total_duration=5s` defaults) as the cold-path commit boundary - cold-path bytes stream origin -> client directly with bounded leader-side retry handling transient origin failures invisibly before HTTP response headers are committed; spool tees in parallel for joiner support and as the asynchronous CacheStore-commit source ([design.md s8.6](./design.md#86-failure-handling-without-re-stampede)); **mid-stream abort** on post-first-byte failure (`RST_STREAM` / `Connection: close`); **`server.max_response_bytes` cap returns `400 RequestSizeExceedsLimit`** (S3-style XML; 416 reserved for Range vs. EOF); `HeadObject`; `ListObjectsV2`; `origin/azureblob` (Block Blob only); **`cachestore/s3` versioning gate** ([design.md s10.1.3](./design.md#1013-cachestores3)) refusing to start on versioned buckets; Prometheus; structured logging; health / readiness | One replica deployed in a dev K8s cluster serving traffic against both S3 and Azure (multi-replica clustering lands in Phase 3) | +| **2 - prod backend & ops** | `cachestore/s3` for VAST with `PutObject` + `If-None-Match: *` and **`SelfTestAtomicCommit` at startup** (refuse to start if backend silently overwrites); **`cachestore/posixfs` for shared POSIX FS deployments** (NFSv4.1+ baseline, plus Weka native, CephFS, Lustre, GPFS) sharing `link()`/`EEXIST` + dir-fsync helpers with `cachestore/localfs` via `internal/orca/cachestore/internal/posixcommon/`, with **`SelfTestAtomicCommit` at startup** (refuse to start on Alluxio FUSE, on NFS below `nfs.minimum_version=4.1` unless `nfs.allow_v3` is set, or on any backend that fails the link-EEXIST + dir-fsync + size-verify self-test) and 2-char hex fan-out under `/`; **`internal/orca/fetch/spool` layer** (slow-joiner fallback regardless of CacheStore driver) **with mandatory boot `statfs(2)` locality check** that refuses to start when `spool.dir` is on a network FS (NFS / SMB / CephFS / Lustre / GPFS / FUSE); **`commit_after_serve_total{ok|failed}` async-commit metric path**; **per-process CacheStore circuit breaker** (`enabled,error_window=30s,error_threshold=10,open_duration=30s,half_open_probes=3`); **per-replica origin semaphore documented** with formula `floor(target_global / N_replicas)` + `origin_inflight` gauge; **`localfs` `staging_max_age=1h` orphaned-staging sweeper** (and equivalent `posixfs.staging_max_age=1h`); **`/readyz` ErrAuth threshold (default 3 consecutive -> NotReady)**; sequential read-ahead; bearer / mTLS auth on the client edge; `deploy/orca/` manifests (incl. `07-networkpolicy.yaml.tmpl`); `images/orca/` Containerfile; `hack/orca/` published with CacheStore lifecycle policy guidance and POSIX-backend support matrix | Production-shaped service running against VAST in a real DC with the self-test green, AND a parallel green run against at least one shared-POSIX backend (NFSv4.1+ baseline) | +| **3 - cluster** | `cluster/` peer discovery from headless Service DNS; rendezvous hashing on pod IP; **per-chunk internal fill RPC** (assembler fan-out); **internal mTLS listener on `:8444`** with internal CA + peer-IP authz + **stable `ServerName=orca..svc`** pinned by dialers (per-replica certs MUST include this SAN) + `X-Origincache-Internal` loop prevention + `409 Conflict` on coordinator disagreement; NetworkPolicy applied; `kubectl unbounded orca` inspection subcommand | Multi-replica Deployment sustaining target throughput; `commit_lost` rate near zero in steady state | +| **4 - optional** | NVMe / HDD tiering; S3 SigV4 verification; adaptive prefetch; deferred optimizations catalogued in [design.md s15](./design.md#15-deferred-optimizations) (edge rate limiting, cluster-wide HEAD singleflight, cluster-wide LIST coordinator) if measured to be needed | As needed | + +Estimated calendar: Phase 0 + 1 ~= 3-4 focused weeks. Phase 2 + 3 another +4-6 weeks depending on ops depth. + +## 8. Test strategy + +- `chunker` and `singleflight`: table-driven + fuzz (`go test -fuzz`). + Iterator must never materialize the full `[]ChunkKey` for a range; + test with `lastChunk - firstChunk = 1_000_000` and assert bounded + allocation. +- `chunkcatalog`: LRU eviction behavior, concurrent `Lookup` / + `Record` / `Forget`, bounded entry count. +- `cachestore/localfs`: temp-dir integration tests including: + - crash simulation (kill mid-write, verify `*.tmp.*` cleanup and + recovery via the periodic sweep); + - **two-leader race**: two goroutines both call `PutChunk(k, ..)` with + distinct payloads; assert exactly one wins (`commit_won`), the other + sees `EEXIST` and reports `commit_lost`, and the on-disk content + matches the winner. +- `cachestore/s3`: integration tests against `minio` covering: + - direct `PutObject(final, body, If-None-Match: "*")` commit; + - **`SelfTestAtomicCommit` pass** (real `minio` returns `412` on the + second probe write); + - **`SelfTestAtomicCommit` fail** (mock S3 server that always returns + `200`; assert process exits with the documented error); + - **412 commit_lost path**: two concurrent leaders, distinct payloads; + assert exactly one `commit_won` and one `commit_lost`, and the stored + object equals the winner's bytes; + - idempotent re-PUT (committed key + repeated PutObject yields 412 + without data loss). +- `origin/s3`: contract tests against `minio` in CI, including: + - **`If-Match: ` header is sent on every `GetRange`** (assert via + request capture); + - **412 -> `OriginETagChangedError`**: overwrite the object mid-test, + issue `GetRange` with the old etag, assert typed error and that the + metadata cache entry for `{origin_id, bucket, key}` is invalidated. +- `origin/azureblob`: contract tests against `azurite` in CI, including: + - One Block Blob, one Page Blob, one Append Blob. + - GETs against Page / Append return `502 OriginUnsupported` and + increment `orca_origin_rejected_total`. + - `ListObjectsV2` in `filter` mode returns only the Block Blob and + preserves continuation tokens across pages. + - 1000 concurrent requests for the same Page Blob produce exactly one + upstream `HEAD`. + - `If-Match: ` sent on every Get Blob; 412 -> `OriginETagChangedError`. +- `fetch.Coordinator` stampede tests: + - 1000 goroutines requesting the same `ChunkKey`; mock origin called + exactly once; all readers receive identical bytes. + - Same as above but origin returns an error after N bytes; all + pre-first-byte joiners get a `502`; mid-stream joiners get an aborted + response (`RST_STREAM` or `Connection: close`); a follow-up request + triggers exactly one new origin call. + - All joiners cancel mid-fill; chunk still lands in cache. + - **Mid-fill `OriginETagChangedError`**: after N bytes, mock origin + returns 412 on `If-Match`; assert (a) leader fails the fill with + `OriginETagChangedError`, (b) metadata cache entry invalidated, (c) + `orca_origin_etag_changed_total` increments, (d) pre-first-byte + joiners receive `502`, mid-stream joiners are aborted, (e) the next + request issues a fresh `Head`, gets a new etag, derives a new + `ChunkKey`, and successfully fills. + - **Slow-joiner spool fallback**: leader streams from origin via + spool + ring buffer; one joiner is artificially slowed beyond the + ring buffer head; assert the joiner transparently switches to + `Spool.Reader` and receives identical bytes; spool entry is released + after refcount hits zero. + - **Spool exhaustion**: fill `spool.max_bytes` with held-open joiners; + assert subsequent fill requests time out on `spool.max_inflight` and + return `503 Slow Down` to the client. +- Cold-start: a freshly started replica receives a request for a chunk + already present in the CacheStore; assert exactly one + `CacheStore.Stat`, no origin call, chunk served from CacheStore, + `ChunkCatalog` populated; subsequent request hits the catalog. +- Cluster: + - in-process 3-replica test for assembler fan-out and per-chunk + coordinator routing against a shared CacheStore; assert + `orca_origin_duplicate_fills_total{result="commit_lost"}` = 0 + under steady-state membership; + - **internal-listener authz**: peer with valid internal cert but source + IP outside `Cluster.Peers()` is rejected; client cert chained only to + the *client* CA is rejected; + - **loop prevention**: replica A forwards `/internal/fill` to replica B + with `X-Origincache-Internal: 1`; B's view of `Coordinator(k)` is C; + assert B returns `409 Conflict` and A falls back to local fill; + - **1000-chunk fan-out**: client requests a `Range` spanning 1000 + distinct cold chunks across 3 replicas; assert the assembler issues + fan-out fill RPCs concurrently up to the configured cap, response + body is byte-identical to a direct origin read, and total origin + GETs equal exactly 1000. +- End-to-end: docker-compose with `minio` (origin) + a second `minio` + (CacheStore) + a single `orca` process; scripted range-read + scenarios incl. mid-test object overwrite to exercise the `If-Match` + path end-to-end. +- Load test: `vegeta` / `k6` against a process backed by a mock origin with + injected latency. Confirm origin RPS stays at exactly 1 per cold chunk + and at most semaphore-limited overall, while client RPS scales linearly. +- **T-1a metadata_ttl bound** (`metadata` package): seed metadata cache + with `etag=v1` at t=0; at t=`metadata_ttl - jitter`, assert reads + still see `v1` without a new HEAD; at t=`metadata_ttl + jitter`, + overwrite origin to `etag=v2`, assert next request triggers HEAD, + observes `v2`, and derives a new `ChunkKey`. Asserts the staleness + cap from + [design.md#11-bounded-staleness-contract](./design.md#11-bounded-staleness-contract). +- **T-create-after-404a stale window** + (`metadata` + `fetch.Coordinator`): origin returns `404` for key `K` + at t=0; assert the cache returns `404` to the client and records a + negative metadata entry. Operator-side mock uploads `K` to origin at + t=`negative_metadata_ttl / 2`. At t=`negative_metadata_ttl - jitter`, + re-issue the client GET against the same replica; assert `404` is + still returned (negative entry still valid) and that + `metadata_negative_hit_total` was incremented. Asserts the bound in + [design.md#12-create-after-404-and-negative-cache-lifecycle](./design.md#12-create-after-404-and-negative-cache-lifecycle). +- **T-create-after-404b recovery** + (`metadata` + `fetch.Coordinator`): same setup as 404a, but at + t=`negative_metadata_ttl + jitter` re-issue the GET against the same + replica; assert the cache re-Heads, observes `200`, and serves the + newly-uploaded bytes via the normal fill path. +- **T-create-after-404c per-replica fan-out** (multi-replica integration): + in a 2-replica deployment, route the original `404` GET to replica A + only; upload `K` to origin; route a follow-up GET to replica B and + assert it serves `200` immediately (replica B never observed the + 404, so its metadata cache is fresh); route another follow-up to + replica A and assert it still returns `404` until its own + `negative_metadata_ttl` window expires. +- **T-list-cache-hit** (`metadata` + `fetch.Coordinator`): identical + LIST queries within `list_cache.ttl` -> first triggers + `Origin.List`, second served from cache; assert + `list_cache_hit_total{result="hit"}` increments and origin LIST + count = 1. +- **T-list-cache-ttl-expiry**: identical LIST query at `t=0` and + `t=list_cache.ttl + jitter` -> two `Origin.List` calls; assert + cache expired correctly. +- **T-list-cache-response-too-large**: mock `Origin.List` returning + a response that exceeds `list_cache.max_response_bytes` -> response + served to client but cache not populated; assert + `list_cache_evict_total{reason="response_too_large"}` incremented. +- **T-list-cache-error-passthrough**: `Origin.List` returns 503 -> + error passed to client; subsequent retry calls origin again (no + negative caching). +- **T-list-cache-pagination**: continuation tokens are part of the + cache key -> different tokens cache independently; sequential + page-through doesn't collide. +- **T-list-cache-swr-trigger**: with `list_cache.swr_enabled=true`, + query at `t=0`, query at `t=ttl*ratio + jitter` -> assert + immediate cached response AND background refresh fires; assert + origin LIST count = 2 over the window. +- **T-list-cache-fuse-pattern**: simulate FUSE `ls` workload (1 query + / 5s for 5 minutes against same prefix at `list_cache.ttl=60s`) -> + assert origin LIST count == 5 (one per minute); assert all client- + observed latencies are sub-millisecond except the 5 cache-miss + instances. +- **T-catalog-access-tracking** (`chunkcatalog`): Lookup hits + increment `AccessCount`; `LastAccessed` updates; cold entries + score lower than warm entries by the eviction ordering. +- **T-catalog-cold-start-protection**: entry created at t=0 not + eligible for active eviction at `t < min_age` regardless of + `AccessCount`. +- **T-active-eviction-cold-chunk** (`chunkcatalog` + `cachestore`): + chunk in CacheStore + catalog entry with `AccessCount=0`, + `LastEntered=t-25h`, `chunk_catalog.active_eviction.enabled=true`. + Run eviction loop. Assert `CacheStore.Delete` called; catalog + Forgets the entry; metric + `cachestore_delete_total{result="ok"}` increments. +- **T-active-eviction-popular-chunk**: chunk with `AccessCount=10`. + Run eviction loop. Assert NOT deleted. +- **T-active-eviction-bounded-run**: 5000 eligible candidates, + `max_evictions_per_run=1000`. Assert exactly 1000 deleted, 4000 + remain (next cycle catches them). +- **T-active-eviction-breaker-open**: simulate `CacheStore.Delete` + returning `ErrTransient` repeatedly until breaker opens. Assert + subsequent eviction runs skip with + `active_eviction_runs_total{result="breaker_open"}`. +- **T-catalog-size-undersized**: `chunk_catalog.max_entries=10`, + working set=100 entries. Assert hit rate < 0.7; assert + `chunk_catalog_evict_total{reason="size"}` increments steadily. +- **T-metadata-refresh-hot-key** (`metadata`): hot entry + (`AccessCount=10`) at age `0.7 * metadata_ttl` is refreshed by the + bounded-freshness loop; `LastEntered` updates; client sees no + observable change. Requires `metadata_refresh.enabled=true`. +- **T-metadata-refresh-cold-key-skipped**: cold entry + (`AccessCount=2`) NOT refreshed even when eligible by age. +- **T-metadata-refresh-cold-start-protected**: entry created at t=0, + hot, NOT refreshed at `t < min_age`. +- **T-metadata-refresh-etag-changed**: background refresh detects + new ETag; metadata cache updates; old `ChunkKey`s are orphaned; + next chunk request derives new `ChunkKey`s; metric + `metadata_refresh_total{result="etag_changed"}` increments; + `origin_etag_changed_total` also increments. +- **T-metadata-refresh-bounded**: 500 eligible candidates, + `max_refreshes_per_run=100` -> exactly 100 refreshed per cycle; + remaining catch up on subsequent cycles. +- **T-metadata-refresh-disabled**: `enabled=false` -> no background + activity; behaves like v1. +- **T-metadata-refresh-singleflight-race**: on-demand HEAD and + background refresh fire concurrently for the same key; per-replica + HEAD singleflight collapses to one origin HEAD; both consumers + get the result. +- **T-metadata-refresh-negative-entries-not-refreshed**: negative + entry (404) under `negative_metadata_ttl` is NOT refreshed; + expires naturally. +- **T-origin-per-replica-cap** (`origin` + mock origin): with + `cluster.target_replicas=3` and `origin.target_global=192` + (giving per-replica cap = 64), launch 100 concurrent + `Origin.GetRange` calls on a single replica. Assert at most 64 + hit origin concurrently; the remainder queue up to + `origin.queue_timeout` (5s) before returning `503 Slow Down` to + the client. Validates the simple per-replica token bucket + (design.md s8.4). +- **T-origin-throttle-handled-by-retry** (`origin` + + `fetch.Coordinator` + mock origin): origin returns `503 SlowDown` + on the first attempt and `200` on the second. Assert client sees + a clean 200 response; assert + `origin_retry_total{result="success"}=1`. Validates that origin + throttling does NOT require a coordinated cluster-wide cap; + pre-header retry handles it. +- **T-s3-versioned-bucket-refusal** (`cachestore/s3`): configure + `cachestore/s3` against a bucket with versioning enabled; assert + process exits non-zero with the documented error message and + metric `s3_versioning_check_total{result="refused"}=1`. +- **T-s3-unversioned-bucket-ok** (`cachestore/s3`): configure + `cachestore/s3` against an unversioned bucket; assert + `GetBucketVersioning` returns `Status: Disabled`; gate passes; + metric `s3_versioning_check_total{result="ok"}=1`; driver proceeds + to `SelfTestAtomicCommit`. +- **T-pre-header-retry-success** (`fetch.Coordinator` + mock origin): + origin returns transient 503 on attempt 1, 200 + bytes on attempt 2; + assert client sees clean 200 response with no observable abort; + assert `origin_retry_total{result="success"}=1`; assert + `origin_retry_attempts` records 2 attempts. +- **T-pre-header-retry-exhausted-attempts**: origin returns 503 on + every attempt within the duration budget; assert client receives + clean `502 Bad Gateway` with code `OriginRetryExhausted` after + `origin.retry.attempts` exhaust; assert + `origin_retry_total{result="exhausted_attempts"}=1`. +- **T-pre-header-retry-exhausted-duration**: origin slow-503 with + hangs that push total wall-clock past + `origin.retry.max_total_duration`; assert client receives `502` + before all attempts complete; assert + `origin_retry_total{result="exhausted_duration"}=1`. +- **T-pre-header-retry-etag-changed-non-retryable**: origin returns + `OriginETagChangedError` on attempt 1; assert NO retry happens; + assert `502` with code `OriginETagChanged`; assert + `origin_retry_total{result="etag_changed"}=1`; assert metadata + cache invalidated. +- **T-pre-header-retry-cold-path-ttfb** (`fetch` + mock origin): + with origin returning bytes after 10ms first-byte latency, + assert client TTFB < 50ms (sum of origin first-byte + small + pre-header retry overhead); assert NO chunk-download wait on + the TTFB path. Validates Option D's TTFB claim + ([design.md s8.6](./design.md#86-failure-handling-without-re-stampede)). +- **T-mid-stream-abort-first-chunk-after-commit** (`fetch` + + `spool` + mock origin): origin succeeds for first byte; cache + commits headers + first byte; origin disconnects at 50% of + chunk; assert client connection aborts (HTTP/2 RST_STREAM or + HTTP/1.1 Connection: close); assert + `responses_aborted_total{phase="mid_stream"}=1`; client SDK + retries (validated separately via real aws-sdk-go integration + test). +- **T-spool-tee-joiner-during-streaming** (`fetch` + `spool`): + leader streams 8 MiB chunk to client A; joiner B arrives at + 50% point through the singleflight; B reads from ring buffer + while on-pace; B falls behind; B switches to spool reader; both + finish with full chunk byte-for-byte. Confirms the spool tee + works in parallel with client streaming and joiner-fallback is + unaffected by the drop of the spool-fsync gate. +- **T-commit-after-serve failure** (`fetch` + `spool` + `cachestore`): + inject CacheStore commit error after the client response is + complete; assert the client response completes successfully + byte-for-byte; assert + `orca_commit_after_serve_total{result="failed"}` == 1; + assert `ChunkCatalog.Lookup(k)` is still a miss; assert a + follow-up request triggers exactly one new origin GET. +- **T-3 typed CacheStore errors** (`cachestore` + `fetch`): inject each + of `ErrNotFound|ErrTransient|ErrAuth` from `CacheStore.GetChunk`: + - `ErrNotFound` -> miss-fill path runs, eventual 200/206 to client; + - `ErrTransient` -> client receives `503 Slow Down` with + `Retry-After: 1s` and `cachestore_errors_total{kind="transient"}` + increments; no refill attempted; + - `ErrAuth` -> client receives `502 Bad Gateway`, + `cachestore_errors_total{kind="auth"}` increments, + `readyz_errauth_consecutive` increments. +- **T-3 circuit breaker** (`cachestore`): inject 10 `ErrTransient` over + 30s; assert breaker opens (`breaker_state=1`, + `breaker_transitions_total{from="closed",to="open"}` == 1); subsequent + calls short-circuit; after 30s, the next 3 probes are allowed (half-open + state); on all-success, breaker closes; on any failure during half-open, + breaker re-opens. +- **T-4a per-replica origin semaphore** (`fetch`): set semaphore to 4; + drive 16 concurrent cold misses across 16 distinct chunks; assert + in-flight `Origin.GetRange` never exceeds 4; assert + `orca_origin_inflight{origin}` saturates at 4; remaining 12 + fills queue and complete in 4-wide batches. +- **T-6a localfs staging-inside-root** (`cachestore/localfs`): assert + every commit writes to `/.staging/` (NOT `/tmp` and NOT + the spool dir); assert `link()` to final and `unlink()` of staging + both happen on the same filesystem; inject orphaned staging entries + older than `staging_max_age=1h`, run sweep, assert they are removed + and `localfs_dir_fsync_total` increments. Verify parent-dir fsync is + invoked by intercepting the syscall via a test seam (no strace + required). +- **T-posixfs-nfs link-EEXIST race** (`cachestore/posixfs`): two + goroutines on two simulated replicas (two open mount handles to a + loopback `nfsd` v4.1 export in CI) call `PutChunk(k, ..)` with + distinct payloads; assert exactly one wins (`commit_won`, + `posixfs_link_total{result="commit_won"}` == 1), the other observes + `EEXIST` and reports `commit_lost` + (`posixfs_link_total{result="commit_lost"}` == 1), and the on-disk + content visible from a third reader matches the winner. Repeat + against `tmpfs` (treated as local) as a control. +- **T-posixfs-nfs SelfTestAtomicCommit success** (`cachestore/posixfs`): + boot the driver against a CI loopback `nfsd` v4.1 export with `sync`; + assert `posixfs_selftest_last_success_timestamp` is set and the + process accepts traffic. Repeat against an `async` export and assert + the runbook warning is logged (note: detecting server-side `async` + is best-effort; the size-verify step still runs and may pass even + with `async` because the kernel client cache is consistent within a + process). +- **T-posixfs-nfs SelfTestAtomicCommit failure** (`cachestore/posixfs`): + boot against a mock POSIX backend (FUSE shim) that + (a) returns `0` instead of `EEXIST` from a second `link()`, OR + (b) silently drops the size-verify check; assert the process exits + non-zero with the documented `cachestore/posixfs: backend does not + honor link()/EEXIST or directory fsync; refusing to start` message. +- **T-posixfs-nfs version gate** (`cachestore/posixfs`): boot against + a loopback NFSv3 export with `cachestore.posixfs.nfs.allow_v3: + false` (default); assert the process exits non-zero. Then set + `allow_v3: true` and reboot; assert the process starts with a loud + WARN log line and `posixfs_nfs_v3_optin_total` == 1. Boot against + NFSv4.0 with the default config; assert exit non-zero (4.0 < 4.1 + minimum and 4.0 is not v3-opt-in eligible). +- **T-posixfs-nfs Alluxio refusal** (`cachestore/posixfs`): boot + against a FUSE mount whose `/proc/mounts` source string contains + `alluxio` (case-insensitive); assert the process exits non-zero + with the `cachestore/posixfs: Alluxio FUSE is unsupported` message + and `posixfs_alluxio_refusal_total` == 1. Repeat with a non-Alluxio + FUSE mount (e.g. a test FUSE shim) and assert the process still + refuses (because FUSE_SUPER_MAGIC also fails the spool-locality + check when `spool.dir` is on the same FS, AND `cachestore/posixfs` + treats a generic FUSE backend as unverified). +- **T-posixfs-fanout** (`cachestore/posixfs`): with + `fanout_chars: 2`, assert chunk paths under + `////`; with + `fanout_chars: 0`, assert paths under + `///`; assert `localfs` default + (`fanout_chars: 0` for localfs) produces the flat layout. Verify + the same `posixcommon` package powers both code paths via a unit + test on the helper. +- **T-spool-locality refusal** (`spool` + `cmd/orca`): boot + with `spool.dir` on a tmpfs-backed loopback NFS mount (CI helper); + assert the process exits non-zero with the `spool: ... is on a + network filesystem (nfs); ... Refusing to start` message and + `orca_spool_locality_check_total{result="refused",fs_type="nfs"}` + == 1. Repeat with `spool.require_local_fs: false`; assert the + process starts, `result="bypassed"` is emitted, and the boot log + carries the `WARN spool.require_local_fs is disabled` line. + Separately assert a clean local-FS run emits `result="ok"`. +- **T-D3 internal mTLS ServerName** (`cluster`): boot 3 replicas with + per-replica certs whose only SAN is `orca..svc`; + rolling-restart one pod so its IP changes; assert the dialer pins + `tls.Config.ServerName = orca..svc` and the handshake + succeeds against the new pod IP without cert reissuance. +- **T-D4 readyz on ErrAuth** (`cachestore` + `server`): inject 1 + `ErrAuth` -> `/readyz` still 200; inject 3 consecutive `ErrAuth` -> + `/readyz` returns 503 NotReady and + `readyz_errauth_consecutive` == 3; interleave a non-auth `ErrNotFound` + between failures and assert it does NOT reset the counter (only a + successful CacheStore call resets); inject success after the + threshold trips, assert counter resets to 0 and `/readyz` returns + 200 again. +- **T-edge cap-exceeded 400** (`server`): set `max_response_bytes=1MiB`; + request `Range: bytes=0-2097151` (2 MiB); assert response is + `400 RequestSizeExceedsLimit` (S3-style XML body) with + `x-orca-cap-exceeded: true`; separately, request a Range past + EOF and assert response is `416 Requested Range Not Satisfiable` + (cap-exceeded MUST NOT be reported as 416). + +## 9. Out of scope for v1 (explicit) + +Re-stated to prevent drift: + +- No write path, multipart upload, or object versioning. +- No cross-DC peering. +- No SigV4 verification. +- No multi-tenant quotas or per-tenant credentials. +- No mutable-blob invalidation. ETag change is the only signal we honor, + and it is enforced at the origin via `If-Match` on every GET (no + opt-out). +- No encryption at rest beyond what the underlying CacheStore provides. + +## 10. Open questions / risks + +- **Origin immutability is an operator contract**: Orca trusts + that an `(origin_id, bucket, object_key)` is immutable for the life + of the key (replacement must use a new key); the bounded violation + window is `metadata_ttl` (default 5m). `If-Match: ` on every + `Origin.GetRange` is defense-in-depth that catches in-flight + overwrites only. Operators MUST surface this contract in the consumer + API documentation. See + [design.md#11-bounded-staleness-contract](./design.md#11-bounded-staleness-contract). +- **Commit-after-serve failure** (decision 2b): with v1 Option D + the cold-path bytes stream origin -> client directly; the + CacheStore commit is async and happens after the client response + is complete. A failure there leaves the client successful but + the chunk uncached. Repeated + failures are visible only via + `orca_commit_after_serve_total{result="failed"}` and the + CacheStore circuit breaker; operators MUST alert on a sustained + non-zero rate (it indicates CacheStore degradation, not request + errors). +- **Per-replica origin semaphore is approximate**: each replica + enforces `floor(origin.target_global / cluster.target_replicas)` + (default 64 slots/replica at `target_global=192`, + `target_replicas=3`). Realized cluster-wide concurrency tracks + `target_global` only when `N_actual == cluster.target_replicas`; + scale-out without updating the knob over-allocates against + origin (cluster-wide cap exceeds `target_global` by + `(N_actual - target_replicas) * target_per_replica`); scale-in + under-allocates. Mitigations: operators MUST update + `cluster.target_replicas` after sustained scale changes; a + coordinated cluster-wide limiter (s15.5) and dynamic recompute + from `len(Cluster.Peers())` (s15.6) are deferred future work. + Origin throttling responses (`503 SlowDown` / `429`) are handled + by the leader's pre-header retry loop (s8.6) with exponential + backoff regardless; origin self-protects against the static-cap + overshoot. +- **VAST `If-None-Match: *` requires unversioned bucket**: the + `cachestore/s3` driver relies on the backend honoring + `If-None-Match: *` to enforce no-clobber atomic commit. AWS S3 + (since 2024-08), MinIO, and VAST Cluster (non-versioned buckets + only) are verified. The driver runs a boot-time `GetBucketVersioning` + versioning gate ([design.md s10.1.3](./design.md#1013-cachestores3)) + and refuses to start on enabled or suspended versioning. VAST KB + citation is in design.md. The `SelfTestAtomicCommit` probe is the + defense-in-depth backstop if any future S3-compatible backend + reports versioning correctly but silently overwrites anyway. +- **LocalStack community community-tier image must be pinned**: the + dev harness uses LocalStack as the `cachestore/s3` backend + (`hack/orca/dev-harness.md`). The `localstack/localstack:latest` + tag now requires a Pro auth token and exits with code 55 on the + free tier. Dev manifests pin to `localstack/localstack:3.8`, the + last known-stable community-tier release whose S3 implementation + honors `PutObject + If-None-Match: *` (verified locally; both the + `SelfTestAtomicCommit` and the `GetBucketVersioning` versioning + gate pass). Future LocalStack releases may diverge; if the dev + harness fails to start, first action is verify `If-None-Match: *` + + `GetBucketVersioning` against the pinned image. +- **NFS export `async` weakens dir-fsync**: `cachestore/posixfs` + depends on directory `fsync()` being durable on the server, which + requires the NFS export to be `sync` (not `async`). The driver + cannot reliably detect server-side `async` from the client; Phase 2 + ships an operator runbook entry that mandates `sync` exports and a + best-effort warning if `/proc/mounts` reveals an `async` client mount + option. Mitigation: the boot self-test re-`stat`s through the kernel + client cache and catches the most common misconfigurations; persistent + silent corruption requires both server `async` AND a + power-loss-window-sized failure, which is outside v1's correctness + envelope. Document this loudly in `operations.md`. +- **Weka NFS `link()` / `EEXIST` semantics not docs-confirmed**: Weka's + NFS share (`-t nfs4` to a Weka cluster) is verified up to NFSv4.1 + (`NFS4_CREATE_SESSION`, `ATOMIC_FILEOPEN`) but the `link()` no-clobber + return of `EEXIST` is not explicitly documented. The driver treats + this as a "must pass `SelfTestAtomicCommit` to start" case: if Weka + NFS fails the self-test, operators MUST switch to Weka native + (`-t wekafs`), which is a true POSIX FS and a separately-detected + backend. This is not a code change, only a configuration / mount-time + decision; document the matrix in `operations.md`. +- **Alluxio FUSE is a tempting misconfiguration**: Alluxio markets a + shared filesystem mount but provides no `link(2)` and no atomic + no-overwrite rename, which makes it unsafe for `cachestore/posixfs`. + The driver detects Alluxio FUSE explicitly (FUSE_SUPER_MAGIC + + `/proc/mounts` source matches `alluxio`) and refuses to start. The + documented workaround is `cachestore.driver: s3` against the + Alluxio S3 gateway, which is a normal in-DC S3 backend from the + cache layer's perspective. Operators MUST be steered to this in the + runbook to prevent Phase-2 deployments from getting stuck. +- **Spool on a network filesystem degrades joiner-fallback latency**: + with the v1 streaming design (Option D) the spool is no longer on + the client TTFB path, but joiner-fallback reads still benefit + materially from local block storage. A spool placed on NFS / + SMB / CephFS / Lustre / GPFS / FUSE pays a network round-trip + per joiner-fallback read, converting microsecond-class + switchover into milliseconds-class. The cache layer enforces + local placement at boot via `statfs(2)` and refuses to start by + default (`spool.require_local_fs=true`; see + [design.md#104-spool-locality-contract](./design.md#104-spool-locality-contract)). + Operators with unusual placements (e.g., RAM-disk) MAY relax to + `spool.require_local_fs=false`; production deployments are + expected to keep the default. Operators should also pin + `spool.dir` to a hostPath / local-PV pointing at NVMe and avoid + generic-default-storage-class PVCs that may bind to network volumes. +- **Spool exhaustion under sustained burst**: `spool.max_bytes` (default + 8 GiB) and `spool.max_inflight` (default 64) bound the local staging + area. A correlated cold-access burst that exceeds these returns `503 + Slow Down` to clients, which is the intended backpressure but visible + as user-facing errors. Operators should monitor `orca_spool_bytes` + and `orca_spool_evictions_total{reason="full"}` and tune the caps + per node disk capacity. +- **Internal cert rotation**: the internal listener uses per-replica certs + chained to an internal CA. Rotation is delegated to the issuing system + (e.g. cert-manager). The server hot-reloads `cluster.internal_tls.cert_file` + / `key_file` on file change (inotify / periodic stat); the CA bundle is + reloaded the same way. CA rotation requires both old and new CAs to + appear in the bundle for at least one full rolling-restart window; + document this in `operations.md`. Misconfiguration risk: dropping the + old CA too early breaks inter-replica RPCs cluster-wide. +- **Cluster membership during rolling restart**: rendezvous hashing + tolerates membership flux, but a pod restart with a new IP looks like a + new member for up to one refresh interval (default 5s), shifting + ownership for ~1/N keys until the next DNS refresh. Back-to-back + restarts can cause repeated duplicate fills. The + `orca_origin_duplicate_fills_total{result="commit_lost"}` metric + makes this visible. We accept this in v1 and revisit if it proves + material. See + [design.md#14-horizontal-scale](./design.md#14-horizontal-scale). +- **Create-after-404 unavailability window**: clients that hit a missing + key before the operator uploads it will continue to see `404` for up + to `negative_metadata_ttl` per replica that observed the original + `404` (default 60s). Worst case across replicas: round-robin LB can + alternate `404` / `200` during the drain. There is no event-driven + invalidation or admin-invalidation in v1 (the immutable-origin + contract makes them unnecessary). + Mitigations: short default `negative_metadata_ttl=60s`, + `metadata_negative_*` metrics expose drain progress, runbook + instructs operators to wait `negative_metadata_ttl` after uploading + a previously-missing key before announcing it. See + [design.md#12-create-after-404-and-negative-cache-lifecycle](./design.md#12-create-after-404-and-negative-cache-lifecycle). +- **ChunkCatalog undersizing degrades active eviction quality**: + the optional active eviction loop (s13.2) bases decisions on + per-entry access counters in the ChunkCatalog. If + `chunk_catalog.max_entries` is much smaller than the working set, + many chunks live in the CacheStore but are not tracked; they + cannot be considered for active eviction; they live indefinitely + until external lifecycle (if any) cleans them up. Operators MUST + size the catalog to roughly 1.2x the estimated working-set chunk + count + ([design.md s13.3](./design.md#133-chunkcatalog-size-awareness-load-bearing-operational-note)); + metrics `chunk_catalog_hit_rate` and + `chunk_catalog_evict_total{reason="size"}` make undersizing + visible. +- **LIST cache staleness in write-and-immediately-list workloads**: + the per-replica LIST cache (s6.2) defaults to 60s TTL. A key + uploaded mid-window will not appear in `Origin.List` results + served from cache until the entry expires (up to 60s). + Acceptable for the documented FUSE-`ls` read-mostly workload; + operators with write-and-immediately-list patterns should tune + `list_cache.ttl` shorter or disable the cache via + `list_cache.enabled: false`. +- **Mid-stream client aborts on post-commit origin failure**: + the v1 streaming design (Option D) sends response headers and + begins streaming as soon as origin returns a first byte. If the + origin connection breaks mid-chunk after the cache has committed, + the response aborts (HTTP/2 `RST_STREAM` or HTTP/1.1 + `Connection: close`). S3 SDKs handle this via `Content-Length` + mismatch retry; the operational impact is small for the + documented workload but visible in + `responses_aborted_total{phase="mid_stream"}`. Sustained non- + zero rates indicate origin tail-latency issues; the trigger for + considering mid-stream origin resume + ([design.md s15.4](./design.md#154-mid-stream-origin-resume)) + is sustained mid-stream abort rate measurably impacting + end-to-end client latency. +- **Cold-start Stat storm**: a freshly started replica receiving a wide + fan-out of distinct cold keys does one `CacheStore.Stat` per `ChunkKey`. + At in-DC latencies this is cheap but not free. If a deployment routinely + sees wide-fan-out cold starts we may add a bulk-stat path or warm the + `ChunkCatalog` from a CacheStore listing on startup. Defer until + measured. +- **CacheStore lifecycle eviction of hot chunks**: age-based expiration may + evict a chunk that is still hot, forcing a re-fetch from origin. + Operators should tune TTL against `orca_origin_bytes_total`. Phase + 4 may add an in-`chunkcatalog` access-tracking layer if this proves + material. +- **Origin egress cost spikes**: cold-start fan-out can be expensive even + with singleflight if many distinct keys are touched simultaneously. + Origin semaphore + 503 backpressure protects us, but operators should + monitor `orca_origin_bytes_total` and set DC-side egress budgets. +- **Prefetch-induced waste**: sequential read-ahead can fetch chunks the + client never reads. Default depth (4) is conservative; we expose the knob + and the metric. +- **Mid-stream abort detection by clients**: post-first-byte failures abort + the response; standard S3 SDKs (aws-sdk, boto3) detect via + `Content-Length` mismatch and retry. Non-standard or hand-rolled HTTP + clients may silently truncate. Document this in `operations.md`. + +## 11. Approval checklist + +Before starting Phase 0 implementation, please confirm: + +- [ ] Repo layout under `cmd/orca/`, `internal/orca/`, + `deploy/orca/`, `images/orca/`, + `design/orca/`, `hack/orca/` is acceptable, + including `internal/orca/fetch/spool/`, + `cmd/orca/orca/server/internal/`, and + `deploy/orca/07-networkpolicy.yaml.tmpl`. +- [ ] Default chunk size of 8 MiB is acceptable. +- [ ] Bearer / mTLS auth on the client edge in v1 is acceptable; SigV4 + is deferred future work. +- [ ] **Separate internal mTLS listener (`:8444`) with an internal CA + distinct from the client mTLS CA, peer-IP-set authorization, + and a NetworkPolicy restricting ingress to `app=orca` pods, + is acceptable.** +- [ ] Azure constraint to Block Blobs only, surfaced as + `502 OriginUnsupported`, is acceptable. +- [ ] No persistent local index in v1; in-memory `ChunkCatalog` + + `CacheStore.Stat` on miss is sufficient. +- [ ] CacheStore lifecycle / TTL is the eviction mechanism in v1; cache + layer ships no eviction code. +- [ ] **Strict `If-Match: ` on every `Origin.GetRange` (no opt-out), + with `412` translated to `OriginETagChangedError`, metadata cache + invalidation, and a non-retryable fill failure, is acceptable.** +- [ ] **Local Spool layer (default 8 GiB) as the universal slow-joiner + fallback, with `503 Slow Down` on exhaustion, is acceptable.** +- [ ] **Atomic-commit model is acceptable: `localfs` uses + `link()` / `renameat2(RENAME_NOREPLACE)` (no plain `rename()`); + `cachestore/s3` uses `PutObject` + `If-None-Match: *` with no + tmp key and no copy hop; `SelfTestAtomicCommit` at startup refuses + to start if the backend doesn't honor the precondition.** +- [ ] **Deferred response headers until first chunk in hand, plus + mid-stream abort (HTTP/2 `RST_STREAM` / HTTP/1.1 `Connection: close`) + on post-first-byte failure, is acceptable.** +- [ ] **Assembler-per-request + per-chunk coordinator routing via + internal fill RPC (rather than whole-request reverse-proxy) is the + right v1 mechanism for strongly correlated cold-access workloads.** +- [ ] Deployment (not StatefulSet) is acceptable for v1 given no per-pod + state, faster rolling updates, and parity with other stateless + components in this repo. +- [ ] Phase 0 deliverable definition (one process serving a Range GET + against real S3 and re-serving from `localfs`) is the right starting + milestone. +- [ ] No cross-cmd imports; shared code lives under `internal/orca/` + per the project's coding standards. +- [ ] **Bounded staleness contract published in design.md s11 with + `metadata_ttl=5m` default; operators are expected to honor the + immutable-origin contract.** +- [ ] **Pre-header origin retry (Option D) ships in Phase 1: the + leader retries `Origin.GetRange` up to + `origin.retry.attempts` (default 3) with exponential backoff + capped by `origin.retry.max_total_duration` (default 5s) + BEFORE response headers are sent to the client; transparent + to the client. The commit boundary is the first byte arrival + from origin: post-commit, bytes stream origin -> client + directly; spool tees in parallel for joiner support and as + the asynchronous CacheStore-commit source. Pre-commit + failures (retry budget exhausted, `OriginETagChangedError`) + return clean HTTP errors; post-commit failures become + mid-stream client aborts (handled by SDK retry). + `origin_retry_total` and `origin_retry_attempts` metrics + exposed; T-pre-header-retry-* test group in Phase 1. + Mid-stream origin resume is deferred future work + ([design.md s15.4](./design.md#154-mid-stream-origin-resume)). + CacheStore commit runs asynchronously after the client + response completes; commit-after-serve failures are reported + as `commit_after_serve_total{result="failed"}` and do NOT + affect client responses.** +- [ ] **`CacheStore` returns typed errors `ErrNotFound|ErrTransient|ErrAuth`; + only `ErrNotFound` triggers refill; `ErrTransient` -> `503 Slow Down` + with `Retry-After`; `ErrAuth` -> `502 Bad Gateway`.** +- [ ] **Per-process CacheStore circuit breaker with defaults + `error_window=30s, error_threshold=10, open_duration=30s, + half_open_probes=3`; state and transitions exported as metrics.** +- [ ] **Origin backpressure is per-replica static cap: + `target_per_replica = floor(origin.target_global / + cluster.target_replicas)` (default 64 slots/replica at + `target_global=192`, `target_replicas=3`); origin throttling + responses (`503 SlowDown` / `429`) are handled by the + pre-header retry loop (`origin.retry.*`); `origin_inflight` + gauge exposes per-replica saturation. Coordinated + cluster-wide limiter and dynamic per-replica recompute are + deferred future work, see + [design.md s15.5](./design.md#155-coordinated-cluster-wide-origin-limiter) + and + [design.md s15.6](./design.md#156-dynamic-per-replica-origin-cap). + Operators MUST update `cluster.target_replicas` after any + sustained scale change.** +- [ ] **`cachestore/localfs` stages inside `/.staging/` (NOT + `/tmp` and NOT spool dir); parent-dir fsync after every link/unlink; + `staging_max_age=1h` orphaned-staging sweeper.** +- [ ] **Internal mTLS dialer pins `tls.Config.ServerName` to the stable + SAN `orca..svc`; per-replica certs MUST include this + SAN; pod-IP SANs are NOT used.** +- [ ] **`/readyz` flips to NotReady after `readyz.errauth_consecutive_threshold=3` + consecutive `ErrAuth` from CacheStore; one non-`ErrAuth` success + resets the counter.** +- [ ] **`server.max_response_bytes` overflow returns + `400 RequestSizeExceedsLimit` (S3-style XML body); `416` is + reserved for true Range vs. object-size violations.** +- [ ] **`cachestore/posixfs` ships in Phase 2 alongside `cachestore/s3`, + sharing `link()`/`EEXIST` + dir-fsync helpers with + `cachestore/localfs` via + `internal/orca/cachestore/internal/posixcommon/`. Supported + backends: NFSv4.1+ (baseline), Weka native (`-t wekafs`), CephFS, + Lustre, GPFS / IBM Spectrum Scale.** +- [ ] **`cachestore/posixfs` runs `SelfTestAtomicCommit` at startup + (link()/`EEXIST` + dir-fsync + size verify); refuses to start on + any failure. Never disabled in production + (`require_atomic_link_self_test: true`).** +- [ ] **NFS minimum version is `4.1` + (`cachestore.posixfs.nfs.minimum_version: "4.1"`); NFSv3 is opt-in + only (`cachestore.posixfs.nfs.allow_v3: true`) with a loud WARN + log and `posixfs_nfs_v3_optin_total++`; `allow_v3` MUST stay + `false` in production manifests.** +- [ ] **Backend auto-detection via `statfs(2)` `f_type` + `/proc/mounts` + emits `posixfs_backend{type,version}` info gauge; operator + override allowed via `cachestore.posixfs.backend_type` for + ambiguous magic numbers; override is logged loudly.** +- [ ] **Alluxio FUSE is unsupported: `cachestore/posixfs` detects it + (FUSE_SUPER_MAGIC + `/proc/mounts` source matches `alluxio`) and + refuses to start with a message pointing operators to + `cachestore.driver: s3` against the Alluxio S3 gateway; + `posixfs_alluxio_refusal_total` exposes accidental + misconfigurations.** +- [ ] **`cachestore/posixfs` paths use a 2-character hex fan-out under + `////` by default + (`fanout_chars: 2`); `cachestore/localfs` keeps the flat layout + (`fanout_chars: 0` default) but the helper is shared.** +- [ ] **NFS export hardening is operator-runbook material: exports MUST + be `sync` (not `async`); the driver issues a best-effort warning + from `/proc/mounts` client-side options but does not refuse on + `async` (it cannot reliably detect server-side `async`); document + this in `operations.md`.** +- [ ] **Spool locality is enforced at boot: `spool.require_local_fs: + true` (default) runs `statfs(2)` on `spool.dir` and refuses to + start when the FS magic matches NFS / SMB / CephFS / Lustre / + GPFS / FUSE. With Option D the spool is no longer on the + client TTFB path, so the contract is defense-in-depth for + joiner-fallback latency; operators with unusual placements + (e.g., RAM-disk) MAY relax via `spool.require_local_fs: false` + with the documented operational warning. Production deploys + are expected to keep the default. See + [design.md#104-spool-locality-contract](./design.md#104-spool-locality-contract).** +- [ ] **Negative-cache TTL is independent: `negative_metadata_ttl: 60s` + (default) is distinct from `metadata_ttl: 5m`; bounds the + create-after-404 unavailability window. The + `metadata_negative_entries` / `metadata_negative_hit_total` / + `metadata_negative_age_seconds` metrics are exposed; the + `T-create-after-404a/b/c` test group is in Phase 1. + Event-driven invalidation and admin-invalidation RPC are + out of v1 scope (the immutable-origin contract makes them + unnecessary). See + [design.md#12-create-after-404-and-negative-cache-lifecycle](./design.md#12-create-after-404-and-negative-cache-lifecycle).** +- [ ] **Per-replica LIST cache (FW3) ships in Phase 1 sized for + the FUSE-`ls` workload pattern: default `list_cache.ttl=60s`, + `max_entries=1024`, `max_response_bytes=1MiB`, no negative + caching, optional stale-while-revalidate (`swr_enabled: false` + default); `list_cache_*` metrics exposed; T-list-cache-* test + group in Phase 1; cluster-wide LIST coordinator is a + deferred optimization + ([design.md s15.3](./design.md#153-cluster-wide-list-coordinator)).** +- [ ] **ChunkCatalog access-frequency tracking (FW8) added in + Phase 1: per-entry `AccessCount`, `LastAccessed`, + `LastEntered`. Optional active eviction loop opt-in via + `chunk_catalog.active_eviction.enabled` (default `false`) + with `inactive_threshold=24h`, `access_threshold=5`, + `min_age=5m`, `max_evictions_per_run=1000`. New + `CacheStore.Delete` method on the interface; + `cachestore_delete_total` and `chunk_catalog_*` metrics + exposed. Operators MUST size `chunk_catalog.max_entries` to + ~1.2x estimated working-set chunks per the load-bearing + operational note in + [design.md s13.3](./design.md#133-chunkcatalog-size-awareness-load-bearing-operational-note). + `T-active-eviction-*` and `T-catalog-*` test groups in Phase 1.** +- [ ] **Bounded-freshness mode (FW5) opt-in via + `metadata_refresh.enabled` (default `false`) with hot-key + detection via metadata-cache access counters (parallel to + ChunkCatalog tracking from FW8). Defaults: `interval=1m`, + `refresh_ahead_ratio=0.7`, `access_threshold=5`, + `min_age=metadata_ttl/4=75s`, `max_refreshes_per_run=100`, + `refresh_concurrency=8`. Negative entries are NOT refreshed. + `metadata_refresh_*` metrics exposed; `T-metadata-refresh-*` + test group in Phase 1. See + [design.md s11.2](./design.md#112-bounded-freshness-mode-optional).** +- [ ] **`cachestore/s3` versioning gate enforced at boot: drives + `GetBucketVersioning` and refuses to start on `Status: Enabled` + or `Status: Suspended`. Governed by + `cachestore.s3.require_unversioned_bucket: true` (default; + never disabled in production). Required because + `If-None-Match: *` is not honored on versioned buckets across + all S3-compatible backends (notably VAST). Metric + `s3_versioning_check_total{result="ok|refused"}` emitted once + per boot. `T-s3-versioned-bucket-refusal` and + `T-s3-unversioned-bucket-ok` tests in Phase 1. See + [design.md s10.1.3](./design.md#1013-cachestores3) and the + VAST KB citation therein.** +- [ ] **Edge rate limiting documented as v1 gap in + [design.md s15.1](./design.md#151-edge-rate-limiting). Multi- + tenant deployments worried about single-client monopolization + should layer rate limiting at an upstream proxy or LB until + this lands as a future deliverable.** +- [ ] **Dev harness brings up cleanly with `make -C hack/orca up` + against LocalStack (cachestore/s3) and a real Azure storage + account (origin) inside a Kind cluster. End-to-end flow + verified: cold miss -> Azure -> LocalStack -> client; warm + hit served from LocalStack without origin call; 50 parallel + GETs across 3 replicas dedupe to 1 origin GET (cluster-wide + via `/internal/fill`). LocalStack pinned to a community-tier + image; dev disables `cluster.internal_tls.enabled` and + `server.auth.enabled`. NetworkPolicy not applied in dev. See + [hack/orca/dev-harness.md](../../hack/orca/dev-harness.md).** diff --git a/go.mod b/go.mod index 9fdc87a3..49794bf4 100644 --- a/go.mod +++ b/go.mod @@ -26,6 +26,11 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4 github.com/Masterminds/semver/v3 v3.4.0 github.com/Masterminds/sprig/v3 v3.3.0 + github.com/aws/aws-sdk-go-v2 v1.41.7 + github.com/aws/aws-sdk-go-v2/config v1.32.17 + github.com/aws/aws-sdk-go-v2/credentials v1.19.16 + github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0 + github.com/aws/smithy-go v1.25.1 github.com/bougou/go-ipmi v0.8.3 github.com/cilium/ebpf v0.21.0 github.com/coder/websocket v1.8.14 @@ -49,6 +54,7 @@ require ( github.com/spf13/cobra v1.10.2 github.com/spf13/pflag v1.0.10 github.com/stretchr/testify v1.11.1 + github.com/testcontainers/testcontainers-go v0.42.0 github.com/vishvananda/netlink v1.3.1 golang.org/x/crypto v0.50.0 golang.org/x/mod v0.35.0 @@ -73,27 +79,51 @@ require ( ) require ( - dario.cat/mergo v1.0.1 // indirect - github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1 // indirect + dario.cat/mergo v1.0.2 // indirect + github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.12.0 // indirect - github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect + github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect + github.com/Microsoft/go-winio v0.6.2 // indirect github.com/apex/log v1.9.0 // indirect + github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23 // indirect + github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/containerd/errdefs v1.0.0 // indirect + github.com/containerd/errdefs/pkg v0.3.0 // indirect github.com/containerd/log v0.1.0 // indirect + github.com/cpuguy83/dockercfg v0.3.2 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/cyphar/filepath-securejoin v0.5.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/distribution/reference v0.6.0 // indirect + github.com/docker/go-connections v0.6.0 // indirect github.com/docker/go-units v0.5.0 // indirect github.com/dustin/go-humanize v1.0.1 // indirect + github.com/ebitengine/purego v0.10.0 // indirect github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-errors/errors v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect @@ -110,12 +140,14 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/josharian/native v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.18.0 // indirect + github.com/klauspost/compress v1.18.5 // indirect github.com/klauspost/pgzip v1.2.6 // indirect github.com/kr/pretty v0.3.1 // indirect github.com/kr/text v0.2.0 // indirect github.com/kylelemons/godebug v1.1.0 // indirect github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect + github.com/magiconair/properties v1.8.10 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-isatty v0.0.20 // indirect @@ -125,10 +157,16 @@ require ( github.com/mdlayher/socket v0.5.1 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect + github.com/moby/docker-image-spec v1.3.1 // indirect + github.com/moby/go-archive v0.2.0 // indirect + github.com/moby/moby/api v1.54.1 // indirect + github.com/moby/moby/client v0.4.0 // indirect + github.com/moby/patternmatcher v0.6.1 // indirect github.com/moby/spdystream v0.5.1 // indirect + github.com/moby/sys/sequential v0.6.0 // indirect github.com/moby/sys/user v0.4.0 // indirect github.com/moby/sys/userns v0.1.0 // indirect - github.com/moby/term v0.5.0 // indirect + github.com/moby/term v0.5.2 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect @@ -145,6 +183,7 @@ require ( github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect @@ -153,10 +192,13 @@ require ( github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect + github.com/shirou/gopsutil/v4 v4.26.3 // indirect github.com/shopspring/decimal v1.4.0 // indirect - github.com/sirupsen/logrus v1.9.3 // indirect + github.com/sirupsen/logrus v1.9.4 // indirect github.com/sony/gobreaker/v2 v2.4.0 // indirect github.com/spf13/cast v1.7.0 // indirect + github.com/tklauser/go-sysconf v0.3.16 // indirect + github.com/tklauser/numcpus v0.11.0 // indirect github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 // indirect github.com/urfave/cli v1.22.12 // indirect github.com/vbatts/go-mtree v0.6.1-0.20250911112631-8307d76bc1b9 // indirect @@ -164,6 +206,12 @@ require ( github.com/x448/float16 v0.8.4 // indirect github.com/xlab/treeprint v1.2.0 // indirect github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect + go.opentelemetry.io/otel v1.41.0 // indirect + go.opentelemetry.io/otel/metric v1.41.0 // indirect + go.opentelemetry.io/otel/trace v1.41.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect @@ -172,7 +220,7 @@ require ( golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/telemetry v0.0.0-20260409153401-be6f6cb8b1fa // indirect golang.org/x/text v0.36.0 // indirect - golang.org/x/time v0.9.0 // indirect + golang.org/x/time v0.11.0 // indirect golang.org/x/tools v0.44.0 // indirect golang.org/x/vuln v1.2.0 // indirect golang.zx2c4.com/wireguard v0.0.0-20231211153847-12269c276173 // indirect diff --git a/go.sum b/go.sum index 91bab086..3cf29662 100644 --- a/go.sum +++ b/go.sum @@ -1,9 +1,9 @@ cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= -dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s= -dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= -github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1 h1:EKPd1INOIyr5hWOWhvpmQpY6tKjeG0hT1s3AMC/9fic= -github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1/go.mod h1:VzwV+t+dZ9j/H867F1M2ziD+yLHtB46oM35FxxMJ4d0= +dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8= +dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA= +github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk= +github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1 h1:jHb/wfvRikGdxMXYV3QG/SzUOPYN9KEUUuC0Yd0/vC0= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1/go.mod h1:pzBXCYn05zvYIrwLgtK8Ap8QcjRg+0i76tMQdWN6wOk= github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4= @@ -46,8 +46,8 @@ github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.8.1 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.8.1/go.mod h1:Ng3urmn6dYe8gnbCMoHHVl5APYz2txho3koEkV2o2HA= github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4 h1:jWQK1GI+LeGGUKBADtcH2rRqPxYB1Ljwms5gFA2LqrM= github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4/go.mod h1:8mwH4klAm9DUgR2EEHyEEAQlRDvLPyg5fQry3y+cDew= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg= +github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM= github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs= @@ -59,6 +59,8 @@ github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1 github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Masterminds/sprig/v3 v3.3.0 h1:mQh0Yrg1XPo6vjYXgtf5OtijNAKJRNcTdOOGZe3tPhs= github.com/Masterminds/sprig/v3 v3.3.0/go.mod h1:Zy1iXRYNqNLUolqCpL4uhk6SHUMAOSCzdgBfDb35Lz0= +github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= +github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/apex/log v1.9.0 h1:FHtw/xuaM8AgmvDDTI9fiwoAL25Sq2cxojnZICUU8l0= @@ -69,6 +71,42 @@ github.com/aphistic/sweet v0.2.0/go.mod h1:fWDlIh/isSE9n6EPsRmC0det+whmX6dJid3st github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/aws/aws-sdk-go v1.20.6/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= +github.com/aws/aws-sdk-go-v2 v1.41.7 h1:DWpAJt66FmnnaRIOT/8ASTucrvuDPZASqhhLey6tLY8= +github.com/aws/aws-sdk-go-v2 v1.41.7/go.mod h1:4LAfZOPHNVNQEckOACQx60Y8pSRjIkNZQz1w92xpMJc= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10 h1:gx1AwW1Iyk9Z9dD9F4akX5gnN3QZwUB20GGKH/I+Rho= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10/go.mod h1:qqY157uZoqm5OXq/amuaBJyC9hgBCBQnsaWnPe905GY= +github.com/aws/aws-sdk-go-v2/config v1.32.17 h1:FpL4/758/diKwqbytU0prpuiu60fgXKUWCpDJtApclU= +github.com/aws/aws-sdk-go-v2/config v1.32.17/go.mod h1:OXqUMzgXytfoF9JaKkhrOYsyh72t9G+MJH8mMRaexOE= +github.com/aws/aws-sdk-go-v2/credentials v1.19.16 h1:r3RJBuU7X9ibt8RHbMjWE6y60QbKBiII6wSrXnapxSU= +github.com/aws/aws-sdk-go-v2/credentials v1.19.16/go.mod h1:6cx7zqDENJDbBIIWX6P8s0h6hqHC8Avbjh9Dseo27ug= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 h1:UuSfcORqNSz/ey3VPRS8TcVH2Ikf0/sC+Hdj400QI6U= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23/go.mod h1:+G/OSGiOFnSOkYloKj/9M35s74LgVAdJBSD5lsFfqKg= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 h1:GpT/TrnBYuE5gan2cZbTtvP+JlHsutdmlV2YfEyNde0= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23/go.mod h1:xYWD6BS9ywC5bS3sz9Xh04whO/hzK2plt2Zkyrp4JuA= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23 h1:bpd8vxhlQi2r1hiueOw02f/duEPTMK59Q4QMAoTTtTo= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23/go.mod h1:15DfR2nw+CRHIk0tqNyifu3G1YdAOy68RftkhMDDwYk= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24 h1:OQqn11BtaYv1WLUowvcA30MpzIu8Ti4pcLPIIyoKZrA= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24/go.mod h1:X5ZJyfwVrWA96GzPmUCWFQaEARPR7gCrpq2E92PJwAE= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9 h1:FLudkZLt5ci0ozzgkVo8BJGwvqNaZbTWb3UcucAateA= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9/go.mod h1:w7wZ/s9qK7c8g4al+UyoF1Sp/Z45UwMGcqIzLWVQHWk= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15 h1:ieLCO1JxUWuxTZ1cRd0GAaeX7O6cIxnwk7tc1LsQhC4= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15/go.mod h1:e3IzZvQ3kAWNykvE0Tr0RDZCMFInMvhku3qNpcIQXhM= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 h1:pbrxO/kuIwgEsOPLkaHu0O+m4fNgLU8B3vxQ+72jTPw= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23/go.mod h1:/CMNUqoj46HpS3MNRDEDIwcgEnrtZlKRaHNaHxIFpNA= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23 h1:03xatSQO4+AM1lTAbnRg5OK528EUg744nW7F73U8DKw= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23/go.mod h1:M8l3mwgx5ToK7wot2sBBce/ojzgnPzZXUV445gTSyE8= +github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0 h1:etqBTKY581iwLL/H/S2sVgk3C9lAsTJFeXWFDsDcWOU= +github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0/go.mod h1:L2dcoOgS2VSgbPLvpak2NyUPsO1TBN7M45Z4H7DlRc4= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 h1:TdJ+HdzOBhU8+iVAOGUTU63VXopcumCOF1paFulHWZc= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.11/go.mod h1:R82ZRExE/nheo0N+T8zHPcLRTcH8MGsnR3BiVGX0TwI= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 h1:7byT8HUWrgoRp6sXjxtZwgOKfhss5fW6SkLBtqzgRoE= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.17/go.mod h1:xNWknVi4Ezm1vg1QsB/5EWpAJURq22uqd38U8qKvOJc= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21 h1:+1Kl1zx6bWi4X7cKi3VYh29h8BvsCoHQEQ6ST9X8w7w= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21/go.mod h1:4vIRDq+CJB2xFAXZ+YgGUTiEft7oAQlhIs71xcSeuVg= +github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 h1:F/M5Y9I3nwr2IEpshZgh1GeHpOItExNM9L1euNuh/fk= +github.com/aws/aws-sdk-go-v2/service/sts v1.42.1/go.mod h1:mTNxImtovCOEEuD65mKW7DCsL+2gjEH+RPEAexAzAio= +github.com/aws/smithy-go v1.25.1 h1:J8ERsGSU7d+aCmdQur5Txg6bVoYelvQJgtZehD12GkI= +github.com/aws/smithy-go v1.25.1/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc= github.com/aybabtme/rgbterm v0.0.0-20170906152045-cc83f3b3ce59/go.mod h1:q/89r3U2H7sSsE2t6Kca0lfwTK8JdoNGS/yzM/4iH5I= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= @@ -84,29 +122,41 @@ github.com/cilium/ebpf v0.21.0 h1:4dpx1J/B/1apeTmWBH5BkVLayHTkFrMovVPnHEk+l3k= github.com/cilium/ebpf v0.21.0/go.mod h1:1kHKv6Kvh5a6TePP5vvvoMa1bclRyzUXELSs272fmIQ= github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g= github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg= +github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= +github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= +github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= +github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpSBQv6A= github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw= github.com/coreos/go-iptables v0.8.0 h1:MPc2P89IhuVpLI7ETL/2tx3XZ61VeICZjYqDEgNsPRc= github.com/coreos/go-iptables v0.8.0/go.mod h1:Qe8Bv2Xik5FyTXwgIbLAnv2sWSBmvWdFETJConOQ//Q= +github.com/cpuguy83/dockercfg v0.3.2 h1:DlJTyZGBDlXqUZ2Dk2Q3xHs/FtnooJJVaad2S9GKorA= +github.com/cpuguy83/dockercfg v0.3.2/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= -github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= -github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= +github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= +github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= github.com/cyphar/filepath-securejoin v0.5.0 h1:hIAhkRBMQ8nIeuVwcAoymp7MY4oherZdAxD+m0u9zaw= github.com/cyphar/filepath-securejoin v0.5.0/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= +github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= +github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94= +github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= +github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU= +github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= @@ -128,12 +178,15 @@ github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj2 github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA= github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= @@ -166,6 +219,7 @@ github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7O github.com/google/go-cmdtest v0.4.1-0.20220921163831-55ab3332a786 h1:rcv+Ippz6RAtvaGgKxc+8FQIpxHgsF+HBzPyYL2cyVU= github.com/google/go-cmdtest v0.4.1-0.20220921163831-55ab3332a786/go.mod h1:apVn/GCasLZUVpAJ6oWAuyP7Ne7CEsQbTnc0plM3m+o= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/go-configfs-tsm v0.3.3-0.20240919001351-b4b5b84fdcbc h1:SG12DWUUM5igxm+//YX5Yq4vhdoRnOG9HkCodkOn+YU= @@ -220,8 +274,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU= github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k= -github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= -github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE= +github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ= github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU= github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= @@ -239,6 +293,10 @@ github.com/lib/pq v1.12.3 h1:tTWxr2YLKwIvK90ZXEw8GP7UFHtcbTtty8zsI+YjrfQ= github.com/lib/pq v1.12.3/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= +github.com/magiconair/properties v1.8.10 h1:s31yESBquKXCV9a/ScB3ESkOjUYYv+X0rg8SYxI99mE= +github.com/magiconair/properties v1.8.10/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= @@ -266,14 +324,26 @@ github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa1 github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= +github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= +github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= +github.com/moby/go-archive v0.2.0 h1:zg5QDUM2mi0JIM9fdQZWC7U8+2ZfixfTYoHL7rWUcP8= +github.com/moby/go-archive v0.2.0/go.mod h1:mNeivT14o8xU+5q1YnNrkQVpK+dnNe/K6fHqnTg4qPU= +github.com/moby/moby/api v1.54.1 h1:TqVzuJkOLsgLDDwNLmYqACUuTehOHRGKiPhvH8V3Nn4= +github.com/moby/moby/api v1.54.1/go.mod h1:+RQ6wluLwtYaTd1WnPLykIDPekkuyD/ROWQClE83pzs= +github.com/moby/moby/client v0.4.0 h1:S+2XegzHQrrvTCvF6s5HFzcrywWQmuVnhOXe2kiWjIw= +github.com/moby/moby/client v0.4.0/go.mod h1:QWPbvWchQbxBNdaLSpoKpCdf5E+WxFAgNHogCWDoa7g= +github.com/moby/patternmatcher v0.6.1 h1:qlhtafmr6kgMIJjKJMDmMWq7WLkKIo23hsrpR3x084U= +github.com/moby/patternmatcher v0.6.1/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y= github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= +github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU= +github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko= github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs= github.com/moby/sys/user v0.4.0/go.mod h1:bG+tYYYJgaMtRKgEmuueC0hJEAZWwtIbZTB+85uoHjs= github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= -github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= -github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= +github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= +github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -331,6 +401,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= @@ -354,10 +426,12 @@ github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= +github.com/shirou/gopsutil/v4 v4.26.3 h1:2ESdQt90yU3oXF/CdOlRCJxrP+Am1aBYubTMTfxJ1qc= +github.com/shirou/gopsutil/v4 v4.26.3/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ= github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k= github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= +github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= github.com/smartystreets/assertions v1.0.0/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM= github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9/go.mod h1:SnhjPscd9TpLiy1LpzGSKh3bXCfxxXuqd9xmQJy3slM= github.com/smartystreets/gunit v1.0.0/go.mod h1:qwPWnhz6pn0NnRBP++URONOVyNkPyr4SauJk4cUOwJs= @@ -375,8 +449,8 @@ github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8w github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= -github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/objx v0.5.3 h1:jmXUvGomnU1o3W/V5h2VEradbpJDwGrzugQQvL0POH4= +github.com/stretchr/objx v0.5.3/go.mod h1:rDQraq+vQZU7Fde9LOZLr8Tax6zZvy4kuNKF+QYS+U0= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= @@ -385,6 +459,8 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/testcontainers/testcontainers-go v0.42.0 h1:He3IhTzTZOygSXLJPMX7n44XtK+qhjat1nI9cneBbUY= +github.com/testcontainers/testcontainers-go v0.42.0/go.mod h1:vZjdY1YmUA1qEForxOIOazfsrdyORJAbhi0bp8plN30= github.com/tj/assert v0.0.0-20171129193455-018094318fb0/go.mod h1:mZ9/Rh9oLWpLLDRpvE+3b7gP/C2YyLFYxNmcLnPTMe0= github.com/tj/assert v0.0.3 h1:Df/BlaZ20mq6kuai7f5z2TvPFiwC3xaWJSDQNiIS3Rk= github.com/tj/assert v0.0.3/go.mod h1:Ne6X72Q+TB1AteidzQncjw9PabbMp4PBMZ1k+vd1Pvk= @@ -392,6 +468,10 @@ github.com/tj/go-buffer v1.1.0/go.mod h1:iyiJpfFcR2B9sXu7KvjbT9fpM4mOelRSDTbntVj github.com/tj/go-elastic v0.0.0-20171221160941-36157cbbebc2/go.mod h1:WjeM0Oo1eNAjXGDx2yma7uG2XoyRZTq1uv3M/o7imD0= github.com/tj/go-kinesis v0.0.0-20171128231115-08b17f58cb1b/go.mod h1:/yhzCV0xPfx6jb1bBgRFjl5lytqVqZXEaeqWP8lTEao= github.com/tj/go-spin v1.1.0/go.mod h1:Mg1mzmePZm4dva8Qz60H2lHwmJ2loum4VIrLgVnKwh4= +github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA= +github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI= +github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw= +github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ= github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 h1:tHNk7XK9GkmKUR6Gh8gVBKXc2MVSZ4G/NnWLtzw4gNA= github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923/go.mod h1:eLL9Nub3yfAho7qB0MzZizFhTU2QkLeoVsWdHtDW264= github.com/urfave/cli v1.22.12 h1:igJgVw1JdKH+trcLWLeLwZjU9fEfPesQ+9/e4MQ44S8= @@ -408,6 +488,8 @@ github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ= github.com/xlab/treeprint v1.2.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0= github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= @@ -460,9 +542,10 @@ golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220622161953-175b2fd9d664/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -477,8 +560,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= -golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= -golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= +golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c= golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI= @@ -526,6 +609,8 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C gopkg.in/yaml.v3 v3.0.0-20200605160147-a5ece683394c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q= +gotest.tools/v3 v3.5.2/go.mod h1:LtdLGcnqToBH83WByAAi/wiwSFCArdFIUV/xxN4pcjA= k8s.io/api v0.35.4 h1:P7nFYKl5vo9AGUp1Z+Pmd3p2tA7bX2wbFWCvDeRv988= k8s.io/api v0.35.4/go.mod h1:yl4lqySWOgYJJf9RERXKUwE9g2y+CkuwG+xmcOK8wXU= k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4= @@ -582,6 +667,8 @@ modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc= oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o= +pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk= +pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80= diff --git a/hack/cmd/render-manifests/main.go b/hack/cmd/render-manifests/main.go index 475c7129..187676fa 100644 --- a/hack/cmd/render-manifests/main.go +++ b/hack/cmd/render-manifests/main.go @@ -10,19 +10,19 @@ // evaluate to empty strings (text/template's missingkey=zero behaviour for map // data), which lets templates rely on sprig's `default` function to supply // documented fallbacks. +// +// The actual rendering logic lives in the render sub-package so it can be +// invoked programmatically from tests. package main import ( - "bytes" "flag" "fmt" "os" - "path/filepath" "sort" "strings" - "text/template" - "github.com/Masterminds/sprig/v3" + "github.com/Azure/unbounded/hack/cmd/render-manifests/render" ) // setFlags implements flag.Value for repeatable --set key=value arguments. @@ -75,60 +75,11 @@ func main() { exitWithError("--output-dir is required") } - if err := renderTemplates(templatesDir, outputDir, data); err != nil { + if err := render.Render(templatesDir, outputDir, data); err != nil { exitWithError(err.Error()) } } -func renderTemplates(templatesDir, outputDir string, data setFlags) error { - return filepath.WalkDir(templatesDir, func(path string, d os.DirEntry, err error) error { - if err != nil { - return err - } - - if d.IsDir() { - return nil - } - - if !strings.HasSuffix(path, ".yaml.tmpl") { - return nil - } - - relPath, err := filepath.Rel(templatesDir, path) - if err != nil { - return err - } - - outputRelPath := strings.TrimSuffix(relPath, ".tmpl") - outputPath := filepath.Join(outputDir, outputRelPath) - - templateBytes, err := os.ReadFile(path) - if err != nil { - return fmt.Errorf("read template %q: %w", path, err) - } - - tmpl, err := template.New(relPath).Funcs(sprig.TxtFuncMap()).Option("missingkey=zero").Parse(string(templateBytes)) - if err != nil { - return fmt.Errorf("parse template %q: %w", path, err) - } - - if err := os.MkdirAll(filepath.Dir(outputPath), 0o755); err != nil { - return fmt.Errorf("create output dir for %q: %w", outputPath, err) - } - - var rendered bytes.Buffer - if err := tmpl.Execute(&rendered, map[string]string(data)); err != nil { - return fmt.Errorf("execute template %q: %w", path, err) - } - - if err := os.WriteFile(outputPath, rendered.Bytes(), 0o644); err != nil { - return fmt.Errorf("write rendered manifest %q: %w", outputPath, err) - } - - return nil - }) -} - func exitWithError(message string) { fmt.Fprintln(os.Stderr, message) os.Exit(1) diff --git a/hack/cmd/render-manifests/render/render.go b/hack/cmd/render-manifests/render/render.go new file mode 100644 index 00000000..13d3dce5 --- /dev/null +++ b/hack/cmd/render-manifests/render/render.go @@ -0,0 +1,75 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package render implements the manifest template renderer used by +// the render-manifests CLI. Exposed as a package so tests in other +// packages (e.g. internal/orca/manifests) can render the orca +// templates programmatically without shelling out to `go run`. +package render + +import ( + "bytes" + "fmt" + "os" + "path/filepath" + "strings" + "text/template" + + "github.com/Masterminds/sprig/v3" +) + +// Render walks templatesDir for *.yaml.tmpl files, executes each with +// Go's text/template (plus the sprig function library), and writes +// the rendered output under outputDir mirroring the source tree. +// +// Template data is supplied via the data map. Missing keys evaluate +// to empty strings (text/template's missingkey=zero), which lets +// templates rely on sprig's `default` function for fallbacks. +func Render(templatesDir, outputDir string, data map[string]string) error { + return filepath.WalkDir(templatesDir, func(path string, d os.DirEntry, err error) error { + if err != nil { + return err + } + + if d.IsDir() { + return nil + } + + if !strings.HasSuffix(path, ".yaml.tmpl") { + return nil + } + + relPath, err := filepath.Rel(templatesDir, path) + if err != nil { + return err + } + + outputRelPath := strings.TrimSuffix(relPath, ".tmpl") + outputPath := filepath.Join(outputDir, outputRelPath) + + templateBytes, err := os.ReadFile(path) + if err != nil { + return fmt.Errorf("read template %q: %w", path, err) + } + + tmpl, err := template.New(relPath).Funcs(sprig.TxtFuncMap()).Option("missingkey=zero").Parse(string(templateBytes)) + if err != nil { + return fmt.Errorf("parse template %q: %w", path, err) + } + + if err := os.MkdirAll(filepath.Dir(outputPath), 0o755); err != nil { + return fmt.Errorf("create output dir for %q: %w", outputPath, err) + } + + var rendered bytes.Buffer + if err := tmpl.Execute(&rendered, data); err != nil { + return fmt.Errorf("execute template %q: %w", path, err) + } + + if err := os.WriteFile(outputPath, rendered.Bytes(), 0o644); err != nil { + return fmt.Errorf("write rendered manifest %q: %w", outputPath, err) + } + + return nil + }) +} diff --git a/images/orca/Containerfile b/images/orca/Containerfile new file mode 100644 index 00000000..6a987546 --- /dev/null +++ b/images/orca/Containerfile @@ -0,0 +1,50 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# Build stage +FROM --platform=$BUILDPLATFORM docker.io/library/golang:1.26.2-trixie AS builder + +RUN apt-get update && apt-get install -y \ + build-essential \ + make \ + gcc \ + git \ + ca-certificates \ + && apt-get clean + +ENV CGO_ENABLED=0 +ENV GOPATH=/go +ENV GOTOOLCHAIN=auto +ENV PATH=$PATH:/go/bin + +WORKDIR /src + +COPY go.mod go.sum ./ +RUN go mod download + +COPY ../../ . + +ARG TARGETOS +ARG TARGETARCH +ARG VERSION=dev +ARG GIT_COMMIT= +ARG BUILD_TIME= +RUN GOOS=${TARGETOS} GOARCH=${TARGETARCH} \ + make orca-build VERSION=${VERSION} ${GIT_COMMIT:+GIT_COMMIT=${GIT_COMMIT}} ${BUILD_TIME:+BUILD_TIME=${BUILD_TIME}} + +# Runtime stage +FROM ubuntu:noble + +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + ca-certificates \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN mkdir -p /unbounded/bin + +COPY --from=builder /src/bin/orca /unbounded/bin/orca + +ENV PATH="/unbounded/bin:${PATH}" + +WORKDIR /unbounded + +ENTRYPOINT ["/unbounded/bin/orca"] diff --git a/internal/orca/app/app.go b/internal/orca/app/app.go new file mode 100644 index 00000000..12a1d7db --- /dev/null +++ b/internal/orca/app/app.go @@ -0,0 +1,374 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package app wires the Orca runtime: origin + cachestore + cluster + +// fetch coordinator + edge / internal HTTP listeners. +// +// Production callers (cmd/orca/orca/orca.go) drive this from a YAML +// config; integration tests (internal/orca/inttest) drive it from a +// programmatic *config.Config plus options that inject in-memory or +// counting decorators around the origin / cachestore. +package app + +import ( + "context" + "errors" + "fmt" + "log/slog" + "net" + "net/http" + "sync" + "time" + + "github.com/Azure/unbounded/internal/orca/cachestore" + cachestores3 "github.com/Azure/unbounded/internal/orca/cachestore/s3" + "github.com/Azure/unbounded/internal/orca/chunkcatalog" + "github.com/Azure/unbounded/internal/orca/cluster" + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/fetch" + "github.com/Azure/unbounded/internal/orca/metadata" + "github.com/Azure/unbounded/internal/orca/origin" + "github.com/Azure/unbounded/internal/orca/origin/awss3" + "github.com/Azure/unbounded/internal/orca/origin/azureblob" + "github.com/Azure/unbounded/internal/orca/server" +) + +// App is a running Orca instance. +// +// Construct with Start; tear down with Shutdown. Start is non-blocking: +// the returned App's listeners are accepting connections (via +// net.Listen) before Start returns, so EdgeAddr / InternalAddr are +// resolved (including any :0 ports) by the time the caller sees them. +type App struct { + // EdgeAddr is the resolved client-edge listen address (host:port). + // When the config requested ":0" the port is the OS-assigned one. + EdgeAddr string + + // InternalAddr is the resolved peer-RPC listen address (host:port). + InternalAddr string + + // Cluster is exposed so tests can inspect peer state and call + // Coordinator/Self for assertions. Production callers should treat + // this as read-only. + Cluster *cluster.Cluster + + log *slog.Logger + edgeSrv *http.Server + internalSrv *http.Server + wg sync.WaitGroup + errCh chan error +} + +type options struct { + log *slog.Logger + clusterOpts []cluster.Option + origin origin.Origin + cacheStore cachestore.CacheStore + skipCacheSelfTst bool + internalHandlerWrap func(http.Handler) http.Handler + edgeListener net.Listener + internalListener net.Listener +} + +// Option configures Start. +type Option func(*options) + +// WithLogger overrides the slog.Logger used for the App's output. If +// not provided, a JSON handler writing to stdout at LevelInfo is used. +func WithLogger(log *slog.Logger) Option { + return func(o *options) { o.log = log } +} + +// WithResolver overrides only the DNS resolver inside the default +// peer source. Convenient for tests that want to keep the production +// DNS-discovery shape but substitute the resolver itself. +func WithResolver(r cluster.Resolver) Option { + return func(o *options) { + o.clusterOpts = append(o.clusterOpts, cluster.WithResolver(r)) + } +} + +// WithPeerSource replaces the cluster's entire peer-discovery +// mechanism. Intended for integration tests that need full control +// (e.g. per-replica peer sets with explicit ports). +func WithPeerSource(s cluster.PeerSource) Option { + return func(o *options) { + o.clusterOpts = append(o.clusterOpts, cluster.WithPeerSource(s)) + } +} + +// WithOrigin replaces the origin driver constructed from cfg. Tests use +// this to wire counting / fault-injecting decorators around a real +// awss3 or azureblob client. +func WithOrigin(or origin.Origin) Option { + return func(o *options) { o.origin = or } +} + +// WithCacheStore replaces the cachestore driver constructed from cfg. +// Tests use this to wire a counting / fault-injecting decorator around +// a real s3 client (or to use an in-memory implementation). +func WithCacheStore(cs cachestore.CacheStore) Option { + return func(o *options) { o.cacheStore = cs } +} + +// WithSkipCachestoreSelfTest disables the boot-time atomic-commit +// self-test. Useful only in tests that wire a cachestore decorator +// already known to honor If-None-Match: *. +func WithSkipCachestoreSelfTest() Option { + return func(o *options) { o.skipCacheSelfTst = true } +} + +// WithInternalHandlerWrap installs a decorator around the internal +// peer-RPC handler. The wrap function receives the production handler +// and returns one that the http.Server actually serves. Production +// passes nothing -> identity. Tests use this to count 409 responses +// per source IP for the not-coordinator fallback assertion. +func WithInternalHandlerWrap(wrap func(http.Handler) http.Handler) Option { + return func(o *options) { o.internalHandlerWrap = wrap } +} + +// WithEdgeListener supplies a pre-bound listener for the client-edge +// HTTP server, bypassing app.Start's own net.Listen call. Intended +// for integration tests that need to allocate a port before starting +// the app (so peer sets can advertise the captured port from t=0 +// without a close/re-bind race window). +func WithEdgeListener(ln net.Listener) Option { + return func(o *options) { o.edgeListener = ln } +} + +// WithInternalListener supplies a pre-bound listener for the peer-RPC +// internal HTTP server. See WithEdgeListener for rationale. +func WithInternalListener(ln net.Listener) Option { + return func(o *options) { o.internalListener = ln } +} + +// Start wires every dependency and begins serving on the configured +// listeners. It returns once both listeners are accepting connections +// (or returns the error that prevented startup). +// +// The returned App must be Shutdown by the caller; Start does not own +// the parent context's lifetime. +func Start(ctx context.Context, cfg *config.Config, opts ...Option) (*App, error) { + o := options{} + for _, opt := range opts { + opt(&o) + } + + log := o.log + if log == nil { + log = slog.Default() + } + + or, err := buildOrigin(ctx, cfg, o.origin) + if err != nil { + return nil, err + } + + cs, err := buildCacheStore(ctx, cfg, o.cacheStore) + if err != nil { + return nil, err + } + + if !o.skipCacheSelfTst { + if err := cs.SelfTestAtomicCommit(ctx); err != nil { + return nil, fmt.Errorf("cachestore self-test failed: %w", err) + } + + log.Info("cachestore self-test passed") + } + + cl, err := cluster.New(ctx, cfg.Cluster, o.clusterOpts...) + if err != nil { + return nil, fmt.Errorf("init cluster: %w", err) + } + + cat := chunkcatalog.New(cfg.ChunkCatalog.MaxEntries) + mc := metadata.NewCache(cfg.Metadata) + fc := fetch.NewCoordinator(or, cs, cl, cat, mc, cfg) + + edgeHandler := server.NewEdgeHandler(fc, cfg, log) + + var internalHandler http.Handler = server.NewInternalHandler(fc, cl, log) + if o.internalHandlerWrap != nil { + internalHandler = o.internalHandlerWrap(internalHandler) + } + + edgeLn := o.edgeListener + if edgeLn == nil { + ln, err := net.Listen("tcp", cfg.Server.Listen) + if err != nil { + cl.Close() + return nil, fmt.Errorf("edge listener bind %q: %w", cfg.Server.Listen, err) + } + + edgeLn = ln + } + + internalLn := o.internalListener + if internalLn == nil { + ln, err := net.Listen("tcp", cfg.Cluster.InternalListen) + if err != nil { + _ = edgeLn.Close() //nolint:errcheck // best-effort close on bind failure + + cl.Close() + + return nil, fmt.Errorf("internal listener bind %q: %w", cfg.Cluster.InternalListen, err) + } + + internalLn = ln + } + + a := &App{ + EdgeAddr: edgeLn.Addr().String(), + InternalAddr: internalLn.Addr().String(), + Cluster: cl, + log: log, + edgeSrv: &http.Server{ + Handler: edgeHandler, + ReadHeaderTimeout: 10 * time.Second, + }, + internalSrv: &http.Server{ + Handler: internalHandler, + ReadHeaderTimeout: 10 * time.Second, + }, + errCh: make(chan error, 2), + } + + a.wg.Add(1) + + go func() { + defer a.wg.Done() + + log.Info("edge listener", "addr", a.EdgeAddr) + + if err := a.edgeSrv.Serve(edgeLn); err != nil && !errors.Is(err, http.ErrServerClosed) { + a.errCh <- fmt.Errorf("edge listener: %w", err) + } + }() + + a.wg.Add(1) + + go func() { + defer a.wg.Done() + + log.Info("internal listener", + "addr", a.InternalAddr, + "tls_enabled", cfg.Cluster.InternalTLS.Enabled, + ) + + var lerr error + if cfg.Cluster.InternalTLS.Enabled { + lerr = a.internalSrv.ServeTLS(internalLn, + cfg.Cluster.InternalTLS.CertFile, + cfg.Cluster.InternalTLS.KeyFile, + ) + } else { + log.Warn("internal listener TLS DISABLED - unsafe for production", + "addr", a.InternalAddr) + + lerr = a.internalSrv.Serve(internalLn) + } + + if lerr != nil && !errors.Is(lerr, http.ErrServerClosed) { + a.errCh <- fmt.Errorf("internal listener: %w", lerr) + } + }() + + return a, nil +} + +// Wait blocks until either the parent context is canceled or one of +// the listeners exits unexpectedly. It returns the listener error (if +// any) or nil if ctx was canceled. Wait is intended for the production +// "serve until SIGTERM" path; tests typically call Shutdown directly. +func (a *App) Wait(ctx context.Context) error { + select { + case <-ctx.Done(): + return nil + case err := <-a.errCh: + return err + } +} + +// Shutdown gracefully stops both listeners and the cluster goroutine. +// It is safe to call multiple times; subsequent calls are no-ops. +func (a *App) Shutdown(ctx context.Context) error { + var firstErr error + + if err := a.edgeSrv.Shutdown(ctx); err != nil { + a.log.Warn("edge listener shutdown failed", "err", err) + + firstErr = err + } + + if err := a.internalSrv.Shutdown(ctx); err != nil { + a.log.Warn("internal listener shutdown failed", "err", err) + + if firstErr == nil { + firstErr = err + } + } + + a.Cluster.Close() + a.wg.Wait() + + return firstErr +} + +func buildOrigin(ctx context.Context, cfg *config.Config, override origin.Origin) (origin.Origin, error) { + if override != nil { + return override, nil + } + + switch cfg.Origin.Driver { + case "azureblob": + or, err := azureblob.New(cfg.Origin.Azureblob) + if err != nil { + return nil, fmt.Errorf("init origin/azureblob: %w", err) + } + + return or, nil + case "awss3": + or, err := awss3.New(ctx, awss3.Config{ + Endpoint: cfg.Origin.AWSS3.Endpoint, + Region: cfg.Origin.AWSS3.Region, + Bucket: cfg.Origin.AWSS3.Bucket, + AccessKey: cfg.Origin.AWSS3.AccessKey, + SecretKey: cfg.Origin.AWSS3.SecretKey, + UsePathStyle: cfg.Origin.AWSS3.UsePathStyle, + }) + if err != nil { + return nil, fmt.Errorf("init origin/awss3: %w", err) + } + + return or, nil + default: + return nil, fmt.Errorf("unsupported origin driver: %q", cfg.Origin.Driver) + } +} + +func buildCacheStore(ctx context.Context, cfg *config.Config, override cachestore.CacheStore) (cachestore.CacheStore, error) { + if override != nil { + return override, nil + } + + switch cfg.Cachestore.Driver { + case "s3": + cs, err := cachestores3.New(ctx, cachestores3.Config{ + Endpoint: cfg.Cachestore.S3.Endpoint, + Bucket: cfg.Cachestore.S3.Bucket, + Region: cfg.Cachestore.S3.Region, + AccessKey: cfg.Cachestore.S3.AccessKey, + SecretKey: cfg.Cachestore.S3.SecretKey, + UsePathStyle: cfg.Cachestore.S3.UsePathStyle, + RequireUnversionedBucket: cfg.Cachestore.S3.RequireUnversionedBucket, + }) + if err != nil { + return nil, fmt.Errorf("init cachestore/s3: %w", err) + } + + return cs, nil + default: + return nil, fmt.Errorf("unsupported cachestore driver: %q", cfg.Cachestore.Driver) + } +} diff --git a/internal/orca/cachestore/cachestore.go b/internal/orca/cachestore/cachestore.go new file mode 100644 index 00000000..f51e664f --- /dev/null +++ b/internal/orca/cachestore/cachestore.go @@ -0,0 +1,43 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package cachestore defines the in-DC chunk store interface and shared +// types. Concrete drivers live under cachestore//. +// +// See design/orca/design.md s7 for the full interface and s10.1 for the +// atomic-commit contract. +package cachestore + +import ( + "context" + "errors" + "io" + "time" + + "github.com/Azure/unbounded/internal/orca/chunk" +) + +// CacheStore is where chunk bytes physically live. Source of truth for +// chunk presence; backed by an in-DC S3-like store in production and +// LocalStack in dev (Scope A+B). +type CacheStore interface { + GetChunk(ctx context.Context, k chunk.Key, off, n int64) (io.ReadCloser, error) + PutChunk(ctx context.Context, k chunk.Key, size int64, r io.Reader) error + Stat(ctx context.Context, k chunk.Key) (Info, error) + Delete(ctx context.Context, k chunk.Key) error + SelfTestAtomicCommit(ctx context.Context) error +} + +// Info is the result of a successful Stat. +type Info struct { + Size int64 + Committed time.Time +} + +// Sentinel errors. Wrap with %w so callers use errors.Is. +var ( + ErrNotFound = errors.New("cachestore: not found") + ErrTransient = errors.New("cachestore: transient") + ErrAuth = errors.New("cachestore: auth") + ErrCommitLost = errors.New("cachestore: commit lost (no-clobber denied)") +) diff --git a/internal/orca/cachestore/s3/s3.go b/internal/orca/cachestore/s3/s3.go new file mode 100644 index 00000000..fc915642 --- /dev/null +++ b/internal/orca/cachestore/s3/s3.go @@ -0,0 +1,354 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package s3 is the cachestore driver for in-DC S3-compatible stores. +// In production this targets VAST or another S3-compatible object +// store; in dev it targets LocalStack. +// +// Atomic commit is implemented via PutObject + If-None-Match: * (s3 +// conditional writes). The boot SelfTestAtomicCommit verifies the +// backend honors the precondition; the boot versioning gate verifies +// the bucket is not versioned (since If-None-Match is not honored on +// versioned buckets). +// +// See design/orca/design.md s10.1.3. +package s3 + +import ( + "bytes" + "context" + "crypto/rand" + "encoding/hex" + "errors" + "fmt" + "io" + "net/http" + "strings" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" + s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" + "github.com/aws/smithy-go" + + "github.com/Azure/unbounded/internal/orca/cachestore" + "github.com/Azure/unbounded/internal/orca/chunk" +) + +// Driver implements cachestore.CacheStore against an S3-compatible +// endpoint. +type Driver struct { + client *s3.Client + bucket string + + requireUnversionedBucket bool +} + +// Config is the s3-driver configuration. Mirrors config.CachestoreS3 +// but kept package-local so the driver can be unit-tested without +// importing the whole config package. +type Config struct { + Endpoint string + Bucket string + Region string + AccessKey string + SecretKey string + UsePathStyle bool + RequireUnversionedBucket bool +} + +// New constructs a Driver. The boot versioning gate is run here. +// +// SelfTestAtomicCommit is a separate step (called by main after New) +// to keep the constructor side-effect-light. +func New(ctx context.Context, cfg Config) (*Driver, error) { + if cfg.Bucket == "" { + return nil, fmt.Errorf("cachestore/s3: bucket required") + } + + if cfg.Endpoint == "" { + return nil, fmt.Errorf("cachestore/s3: endpoint required") + } + + awsCfg, err := awsconfig.LoadDefaultConfig(ctx, + awsconfig.WithRegion(cfg.Region), + awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider( + cfg.AccessKey, cfg.SecretKey, "", + )), + // Opt out of CRC64NVME default introduced in aws-sdk-go-v2 + // 1.32. LocalStack 3.8 returns InvalidRequest for unknown + // algorithms; real AWS S3 still works either way. + awsconfig.WithRequestChecksumCalculation(aws.RequestChecksumCalculationWhenRequired), + awsconfig.WithResponseChecksumValidation(aws.ResponseChecksumValidationWhenRequired), + ) + if err != nil { + return nil, fmt.Errorf("cachestore/s3: aws config: %w", err) + } + + client := s3.NewFromConfig(awsCfg, func(o *s3.Options) { + o.BaseEndpoint = aws.String(cfg.Endpoint) + o.UsePathStyle = cfg.UsePathStyle + }) + + d := &Driver{ + client: client, + bucket: cfg.Bucket, + requireUnversionedBucket: cfg.RequireUnversionedBucket, + } + + if d.requireUnversionedBucket { + if err := d.versioningGate(ctx); err != nil { + return nil, err + } + } + + return d, nil +} + +// versioningGate refuses to start if the bucket has versioning enabled +// or suspended. design.md s10.1.3. +func (d *Driver) versioningGate(ctx context.Context) error { + out, err := d.client.GetBucketVersioning(ctx, &s3.GetBucketVersioningInput{ + Bucket: aws.String(d.bucket), + }) + if err != nil { + return fmt.Errorf("cachestore/s3: GetBucketVersioning failed: %w", err) + } + + return validateBucketVersioning(d.bucket, out.Status) +} + +// validateBucketVersioning returns an error if the bucket's versioning +// status is incompatible with cachestore/s3's atomic-commit primitive. +// Extracted as a pure function so unit tests can cover all branches +// (empty / Enabled / Suspended) without round-tripping to a real or +// emulated S3 backend. +func validateBucketVersioning(bucket string, status s3types.BucketVersioningStatus) error { + switch status { + case s3types.BucketVersioningStatusEnabled, s3types.BucketVersioningStatusSuspended: + return fmt.Errorf( + "cachestore/s3: bucket %s has versioning %s; If-None-Match: * is not "+ + "honored on versioned buckets and the atomic-commit primitive cannot "+ + "guarantee no-clobber; disable bucket versioning to use cachestore/s3", + bucket, status) + } + + return nil +} + +// SelfTestAtomicCommit verifies the backend honors PutObject + +// If-None-Match: *. +func (d *Driver) SelfTestAtomicCommit(ctx context.Context) error { + probeKey := fmt.Sprintf("_orca-selftest/%s", randHex(16)) + body := []byte("orca-selftest") + + // First put: must succeed. + _, err := d.client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(d.bucket), + Key: aws.String(probeKey), + Body: bytes.NewReader(body), + IfNoneMatch: aws.String("*"), + }) + if err != nil { + return fmt.Errorf("cachestore/s3 self-test: first put failed: %w", err) + } + + // Second put: must fail with 412. + _, err = d.client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(d.bucket), + Key: aws.String(probeKey), + Body: bytes.NewReader(body), + IfNoneMatch: aws.String("*"), + }) + if err == nil { + // Clean up before returning the failure. + _, _ = d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort selftest cleanup + Bucket: aws.String(d.bucket), + Key: aws.String(probeKey), + }) + + return fmt.Errorf( + "cachestore/s3: backend does not honor If-None-Match: *; refusing to start " + + "(second concurrent put returned 200 instead of 412)") + } + + if !isPreconditionFailed(err) { + _, _ = d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort selftest cleanup + Bucket: aws.String(d.bucket), + Key: aws.String(probeKey), + }) + + return fmt.Errorf("cachestore/s3 self-test: second put returned unexpected error "+ + "(want 412 PreconditionFailed): %w", err) + } + + // Cleanup probe key. + _, _ = d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort selftest cleanup + Bucket: aws.String(d.bucket), + Key: aws.String(probeKey), + }) + + return nil +} + +// GetChunk fetches [off, off+n) of the chunk path from the bucket. +func (d *Driver) GetChunk(ctx context.Context, k chunk.Key, off, n int64) (io.ReadCloser, error) { + rng := fmt.Sprintf("bytes=%d-%d", off, off+n-1) + + out, err := d.client.GetObject(ctx, &s3.GetObjectInput{ + Bucket: aws.String(d.bucket), + Key: aws.String(k.Path()), + Range: aws.String(rng), + }) + if err != nil { + return nil, mapErr(err) + } + + return out.Body, nil +} + +// PutChunk uploads the chunk via PutObject + If-None-Match: *. On +// 412 returns ErrCommitLost (loser of an atomic-commit race). +func (d *Driver) PutChunk(ctx context.Context, k chunk.Key, size int64, r io.Reader) error { + // AWS SDK v2 needs an io.ReadSeeker for unsigned-payload uploads. + // For prototype simplicity we buffer the chunk in memory (chunks + // are 8 MiB by default). + buf, err := io.ReadAll(r) + if err != nil { + return fmt.Errorf("cachestore/s3 put: read body: %w", err) + } + + if int64(len(buf)) != size && size > 0 { + return fmt.Errorf("cachestore/s3 put: short body (got %d want %d)", len(buf), size) + } + + _, err = d.client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(d.bucket), + Key: aws.String(k.Path()), + Body: bytes.NewReader(buf), + ContentLength: aws.Int64(int64(len(buf))), + IfNoneMatch: aws.String("*"), + }) + if err != nil { + if isPreconditionFailed(err) { + return cachestore.ErrCommitLost + } + + return mapErr(err) + } + + return nil +} + +// Stat checks for chunk presence. +func (d *Driver) Stat(ctx context.Context, k chunk.Key) (cachestore.Info, error) { + out, err := d.client.HeadObject(ctx, &s3.HeadObjectInput{ + Bucket: aws.String(d.bucket), + Key: aws.String(k.Path()), + }) + if err != nil { + return cachestore.Info{}, mapErr(err) + } + + info := cachestore.Info{} + if out.ContentLength != nil { + info.Size = *out.ContentLength + } + + if out.LastModified != nil { + info.Committed = *out.LastModified + } + + return info, nil +} + +// Delete removes the chunk; idempotent. +func (d *Driver) Delete(ctx context.Context, k chunk.Key) error { + _, err := d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ + Bucket: aws.String(d.bucket), + Key: aws.String(k.Path()), + }) + if err != nil { + if isNotFound(err) { + return nil + } + + return mapErr(err) + } + + return nil +} + +func randHex(n int) string { + b := make([]byte, n) + if _, err := rand.Read(b); err != nil { + // Fallback: time-based; only used for boot-test probe key. + return fmt.Sprintf("ts%d", time.Now().UnixNano()) + } + + return hex.EncodeToString(b) +} + +func isPreconditionFailed(err error) bool { + var apiErr smithy.APIError + if errors.As(err, &apiErr) { + code := apiErr.ErrorCode() + if code == "PreconditionFailed" || code == "InvalidArgument" || code == "ConditionalRequestConflict" { + return true + } + } + + return strings.Contains(err.Error(), "PreconditionFailed") || + strings.Contains(err.Error(), "412") +} + +func isNotFound(err error) bool { + var nsk *s3types.NoSuchKey + if errors.As(err, &nsk) { + return true + } + + var nsb *s3types.NoSuchBucket + if errors.As(err, &nsb) { + return true + } + + var notFound *s3types.NotFound + if errors.As(err, ¬Found) { + return true + } + + var apiErr smithy.APIError + if errors.As(err, &apiErr) { + switch apiErr.ErrorCode() { + case "NoSuchKey", "NotFound", "404": + return true + } + } + + return false +} + +func mapErr(err error) error { + if isNotFound(err) { + return cachestore.ErrNotFound + } + + var apiErr smithy.APIError + if errors.As(err, &apiErr) { + switch apiErr.ErrorCode() { + case "AccessDenied", "Unauthorized", "Forbidden", "InvalidAccessKeyId", "SignatureDoesNotMatch": + return cachestore.ErrAuth + } + } + // Treat HTTP 5xx as transient. + if strings.Contains(err.Error(), "StatusCode: 5") { + return cachestore.ErrTransient + } + + _ = http.StatusOK // keep net/http import if not needed otherwise + + return err +} diff --git a/internal/orca/cachestore/s3/s3_test.go b/internal/orca/cachestore/s3/s3_test.go new file mode 100644 index 00000000..b8d28735 --- /dev/null +++ b/internal/orca/cachestore/s3/s3_test.go @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package s3 + +import ( + "strings" + "testing" + + s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" +) + +// TestValidateBucketVersioning covers every BucketVersioningStatus +// branch the gate cares about. The integration suite only exercises +// the Enabled case end-to-end; this unit test fills in the empty +// (never-enabled) and Suspended cases. +func TestValidateBucketVersioning(t *testing.T) { + tests := []struct { + name string + status s3types.BucketVersioningStatus + wantErr bool + }{ + {"empty (never enabled)", "", false}, + {"enabled", s3types.BucketVersioningStatusEnabled, true}, + {"suspended", s3types.BucketVersioningStatusSuspended, true}, + } + + const bucket = "test-bucket" + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateBucketVersioning(bucket, tt.status) + + if (err != nil) != tt.wantErr { + t.Fatalf("err=%v, wantErr=%v", err, tt.wantErr) + } + + if !tt.wantErr { + return + } + + if !strings.Contains(err.Error(), bucket) { + t.Errorf("error %q does not include bucket name %q", err, bucket) + } + + if !strings.Contains(err.Error(), string(tt.status)) { + t.Errorf("error %q does not include status %q", err, tt.status) + } + }) + } +} diff --git a/internal/orca/chunk/chunk.go b/internal/orca/chunk/chunk.go new file mode 100644 index 00000000..1a520c87 --- /dev/null +++ b/internal/orca/chunk/chunk.go @@ -0,0 +1,126 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package chunk implements the chunk model: ChunkKey, deterministic +// path encoding, and the range -> chunk-index iterator. +// +// See design/orca/design.md s5 for the full chunk model spec. This +// implementation is a faithful subset. +package chunk + +import ( + "crypto/sha256" + "encoding/binary" + "encoding/hex" + "fmt" + "hash" +) + +// Key is the immutable identifier for a chunk. +// +// Path encoding (design.md s5): +// +// LP(s) = LE64(uint64(len(s))) || s +// hashKey = sha256( +// LP(origin_id) || +// LP(bucket) || +// LP(key) || +// LP(etag) || +// LE64(chunk_size) +// ) +// path = "//" +type Key struct { + OriginID string + Bucket string + ObjectKey string + ETag string + ChunkSize int64 + Index int64 +} + +// Path returns the canonical on-store path for this ChunkKey. +func (k Key) Path() string { + h := sha256.New() + writeLP(h, k.OriginID) + writeLP(h, k.Bucket) + writeLP(h, k.ObjectKey) + writeLP(h, k.ETag) + + var sizeBuf [8]byte + binary.LittleEndian.PutUint64(sizeBuf[:], uint64(k.ChunkSize)) + h.Write(sizeBuf[:]) + sum := h.Sum(nil) + + return fmt.Sprintf("%s/%s/%d", k.OriginID, hex.EncodeToString(sum), k.Index) +} + +// Range returns the byte range [Off, Off+Len) within the origin +// object that this chunk corresponds to. +func (k Key) Range() (off, length int64) { + off = k.Index * k.ChunkSize + length = k.ChunkSize + + return off, length +} + +// String renders the key compactly for logging. +func (k Key) String() string { + if len(k.ETag) > 8 { + return fmt.Sprintf("ChunkKey{%s/%s/%s..@%d#%d}", + k.OriginID, k.Bucket, k.ObjectKey, k.Index, len(k.ETag)) + } + + return fmt.Sprintf("ChunkKey{%s/%s/%s@%d}", k.OriginID, k.Bucket, k.ObjectKey, k.Index) +} + +func writeLP(h hash.Hash, s string) { + var lenBuf [8]byte + binary.LittleEndian.PutUint64(lenBuf[:], uint64(len(s))) + h.Write(lenBuf[:]) + h.Write([]byte(s)) +} + +// IndexRange returns the inclusive [first, last] chunk indices that +// cover the byte range [start, end] of an object whose total size is +// objectSize. +// +// Caller is responsible for clamping start / end against objectSize +// before invoking; if end >= objectSize, end is clamped here. +func IndexRange(start, end, chunkSize, objectSize int64) (first, last int64) { + if end >= objectSize { + end = objectSize - 1 + } + + first = start / chunkSize + last = end / chunkSize + + return first, last +} + +// ChunkSlice returns the [off, len) within a single chunk that +// satisfies the original client byte range [start, end]. +// +// chunkIdx is the chunk index. chunkSize is the configured chunk size. +// objectSize is the total origin-object size (used to clamp the last +// chunk if it is partial). +func ChunkSlice(chunkIdx, chunkSize, start, end, objectSize int64) (off, length int64) { + chunkStart := chunkIdx * chunkSize + + chunkEnd := chunkStart + chunkSize - 1 + if chunkEnd >= objectSize { + chunkEnd = objectSize - 1 + } + + if start > chunkStart { + off = start - chunkStart + } + + sliceEnd := chunkEnd + if end < chunkEnd { + sliceEnd = end + } + + length = sliceEnd - chunkStart - off + 1 + + return off, length +} diff --git a/internal/orca/chunk/chunk_test.go b/internal/orca/chunk/chunk_test.go new file mode 100644 index 00000000..bc53c795 --- /dev/null +++ b/internal/orca/chunk/chunk_test.go @@ -0,0 +1,231 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package chunk + +import ( + "strings" + "testing" +) + +// TestKey_Path_Deterministic verifies that the same inputs always +// produce the same path and that meaningful input differences +// (OriginID, Bucket, ObjectKey, ETag, ChunkSize, Index) produce +// distinct paths. The path encoding is part of orca's design +// contract (design.md s5). +func TestKey_Path_Deterministic(t *testing.T) { + t.Parallel() + + base := Key{ + OriginID: "origin-a", + Bucket: "bucket", + ObjectKey: "key", + ETag: "etag1", + ChunkSize: 1024, + Index: 0, + } + // Same inputs -> same path. Compare two equally-constructed Keys + // (calling Path() on the same receiver tautologically passes). + dup := base + if base.Path() != dup.Path() { + t.Fatalf("Path() not deterministic for identical key") + } + + other := base + otherPath := other.Path() + + mutations := []struct { + name string + mut func(k *Key) + }{ + {"different origin", func(k *Key) { k.OriginID = "origin-b" }}, + {"different bucket", func(k *Key) { k.Bucket = "other-bucket" }}, + {"different key", func(k *Key) { k.ObjectKey = "other-key" }}, + {"different etag", func(k *Key) { k.ETag = "etag2" }}, + {"different chunk size", func(k *Key) { k.ChunkSize = 2048 }}, + {"different index", func(k *Key) { k.Index = 1 }}, + } + + for _, m := range mutations { + t.Run(m.name, func(t *testing.T) { + mutated := base + m.mut(&mutated) + + got := mutated.Path() + if got == otherPath { + t.Errorf("path collision after %s mutation: %q", m.name, got) + } + }) + } +} + +// TestKey_Path_Format asserts the documented path shape: +// "//". +func TestKey_Path_Format(t *testing.T) { + t.Parallel() + + k := Key{ + OriginID: "origin-a", + Bucket: "b", + ObjectKey: "k", + ETag: "e", + ChunkSize: 1024, + Index: 7, + } + + path := k.Path() + + parts := strings.Split(path, "/") + if len(parts) != 3 { + t.Fatalf("path %q has %d segments, want 3", path, len(parts)) + } + + if parts[0] != "origin-a" { + t.Errorf("origin segment=%q want %q", parts[0], "origin-a") + } + + if len(parts[1]) != 64 { + t.Errorf("hex segment len=%d want 64 (sha256)", len(parts[1])) + } + + for _, c := range parts[1] { + isDigit := c >= '0' && c <= '9' + isLowerHex := c >= 'a' && c <= 'f' + + if !isDigit && !isLowerHex { + t.Errorf("hex segment contains non-hex char %q", c) + break + } + } + + if parts[2] != "7" { + t.Errorf("index segment=%q want %q", parts[2], "7") + } +} + +// TestKey_Range verifies (off, length) = (Index*ChunkSize, ChunkSize). +func TestKey_Range(t *testing.T) { + t.Parallel() + + k := Key{ChunkSize: 1 << 20, Index: 3} + + off, length := k.Range() + if off != 3<<20 { + t.Errorf("off=%d want %d", off, 3<<20) + } + + if length != 1<<20 { + t.Errorf("length=%d want %d", length, 1<<20) + } +} + +// TestIndexRange covers the chunk-index span computed from a byte +// range plus the end clamping to objectSize. +func TestIndexRange(t *testing.T) { + t.Parallel() + + const chunkSize = int64(1024) + + tests := []struct { + name string + start, end int64 + objectSize int64 + wantFirst int64 + wantLast int64 + }{ + {"aligned full chunk", 0, 1023, 1024, 0, 0}, + {"aligned two chunks", 0, 2047, 4096, 0, 1}, + {"start mid-chunk, end mid-chunk same", 100, 500, 1024, 0, 0}, + {"start mid-chunk, end mid-next-chunk", 100, 1500, 4096, 0, 1}, + {"end clamped to objectSize", 0, 9999, 2048, 0, 1}, + {"single byte", 5, 5, 1024, 0, 0}, + {"last partial chunk", 1024, 1500, 1500, 1, 1}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + first, last := IndexRange(tt.start, tt.end, chunkSize, tt.objectSize) + if first != tt.wantFirst { + t.Errorf("first=%d want %d", first, tt.wantFirst) + } + + if last != tt.wantLast { + t.Errorf("last=%d want %d", last, tt.wantLast) + } + }) + } +} + +// TestChunkSlice covers the (off, length) within a single chunk that +// satisfies the original byte range. Critical for cross-chunk +// streamSlice copies. +func TestChunkSlice(t *testing.T) { + t.Parallel() + + const chunkSize = int64(1024) + + tests := []struct { + name string + chunkIdx int64 + start int64 + end int64 + objectSize int64 + wantOff int64 + wantLen int64 + }{ + {"entirely within chunk 0", 0, 100, 199, 4096, 100, 100}, + {"start at chunk 0 boundary", 0, 0, 99, 4096, 0, 100}, + {"end at chunk 0 boundary", 0, 0, 1023, 4096, 0, 1024}, + {"chunk 1, range covers full chunk", 1, 1024, 2047, 4096, 0, 1024}, + {"chunk spans range start", 1, 500, 1500, 4096, 0, 477}, // [1024..1500] + {"chunk spans range end", 1, 1500, 2500, 4096, 476, 548}, + {"last partial chunk", 3, 3000, 3500, 3500, 0, 428}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + off, length := ChunkSlice(tt.chunkIdx, chunkSize, tt.start, tt.end, tt.objectSize) + if off != tt.wantOff { + t.Errorf("off=%d want %d", off, tt.wantOff) + } + + if length != tt.wantLen { + t.Errorf("length=%d want %d", length, tt.wantLen) + } + }) + } +} + +// TestKey_String covers both formatting branches (short ETag + long +// ETag). +func TestKey_String(t *testing.T) { + t.Parallel() + + short := Key{ + OriginID: "o", + Bucket: "b", + ObjectKey: "k", + ETag: "abc", + Index: 5, + } + if s := short.String(); !strings.Contains(s, "@5") { + t.Errorf("short ETag string=%q does not contain @5", s) + } + + long := Key{ + OriginID: "o", + Bucket: "b", + ObjectKey: "k", + ETag: "abcdefghi", // 9 chars > 8 + Index: 5, + } + + s := long.String() + if !strings.Contains(s, "..@") { + t.Errorf("long ETag string=%q does not contain truncation marker '..@'", s) + } + + if !strings.Contains(s, "#9") { + t.Errorf("long ETag string=%q does not contain length suffix '#9'", s) + } +} diff --git a/internal/orca/chunkcatalog/chunkcatalog.go b/internal/orca/chunkcatalog/chunkcatalog.go new file mode 100644 index 00000000..453c8ed8 --- /dev/null +++ b/internal/orca/chunkcatalog/chunkcatalog.go @@ -0,0 +1,130 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package chunkcatalog implements a bounded LRU recording chunks known +// to be present in the CacheStore. Pure hot-path optimization; +// CacheStore is the source of truth. +package chunkcatalog + +import ( + "container/list" + "fmt" + "sync" + "time" + + "github.com/Azure/unbounded/internal/orca/cachestore" + "github.com/Azure/unbounded/internal/orca/chunk" +) + +// Catalog is a bounded LRU keyed on chunk.Key.Path(). +type Catalog struct { + mu sync.Mutex + maxEntries int + ll *list.List + idx map[string]*list.Element +} + +type entry struct { + path string + info cachestore.Info + at time.Time +} + +// New constructs a Catalog. +func New(maxEntries int) *Catalog { + if maxEntries <= 0 { + maxEntries = 100_000 + } + + return &Catalog{ + maxEntries: maxEntries, + ll: list.New(), + idx: make(map[string]*list.Element, maxEntries), + } +} + +// Lookup returns the cached Info if present and bumps the LRU position. +func (c *Catalog) Lookup(k chunk.Key) (cachestore.Info, bool, error) { + path := k.Path() + + c.mu.Lock() + defer c.mu.Unlock() + + el, ok := c.idx[path] + if !ok { + return cachestore.Info{}, false, nil + } + + c.ll.MoveToFront(el) + + e, ok := el.Value.(*entry) + if !ok { + return cachestore.Info{}, false, fmt.Errorf("chunkcatalog: list element is not *entry") + } + + return e.info, true, nil +} + +// Record inserts or updates the entry. +func (c *Catalog) Record(k chunk.Key, info cachestore.Info) error { + path := k.Path() + + c.mu.Lock() + defer c.mu.Unlock() + + if el, ok := c.idx[path]; ok { + c.ll.MoveToFront(el) + + e, ok := el.Value.(*entry) + if !ok { + return fmt.Errorf("chunkcatalog: list element is not *entry") + } + + e.info = info + e.at = time.Now() + + return nil + } + + el := c.ll.PushFront(&entry{path: path, info: info, at: time.Now()}) + + c.idx[path] = el + for c.ll.Len() > c.maxEntries { + oldest := c.ll.Back() + if oldest == nil { + break + } + + c.ll.Remove(oldest) + + oldEntry, ok := oldest.Value.(*entry) + if !ok { + return fmt.Errorf("chunkcatalog: list element is not *entry") + } + + delete(c.idx, oldEntry.path) + } + + return nil +} + +// Forget removes the entry if present. +func (c *Catalog) Forget(k chunk.Key) { + path := k.Path() + + c.mu.Lock() + defer c.mu.Unlock() + + if el, ok := c.idx[path]; ok { + c.ll.Remove(el) + delete(c.idx, path) + } +} + +// Len returns the current entry count (test helper). +func (c *Catalog) Len() int { + c.mu.Lock() + defer c.mu.Unlock() + + return c.ll.Len() +} diff --git a/internal/orca/cluster/cluster.go b/internal/orca/cluster/cluster.go new file mode 100644 index 00000000..d3c178c5 --- /dev/null +++ b/internal/orca/cluster/cluster.go @@ -0,0 +1,449 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package cluster handles peer discovery and rendezvous-hash +// coordinator selection. +// +// Peer discovery: the headless Kubernetes Service backing the Orca +// Deployment publishes Pod IPs in its A-record. We poll DNS at +// cluster.membership_refresh interval (default 5s) and snapshot the +// peer set. +// +// Coordinator selection: rendezvous hashing on (peer_ip, ChunkKey) +// picks one coordinator per chunk across the cluster. See +// design.md s8.3. +// +// Internal RPC: each replica runs an HTTP/2 client to dial peers' +// internal listeners (mTLS in production, plain in dev). The +// listener side is in the server/internal handler. +// +// # Test seams +// +// Production constructs a DNS-backed PeerSource implicitly from +// cfg.Cluster.Service + net.DefaultResolver. Tests can substitute the +// entire mechanism with WithPeerSource (typically a mutable +// StaticPeerSource per replica) or just swap the underlying DNS +// resolver with WithResolver. +package cluster + +import ( + "context" + "crypto/sha256" + "encoding/binary" + "fmt" + "io" + "net" + "net/http" + "net/url" + "strconv" + "sync" + "sync/atomic" + "time" + + "github.com/Azure/unbounded/internal/orca/chunk" + "github.com/Azure/unbounded/internal/orca/config" +) + +// Peer represents one replica in the current peer-set snapshot. +// +// In production every Peer has Port == 0 because pod IPs are +// addressed on the same internal-listener port across the +// Deployment. Integration tests with multiple replicas sharing +// 127.0.0.1 set Port to the per-replica OS-assigned port; in that +// mode FillFromPeer dials peer.IP:peer.Port instead of falling back +// to cfg.Cluster.InternalListen's port. +type Peer struct { + IP string + Port int // 0 = use cfg.Cluster.InternalListen's port (production) + Self bool // true when this Peer entry represents the local replica +} + +// Cluster manages peer discovery, rendezvous hashing, and the +// internal-RPC client. +type Cluster struct { + cfg config.Cluster + + peers atomic.Pointer[[]Peer] + + httpClient *http.Client + source PeerSource + + cancelFn context.CancelFunc + done chan struct{} +} + +// Resolver looks up the host names that back the headless Service. +// Production uses net.DefaultResolver; tests can swap it with +// WithResolver to substitute only the DNS layer while keeping the +// rest of the DNS-based PeerSource behavior. +type Resolver interface { + LookupHost(ctx context.Context, host string) ([]string, error) +} + +// PeerSource produces the current peer-set snapshot. The DNS-backed +// implementation queries the headless Service's A-record. Tests +// substitute a StaticPeerSource that returns a mutable list of peers +// with explicit Port values (so multiple replicas can share an IP). +// +// Each returned Peer.Self must be authoritatively set by the source +// (the source knows the calling replica's identity at construction +// time, so it is the only place that can stamp Self correctly when +// peers share an IP). +type PeerSource interface { + Peers(ctx context.Context) ([]Peer, error) +} + +// Option configures a Cluster at construction time. +type Option func(*Cluster) + +// WithPeerSource replaces the entire peer-discovery mechanism. This +// is the primary test seam; production code constructs the default +// DNS-backed source implicitly from cfg.Cluster.Service. +func WithPeerSource(s PeerSource) Option { + return func(c *Cluster) { c.source = s } +} + +// WithResolver replaces only the DNS resolver inside the default +// DNS-backed PeerSource. Has no effect when WithPeerSource is also +// provided. Useful if production wants a custom resolver (e.g. a +// proxy resolver) without otherwise changing discovery semantics. +func WithResolver(r Resolver) Option { + return func(c *Cluster) { + c.source = newDNSPeerSource(c.cfg.Service, c.cfg.SelfPodIP, r) + } +} + +// NewDNSPeerSource is the production peer source: it polls the +// headless Service via the given resolver. If resolver is nil, it +// uses net.DefaultResolver. Returned peers have Port=0; FillFromPeer +// falls back to cfg.Cluster.InternalListen's port when dialing. +func NewDNSPeerSource(service, selfIP string, resolver Resolver) PeerSource { + return newDNSPeerSource(service, selfIP, resolver) +} + +func newDNSPeerSource(service, selfIP string, resolver Resolver) PeerSource { + if resolver == nil { + resolver = net.DefaultResolver + } + + return &dnsPeerSource{ + service: service, + selfIP: selfIP, + resolver: resolver, + } +} + +type dnsPeerSource struct { + service string + selfIP string + resolver Resolver +} + +func (s *dnsPeerSource) Peers(ctx context.Context) ([]Peer, error) { + rctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + + ips, err := s.resolver.LookupHost(rctx, s.service) + if err != nil { + return nil, err + } + + peers := make([]Peer, 0, len(ips)) + for _, ip := range ips { + peers = append(peers, Peer{IP: ip, Self: ip == s.selfIP}) + } + + return peers, nil +} + +// New returns a Cluster and starts the membership-refresh goroutine. +func New(parent context.Context, cfg config.Cluster, opts ...Option) (*Cluster, error) { + if cfg.Service == "" { + return nil, fmt.Errorf("cluster: service required (headless Service FQDN)") + } + + if cfg.SelfPodIP == "" { + return nil, fmt.Errorf("cluster: self_pod_ip required (set POD_IP env)") + } + + ctx, cancel := context.WithCancel(parent) + c := &Cluster{ + cfg: cfg, + httpClient: newHTTPClient(cfg), + source: newDNSPeerSource(cfg.Service, cfg.SelfPodIP, nil), + cancelFn: cancel, + done: make(chan struct{}), + } + + for _, opt := range opts { + opt(c) + } + // Initial refresh; failure is non-fatal (empty peer-set fallback). + c.refresh(ctx) + + go c.refreshLoop(ctx) + + return c, nil +} + +// Close stops the refresh goroutine and waits for it to exit. +func (c *Cluster) Close() { + c.cancelFn() + <-c.done +} + +// Peers returns the current peer-set snapshot. +func (c *Cluster) Peers() []Peer { + p := c.peers.Load() + if p == nil { + return []Peer{{IP: c.cfg.SelfPodIP, Self: true}} + } + + return *p +} + +// Self returns the Peer for this replica. +func (c *Cluster) Self() Peer { + return Peer{IP: c.cfg.SelfPodIP, Self: true} +} + +// Coordinator selects the rendezvous-hashed coordinator for a chunk. +// +// Returns the Peer with the highest hash(peer || chunk_path) score. +// On empty peer set returns Self (last-replica-standing fallback). +func (c *Cluster) Coordinator(k chunk.Key) Peer { + peers := c.Peers() + if len(peers) == 0 { + return c.Self() + } + + path := []byte(k.Path()) + + var ( + best Peer + bestScore uint64 + ) + + for i, p := range peers { + score := rendezvousScore(p, path) + if i == 0 || score > bestScore { + bestScore = score + best = p + } + } + + return best +} + +// IsCoordinator reports whether this replica is the coordinator for k. +func (c *Cluster) IsCoordinator(k chunk.Key) bool { + coord := c.Coordinator(k) + if coord.Self { + return true + } + // In production peers are addressed by IP only and Self is set + // from cfg.SelfPodIP, so the IP comparison below is the same as + // the Self check above. Tests with shared IPs rely on the Self + // flag being set authoritatively by the PeerSource. + return coord.IP == c.cfg.SelfPodIP && coord.Port == 0 +} + +// FillFromPeer issues GET /internal/fill against the named peer and +// returns the streaming chunk body. Caller closes the returned reader. +func (c *Cluster) FillFromPeer(ctx context.Context, p Peer, k chunk.Key) (io.ReadCloser, error) { + if p.Self { + return nil, fmt.Errorf("cluster: refusing to FillFromPeer for self") + } + + scheme := "http" + if c.cfg.InternalTLS.Enabled { + scheme = "https" + } + + port := strconv.Itoa(p.Port) + if p.Port == 0 { + _, defaultPort, err := net.SplitHostPort(c.cfg.InternalListen) + if err != nil { + defaultPort = "8444" + } + + port = defaultPort + } + + target := url.URL{ + Scheme: scheme, + Host: net.JoinHostPort(p.IP, port), + Path: "/internal/fill", + RawQuery: encodeChunkKey(k), + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, target.String(), nil) + if err != nil { + return nil, fmt.Errorf("cluster: build internal-fill request: %w", err) + } + + req.Header.Set("X-Orca-Internal", "1") + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("cluster: internal-fill RPC: %w", err) + } + + if resp.StatusCode == http.StatusConflict { + _ = resp.Body.Close() //nolint:errcheck // best-effort close on error path + return nil, ErrPeerNotCoordinator + } + + if resp.StatusCode/100 != 2 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) //nolint:errcheck // best-effort error body read + _ = resp.Body.Close() //nolint:errcheck // best-effort close on error path + + return nil, fmt.Errorf("cluster: internal-fill RPC returned %d: %s", + resp.StatusCode, string(body)) + } + + return resp.Body, nil +} + +// ErrPeerNotCoordinator is returned by FillFromPeer when the peer +// reports it is not the coordinator (membership disagreement). +var ErrPeerNotCoordinator = fmt.Errorf("cluster: peer is not the coordinator (409 Conflict)") + +func (c *Cluster) refreshLoop(ctx context.Context) { + defer close(c.done) + + t := time.NewTicker(c.cfg.MembershipRefresh) + defer t.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-t.C: + c.refresh(ctx) + } + } +} + +func (c *Cluster) refresh(ctx context.Context) { + peers, err := c.source.Peers(ctx) + if err != nil || len(peers) == 0 { + // Empty-peer-set fallback: treat self as only peer. + self := []Peer{{IP: c.cfg.SelfPodIP, Self: true}} + c.peers.Store(&self) + + return + } + // Ensure self is always in the set even if discovery hasn't + // caught up yet. + hasSelf := false + + for _, p := range peers { + if p.Self { + hasSelf = true + break + } + } + + if !hasSelf { + peers = append(peers, Peer{IP: c.cfg.SelfPodIP, Self: true}) + } + + c.peers.Store(&peers) +} + +func newHTTPClient(cfg config.Cluster) *http.Client { + tr := &http.Transport{ + MaxIdleConns: 16, + MaxIdleConnsPerHost: 4, + IdleConnTimeout: 30 * time.Second, + ForceAttemptHTTP2: true, + } + // TLS configuration deliberately omitted for prototype dev mode + // (cluster.internal_tls.enabled=false). Production will populate + // tr.TLSClientConfig from cfg.InternalTLS. + _ = cfg + + return &http.Client{ + Transport: tr, + Timeout: 60 * time.Second, + } +} + +// Score returns the rendezvous-hash score for (peer, key). Exposed so +// integration tests can craft phantom peers that deterministically +// win or lose against a real peer for a given key (used to induce +// membership disagreement scenarios). +func Score(p Peer, key []byte) uint64 { + return rendezvousScore(p, key) +} + +func rendezvousScore(p Peer, key []byte) uint64 { + h := sha256.New() + h.Write([]byte(p.IP)) + h.Write([]byte{0}) + + if p.Port != 0 { + // In production every peer has Port=0 so this branch never + // fires and the score is identical to historical behavior + // (sha256(ip || 0 || key)). Tests with multiple peers sharing + // 127.0.0.1 set distinct Ports so the score differentiates + // replicas. + var pb [4]byte + binary.BigEndian.PutUint32(pb[:], uint32(p.Port)) + h.Write(pb[:]) + h.Write([]byte{0}) + } + + h.Write(key) + sum := h.Sum(nil) + + return binary.BigEndian.Uint64(sum[:8]) +} + +func encodeChunkKey(k chunk.Key) string { + v := url.Values{} + v.Set("origin_id", k.OriginID) + v.Set("bucket", k.Bucket) + v.Set("key", k.ObjectKey) + v.Set("etag", k.ETag) + v.Set("chunk_size", strconv.FormatInt(k.ChunkSize, 10)) + v.Set("index", strconv.FormatInt(k.Index, 10)) + + return v.Encode() +} + +// DecodeChunkKey parses query params into a Key. Used by the internal +// listener (server/internal/fill). +func DecodeChunkKey(values url.Values) (chunk.Key, error) { + chunkSize, err := strconv.ParseInt(values.Get("chunk_size"), 10, 64) + if err != nil { + return chunk.Key{}, fmt.Errorf("invalid chunk_size: %w", err) + } + + idx, err := strconv.ParseInt(values.Get("index"), 10, 64) + if err != nil { + return chunk.Key{}, fmt.Errorf("invalid index: %w", err) + } + + originID := values.Get("origin_id") + bucket := values.Get("bucket") + key := values.Get("key") + etag := values.Get("etag") + + if originID == "" || key == "" { + return chunk.Key{}, fmt.Errorf("missing required key fields") + } + + return chunk.Key{ + OriginID: originID, + Bucket: bucket, + ObjectKey: key, + ETag: etag, + ChunkSize: chunkSize, + Index: idx, + }, nil +} + +// Mu guards external mutation in tests. +var Mu sync.Mutex diff --git a/internal/orca/config/config.go b/internal/orca/config/config.go new file mode 100644 index 00000000..e524611e --- /dev/null +++ b/internal/orca/config/config.go @@ -0,0 +1,364 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package config defines Orca's YAML configuration shape and loading +// helpers. +// +// Only the subset of design.md s5 needed for the prototype (Scope A+B) +// is represented here. The schema is intentionally a subset: extending +// it later is a matter of adding fields and keeping zero-values +// backward-compatible. +package config + +import ( + "fmt" + "os" + "time" + + "gopkg.in/yaml.v3" +) + +// Config is the top-level Orca configuration. +type Config struct { + Server Server `yaml:"server"` + Origin Origin `yaml:"origin"` + Cachestore Cachestore `yaml:"cachestore"` + Cluster Cluster `yaml:"cluster"` + ChunkCatalog ChunkCatalog `yaml:"chunk_catalog"` + Metadata Metadata `yaml:"metadata"` + Chunking Chunking `yaml:"chunking"` +} + +// Server holds the client-edge listener configuration. +type Server struct { + Listen string `yaml:"listen"` + Auth ServerAuth `yaml:"auth"` +} + +// ServerAuth governs the client-edge authentication path. +// +// Production: enabled=true with mode=bearer or mode=mtls. +// Dev: enabled=false disables authentication entirely (no token +// or client cert required). This is a single security knob, not a +// dev_mode flag. +type ServerAuth struct { + Enabled bool `yaml:"enabled"` + Mode string `yaml:"mode"` + BearerSecretFile string `yaml:"bearer_secret_file"` +} + +// Origin describes the upstream origin (Azure Blob or AWS S3 in v1). +type Origin struct { + ID string `yaml:"id"` + Driver string `yaml:"driver"` // "azureblob" or "awss3" + TargetGlobal int `yaml:"target_global"` + QueueTimeout time.Duration `yaml:"queue_timeout"` + Retry OriginRetry `yaml:"retry"` + Azureblob Azureblob `yaml:"azureblob"` + AWSS3 AWSS3 `yaml:"awss3"` +} + +// OriginRetry captures the leader-side pre-header retry budget. +type OriginRetry struct { + Attempts int `yaml:"attempts"` + BackoffInitial time.Duration `yaml:"backoff_initial"` + BackoffMax time.Duration `yaml:"backoff_max"` + MaxTotalDuration time.Duration `yaml:"max_total_duration"` +} + +// Azureblob is the azureblob origin adapter configuration. +type Azureblob struct { + Account string `yaml:"account"` + AccountKey string `yaml:"account_key"` + Container string `yaml:"container"` + EnforceBlockBlobOnly bool `yaml:"enforce_block_blob_only"` + + // Endpoint, when set, overrides the default Azure Blob service URL + // (https://.blob.core.windows.net/). Used in dev to point + // at Azurite (http://azurite:10000/devstoreaccount1) so the + // azureblob driver path can be exercised without a real Azure + // account. + Endpoint string `yaml:"endpoint"` +} + +// AWSS3 is the awss3 origin adapter configuration. In dev this points +// at LocalStack alongside the cachestore (different bucket); in +// production it points at real AWS S3 with no Endpoint override. +type AWSS3 struct { + Endpoint string `yaml:"endpoint"` // empty for real AWS S3 + Region string `yaml:"region"` + Bucket string `yaml:"bucket"` + AccessKey string `yaml:"access_key"` + SecretKey string `yaml:"secret_key"` + UsePathStyle bool `yaml:"use_path_style"` // true for LocalStack +} + +// Cachestore is the in-DC chunk store configuration. +type Cachestore struct { + Driver string `yaml:"driver"` // "s3" in v1 + S3 CachestoreS3 `yaml:"s3"` +} + +// CachestoreS3 is the s3 driver configuration. In dev this points at +// LocalStack; in production at VAST or another in-DC S3-compatible +// store. +type CachestoreS3 struct { + Endpoint string `yaml:"endpoint"` + Bucket string `yaml:"bucket"` + Region string `yaml:"region"` + AccessKey string `yaml:"access_key"` + SecretKey string `yaml:"secret_key"` + UsePathStyle bool `yaml:"use_path_style"` // true for LocalStack + RequireUnversionedBucket bool `yaml:"require_unversioned_bucket"` +} + +// Cluster captures peer discovery + internal-listener configuration. +type Cluster struct { + Service string `yaml:"service"` // headless Service FQDN + MembershipRefresh time.Duration `yaml:"membership_refresh"` // DNS poll interval + InternalListen string `yaml:"internal_listen"` + InternalTLS InternalTLS `yaml:"internal_tls"` + TargetReplicas int `yaml:"target_replicas"` + SelfPodIP string `yaml:"self_pod_ip"` // resolved from POD_IP env +} + +// InternalTLS governs the internal-listener mTLS posture. +// +// Production: enabled=true (mTLS required). +// Dev: enabled=false (plain HTTP/2). The binary logs WARN at startup. +type InternalTLS struct { + Enabled bool `yaml:"enabled"` + CertFile string `yaml:"cert_file"` + KeyFile string `yaml:"key_file"` + CAFile string `yaml:"ca_file"` + ServerName string `yaml:"server_name"` +} + +// ChunkCatalog is the in-memory chunk-presence cache configuration. +type ChunkCatalog struct { + MaxEntries int `yaml:"max_entries"` +} + +// Metadata is the object-metadata cache configuration. +type Metadata struct { + TTL time.Duration `yaml:"ttl"` + NegativeTTL time.Duration `yaml:"negative_ttl"` + MaxEntries int `yaml:"max_entries"` +} + +// Chunking governs chunk size and prefetch. +type Chunking struct { + Size int64 `yaml:"size"` // bytes per chunk; default 8 MiB +} + +// Load reads the YAML config file at path and returns a populated +// Config. Defaults are applied for fields left at zero-value. +func Load(path string) (*Config, error) { + raw, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read %s: %w", path, err) + } + + cfg := &Config{} + if err := yaml.Unmarshal(raw, cfg); err != nil { + return nil, fmt.Errorf("yaml unmarshal: %w", err) + } + + cfg.applyDefaults() + + if err := cfg.validate(); err != nil { + return nil, fmt.Errorf("config invalid: %w", err) + } + + return cfg, nil +} + +func (c *Config) applyDefaults() { + // Server. + if c.Server.Listen == "" { + c.Server.Listen = "0.0.0.0:8443" + } + // Origin. + if c.Origin.Driver == "" { + c.Origin.Driver = "azureblob" + } + + if c.Origin.TargetGlobal == 0 { + c.Origin.TargetGlobal = 192 + } + + if c.Origin.QueueTimeout == 0 { + c.Origin.QueueTimeout = 5 * time.Second + } + + if c.Origin.Retry.Attempts == 0 { + c.Origin.Retry.Attempts = 3 + } + + if c.Origin.Retry.BackoffInitial == 0 { + c.Origin.Retry.BackoffInitial = 100 * time.Millisecond + } + + if c.Origin.Retry.BackoffMax == 0 { + c.Origin.Retry.BackoffMax = 2 * time.Second + } + + if c.Origin.Retry.MaxTotalDuration == 0 { + c.Origin.Retry.MaxTotalDuration = 5 * time.Second + } + + if !c.Origin.Azureblob.EnforceBlockBlobOnly { + // design.md s9 states this is locked-true. + c.Origin.Azureblob.EnforceBlockBlobOnly = true + } + // Cachestore. + if c.Cachestore.Driver == "" { + c.Cachestore.Driver = "s3" + } + + if c.Cachestore.S3.Region == "" { + c.Cachestore.S3.Region = "us-east-1" + } + + if !c.Cachestore.S3.RequireUnversionedBucket { + c.Cachestore.S3.RequireUnversionedBucket = true + } + // Cluster. + if c.Cluster.MembershipRefresh == 0 { + c.Cluster.MembershipRefresh = 5 * time.Second + } + + if c.Cluster.InternalListen == "" { + c.Cluster.InternalListen = "0.0.0.0:8444" + } + + if c.Cluster.TargetReplicas == 0 { + c.Cluster.TargetReplicas = 3 + } + + if c.Cluster.InternalTLS.ServerName == "" { + c.Cluster.InternalTLS.ServerName = "orca..svc" + } + // Resolve self pod IP from env if not set in YAML. + if c.Cluster.SelfPodIP == "" { + c.Cluster.SelfPodIP = os.Getenv("POD_IP") + } + // Resolve credentials from env if not set in YAML. This lets the + // non-secret config live in a ConfigMap while credentials come from + // a Kubernetes Secret mounted as env vars (envFrom: secretRef). + if c.Origin.Azureblob.AccountKey == "" { + c.Origin.Azureblob.AccountKey = os.Getenv("ORCA_AZUREBLOB_ACCOUNT_KEY") + } + + if c.Origin.AWSS3.AccessKey == "" { + c.Origin.AWSS3.AccessKey = os.Getenv("ORCA_AWSS3_ACCESS_KEY") + } + + if c.Origin.AWSS3.SecretKey == "" { + c.Origin.AWSS3.SecretKey = os.Getenv("ORCA_AWSS3_SECRET_KEY") + } + + if c.Cachestore.S3.AccessKey == "" { + c.Cachestore.S3.AccessKey = os.Getenv("ORCA_CACHESTORE_S3_ACCESS_KEY") + } + + if c.Cachestore.S3.SecretKey == "" { + c.Cachestore.S3.SecretKey = os.Getenv("ORCA_CACHESTORE_S3_SECRET_KEY") + } + // awss3 region default. + if c.Origin.AWSS3.Region == "" { + c.Origin.AWSS3.Region = "us-east-1" + } + // Chunk catalog. + if c.ChunkCatalog.MaxEntries == 0 { + c.ChunkCatalog.MaxEntries = 100_000 + } + // Metadata. + if c.Metadata.TTL == 0 { + c.Metadata.TTL = 5 * time.Minute + } + + if c.Metadata.NegativeTTL == 0 { + c.Metadata.NegativeTTL = 60 * time.Second + } + + if c.Metadata.MaxEntries == 0 { + c.Metadata.MaxEntries = 10_000 + } + // Chunking. + if c.Chunking.Size == 0 { + c.Chunking.Size = 8 * 1024 * 1024 + } +} + +func (c *Config) validate() error { + if c.Origin.ID == "" { + return fmt.Errorf("origin.id is required") + } + + switch c.Origin.Driver { + case "azureblob": + if c.Origin.Azureblob.Account == "" { + return fmt.Errorf("origin.azureblob.account is required") + } + + if c.Origin.Azureblob.Container == "" { + return fmt.Errorf("origin.azureblob.container is required") + } + case "awss3": + if c.Origin.AWSS3.Bucket == "" { + return fmt.Errorf("origin.awss3.bucket is required") + } + default: + return fmt.Errorf("origin.driver %q unsupported; supported: azureblob, awss3", + c.Origin.Driver) + } + + if c.Cachestore.Driver != "s3" { + return fmt.Errorf("cachestore.driver %q unsupported; only s3 in v1", c.Cachestore.Driver) + } + + if c.Cachestore.S3.Endpoint == "" { + return fmt.Errorf("cachestore.s3.endpoint is required") + } + + if c.Cachestore.S3.Bucket == "" { + return fmt.Errorf("cachestore.s3.bucket is required") + } + + if c.Cluster.Service == "" { + return fmt.Errorf("cluster.service is required (headless Service FQDN)") + } + + if c.Cluster.SelfPodIP == "" { + return fmt.Errorf("cluster.self_pod_ip is required (typically resolved from POD_IP env)") + } + + if c.Cluster.TargetReplicas < 1 { + return fmt.Errorf("cluster.target_replicas must be >= 1") + } + + if c.Origin.TargetGlobal < c.Cluster.TargetReplicas { + return fmt.Errorf( + "origin.target_global=%d must be >= cluster.target_replicas=%d", + c.Origin.TargetGlobal, c.Cluster.TargetReplicas, + ) + } + + if c.Chunking.Size < 1024*1024 { + return fmt.Errorf("chunking.size %d too small; minimum 1 MiB", c.Chunking.Size) + } + + return nil +} + +// TargetPerReplica returns the per-replica origin concurrency cap derived +// from origin.target_global and cluster.target_replicas +// (design.md s8.4). +func (c *Config) TargetPerReplica() int { + if c.Cluster.TargetReplicas <= 0 { + return c.Origin.TargetGlobal + } + + return c.Origin.TargetGlobal / c.Cluster.TargetReplicas +} diff --git a/internal/orca/config/config_test.go b/internal/orca/config/config_test.go new file mode 100644 index 00000000..28a734bf --- /dev/null +++ b/internal/orca/config/config_test.go @@ -0,0 +1,339 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package config + +import ( + "os" + "path/filepath" + "strings" + "testing" + "time" +) + +// TestApplyDefaults_EnvFallback verifies that applyDefaults populates +// credential / pod-identity fields from environment variables when +// the YAML omits them. This is the path used in production where the +// Kubernetes Secret is mounted via envFrom and the ConfigMap holds +// only the non-secret config. +// +// Each subtest sets one env var and checks that: +// - env-set, yaml-empty -> field populated from env. +// - env-unset, yaml-set -> field keeps yaml value. +// - env-set, yaml-set -> field keeps yaml value (yaml wins). +// - env-unset, yaml-empty -> field stays empty. +func TestApplyDefaults_EnvFallback(t *testing.T) { + tests := []struct { + envVar string + setVal func(c *Config, v string) + getVal func(c *Config) string + }{ + { + envVar: "POD_IP", + setVal: func(c *Config, v string) { c.Cluster.SelfPodIP = v }, + getVal: func(c *Config) string { return c.Cluster.SelfPodIP }, + }, + { + envVar: "ORCA_AZUREBLOB_ACCOUNT_KEY", + setVal: func(c *Config, v string) { c.Origin.Azureblob.AccountKey = v }, + getVal: func(c *Config) string { return c.Origin.Azureblob.AccountKey }, + }, + { + envVar: "ORCA_AWSS3_ACCESS_KEY", + setVal: func(c *Config, v string) { c.Origin.AWSS3.AccessKey = v }, + getVal: func(c *Config) string { return c.Origin.AWSS3.AccessKey }, + }, + { + envVar: "ORCA_AWSS3_SECRET_KEY", + setVal: func(c *Config, v string) { c.Origin.AWSS3.SecretKey = v }, + getVal: func(c *Config) string { return c.Origin.AWSS3.SecretKey }, + }, + { + envVar: "ORCA_CACHESTORE_S3_ACCESS_KEY", + setVal: func(c *Config, v string) { c.Cachestore.S3.AccessKey = v }, + getVal: func(c *Config) string { return c.Cachestore.S3.AccessKey }, + }, + { + envVar: "ORCA_CACHESTORE_S3_SECRET_KEY", + setVal: func(c *Config, v string) { c.Cachestore.S3.SecretKey = v }, + getVal: func(c *Config) string { return c.Cachestore.S3.SecretKey }, + }, + } + + for _, tt := range tests { + t.Run(tt.envVar, func(t *testing.T) { + t.Run("env_set/yaml_empty", func(t *testing.T) { + t.Setenv(tt.envVar, "from-env") + + c := &Config{} + c.applyDefaults() + + if got := tt.getVal(c); got != "from-env" { + t.Errorf("got %q want %q", got, "from-env") + } + }) + + t.Run("env_unset/yaml_set", func(t *testing.T) { + _ = os.Unsetenv(tt.envVar) //nolint:errcheck // best-effort + + c := &Config{} + tt.setVal(c, "from-yaml") + c.applyDefaults() + + if got := tt.getVal(c); got != "from-yaml" { + t.Errorf("got %q want %q", got, "from-yaml") + } + }) + + t.Run("env_set/yaml_set_yaml_wins", func(t *testing.T) { + t.Setenv(tt.envVar, "from-env") + + c := &Config{} + tt.setVal(c, "from-yaml") + c.applyDefaults() + + if got := tt.getVal(c); got != "from-yaml" { + t.Errorf("got %q want %q (yaml should win)", got, "from-yaml") + } + }) + + t.Run("env_unset/yaml_empty", func(t *testing.T) { + _ = os.Unsetenv(tt.envVar) //nolint:errcheck // best-effort + + c := &Config{} + c.applyDefaults() + + if got := tt.getVal(c); got != "" { + t.Errorf("got %q want empty", got) + } + }) + }) + } +} + +// TestApplyDefaults_FieldDefaults verifies that the hard-coded +// fallback values fire for every field whose zero value is replaced. +func TestApplyDefaults_FieldDefaults(t *testing.T) { + t.Parallel() + + c := &Config{} + c.applyDefaults() + + checks := []struct { + name string + got any + want any + }{ + {"server.listen", c.Server.Listen, "0.0.0.0:8443"}, + {"origin.driver", c.Origin.Driver, "azureblob"}, + {"origin.target_global", c.Origin.TargetGlobal, 192}, + {"origin.queue_timeout", c.Origin.QueueTimeout, 5 * time.Second}, + {"origin.retry.attempts", c.Origin.Retry.Attempts, 3}, + {"origin.retry.backoff_initial", c.Origin.Retry.BackoffInitial, 100 * time.Millisecond}, + {"origin.retry.backoff_max", c.Origin.Retry.BackoffMax, 2 * time.Second}, + {"origin.retry.max_total_duration", c.Origin.Retry.MaxTotalDuration, 5 * time.Second}, + {"origin.azureblob.enforce_block_blob_only", c.Origin.Azureblob.EnforceBlockBlobOnly, true}, + {"cachestore.driver", c.Cachestore.Driver, "s3"}, + {"cachestore.s3.region", c.Cachestore.S3.Region, "us-east-1"}, + {"cachestore.s3.require_unversioned_bucket", c.Cachestore.S3.RequireUnversionedBucket, true}, + {"cluster.membership_refresh", c.Cluster.MembershipRefresh, 5 * time.Second}, + {"cluster.internal_listen", c.Cluster.InternalListen, "0.0.0.0:8444"}, + {"cluster.target_replicas", c.Cluster.TargetReplicas, 3}, + {"cluster.internal_tls.server_name", c.Cluster.InternalTLS.ServerName, "orca..svc"}, + {"chunk_catalog.max_entries", c.ChunkCatalog.MaxEntries, 100_000}, + {"metadata.ttl", c.Metadata.TTL, 5 * time.Minute}, + {"metadata.negative_ttl", c.Metadata.NegativeTTL, 60 * time.Second}, + {"metadata.max_entries", c.Metadata.MaxEntries, 10_000}, + {"chunking.size", c.Chunking.Size, int64(8 * 1024 * 1024)}, + {"origin.awss3.region", c.Origin.AWSS3.Region, "us-east-1"}, + } + + for _, ch := range checks { + if ch.got != ch.want { + t.Errorf("%s: got %v want %v", ch.name, ch.got, ch.want) + } + } +} + +// TestApplyDefaults_PreservesExplicitValues verifies that explicit +// non-zero values are not overwritten by applyDefaults. +func TestApplyDefaults_PreservesExplicitValues(t *testing.T) { + t.Parallel() + + c := &Config{ + Server: Server{Listen: "1.2.3.4:9000"}, + Origin: Origin{ + Driver: "awss3", + TargetGlobal: 64, + }, + Cachestore: Cachestore{S3: CachestoreS3{Region: "eu-west-1"}}, + Cluster: Cluster{TargetReplicas: 7, MembershipRefresh: 10 * time.Second}, + ChunkCatalog: ChunkCatalog{MaxEntries: 50}, + Metadata: Metadata{TTL: time.Hour, MaxEntries: 99}, + Chunking: Chunking{Size: 16 << 20}, + } + + c.applyDefaults() + + if c.Server.Listen != "1.2.3.4:9000" { + t.Errorf("Server.Listen overwritten: %q", c.Server.Listen) + } + + if c.Origin.Driver != "awss3" { + t.Errorf("Origin.Driver overwritten: %q", c.Origin.Driver) + } + + if c.Origin.TargetGlobal != 64 { + t.Errorf("Origin.TargetGlobal overwritten: %d", c.Origin.TargetGlobal) + } + + if c.Cachestore.S3.Region != "eu-west-1" { + t.Errorf("Cachestore.S3.Region overwritten: %q", c.Cachestore.S3.Region) + } + + if c.Cluster.TargetReplicas != 7 { + t.Errorf("Cluster.TargetReplicas overwritten: %d", c.Cluster.TargetReplicas) + } + + if c.Cluster.MembershipRefresh != 10*time.Second { + t.Errorf("Cluster.MembershipRefresh overwritten: %v", c.Cluster.MembershipRefresh) + } + + if c.ChunkCatalog.MaxEntries != 50 { + t.Errorf("ChunkCatalog.MaxEntries overwritten: %d", c.ChunkCatalog.MaxEntries) + } + + if c.Metadata.TTL != time.Hour { + t.Errorf("Metadata.TTL overwritten: %v", c.Metadata.TTL) + } + + if c.Chunking.Size != 16<<20 { + t.Errorf("Chunking.Size overwritten: %d", c.Chunking.Size) + } +} + +// TestLoad_Validate covers the validate() error paths. +func TestLoad_Validate(t *testing.T) { + // No t.Parallel: subtests use t.Setenv to neutralize POD_IP. + tests := []struct { + name string + yaml string + wantErr string + wantOK bool + }{ + { + name: "valid awss3 config", + yaml: validAwss3YAML, + wantOK: true, + }, + { + name: "missing origin.id", + yaml: strings.ReplaceAll(validAwss3YAML, "id: test-origin", "id: \"\""), + wantErr: "origin.id is required", + }, + { + name: "unsupported driver", + yaml: strings.ReplaceAll(validAwss3YAML, "driver: awss3", "driver: ftp"), + wantErr: "origin.driver", + }, + { + name: "missing awss3 bucket", + yaml: strings.ReplaceAll(validAwss3YAML, "bucket: orca-origin", "bucket: \"\""), + wantErr: "origin.awss3.bucket is required", + }, + { + name: "missing cachestore endpoint", + yaml: strings.ReplaceAll(validAwss3YAML, "endpoint: http://localstack:4566", "endpoint: \"\""), + wantErr: "cachestore.s3.endpoint is required", + }, + { + name: "missing cluster service", + yaml: strings.ReplaceAll(validAwss3YAML, "service: orca-peers.svc", "service: \"\""), + wantErr: "cluster.service is required", + }, + { + name: "missing self_pod_ip when POD_IP unset", + yaml: strings.ReplaceAll(validAwss3YAML, "self_pod_ip: 10.0.0.1", "self_pod_ip: \"\""), + wantErr: "self_pod_ip is required", + }, + { + name: "target_replicas negative", + yaml: strings.ReplaceAll(validAwss3YAML, "target_replicas: 3", "target_replicas: -1"), + wantErr: "target_replicas", + }, + { + name: "chunking size below minimum", + yaml: strings.ReplaceAll(validAwss3YAML, "size: 8388608", "size: 4096"), + wantErr: "chunking.size", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Ensure no leakage of POD_IP from the test process env. + t.Setenv("POD_IP", "") + + path := writeTempYAML(t, tt.yaml) + + _, err := Load(path) + if tt.wantOK { + if err != nil { + t.Fatalf("expected nil error, got %v", err) + } + + return + } + + if err == nil { + t.Fatalf("expected error containing %q, got nil", tt.wantErr) + } + + if !strings.Contains(err.Error(), tt.wantErr) { + t.Errorf("error %q does not contain %q", err.Error(), tt.wantErr) + } + }) + } +} + +func writeTempYAML(t *testing.T, content string) string { + t.Helper() + + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + + if err := os.WriteFile(path, []byte(content), 0o600); err != nil { + t.Fatalf("write temp yaml: %v", err) + } + + return path +} + +const validAwss3YAML = ` +server: + listen: 0.0.0.0:8443 +origin: + id: test-origin + driver: awss3 + awss3: + endpoint: http://localstack:4566 + region: us-east-1 + bucket: orca-origin + access_key: test + secret_key: test + use_path_style: true +cachestore: + driver: s3 + s3: + endpoint: http://localstack:4566 + bucket: orca-cache + region: us-east-1 + access_key: test + secret_key: test + use_path_style: true +cluster: + service: orca-peers.svc + self_pod_ip: 10.0.0.1 + target_replicas: 3 +chunking: + size: 8388608 +` diff --git a/internal/orca/fetch/fetch.go b/internal/orca/fetch/fetch.go new file mode 100644 index 00000000..8bc2cd51 --- /dev/null +++ b/internal/orca/fetch/fetch.go @@ -0,0 +1,333 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package fetch is the per-replica fill orchestrator: per-ChunkKey +// singleflight, pre-header origin retry (Option D), per-replica origin +// concurrency cap, and cross-replica fill via the cluster's internal +// RPC (s8.3). +// +// Scope A+B per the design: per-replica singleflight + cluster-wide +// dedup via rendezvous-hashed coordinator. No disk spool; joiner +// streams from the leader's in-memory ring buffer. +package fetch + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "log/slog" + "sync" + "time" + + "github.com/Azure/unbounded/internal/orca/cachestore" + "github.com/Azure/unbounded/internal/orca/chunk" + "github.com/Azure/unbounded/internal/orca/chunkcatalog" + "github.com/Azure/unbounded/internal/orca/cluster" + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/metadata" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// Coordinator orchestrates per-replica chunk fills. +type Coordinator struct { + or origin.Origin + cs cachestore.CacheStore + cl *cluster.Cluster + cat *chunkcatalog.Catalog + mc *metadata.Cache + cfg *config.Config + + // Per-replica origin concurrency cap (s8.4 simplified). + originSem chan struct{} + + // Per-ChunkKey singleflight (s8.1). + mu sync.Mutex + inflight map[string]*fill +} + +type fill struct { + done chan struct{} + bodyBuf *bytes.Buffer // buffered chunk after fetch (in-memory, bounded by chunk size) + err error +} + +// NewCoordinator wires up the fetch coordinator. +func NewCoordinator( + or origin.Origin, + cs cachestore.CacheStore, + cl *cluster.Cluster, + cat *chunkcatalog.Catalog, + mc *metadata.Cache, + cfg *config.Config, +) *Coordinator { + tpr := cfg.TargetPerReplica() + if tpr < 1 { + tpr = 1 + } + + return &Coordinator{ + or: or, + cs: cs, + cl: cl, + cat: cat, + mc: mc, + cfg: cfg, + originSem: make(chan struct{}, tpr), + inflight: make(map[string]*fill), + } +} + +// Origin returns the underlying origin (used by the LIST passthrough). +func (c *Coordinator) Origin() origin.Origin { return c.or } + +// HeadObject returns object metadata, satisfying client HEAD requests. +func (c *Coordinator) HeadObject(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + return c.mc.LookupOrFetch(ctx, c.cfg.Origin.ID, bucket, key, + func(ctx context.Context) (origin.ObjectInfo, error) { + return c.or.Head(ctx, bucket, key) + }) +} + +// GetChunk returns a reader over the chunk's bytes, fulfilling either +// from CacheStore (hit) or by orchestrating a cluster-wide +// dedup'd fill (miss). +// +// On miss: +// - If self is the coordinator: run local fill (origin GET via retry, +// atomic commit to CacheStore, populate buffer for joiners). +// - If a peer is the coordinator: send /internal/fill to that peer; +// stream from peer's response. On 409 Conflict, fall back to local +// fill. +func (c *Coordinator) GetChunk(ctx context.Context, k chunk.Key) (io.ReadCloser, error) { + // Hot path: catalog hit -> direct CacheStore read. + _, ok, err := c.cat.Lookup(k) + if err != nil { + return nil, fmt.Errorf("chunkcatalog lookup: %w", err) + } + + if ok { + rc, err := c.cs.GetChunk(ctx, k, 0, k.ChunkSize) + if err == nil { + return rc, nil + } + + if errors.Is(err, cachestore.ErrNotFound) { + c.cat.Forget(k) + // fall through to miss path + } else { + return nil, err + } + } + + // Stat to confirm presence. + if info, err := c.cs.Stat(ctx, k); err == nil { + if recErr := c.cat.Record(k, info); recErr != nil { + return nil, fmt.Errorf("chunkcatalog record: %w", recErr) + } + + return c.cs.GetChunk(ctx, k, 0, info.Size) + } else if !errors.Is(err, cachestore.ErrNotFound) { + return nil, err + } + + // Cluster-wide dedup: route to coordinator. + coord := c.cl.Coordinator(k) + if !coord.Self { + rc, err := c.cl.FillFromPeer(ctx, coord, k) + if err == nil { + return rc, nil + } + + if errors.Is(err, cluster.ErrPeerNotCoordinator) { + slog.Default().Warn("peer reported not-coordinator; falling back to local fill", + "chunk", k.String(), "peer", coord.IP) + // fall through to local fill + } else { + slog.Default().Warn("internal-fill RPC failed; falling back to local fill", + "chunk", k.String(), "peer", coord.IP, "err", err) + } + } + + return c.fillLocal(ctx, k) +} + +// FillForPeer is the path taken by the /internal/fill handler. +// +// The receiver becomes the leader for this fill (or joins an in-flight +// fill for the same key). Returns a streaming body of the entire chunk. +func (c *Coordinator) FillForPeer(ctx context.Context, k chunk.Key) (io.ReadCloser, error) { + // Hot path: catalog hit -> direct read. The catalog can be stale + // (e.g. cachestore pruned out-of-band, or operator clear-cache); + // on ErrNotFound we forget and fall through to a fresh fill. + _, ok, err := c.cat.Lookup(k) + if err != nil { + return nil, fmt.Errorf("chunkcatalog lookup: %w", err) + } + + if ok { + rc, err := c.cs.GetChunk(ctx, k, 0, k.ChunkSize) + if err == nil { + return rc, nil + } + + if errors.Is(err, cachestore.ErrNotFound) { + c.cat.Forget(k) + } else { + return nil, err + } + } + + if info, err := c.cs.Stat(ctx, k); err == nil { + if recErr := c.cat.Record(k, info); recErr != nil { + return nil, fmt.Errorf("chunkcatalog record: %w", recErr) + } + + return c.cs.GetChunk(ctx, k, 0, info.Size) + } else if !errors.Is(err, cachestore.ErrNotFound) { + return nil, err + } + + return c.fillLocal(ctx, k) +} + +// fillLocal runs (or joins) the singleflight for k on this replica. +func (c *Coordinator) fillLocal(ctx context.Context, k chunk.Key) (io.ReadCloser, error) { + path := k.Path() + + c.mu.Lock() + + f, ok := c.inflight[path] + if !ok { + f = &fill{done: make(chan struct{})} + c.inflight[path] = f + c.mu.Unlock() + + go c.runFill(k, f) + } else { + c.mu.Unlock() + } + + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-f.done: + } + + if f.err != nil { + return nil, f.err + } + + return io.NopCloser(bytes.NewReader(f.bodyBuf.Bytes())), nil +} + +func (c *Coordinator) runFill(k chunk.Key, f *fill) { + // Use a fill-scoped context to outlive any single requester. + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + defer func() { + close(f.done) + c.mu.Lock() + delete(c.inflight, k.Path()) + c.mu.Unlock() + }() + + // Acquire per-replica origin slot. + queueCtx, queueCancel := context.WithTimeout(ctx, c.cfg.Origin.QueueTimeout) + defer queueCancel() + + select { + case c.originSem <- struct{}{}: + case <-queueCtx.Done(): + f.err = fmt.Errorf("origin: queue timeout (cap=%d)", cap(c.originSem)) + return + } + + defer func() { <-c.originSem }() + + // Pre-header retry loop. + off, length := k.Range() + + body, err := c.fetchWithRetry(ctx, k, off, length) + if err != nil { + f.err = err + return + } + defer body.Close() //nolint:errcheck // origin body close best-effort + + buf := &bytes.Buffer{} + if _, err := io.Copy(buf, body); err != nil { + f.err = fmt.Errorf("fill copy: %w", err) + return + } + + f.bodyBuf = buf + + // Atomic commit to CacheStore. + commitErr := c.cs.PutChunk(ctx, k, int64(buf.Len()), bytes.NewReader(buf.Bytes())) + if commitErr == nil { + if recErr := c.cat.Record(k, cachestore.Info{Size: int64(buf.Len()), Committed: time.Now()}); recErr != nil { + slog.Default().Warn("chunkcatalog record failed", + "chunk", k.String(), "err", recErr) + } + } else if errors.Is(commitErr, cachestore.ErrCommitLost) { + // Another replica won; treat existing CacheStore entry as truth. + if info, err := c.cs.Stat(ctx, k); err == nil { + if recErr := c.cat.Record(k, info); recErr != nil { + slog.Default().Warn("chunkcatalog record failed", + "chunk", k.String(), "err", recErr) + } + } + } else { + slog.Default().Warn("commit-after-serve failed", + "chunk", k.String(), "err", commitErr) + // Don't record in catalog; next request refills. + } +} + +func (c *Coordinator) fetchWithRetry(ctx context.Context, k chunk.Key, off, length int64) (io.ReadCloser, error) { + deadline := time.Now().Add(c.cfg.Origin.Retry.MaxTotalDuration) + backoff := c.cfg.Origin.Retry.BackoffInitial + + var lastErr error + + for attempt := 1; attempt <= c.cfg.Origin.Retry.Attempts; attempt++ { + if time.Now().After(deadline) { + return nil, fmt.Errorf("origin retry exhausted (duration); last err: %w", lastErr) + } + + body, err := c.or.GetRange(ctx, k.Bucket, k.ObjectKey, k.ETag, off, length) + if err == nil { + return body, nil + } + + lastErr = err + // Non-retryable: ETag changed. + var etagChanged *origin.OriginETagChangedError + if errors.As(err, &etagChanged) { + c.mc.Invalidate(c.cfg.Origin.ID, k.Bucket, k.ObjectKey) + return nil, err + } + // Non-retryable: not found. + if errors.Is(err, origin.ErrNotFound) { + return nil, err + } + // Backoff. + if attempt < c.cfg.Origin.Retry.Attempts { + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(backoff): + } + + backoff *= 2 + if backoff > c.cfg.Origin.Retry.BackoffMax { + backoff = c.cfg.Origin.Retry.BackoffMax + } + } + } + + return nil, fmt.Errorf("origin retry exhausted (attempts); last err: %w", lastErr) +} diff --git a/internal/orca/inttest/azure_test.go b/internal/orca/inttest/azure_test.go new file mode 100644 index 00000000..5c9ab1dd --- /dev/null +++ b/internal/orca/inttest/azure_test.go @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "bytes" + "context" + "net/http" + "testing" + "time" +) + +// TestAzureBlobOrigin_ColdGet verifies the azureblob origin driver +// works against Azurite end-to-end on a 3-replica cluster. The +// MediumBlob spans 2 chunks so rendezvous-hashed routing typically +// exercises both fillLocal and FillFromPeer in a single run. +func TestAzureBlobOrigin_ColdGet(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 90*time.Second) + defer cancel() + + ctr := pkgAzurite.NewContainer(ctx, t, "orca-origin") + blob := MediumBlob() + SeedAzure(ctx, t, pkgAzurite, ctr, []SeedBlob{blob}) + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + Azurite: pkgAzurite, + OriginDriver: "azureblob", + AzureContainer: ctr, + }) + + resp := cl.Get(1).HTTP.Get(ctx, t, ctr, blob.Key) + if resp.Status != http.StatusOK { + t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body)) + } + + if !bytes.Equal(resp.Body, blob.Data) { + t.Fatalf("body mismatch: got %d bytes want %d", len(resp.Body), len(blob.Data)) + } +} diff --git a/internal/orca/inttest/azurite.go b/internal/orca/inttest/azurite.go new file mode 100644 index 00000000..e80134ab --- /dev/null +++ b/internal/orca/inttest/azurite.go @@ -0,0 +1,167 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "crypto/rand" + "encoding/hex" + "fmt" + "testing" + + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/pageblob" + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/wait" +) + +// Azurite is a running Azurite container with helper accessors for +// constructing azblob clients pointed at the well-known dev account. +type Azurite struct { + container testcontainers.Container + endpoint string // http://host:port/devstoreaccount1 +} + +// Endpoint returns the Azurite blob-service URL including the +// devstoreaccount1 path segment. +func (az *Azurite) Endpoint() string { return az.endpoint } + +// AccountName returns the well-known Azurite dev account name. +func (az *Azurite) AccountName() string { return azuriteAccountName } + +// AccountKey returns the well-known Azurite dev account key. +func (az *Azurite) AccountKey() string { return azuriteAccountKey } + +// StartAzurite launches an Azurite container and returns once the +// blob-service port is reachable. Caller terminates via Terminate or +// t.Cleanup. +func StartAzurite(ctx context.Context) (*Azurite, error) { + req := testcontainers.ContainerRequest{ + Image: azuriteImage, + ExposedPorts: []string{azuritePort + "/tcp"}, + // `azurite-blob` listens on 0.0.0.0 by default; --skipApiVersionCheck + // keeps the SDK happy for newer client versions. + Cmd: []string{"azurite-blob", "--blobHost", "0.0.0.0", "--skipApiVersionCheck"}, + WaitingFor: wait.ForListeningPort(azuritePort + "/tcp"), + } + + c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: req, + Started: true, + }) + if err != nil { + return nil, fmt.Errorf("start azurite: %w", err) + } + + host, err := c.Host(ctx) + if err != nil { + _ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup + return nil, fmt.Errorf("azurite host: %w", err) + } + + port, err := c.MappedPort(ctx, azuritePort+"/tcp") + if err != nil { + _ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup + return nil, fmt.Errorf("azurite port: %w", err) + } + + endpoint := fmt.Sprintf("http://%s:%s/%s", host, port.Port(), azuriteAccountName) + + return &Azurite{ + container: c, + endpoint: endpoint, + }, nil +} + +// Terminate stops and removes the Azurite container. +func (az *Azurite) Terminate(ctx context.Context) error { + return az.container.Terminate(ctx) +} + +// NewServiceClient returns an azblob.Client authenticated with the +// well-known Azurite dev creds. +func (az *Azurite) NewServiceClient(t *testing.T) *azblob.Client { + t.Helper() + + cred, err := azblob.NewSharedKeyCredential(az.AccountName(), az.AccountKey()) + if err != nil { + t.Fatalf("azurite shared key cred: %v", err) + } + + cli, err := azblob.NewClientWithSharedKeyCredential(az.endpoint, cred, nil) + if err != nil { + t.Fatalf("azurite client: %v", err) + } + + return cli +} + +// NewContainer creates a fresh container and registers a cleanup. The +// container name is returned. +func (az *Azurite) NewContainer(ctx context.Context, t *testing.T, prefix string) string { + t.Helper() + + cli := az.NewServiceClient(t) + name := uniqueName(prefix) + + if _, err := cli.CreateContainer(ctx, name, nil); err != nil { + t.Fatalf("create container %s: %v", name, err) + } + + t.Cleanup(func() { + _, _ = cli.DeleteContainer(context.Background(), name, nil) //nolint:errcheck // best-effort cleanup + }) + + return name +} + +// UploadBlockBlob uploads bytes as a block blob to (container, name). +func (az *Azurite) UploadBlockBlob(ctx context.Context, t *testing.T, ctr, name string, data []byte) { + t.Helper() + + cli := az.NewServiceClient(t) + if _, err := cli.UploadBuffer(ctx, ctr, name, data, nil); err != nil { + t.Fatalf("upload block blob %s/%s: %v", ctr, name, err) + } +} + +// UploadPageBlob uploads bytes as a page blob (used to exercise the +// EnforceBlockBlobOnly negative path). Size must be a multiple of 512. +func (az *Azurite) UploadPageBlob(ctx context.Context, t *testing.T, ctr, name string, size int64) { + t.Helper() + + cred, err := azblob.NewSharedKeyCredential(az.AccountName(), az.AccountKey()) + if err != nil { + t.Fatalf("azurite shared key cred: %v", err) + } + + containerCli, err := container.NewClientWithSharedKeyCredential( + fmt.Sprintf("%s/%s", az.endpoint, ctr), cred, nil) + if err != nil { + t.Fatalf("container client: %v", err) + } + + pbCli := containerCli.NewPageBlobClient(name) + if _, err := pbCli.Create(ctx, size, &pageblob.CreateOptions{ + HTTPHeaders: &blob.HTTPHeaders{}, + }); err != nil { + t.Fatalf("create page blob: %v", err) + } + // Page blobs created here are zero-filled; tests don't read content + // because EnforceBlockBlobOnly should reject the GET first. +} + +// uniqueName returns a short random-suffixed name suitable for +// LocalStack buckets and Azurite containers. +func uniqueName(prefix string) string { + var b [4]byte + + _, _ = rand.Read(b[:]) //nolint:errcheck // crypto/rand never fails on linux + + return fmt.Sprintf("%s-%s", prefix, hex.EncodeToString(b[:])) +} diff --git a/internal/orca/inttest/client.go b/internal/orca/inttest/client.go new file mode 100644 index 00000000..78543451 --- /dev/null +++ b/internal/orca/inttest/client.go @@ -0,0 +1,127 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "encoding/xml" + "fmt" + "io" + "net/http" + "testing" +) + +// Client is a thin HTTP wrapper that targets a single replica's edge +// listener and provides typed helpers (GET, GET-Range, HEAD, LIST) for +// test assertions. +type Client struct { + BaseURL string + HTTP *http.Client +} + +// NewClient returns a Client targeting baseURL (e.g. http://127.0.0.1:34567). +func NewClient(baseURL string) *Client { + return &Client{ + BaseURL: baseURL, + HTTP: &http.Client{}, + } +} + +// GetResponse is the result of a GET / HEAD request. +type GetResponse struct { + Status int + Header http.Header + Body []byte +} + +// Get fetches the full body of /bucket/key. +func (c *Client) Get(ctx context.Context, t *testing.T, bucket, key string) GetResponse { + t.Helper() + + return c.do(ctx, t, http.MethodGet, fmt.Sprintf("/%s/%s", bucket, key), nil) +} + +// GetRange fetches a byte range from /bucket/key. +func (c *Client) GetRange(ctx context.Context, t *testing.T, bucket, key string, start, end int64) GetResponse { + t.Helper() + + hdr := http.Header{} + hdr.Set("Range", fmt.Sprintf("bytes=%d-%d", start, end)) + + return c.do(ctx, t, http.MethodGet, fmt.Sprintf("/%s/%s", bucket, key), hdr) +} + +// Head issues a HEAD against /bucket/key. +func (c *Client) Head(ctx context.Context, t *testing.T, bucket, key string) GetResponse { + t.Helper() + + return c.do(ctx, t, http.MethodHead, fmt.Sprintf("/%s/%s", bucket, key), nil) +} + +// ListBucketResult mirrors the (subset) S3 ListObjectsV2 XML response +// shape produced by the orca edge handler. +type ListBucketResult struct { + XMLName xml.Name `xml:"ListBucketResult"` + Name string `xml:"Name"` + Prefix string `xml:"Prefix"` + KeyCount int `xml:"KeyCount"` + Contents []struct { + Key string `xml:"Key"` + Size int64 `xml:"Size"` + ETag string `xml:"ETag"` + } `xml:"Contents"` +} + +// List issues a LIST against /bucket/?list-type=2&prefix=. +func (c *Client) List(ctx context.Context, t *testing.T, bucket, prefix string) ListBucketResult { + t.Helper() + + resp := c.do(ctx, t, http.MethodGet, + fmt.Sprintf("/%s/?list-type=2&prefix=%s", bucket, prefix), nil) + if resp.Status != http.StatusOK { + t.Fatalf("LIST status=%d body=%s", resp.Status, string(resp.Body)) + } + + var out ListBucketResult + if err := xml.Unmarshal(resp.Body, &out); err != nil { + t.Fatalf("LIST decode: %v body=%s", err, string(resp.Body)) + } + + return out +} + +func (c *Client) do(ctx context.Context, t *testing.T, method, path string, hdr http.Header) GetResponse { + t.Helper() + + req, err := http.NewRequestWithContext(ctx, method, c.BaseURL+path, nil) + if err != nil { + t.Fatalf("build request: %v", err) + } + + for k, vs := range hdr { + for _, v := range vs { + req.Header.Add(k, v) + } + } + + resp, err := c.HTTP.Do(req) + if err != nil { + t.Fatalf("%s %s: %v", method, path, err) + } + + defer func() { _ = resp.Body.Close() }() //nolint:errcheck // body close best-effort in tests + + body, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatalf("read body: %v", err) + } + + return GetResponse{ + Status: resp.StatusCode, + Header: resp.Header, + Body: body, + } +} diff --git a/internal/orca/inttest/doc.go b/internal/orca/inttest/doc.go new file mode 100644 index 00000000..ac83f611 --- /dev/null +++ b/internal/orca/inttest/doc.go @@ -0,0 +1,75 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +// Package inttest contains integration tests for the Orca cache. +// +// Build tag `integrationtest` gates these tests; run via: +// +// make orca-inttest +// +// Equivalent to: +// +// go test -tags=integrationtest -race -timeout 15m \ +// ./internal/orca/inttest/... +// +// # Architecture +// +// The harness brings up real LocalStack and Azurite containers via +// testcontainers-go and constructs N in-process *app.App instances +// wired to those containers. By default StartCluster runs 3 replicas, +// matching the production deploy/orca topology. +// +// Every replica binds to 127.0.0.1 with an OS-assigned distinct +// internal port; the cluster.Peer struct now carries an explicit Port +// (zero in production, set in tests) and FillFromPeer dials peer.IP + +// peer.Port. This lets multi-replica tests run on every platform +// (Linux, macOS, Windows / WSL) without loopback-alias setup. +// +// Each replica owns its own StaticPeerSource (cluster.PeerSource). +// Tests that need to induce membership disagreement mutate one +// replica's source; the cluster's refresh goroutine picks up the +// change within MembershipRefresh (250 ms in tests). +// +// # Container lifecycle +// +// TestMain starts one LocalStack and one Azurite container per +// `go test` invocation; per-test buckets/containers prevent +// cross-test interference. +// +// # File layout +// +// - e2e_test.go - the canonical end-to-end suite (3 replicas). +// Boot-self-test, cold/warm GET, ranged GET, multi-chunk GET, +// LIST, HEAD, NotFound, rendezvous coordinator routing, +// singleflight collapse, peer-not-coordinator fallback (real). +// - azure_test.go - azureblob origin driver smoke against Azurite +// (3 replicas). +// +// Driver-level branch coverage (versioning gate, blob-type +// rejection) lives as fast unit tests in the respective driver +// packages (cachestore/s3, origin/azureblob), not here. +// +// # Adding a scenario +// +// 1. Pick the right entry point: StartCluster (3-replica default). +// Tests that need to assert on a boot-time failure mode that +// surfaces before any chunk fetch (versioning gate, blob-type +// rejection, etc.) should live as unit tests in the respective +// driver package. +// 2. Seed the origin: SeedS3 or SeedAzure. +// 3. Issue requests via cl.Get(i).HTTP.Get / GetRange / Head / List. +// 4. Assert byte-exact body, status code, and (where relevant) origin +// RPC counts via the optional CountingOrigin or peer 409 counts via +// CountingInternalHandlerWrap. +// +// # TODO (genuinely future work) +// +// - TestEtagChange (mid-fill mutation): requires a deterministic +// test seam in fetch.Coordinator (e.g. a hook that pauses between +// chunk fetches) so the test can rewrite the origin object +// between chunk 0 and chunk 1 of the same fill. +// - Fault-injection origin / cachestore decorators: useful for +// timeout, throttle, and 5xx retry-budget assertions. +package inttest diff --git a/internal/orca/inttest/e2e_test.go b/internal/orca/inttest/e2e_test.go new file mode 100644 index 00000000..c384fc61 --- /dev/null +++ b/internal/orca/inttest/e2e_test.go @@ -0,0 +1,496 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "bytes" + "context" + "net/http" + "strconv" + "sync" + "testing" + "time" + + "github.com/Azure/unbounded/internal/orca/chunk" + "github.com/Azure/unbounded/internal/orca/cluster" +) + +// e2e_test.go is the canonical end-to-end suite for orca: every +// scenario runs against a 3-replica in-process cluster pointed at +// LocalStack. Tests that exercise chunk fetching naturally exercise +// both the local-fill path (when self happens to win rendezvous for +// a chunk) and the cross-replica /internal/fill path (when a peer +// wins). +// +// Driver-level branch coverage (versioning gate, blob-type rejection, +// HTTP error mapping, range parsing, chunk arithmetic, config env +// fallback) lives as fast unit tests in the respective driver / server +// / chunk / config packages. The scenarios here are reserved for +// behavior that can only be verified end-to-end against real +// LocalStack (or Azurite, in azure_test.go) plus a real cluster of +// in-process orca instances. + +// TestColdAndWarmGet exercises GET twice for the same single-chunk +// blob: cold (origin fetch + cache commit) and warm (cachestore hit). +// The warm phase deletes the origin object first to prove the cache +// hit really happened. +func TestColdAndWarmGet(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 60*time.Second) + defer cancel() + + bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin") + blob := SmallBlob() + SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob}) + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + OriginBucket: bucket, + }) + + cold := cl.Get(1).HTTP.Get(ctx, t, bucket, blob.Key) + if cold.Status != http.StatusOK { + t.Fatalf("cold status=%d body=%s", cold.Status, string(cold.Body)) + } + + if !bytes.Equal(cold.Body, blob.Data) { + t.Fatalf("cold body mismatch: got %d bytes, want %d", len(cold.Body), len(blob.Data)) + } + + if cold.Header.Get("ETag") == "" { + t.Errorf("expected ETag header on cold GET") + } + + DeleteS3Object(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, blob.Key) + + warm := cl.Get(1).HTTP.Get(ctx, t, bucket, blob.Key) + if warm.Status != http.StatusOK { + t.Fatalf("warm status=%d body=%s", warm.Status, string(warm.Body)) + } + + if !bytes.Equal(warm.Body, blob.Data) { + t.Fatalf("warm body mismatch: got %d bytes, want %d", len(warm.Body), len(blob.Data)) + } +} + +// TestRangedGet verifies byte-range requests return 206 + +// Content-Range + the requested slice. Covers within-chunk, +// cross-chunk, and (against a 64-chunk blob) various boundary edge +// cases. The chunk-arithmetic branches are unit-tested separately in +// internal/orca/chunk; this verifies the end-to-end HTTP Range +// round-trip with real chunk bodies. +func TestRangedGet(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 120*time.Second) + defer cancel() + + bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin") + medium := MediumBlob() // 1.5 MiB == 2 chunks at 1 MiB + huge := HugeBlob() // 64 MiB == 64 chunks at 1 MiB + SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{medium, huge}) + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + OriginBucket: bucket, + }) + + resp := cl.Get(1).HTTP.GetRange(ctx, t, bucket, medium.Key, 100, 199) + if resp.Status != http.StatusPartialContent { + t.Fatalf("status=%d (want 206)", resp.Status) + } + + if cr := resp.Header.Get("Content-Range"); cr == "" { + t.Errorf("expected Content-Range header") + } + + want := medium.Data[100:200] + if !bytes.Equal(resp.Body, want) { + t.Fatalf("range body mismatch: got %d bytes, want %d", len(resp.Body), len(want)) + } + + chunkSize := int64(1024 * 1024) + resp2 := cl.Get(1).HTTP.GetRange(ctx, t, bucket, medium.Key, chunkSize-50, chunkSize+49) + + if resp2.Status != http.StatusPartialContent { + t.Fatalf("cross-chunk status=%d (want 206)", resp2.Status) + } + + want2 := medium.Data[chunkSize-50 : chunkSize+50] + if !bytes.Equal(resp2.Body, want2) { + t.Fatalf("cross-chunk range mismatch: got %d bytes, want %d", len(resp2.Body), len(want2)) + } + + t.Run("huge blob boundary cases", func(t *testing.T) { + const chunk = int64(1024 * 1024) + + cases := []struct { + name string + start, end int64 + }{ + {"starts exactly at chunk boundary 32", 32 * chunk, 32*chunk + 100}, + {"ends exactly at chunk boundary 47", 48*chunk - 100, 48*chunk - 1}, + {"covers chunks 10-12 (3 contiguous full chunks)", 10 * chunk, 13*chunk - 1}, + {"straddles 5 consecutive boundaries (chunks 20-25)", 20*chunk + 100, 25*chunk + 200}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + rr := cl.Get(1).HTTP.GetRange(ctx, t, bucket, huge.Key, tc.start, tc.end) + if rr.Status != http.StatusPartialContent { + t.Fatalf("status=%d (want 206)", rr.Status) + } + + expected := huge.Data[tc.start : tc.end+1] + if !bytes.Equal(rr.Body, expected) { + t.Fatalf("body mismatch: got %d bytes, want %d", len(rr.Body), len(expected)) + } + }) + } + }) +} + +// TestMultiChunkGet verifies a full GET of a 64-chunk blob assembles +// correctly across chunk boundaries. With 3 replicas and 64 chunks, +// rendezvous-hashed coordinator selection statistically guarantees +// every replica is the coordinator for many chunks, so this test +// exercises both fillLocal and FillFromPeer paths thoroughly in a +// single run. +func TestMultiChunkGet(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 120*time.Second) + defer cancel() + + bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin") + blob := HugeBlob() + SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob}) + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + OriginBucket: bucket, + }) + + resp := cl.Get(1).HTTP.Get(ctx, t, bucket, blob.Key) + if resp.Status != http.StatusOK { + t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body)) + } + + if !bytes.Equal(resp.Body, blob.Data) { + t.Fatalf("body mismatch: got %d bytes, want %d", len(resp.Body), len(blob.Data)) + } +} + +// TestRendezvousCoordinatorRouting verifies that a GET against a +// non-coordinator replica routes through /internal/fill to the +// coordinator and still returns the body. The CountingOrigin +// decorator confirms exactly one origin GetRange happened across the +// cluster (the coordinator's). +func TestRendezvousCoordinatorRouting(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 90*time.Second) + defer cancel() + + bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin") + blob := SmallBlob() + SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob}) + + count := newCountingOriginForLocalStack(ctx, t, bucket) + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + OriginBucket: bucket, + OriginOverride: count, + }) + + headResp := cl.Get(1).HTTP.Head(ctx, t, bucket, blob.Key) + + etag := stripQuotes(headResp.Header.Get("ETag")) + if etag == "" { + t.Fatalf("HEAD returned empty ETag: %+v", headResp.Header) + } + + k := chunk.Key{ + OriginID: "inttest-origin", + Bucket: bucket, + ObjectKey: blob.Key, + ETag: etag, + ChunkSize: int64(1024 * 1024), + Index: 0, + } + coord := cl.Get(1).App.Cluster.Coordinator(k) + + var nonCoord *Replica + + for _, r := range cl.Replicas { + if r.SelfIP != coord.IP || r.InternalPort != coord.Port { + nonCoord = r + break + } + } + + if nonCoord == nil { + t.Fatalf("could not find a non-coordinator replica; coord=%+v peers=%+v", + coord, cl.Get(1).App.Cluster.Peers()) + } + + count.Reset() + + resp := nonCoord.HTTP.Get(ctx, t, bucket, blob.Key) + if resp.Status != http.StatusOK { + t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body)) + } + + if !bytes.Equal(resp.Body, blob.Data) { + t.Fatalf("body mismatch: got %d bytes, want %d", len(resp.Body), len(blob.Data)) + } + // Exactly one HEAD (HeadObject metadata cache) plus one GetRange + // (single chunk fetch). Cluster-wide dedup must not produce more. + if got := count.GetRanges(); got != 1 { + t.Errorf("origin GetRange count=%d (want 1)", got) + } +} + +// TestSingleflightCollapse fires N concurrent GETs (one per replica) +// for the same key and asserts the origin saw exactly one GetRange +// per chunk (cluster-wide singleflight collapse). +func TestSingleflightCollapse(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 120*time.Second) + defer cancel() + + bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin") + blob := HugeBlob() // 64 chunks + SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob}) + + count := newCountingOriginForLocalStack(ctx, t, bucket) + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + OriginBucket: bucket, + OriginOverride: count, + }) + + count.Reset() + + var wg sync.WaitGroup + + wg.Add(cl.Len()) + + results := make([][]byte, cl.Len()) + statuses := make([]int, cl.Len()) + + for i := 1; i <= cl.Len(); i++ { + go func(i int) { + defer wg.Done() + + r := cl.Get(i).HTTP.Get(ctx, t, bucket, blob.Key) + results[i-1] = r.Body + statuses[i-1] = r.Status + }(i) + } + + wg.Wait() + + for i, s := range statuses { + if s != http.StatusOK { + t.Fatalf("replica %d status=%d", i+1, s) + } + + if !bytes.Equal(results[i], blob.Data) { + t.Fatalf("replica %d body mismatch: got %d bytes want %d", i+1, len(results[i]), len(blob.Data)) + } + } + // HugeBlob spans 64 chunks; cluster-wide singleflight should + // dedupe each chunk to exactly one origin GetRange. Allow up to + // 76 (~20% slack) to absorb timing-dependent races where a + // joiner arrives during in-flight commit. + if got := count.GetRanges(); got > 76 { + t.Errorf("origin GetRange count=%d (want <= 76 for 64-chunk blob)", got) + } + + if got := count.GetRanges(); got < 64 { + t.Errorf("origin GetRange count=%d (want >= 64 for 64-chunk cold fill)", got) + } +} + +// TestPeerNotCoordinatorFallback induces real membership disagreement +// and asserts the coordinator's /internal/fill returns 409 and the +// requesting replica's local-fill fallback succeeds. +// +// Setup: +// +// - 3-replica cluster with shared CountingInternalHandlerWrap so we +// can read 409 counts per receiving replica. +// - HEAD the seeded blob to learn ETag; compute Coordinator(k) for +// chunk 0 from replica 1's view (call it C). +// - Craft a phantom peer P (an unreachable IP/Port pair) whose +// rendezvous score for k is higher than C's. Mutate C's peer +// source to include P plus C itself; now C.IsCoordinator(k) +// returns false because P wins. +// - Find another replica R whose view still says C is the +// coordinator. GET via R. +// +// Expected: +// +// - R issues /internal/fill to C. +// - C responds 409 (its IsCoordinator returns false because P wins). +// - R falls through to fillLocal, fetches the origin, serves the +// body. +// - counter.Count(C, 409) >= 1. +func TestPeerNotCoordinatorFallback(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 90*time.Second) + defer cancel() + + bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin") + blob := SmallBlob() + SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob}) + + wrap := NewCountingInternalHandlerWrap() + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + OriginBucket: bucket, + InternalHandlerWrap: wrap, + }) + + headResp := cl.Get(1).HTTP.Head(ctx, t, bucket, blob.Key) + + etag := stripQuotes(headResp.Header.Get("ETag")) + if etag == "" { + t.Fatalf("HEAD returned empty ETag: %+v", headResp.Header) + } + + k := chunk.Key{ + OriginID: "inttest-origin", + Bucket: bucket, + ObjectKey: blob.Key, + ETag: etag, + ChunkSize: int64(1024 * 1024), + Index: 0, + } + coord := cl.Get(1).App.Cluster.Coordinator(k) + + coordReplica := cl.FindBySelfIPPort(coord.IP, coord.Port) + if coordReplica == nil { + t.Fatalf("coord %+v not found among replicas", coord) + } + + // Craft a phantom peer whose rendezvous score beats coord's for k. + // The phantom's IP/Port don't need to be reachable; it's never + // dialed, only used to skew rendezvous on coord's view. + pathBytes := []byte(k.Path()) + coordScore := cluster.Score(coord, pathBytes) + phantom := cluster.Peer{IP: "203.0.113.1"} // TEST-NET-3, unreachable + + for port := 1; port < 65536; port++ { + phantom.Port = port + if cluster.Score(phantom, pathBytes) > coordScore { + break + } + } + + if cluster.Score(phantom, pathBytes) <= coordScore { + t.Fatalf("could not find a phantom peer beating coord rendezvous score") + } + + // Build coord's new peer-set: original real peers plus the + // phantom. The StaticPeerSource will stamp Self=true only on the + // peer matching coord's (selfIP, selfPort), so coord still + // recognizes itself; but the phantom wins rendezvous, so + // coord.IsCoordinator(k) flips to false. + newPeers := make([]cluster.Peer, 0, cl.Len()+1) + for _, r := range cl.Replicas { + newPeers = append(newPeers, cluster.Peer{IP: r.SelfIP, Port: r.InternalPort}) + } + + newPeers = append(newPeers, phantom) + coordReplica.PeerSource.SetPeers(newPeers) + + if err := waitForCondition(ctx, 2*time.Second, func() bool { + return !coordReplica.App.Cluster.IsCoordinator(k) + }); err != nil { + t.Fatalf("coord did not relinquish coordinator status: %v", err) + } + // Find a replica R whose view still says coord is the coordinator. + var requester *Replica + + for _, r := range cl.Replicas { + if r == coordReplica { + continue + } + + rc := r.App.Cluster.Coordinator(k) + if rc.IP == coord.IP && rc.Port == coord.Port { + requester = r + break + } + } + + if requester == nil { + t.Fatalf("no non-coord replica still views coord %+v as coordinator", coord) + } + + resp := requester.HTTP.Get(ctx, t, bucket, blob.Key) + if resp.Status != http.StatusOK { + t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body)) + } + + if !bytes.Equal(resp.Body, blob.Data) { + t.Fatalf("body mismatch: got %d bytes, want %d", len(resp.Body), len(blob.Data)) + } + + coordKey := coord.IP + ":" + strconv.Itoa(coord.Port) + if got := wrap.Count(coordKey, http.StatusConflict); got < 1 { + t.Fatalf("expected at least one 409 from coord %s; got %d", + coordKey, got) + } +} + +func newCountingOriginForLocalStack(ctx context.Context, t *testing.T, bucket string) *CountingOrigin { + t.Helper() + + or, err := localStackOrigin(ctx, t, bucket) + if err != nil { + t.Fatalf("localStackOrigin: %v", err) + } + + return NewCountingOrigin(or) +} + +func stripQuotes(s string) string { + if len(s) >= 2 && s[0] == '"' && s[len(s)-1] == '"' { + return s[1 : len(s)-1] + } + + return s +} + +func waitForCondition(ctx context.Context, dl time.Duration, cond func() bool) error { + deadline := time.Now().Add(dl) + for time.Now().Before(deadline) { + if cond() { + return nil + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(25 * time.Millisecond): + } + } + + if cond() { + return nil + } + + return context.DeadlineExceeded +} diff --git a/internal/orca/inttest/harness.go b/internal/orca/inttest/harness.go new file mode 100644 index 00000000..ee4fd291 --- /dev/null +++ b/internal/orca/inttest/harness.go @@ -0,0 +1,366 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "fmt" + "io" + "log/slog" + "net" + "strconv" + "testing" + "time" + + "github.com/Azure/unbounded/internal/orca/app" + "github.com/Azure/unbounded/internal/orca/cachestore" + "github.com/Azure/unbounded/internal/orca/cluster" + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// ClusterOptions controls Harness.StartCluster. +type ClusterOptions struct { + // Replicas is the number of in-process orca instances. Defaults + // to 3 when zero, matching the production deploy/orca topology. + Replicas int + + // ChunkSize is the per-chunk byte count. The orca config validator + // enforces a 1 MiB minimum; tests typically use 1 MiB to keep test + // blob sizes manageable while still spanning multiple chunks. + ChunkSize int64 + + // OriginID is the logical origin identifier (echoed in chunk paths). + OriginID string + + // OriginBucket is the bucket on the origin LocalStack/Azurite. + OriginBucket string + + // OriginDriver is "awss3" (default) or "azureblob". + OriginDriver string + + // LocalStack is the LocalStack handle used for origin (when + // OriginDriver=="awss3") and always for cachestore. + LocalStack *LocalStack + + // Azurite is required when OriginDriver=="azureblob". + Azurite *Azurite + + // AzureContainer is the Azurite container name for the origin. + AzureContainer string + + // CachestoreBucket is the bucket on LocalStack used as the orca + // cachestore. If empty, a fresh bucket is allocated. + CachestoreBucket string + + // OriginOverride, when set, replaces the constructed origin driver. + // Used to wire CountingOrigin around the real client. + OriginOverride origin.Origin + + // CacheStoreOverride, when set, replaces the constructed cachestore + // driver. + CacheStoreOverride cachestore.CacheStore + + // InternalHandlerWrap, when set, is registered with each replica's + // app.WithInternalHandlerWrap. Tests use this to install a 409 + // counter (CountingInternalHandlerWrap.WrapFor). + InternalHandlerWrap *CountingInternalHandlerWrap +} + +// Replica represents one running *app.App in the harness. +type Replica struct { + App *app.App + SelfIP string + InternalPort int + PeerSource *StaticPeerSource + HTTP *Client // pre-built client targeting this replica's edge +} + +// Cluster is a collection of Replicas plus the harness-owned context. +type Cluster struct { + Replicas []*Replica +} + +// Get returns replica i (1-indexed). +func (c *Cluster) Get(i int) *Replica { return c.Replicas[i-1] } + +// Len returns the replica count. +func (c *Cluster) Len() int { return len(c.Replicas) } + +// FindBySelfIPPort returns the replica whose (SelfIP, InternalPort) +// matches the given peer; nil if none. +func (c *Cluster) FindBySelfIPPort(ip string, port int) *Replica { + for _, r := range c.Replicas { + if r.SelfIP == ip && r.InternalPort == port { + return r + } + } + + return nil +} + +// StartCluster brings up `opts.Replicas` orca instances (default 3) +// pointed at the origin/cachestore described in opts. Every replica +// binds to 127.0.0.1 with an OS-assigned distinct internal port; one +// StaticPeerSource per replica is initialized with the full peer set +// (with explicit ports). Tests can mutate any replica's PeerSource +// independently. +// +// Cleanup (Shutdown of each app) is registered with t.Cleanup. +func StartCluster(ctx context.Context, t *testing.T, opts ClusterOptions) *Cluster { + t.Helper() + + if opts.Replicas == 0 { + opts.Replicas = 3 + } + + if opts.Replicas < 1 { + t.Fatalf("StartCluster: Replicas must be >= 1, got %d", opts.Replicas) + } + + if opts.ChunkSize == 0 { + opts.ChunkSize = 1024 * 1024 + } + + if opts.OriginDriver == "" { + opts.OriginDriver = "awss3" + } + + if opts.OriginID == "" { + opts.OriginID = "inttest-origin" + } + + if opts.LocalStack == nil { + t.Fatal("StartCluster: LocalStack handle required") + } + + if opts.OriginDriver == "azureblob" { + if opts.Azurite == nil { + t.Fatal("StartCluster: Azurite handle required for azureblob driver") + } + + if opts.AzureContainer == "" { + t.Fatal("StartCluster: AzureContainer required for azureblob driver") + } + } + + if opts.OriginBucket == "" && opts.OriginDriver == "awss3" { + t.Fatal("StartCluster: OriginBucket required for awss3 driver") + } + + cacheBucket := opts.CachestoreBucket + if cacheBucket == "" { + cacheBucket = opts.LocalStack.NewBucket(ctx, t, "orca-cache") + } + + // Allocate per-replica internal listeners up front (open) so each + // replica's peer source can advertise the full set with explicit + // ports from t=0. We hand the open listeners to app.Start via + // WithInternalListener/WithEdgeListener so there is no + // close-and-rebind window for races with concurrent tests. + internalListeners := make([]net.Listener, opts.Replicas) + internalPorts := make([]int, opts.Replicas) + edgeListeners := make([]net.Listener, opts.Replicas) + + for i := range internalListeners { + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + closeListeners(internalListeners) + closeListeners(edgeListeners) + t.Fatalf("alloc internal port for replica %d: %v", i+1, err) + } + + internalListeners[i] = ln + internalPorts[i] = ln.Addr().(*net.TCPAddr).Port //nolint:errcheck // *net.TCPAddr from net.Listen + + eln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + closeListeners(internalListeners) + closeListeners(edgeListeners) + t.Fatalf("alloc edge port for replica %d: %v", i+1, err) + } + + edgeListeners[i] = eln + } + + allPeers := make([]cluster.Peer, opts.Replicas) + for i := range allPeers { + allPeers[i] = cluster.Peer{ + IP: "127.0.0.1", + Port: internalPorts[i], + } + } + + cl := &Cluster{} + + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + + for i := 0; i < opts.Replicas; i++ { + selfIP := "127.0.0.1" + selfPort := internalPorts[i] + ps := NewStaticPeerSource(selfIP, selfPort, allPeers) + + cfg := buildConfig(opts, cacheBucket) + cfg.Cluster.SelfPodIP = selfIP + cfg.Cluster.InternalListen = net.JoinHostPort(selfIP, strconv.Itoa(selfPort)) + cfg.Server.Listen = edgeListeners[i].Addr().String() + + appOpts := []app.Option{ + app.WithLogger(logger), + app.WithPeerSource(ps), + app.WithEdgeListener(edgeListeners[i]), + app.WithInternalListener(internalListeners[i]), + } + + if opts.OriginOverride != nil { + appOpts = append(appOpts, app.WithOrigin(opts.OriginOverride)) + } + + if opts.CacheStoreOverride != nil { + appOpts = append(appOpts, app.WithCacheStore(opts.CacheStoreOverride)) + } + + if opts.InternalHandlerWrap != nil { + appOpts = append(appOpts, app.WithInternalHandlerWrap(opts.InternalHandlerWrap.WrapFor(selfIP+":"+strconv.Itoa(selfPort)))) + } + + a, err := app.Start(ctx, cfg, appOpts...) + if err != nil { + t.Fatalf("app.Start replica %d: %v", i+1, err) + } + + r := &Replica{ + App: a, + SelfIP: selfIP, + InternalPort: selfPort, + PeerSource: ps, + HTTP: NewClient("http://" + a.EdgeAddr), + } + cl.Replicas = append(cl.Replicas, r) + + t.Cleanup(func() { + ctxShut, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + _ = a.Shutdown(ctxShut) //nolint:errcheck // shutdown logs already emitted + }) + } + // Wait for every replica's Cluster.Peers() to converge to the + // full set. + if err := waitForPeers(ctx, cl, opts.Replicas, 2*time.Second); err != nil { + t.Fatalf("waitForPeers: %v", err) + } + + return cl +} + +func buildConfig(opts ClusterOptions, cacheBucket string) *config.Config { + cfg := &config.Config{ + Server: config.Server{ + Listen: "127.0.0.1:0", + Auth: config.ServerAuth{Enabled: false}, + }, + Origin: config.Origin{ + ID: opts.OriginID, + Driver: opts.OriginDriver, + TargetGlobal: 32, + QueueTimeout: 5 * time.Second, + Retry: config.OriginRetry{ + Attempts: 2, + BackoffInitial: 10 * time.Millisecond, + BackoffMax: 50 * time.Millisecond, + MaxTotalDuration: 2 * time.Second, + }, + }, + Cachestore: config.Cachestore{ + Driver: "s3", + S3: config.CachestoreS3{ + Endpoint: opts.LocalStack.Endpoint(), + Bucket: cacheBucket, + Region: opts.LocalStack.Region(), + AccessKey: opts.LocalStack.AccessKey(), + SecretKey: opts.LocalStack.SecretKey(), + UsePathStyle: true, + RequireUnversionedBucket: true, + }, + }, + Cluster: config.Cluster{ + Service: "orca-peers.test.svc.cluster.local", + MembershipRefresh: 250 * time.Millisecond, + InternalListen: "127.0.0.1:0", // overridden per replica + InternalTLS: config.InternalTLS{Enabled: false}, + TargetReplicas: opts.Replicas, + SelfPodIP: "127.0.0.1", // overridden per replica + }, + ChunkCatalog: config.ChunkCatalog{MaxEntries: 1024}, + Metadata: config.Metadata{ + TTL: 5 * time.Minute, + NegativeTTL: 5 * time.Second, + MaxEntries: 1024, + }, + Chunking: config.Chunking{Size: opts.ChunkSize}, + } + + switch opts.OriginDriver { + case "awss3": + cfg.Origin.AWSS3 = config.AWSS3{ + Endpoint: opts.LocalStack.Endpoint(), + Region: opts.LocalStack.Region(), + Bucket: opts.OriginBucket, + AccessKey: opts.LocalStack.AccessKey(), + SecretKey: opts.LocalStack.SecretKey(), + UsePathStyle: true, + } + case "azureblob": + cfg.Origin.Azureblob = config.Azureblob{ + Account: opts.Azurite.AccountName(), + AccountKey: opts.Azurite.AccountKey(), + Container: opts.AzureContainer, + EnforceBlockBlobOnly: true, + Endpoint: opts.Azurite.Endpoint(), + } + } + + return cfg +} + +// waitForPeers polls each replica's cluster.Peers() until every +// replica has at least the expected count or the deadline elapses. +func waitForPeers(ctx context.Context, cl *Cluster, want int, dl time.Duration) error { + deadline := time.Now().Add(dl) + + for time.Now().Before(deadline) { + ok := true + + for _, r := range cl.Replicas { + if len(r.App.Cluster.Peers()) < want { + ok = false + break + } + } + + if ok { + return nil + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(50 * time.Millisecond): + } + } + + return fmt.Errorf("peer-set did not converge to %d on all %d replicas within %s", + want, len(cl.Replicas), dl) +} + +func closeListeners(lns []net.Listener) { + for _, ln := range lns { + if ln != nil { + _ = ln.Close() //nolint:errcheck // best-effort cleanup + } + } +} diff --git a/internal/orca/inttest/images.go b/internal/orca/inttest/images.go new file mode 100644 index 00000000..9eb3c729 --- /dev/null +++ b/internal/orca/inttest/images.go @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +// Pinned container image tags. Bump centrally when upgrading. +const ( + // localstackImage is the LocalStack image used for both the origin + // (awss3) and cachestore (s3) backends. 3.8 matches the version + // referenced in design.md and the dev harness's awareness of the + // CRC64NVME checksum quirk. + localstackImage = "localstack/localstack:3.8" + + // azuriteImage is the Azurite (Azure Blob emulator) image. We pin + // to a specific minor for reproducibility. + azuriteImage = "mcr.microsoft.com/azure-storage/azurite:3.34.0" + + // azuritePort is the blob-service port published by Azurite. + azuritePort = "10000" + + // azuriteAccountName is the well-known Azurite dev account. + azuriteAccountName = "devstoreaccount1" + + // azuriteAccountKey is the well-known Azurite dev account key. It + // is hard-coded by the emulator; not a secret. + azuriteAccountKey = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" +) diff --git a/internal/orca/inttest/internalwrap.go b/internal/orca/inttest/internalwrap.go new file mode 100644 index 00000000..67197393 --- /dev/null +++ b/internal/orca/inttest/internalwrap.go @@ -0,0 +1,135 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "net/http" + "sync" + "sync/atomic" +) + +// CountingInternalHandlerWrap is an http.Handler decorator factory +// that counts response status codes per receiving replica IP. Used +// by TestPeerNotCoordinatorFallback to assert a peer's +// /internal/fill handler returned 409 (proving the cluster.go 409 +// fallback path actually fired on the requesting replica). +// +// One CountingInternalHandlerWrap is shared across all replicas in +// the harness; each replica's wrapped handler stamps its self IP +// onto the response writer so counts can be attributed back. +type CountingInternalHandlerWrap struct { + mu sync.Mutex + counts map[string]map[int]*atomic.Int64 // selfIP -> status -> count + defined map[string]struct{} +} + +// NewCountingInternalHandlerWrap returns an empty wrapper. +func NewCountingInternalHandlerWrap() *CountingInternalHandlerWrap { + return &CountingInternalHandlerWrap{ + counts: make(map[string]map[int]*atomic.Int64), + defined: make(map[string]struct{}), + } +} + +// WrapFor returns a wrap function suitable for app.WithInternalHandlerWrap +// that attributes status-code counts back to the named selfIP. +func (w *CountingInternalHandlerWrap) WrapFor(selfIP string) func(http.Handler) http.Handler { + w.mu.Lock() + if _, ok := w.counts[selfIP]; !ok { + w.counts[selfIP] = make(map[int]*atomic.Int64) + } + + w.defined[selfIP] = struct{}{} + w.mu.Unlock() + + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + cw := &countingResponseWriter{ResponseWriter: rw, status: http.StatusOK} + next.ServeHTTP(cw, req) + w.record(selfIP, cw.status) + }) + } +} + +// Count returns the number of responses with the given status code +// observed at the named selfIP. +func (w *CountingInternalHandlerWrap) Count(selfIP string, status int) int64 { + w.mu.Lock() + defer w.mu.Unlock() + + byStatus, ok := w.counts[selfIP] + if !ok { + return 0 + } + + c, ok := byStatus[status] + if !ok { + return 0 + } + + return c.Load() +} + +// CountAcross returns the count summed across all known selfIPs. +func (w *CountingInternalHandlerWrap) CountAcross(status int) int64 { + w.mu.Lock() + defer w.mu.Unlock() + + var total int64 + + for _, byStatus := range w.counts { + if c, ok := byStatus[status]; ok { + total += c.Load() + } + } + + return total +} + +func (w *CountingInternalHandlerWrap) record(selfIP string, status int) { + w.mu.Lock() + + byStatus, ok := w.counts[selfIP] + if !ok { + byStatus = make(map[int]*atomic.Int64) + w.counts[selfIP] = byStatus + } + + c, ok := byStatus[status] + if !ok { + c = &atomic.Int64{} + byStatus[status] = c + } + + w.mu.Unlock() + c.Add(1) +} + +// countingResponseWriter records the first WriteHeader status; if no +// WriteHeader is ever called, http.StatusOK is recorded (matching the +// net/http default). +type countingResponseWriter struct { + http.ResponseWriter + status int + wroteHeader bool +} + +func (c *countingResponseWriter) WriteHeader(status int) { + if !c.wroteHeader { + c.status = status + c.wroteHeader = true + } + + c.ResponseWriter.WriteHeader(status) +} + +func (c *countingResponseWriter) Write(p []byte) (int, error) { + if !c.wroteHeader { + c.wroteHeader = true + } + + return c.ResponseWriter.Write(p) +} diff --git a/internal/orca/inttest/localstack.go b/internal/orca/inttest/localstack.go new file mode 100644 index 00000000..5abb404d --- /dev/null +++ b/internal/orca/inttest/localstack.go @@ -0,0 +1,180 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "fmt" + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" + s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/wait" +) + +// LocalStack is a running LocalStack container with helper accessors +// for constructing AWS S3 clients pointed at it. Use NewS3Client to +// get a configured client; use NewBucket to allocate a fresh bucket +// for a single test. +type LocalStack struct { + container testcontainers.Container + endpoint string + region string +} + +// AccessKey returns the LocalStack-default access key. LocalStack does +// not validate credentials but the AWS SDK requires non-empty values. +func (ls *LocalStack) AccessKey() string { return "test" } + +// SecretKey returns the LocalStack-default secret key. +func (ls *LocalStack) SecretKey() string { return "test" } + +// Endpoint returns the http:// URL of the LocalStack edge port. +func (ls *LocalStack) Endpoint() string { return ls.endpoint } + +// Region returns the static region the harness uses with LocalStack. +func (ls *LocalStack) Region() string { return ls.region } + +// StartLocalStack launches a LocalStack container and returns a handle +// once the edge port is healthy. Caller is responsible for terminating +// the container (via container.Terminate or t.Cleanup). +func StartLocalStack(ctx context.Context) (*LocalStack, error) { + req := testcontainers.ContainerRequest{ + Image: localstackImage, + ExposedPorts: []string{"4566/tcp"}, + Env: map[string]string{ + "SERVICES": "s3", + // LocalStack 3.8 returns InvalidRequest on the SDK's + // CRC64NVME default checksum. The orca s3 driver opts out + // at the SDK config level, but seeding clients in tests + // must do the same. We set the variables both in the + // container env (for any in-container tooling) and on the + // SDK config in NewS3Client. + "S3_SKIP_SIGNATURE_VALIDATION": "1", + }, + WaitingFor: wait.ForHTTP("/_localstack/health"). + WithPort("4566/tcp"). + WithStatusCodeMatcher(func(status int) bool { return status == 200 }), + } + + c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: req, + Started: true, + }) + if err != nil { + return nil, fmt.Errorf("start localstack: %w", err) + } + + host, err := c.Host(ctx) + if err != nil { + _ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup + return nil, fmt.Errorf("localstack host: %w", err) + } + + port, err := c.MappedPort(ctx, "4566/tcp") + if err != nil { + _ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup + return nil, fmt.Errorf("localstack port: %w", err) + } + + return &LocalStack{ + container: c, + endpoint: fmt.Sprintf("http://%s:%s", host, port.Port()), + region: "us-east-1", + }, nil +} + +// Terminate stops and removes the LocalStack container. +func (ls *LocalStack) Terminate(ctx context.Context) error { + return ls.container.Terminate(ctx) +} + +// NewS3Client returns an AWS S3 client with LocalStack-friendly +// settings (path-style addressing, dummy credentials, checksum quirks +// disabled). +func (ls *LocalStack) NewS3Client(ctx context.Context, t *testing.T) *s3.Client { + t.Helper() + + cfg, err := awsconfig.LoadDefaultConfig(ctx, + awsconfig.WithRegion(ls.region), + awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider( + ls.AccessKey(), ls.SecretKey(), "", + )), + awsconfig.WithRequestChecksumCalculation(aws.RequestChecksumCalculationWhenRequired), + awsconfig.WithResponseChecksumValidation(aws.ResponseChecksumValidationWhenRequired), + ) + if err != nil { + t.Fatalf("aws config: %v", err) + } + + return s3.NewFromConfig(cfg, func(o *s3.Options) { + o.BaseEndpoint = aws.String(ls.endpoint) + o.UsePathStyle = true + }) +} + +// NewBucket creates a fresh bucket and registers a t.Cleanup hook to +// best-effort delete it. Returns the bucket name. +func (ls *LocalStack) NewBucket(ctx context.Context, t *testing.T, prefix string) string { + t.Helper() + + cli := ls.NewS3Client(ctx, t) + name := uniqueName(prefix) + + if _, err := cli.CreateBucket(ctx, &s3.CreateBucketInput{ + Bucket: aws.String(name), + }); err != nil { + t.Fatalf("create bucket %s: %v", name, err) + } + + t.Cleanup(func() { + emptyBucket(context.Background(), cli, name) + + _, _ = cli.DeleteBucket(context.Background(), &s3.DeleteBucketInput{ //nolint:errcheck // best-effort cleanup + Bucket: aws.String(name), + }) + }) + + return name +} + +// EnableVersioning toggles versioning on a bucket. Used by the +// versioning-gate negative test. +func (ls *LocalStack) EnableVersioning(ctx context.Context, t *testing.T, bucket string) { + t.Helper() + + cli := ls.NewS3Client(ctx, t) + if _, err := cli.PutBucketVersioning(ctx, &s3.PutBucketVersioningInput{ + Bucket: aws.String(bucket), + VersioningConfiguration: &s3types.VersioningConfiguration{ + Status: s3types.BucketVersioningStatusEnabled, + }, + }); err != nil { + t.Fatalf("enable versioning on %s: %v", bucket, err) + } +} + +// emptyBucket deletes every object in the bucket. Best-effort; errors +// are ignored. +func emptyBucket(ctx context.Context, cli *s3.Client, bucket string) { + out, err := cli.ListObjectsV2(ctx, &s3.ListObjectsV2Input{ + Bucket: aws.String(bucket), + }) + if err != nil { + return + } + + for _, obj := range out.Contents { + _, _ = cli.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort cleanup + Bucket: aws.String(bucket), + Key: obj.Key, + }) + } +} diff --git a/internal/orca/inttest/main_test.go b/internal/orca/inttest/main_test.go new file mode 100644 index 00000000..f793abd6 --- /dev/null +++ b/internal/orca/inttest/main_test.go @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "fmt" + "os" + "testing" + "time" +) + +// Package-level container handles shared across tests in this package. +// TestMain brings them up once and tears them down at the end. +var ( + pkgLocalStack *LocalStack + pkgAzurite *Azurite +) + +// TestMain provisions LocalStack + Azurite once per `go test` run. +// Per-test buckets / containers are allocated inside individual tests +// to avoid cross-test interference. +func TestMain(m *testing.M) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + ls, err := StartLocalStack(ctx) + if err != nil { + fmt.Fprintf(os.Stderr, "TestMain: start localstack: %v\n", err) + os.Exit(1) + } + + pkgLocalStack = ls + + az, err := StartAzurite(ctx) + if err != nil { + fmt.Fprintf(os.Stderr, "TestMain: start azurite: %v\n", err) + + _ = ls.Terminate(ctx) //nolint:errcheck // best-effort cleanup + + os.Exit(1) + } + + pkgAzurite = az + + code := m.Run() + + termCtx, termCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer termCancel() + + _ = pkgAzurite.Terminate(termCtx) //nolint:errcheck // best-effort + _ = pkgLocalStack.Terminate(termCtx) //nolint:errcheck // best-effort + + os.Exit(code) +} diff --git a/internal/orca/inttest/origins_test.go b/internal/orca/inttest/origins_test.go new file mode 100644 index 00000000..594b7596 --- /dev/null +++ b/internal/orca/inttest/origins_test.go @@ -0,0 +1,30 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "testing" + + "github.com/Azure/unbounded/internal/orca/origin" + "github.com/Azure/unbounded/internal/orca/origin/awss3" +) + +// localStackOrigin builds an awss3.Origin pointed at the package-level +// LocalStack with the given bucket. Used by tests that need to wrap +// the origin in a CountingOrigin decorator. +func localStackOrigin(ctx context.Context, t *testing.T, bucket string) (origin.Origin, error) { + t.Helper() + + return awss3.New(ctx, awss3.Config{ + Endpoint: pkgLocalStack.Endpoint(), + Region: pkgLocalStack.Region(), + Bucket: bucket, + AccessKey: pkgLocalStack.AccessKey(), + SecretKey: pkgLocalStack.SecretKey(), + UsePathStyle: true, + }) +} diff --git a/internal/orca/inttest/originwrap.go b/internal/orca/inttest/originwrap.go new file mode 100644 index 00000000..c215d9e8 --- /dev/null +++ b/internal/orca/inttest/originwrap.go @@ -0,0 +1,67 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "io" + "sync/atomic" + + "github.com/Azure/unbounded/internal/orca/origin" +) + +// CountingOrigin is an origin.Origin decorator that counts Head and +// GetRange calls. It is used by tests that need to assert +// singleflight collapse and coordinator routing. +type CountingOrigin struct { + inner origin.Origin + + heads atomic.Int64 + getRanges atomic.Int64 + lists atomic.Int64 +} + +// NewCountingOrigin wraps inner with call counters. +func NewCountingOrigin(inner origin.Origin) *CountingOrigin { + return &CountingOrigin{inner: inner} +} + +// Heads returns the number of Head() calls observed. +func (c *CountingOrigin) Heads() int64 { return c.heads.Load() } + +// GetRanges returns the number of GetRange() calls observed. +func (c *CountingOrigin) GetRanges() int64 { return c.getRanges.Load() } + +// Lists returns the number of List() calls observed. +func (c *CountingOrigin) Lists() int64 { return c.lists.Load() } + +// Reset zeroes all counters. +func (c *CountingOrigin) Reset() { + c.heads.Store(0) + c.getRanges.Store(0) + c.lists.Store(0) +} + +// Head implements origin.Origin. +func (c *CountingOrigin) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + c.heads.Add(1) + + return c.inner.Head(ctx, bucket, key) +} + +// GetRange implements origin.Origin. +func (c *CountingOrigin) GetRange(ctx context.Context, bucket, key, etag string, off, length int64) (io.ReadCloser, error) { + c.getRanges.Add(1) + + return c.inner.GetRange(ctx, bucket, key, etag, off, length) +} + +// List implements origin.Origin. +func (c *CountingOrigin) List(ctx context.Context, bucket, prefix, marker string, maxKeys int) (origin.ListResult, error) { + c.lists.Add(1) + + return c.inner.List(ctx, bucket, prefix, marker, maxKeys) +} diff --git a/internal/orca/inttest/peersource.go b/internal/orca/inttest/peersource.go new file mode 100644 index 00000000..c349f601 --- /dev/null +++ b/internal/orca/inttest/peersource.go @@ -0,0 +1,67 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "sync" + + "github.com/Azure/unbounded/internal/orca/cluster" +) + +// StaticPeerSource implements cluster.PeerSource with a mutable peer +// list. Each replica in the harness owns its own StaticPeerSource so +// tests can mutate one replica's view of the cluster independently +// (used by TestPeerNotCoordinatorFallback to induce membership +// disagreement). +// +// The source knows its calling replica's identity (selfIP, selfPort) +// so it can stamp Peer.Self correctly even when multiple peers share +// an IP (the case in tests where every replica is on 127.0.0.1). +type StaticPeerSource struct { + mu sync.Mutex + selfIP string + selfPort int + peers []cluster.Peer +} + +// NewStaticPeerSource returns a peer source that stamps Self=true on +// any peer whose (IP, Port) matches the constructor arguments. +func NewStaticPeerSource(selfIP string, selfPort int, peers []cluster.Peer) *StaticPeerSource { + s := &StaticPeerSource{ + selfIP: selfIP, + selfPort: selfPort, + } + s.SetPeers(peers) + + return s +} + +// SetPeers replaces the current peer list. Each peer's Self bit is +// recomputed against the source's stored (selfIP, selfPort). +func (s *StaticPeerSource) SetPeers(peers []cluster.Peer) { + out := make([]cluster.Peer, len(peers)) + for i, p := range peers { + p.Self = p.IP == s.selfIP && p.Port == s.selfPort + out[i] = p + } + + s.mu.Lock() + defer s.mu.Unlock() + + s.peers = out +} + +// Peers satisfies cluster.PeerSource. +func (s *StaticPeerSource) Peers(_ context.Context) ([]cluster.Peer, error) { + s.mu.Lock() + defer s.mu.Unlock() + + out := make([]cluster.Peer, len(s.peers)) + copy(out, s.peers) + + return out, nil +} diff --git a/internal/orca/inttest/seed.go b/internal/orca/inttest/seed.go new file mode 100644 index 00000000..c286bcdc --- /dev/null +++ b/internal/orca/inttest/seed.go @@ -0,0 +1,96 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "bytes" + "context" + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/s3" +) + +// SeedBlob describes a single blob seeded into the origin. +type SeedBlob struct { + Key string + Data []byte +} + +// SmallBlob is one chunk's-worth (1 KiB). +func SmallBlob() SeedBlob { + return SeedBlob{Key: "sample-1k", Data: deterministicBytes(1024, 0xa1)} +} + +// MediumBlob spans two 1 MiB chunks. +func MediumBlob() SeedBlob { + return SeedBlob{Key: "sample-2chunk", Data: deterministicBytes(1024*1024+512*1024, 0xb2)} +} + +// HugeBlob spans 64 chunks at the harness's 1 MiB chunk size. With 3 +// replicas, rendezvous-hashed coordinator selection statistically +// covers every replica many times over (~21 chunks per replica), +// so any test using HugeBlob exercises the full local-fill + +// cross-replica /internal/fill matrix in a single run. +func HugeBlob() SeedBlob { + return SeedBlob{Key: "sample-64chunk", Data: deterministicBytes(64*1024*1024, 0xd4)} +} + +// AllBlobs returns the canonical seed set used across most tests. +func AllBlobs() []SeedBlob { + return []SeedBlob{SmallBlob(), MediumBlob(), HugeBlob()} +} + +// SeedS3 uploads each blob to the named bucket via the provided +// LocalStack-friendly S3 client. +func SeedS3(ctx context.Context, t *testing.T, cli *s3.Client, bucket string, blobs []SeedBlob) { + t.Helper() + + for _, b := range blobs { + if _, err := cli.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(b.Key), + Body: bytes.NewReader(b.Data), + }); err != nil { + t.Fatalf("seed %s/%s: %v", bucket, b.Key, err) + } + } +} + +// DeleteS3Object removes a blob from a LocalStack bucket. Used by +// warm-cache tests to prove that subsequent GETs are served from the +// cachestore and not refetched from the origin. +func DeleteS3Object(ctx context.Context, t *testing.T, cli *s3.Client, bucket, key string) { + t.Helper() + + if _, err := cli.DeleteObject(ctx, &s3.DeleteObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(key), + }); err != nil { + t.Fatalf("delete origin %s/%s: %v", bucket, key, err) + } +} + +// SeedAzure uploads each blob to the named container as block blobs. +func SeedAzure(ctx context.Context, t *testing.T, az *Azurite, ctr string, blobs []SeedBlob) { + t.Helper() + + for _, b := range blobs { + az.UploadBlockBlob(ctx, t, ctr, b.Key, b.Data) + } +} + +// deterministicBytes returns n bytes filled with a repeating pattern +// derived from seed. Useful for byte-exact assertions without random +// flakiness. +func deterministicBytes(n int, seed byte) []byte { + out := make([]byte, n) + for i := range out { + out[i] = seed ^ byte(i*31+17) + } + + return out +} diff --git a/internal/orca/manifests/doc.go b/internal/orca/manifests/doc.go new file mode 100644 index 00000000..a629d147 --- /dev/null +++ b/internal/orca/manifests/doc.go @@ -0,0 +1,12 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package manifests holds tests that validate the orca deployment +// manifest templates render to syntactically correct, structurally +// reasonable Kubernetes YAML. +// +// These tests catch typos, missing required fields, and template +// regressions at compile time without needing a Kind cluster. They +// complement (but do not replace) hack/orca's actual `kubectl apply` +// validation. +package manifests diff --git a/internal/orca/manifests/manifests_test.go b/internal/orca/manifests/manifests_test.go new file mode 100644 index 00000000..bbab6cab --- /dev/null +++ b/internal/orca/manifests/manifests_test.go @@ -0,0 +1,307 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package manifests + +import ( + "bytes" + "errors" + "io" + "os" + "path/filepath" + "runtime" + "sort" + "strings" + "testing" + + "gopkg.in/yaml.v3" + + "github.com/Azure/unbounded/hack/cmd/render-manifests/render" +) + +// TestProductionManifestsRender renders every *.yaml.tmpl under +// deploy/orca/ (excluding the dev/ subdirectory which contains the +// in-Kind LocalStack/Azurite manifests) with realistic inputs and +// asserts the output is structurally valid Kubernetes YAML. +func TestProductionManifestsRender(t *testing.T) { + t.Parallel() + + root := repoRoot(t) + templatesDir := filepath.Join(root, "deploy", "orca") + + renderAndValidate(t, templatesDir, productionData(), + // One file at a time: walking the dev/ subdirectory is the dev + // suite's job, so we render-then-skip it here. + skipDir("dev"), + // Required kinds that MUST appear at least once across the + // rendered manifests. + expectKindsAtLeastOnce("Namespace", "Deployment", "Service", "ConfigMap"), + ) +} + +// TestDevManifestsRender renders the LocalStack + Azurite + init-Job +// manifests used by the Kind dev harness. +func TestDevManifestsRender(t *testing.T) { + t.Parallel() + + root := repoRoot(t) + templatesDir := filepath.Join(root, "deploy", "orca", "dev") + + renderAndValidate(t, templatesDir, devData(), + expectKindsAtLeastOnce("Deployment", "Service", "Job"), + ) +} + +// productionData supplies realistic template variables for the +// production-shape templates. Templates use sprig's `default` for +// missing keys; we set values that exercise the non-default paths +// where it matters. +func productionData() map[string]string { + return map[string]string{ + "Namespace": "orca-test", + "Image": "ghcr.io/example/orca:test", + "ImagePullPolicy": "IfNotPresent", + "TargetReplicas": "3", + "OriginID": "test-origin", + "OriginDriver": "awss3", + "OriginAWSS3Endpoint": "http://localstack:4566", + "OriginAWSS3Region": "us-east-1", + "OriginAWSS3Bucket": "orca-origin", + "OriginAWSS3UsePathStyle": "true", + "CachestoreEndpoint": "http://localstack:4566", + "CachestoreBucket": "orca-cache", + "CachestoreRegion": "us-east-1", + "ClusterService": "orca-peers.orca-test.svc.cluster.local", + "ServerAuthEnabled": "false", + "InternalTLSEnabled": "false", + "AzureAccount": "", + "AzureContainer": "", + "AzureEndpoint": "", + } +} + +func devData() map[string]string { + return map[string]string{ + "Namespace": "orca-test", + "CachestoreBucket": "orca-cache", + "OriginBucket": "orca-origin", + "AzuriteContainer": "orca-test", + } +} + +// renderAndValidate renders every template under templatesDir into a +// t.TempDir, then walks the output and applies each Validator. +func renderAndValidate(t *testing.T, templatesDir string, data map[string]string, validators ...Validator) { + t.Helper() + + outputDir := t.TempDir() + + if err := render.Render(templatesDir, outputDir, data); err != nil { + t.Fatalf("render.Render: %v", err) + } + // Collect every rendered .yaml file. Skip directories filtered + // by the validators. + skipDirs := skipDirsOf(validators) + + var renderedFiles []string + + walkErr := filepath.WalkDir(outputDir, func(path string, d os.DirEntry, err error) error { + if err != nil { + return err + } + + if d.IsDir() { + rel, _ := filepath.Rel(outputDir, path) + if _, skip := skipDirs[rel]; skip { + return filepath.SkipDir + } + + return nil + } + + if strings.HasSuffix(path, ".yaml") { + renderedFiles = append(renderedFiles, path) + } + + return nil + }) + if walkErr != nil { + t.Fatalf("walk rendered output: %v", walkErr) + } + + if len(renderedFiles) == 0 { + t.Fatalf("no rendered manifests found under %s", outputDir) + } + + sort.Strings(renderedFiles) + + docs := parseRenderedDocs(t, renderedFiles) + + // Always-on basic structural validation. + for _, d := range docs { + validateBasicStructure(t, d) + } + + for _, v := range validators { + v.Validate(t, docs) + } +} + +// renderedDoc is one logical YAML document plus the source file it +// came from (multi-doc files split into multiple renderedDocs). +type renderedDoc struct { + SourcePath string + Index int + Doc map[string]any +} + +func parseRenderedDocs(t *testing.T, files []string) []renderedDoc { + t.Helper() + + var docs []renderedDoc + + for _, f := range files { + raw, err := os.ReadFile(f) + if err != nil { + t.Fatalf("read %s: %v", f, err) + } + + dec := yaml.NewDecoder(bytes.NewReader(raw)) + + for i := 0; ; i++ { + var doc map[string]any + if derr := dec.Decode(&doc); derr != nil { + if errors.Is(derr, io.EOF) { + break + } + + t.Fatalf("yaml decode %s doc %d: %v", f, i, derr) + } + + if doc == nil { + continue + } + + docs = append(docs, renderedDoc{SourcePath: f, Index: i, Doc: doc}) + } + } + + return docs +} + +func validateBasicStructure(t *testing.T, d renderedDoc) { + t.Helper() + + apiVersion, _ := d.Doc["apiVersion"].(string) + kind, _ := d.Doc["kind"].(string) + + if apiVersion == "" { + t.Errorf("%s doc %d: missing apiVersion", d.SourcePath, d.Index) + } + + if kind == "" { + t.Errorf("%s doc %d: missing kind", d.SourcePath, d.Index) + } + + meta, _ := d.Doc["metadata"].(map[string]any) + if meta == nil { + t.Errorf("%s doc %d (kind=%s): missing metadata", d.SourcePath, d.Index, kind) + return + } + + name, _ := meta["name"].(string) + if name == "" { + t.Errorf("%s doc %d (kind=%s): missing metadata.name", d.SourcePath, d.Index, kind) + } +} + +// Validator is a test-time check applied to the full set of +// rendered docs. +type Validator interface { + Validate(t *testing.T, docs []renderedDoc) + skipDir() string // empty when not a dir filter +} + +type kindsAtLeastOnce struct{ kinds []string } + +func (v kindsAtLeastOnce) Validate(t *testing.T, docs []renderedDoc) { + t.Helper() + + seen := map[string]bool{} + + for _, d := range docs { + if k, _ := d.Doc["kind"].(string); k != "" { + seen[k] = true + } + } + + for _, want := range v.kinds { + if !seen[want] { + t.Errorf("expected at least one document of kind %q, got kinds %v", want, sortedKeys(seen)) + } + } +} + +func (v kindsAtLeastOnce) skipDir() string { return "" } + +func expectKindsAtLeastOnce(kinds ...string) Validator { + return kindsAtLeastOnce{kinds: kinds} +} + +type dirSkipper struct{ name string } + +func (d dirSkipper) Validate(*testing.T, []renderedDoc) {} + +func (d dirSkipper) skipDir() string { return d.name } + +func skipDir(name string) Validator { + return dirSkipper{name: name} +} + +func skipDirsOf(vs []Validator) map[string]struct{} { + out := map[string]struct{}{} + + for _, v := range vs { + if d := v.skipDir(); d != "" { + out[d] = struct{}{} + } + } + + return out +} + +func sortedKeys(m map[string]bool) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + + sort.Strings(out) + + return out +} + +// repoRoot returns the absolute path to the repo root by walking up +// from this test file's directory until it finds a go.mod. +func repoRoot(t *testing.T) string { + t.Helper() + + _, file, _, ok := runtime.Caller(0) + if !ok { + t.Fatal("runtime.Caller(0) failed") + } + + dir := filepath.Dir(file) + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + + parent := filepath.Dir(dir) + if parent == dir { + t.Fatalf("reached filesystem root without finding go.mod (started at %s)", filepath.Dir(file)) + } + + dir = parent + } +} diff --git a/internal/orca/metadata/metadata.go b/internal/orca/metadata/metadata.go new file mode 100644 index 00000000..be7e3dd5 --- /dev/null +++ b/internal/orca/metadata/metadata.go @@ -0,0 +1,231 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package metadata is the per-replica object-metadata cache. +// +// Responsibilities: +// - bounded TTL'd cache of ObjectInfo keyed on (origin_id, bucket, +// key) +// - separate negative-TTL handling for 404 / unsupported-blob-type +// entries (design.md s12) +// - per-replica HEAD singleflight (s8.7) so concurrent misses +// collapse to one Origin.Head +package metadata + +import ( + "container/list" + "context" + "errors" + "fmt" + "sync" + "time" + + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// Cache is the per-replica metadata cache. +type Cache struct { + cfg config.Metadata + + mu sync.Mutex + ll *list.List + idx map[string]*list.Element + + sf sync.Map // map[string]*sfEntry +} + +type cacheEntry struct { + key string + info origin.ObjectInfo + negative bool + negErr error + expiresAt time.Time +} + +type sfEntry struct { + once sync.Once + done chan struct{} + info origin.ObjectInfo + err error +} + +// NewCache builds a Cache from config. +func NewCache(cfg config.Metadata) *Cache { + if cfg.MaxEntries <= 0 { + cfg.MaxEntries = 10_000 + } + + if cfg.TTL <= 0 { + cfg.TTL = 5 * time.Minute + } + + if cfg.NegativeTTL <= 0 { + cfg.NegativeTTL = 60 * time.Second + } + + return &Cache{ + cfg: cfg, + ll: list.New(), + idx: make(map[string]*list.Element, cfg.MaxEntries), + } +} + +// Lookup returns the cached ObjectInfo if present and unexpired. +// +// Returns: +// - info, true, nil -> positive cache hit +// - {}, true, err -> negative cache hit (err is the cached error) +// - {}, false, nil -> miss; caller should LookupOrFetch +func (c *Cache) Lookup(originID, bucket, key string) (origin.ObjectInfo, bool, error) { + k := mkKey(originID, bucket, key) + + c.mu.Lock() + defer c.mu.Unlock() + + el, ok := c.idx[k] + if !ok { + return origin.ObjectInfo{}, false, nil + } + + e, ok := el.Value.(*cacheEntry) + if !ok { + return origin.ObjectInfo{}, false, fmt.Errorf("metadata: list element is not *cacheEntry") + } + + if time.Now().After(e.expiresAt) { + c.ll.Remove(el) + delete(c.idx, k) + + return origin.ObjectInfo{}, false, nil + } + + c.ll.MoveToFront(el) + + if e.negative { + return origin.ObjectInfo{}, true, e.negErr + } + + return e.info, true, nil +} + +// LookupOrFetch returns the cached ObjectInfo on hit (positive or +// negative); on miss, runs the per-replica HEAD singleflight against +// fetch and caches the result with the appropriate TTL. +func (c *Cache) LookupOrFetch( + ctx context.Context, + originID, bucket, key string, + fetch func(ctx context.Context) (origin.ObjectInfo, error), +) (origin.ObjectInfo, error) { + if info, ok, err := c.Lookup(originID, bucket, key); ok { + return info, err + } + + k := mkKey(originID, bucket, key) + v, _ := c.sf.LoadOrStore(k, &sfEntry{done: make(chan struct{})}) + + sfe, ok := v.(*sfEntry) + if !ok { + return origin.ObjectInfo{}, fmt.Errorf("metadata: singleflight value is not *sfEntry") + } + + first := false + + sfe.once.Do(func() { + first = true + }) + + if first { + defer func() { + close(sfe.done) + c.sf.Delete(k) + }() + + info, err := fetch(ctx) + sfe.info = info + sfe.err = err + + if recErr := c.recordResult(originID, bucket, key, info, err); recErr != nil { + err = errors.Join(err, recErr) + } + + return info, err + } + // Joiner: wait for the leader. + select { + case <-ctx.Done(): + return origin.ObjectInfo{}, ctx.Err() + case <-sfe.done: + } + + return sfe.info, sfe.err +} + +// Invalidate drops the entry. +func (c *Cache) Invalidate(originID, bucket, key string) { + k := mkKey(originID, bucket, key) + + c.mu.Lock() + defer c.mu.Unlock() + + if el, ok := c.idx[k]; ok { + c.ll.Remove(el) + delete(c.idx, k) + } +} + +func (c *Cache) recordResult(originID, bucket, key string, info origin.ObjectInfo, err error) error { + k := mkKey(originID, bucket, key) + + c.mu.Lock() + defer c.mu.Unlock() + + now := time.Now() + + var e *cacheEntry + + switch { + case err == nil: + e = &cacheEntry{key: k, info: info, expiresAt: now.Add(c.cfg.TTL)} + case errors.Is(err, origin.ErrNotFound): + e = &cacheEntry{key: k, negative: true, negErr: err, expiresAt: now.Add(c.cfg.NegativeTTL)} + default: + var ube *origin.UnsupportedBlobTypeError + if errors.As(err, &ube) { + e = &cacheEntry{key: k, negative: true, negErr: err, expiresAt: now.Add(c.cfg.NegativeTTL)} + } else { + // Other transient errors not cached. + return nil + } + } + + if existing, ok := c.idx[k]; ok { + c.ll.Remove(existing) + delete(c.idx, k) + } + + el := c.ll.PushFront(e) + + c.idx[k] = el + for c.ll.Len() > c.cfg.MaxEntries { + oldest := c.ll.Back() + if oldest == nil { + break + } + + c.ll.Remove(oldest) + + oldEntry, ok := oldest.Value.(*cacheEntry) + if !ok { + return fmt.Errorf("metadata: list element is not *cacheEntry") + } + + delete(c.idx, oldEntry.key) + } + + return nil +} + +func mkKey(originID, bucket, key string) string { + return originID + "|" + bucket + "|" + key +} diff --git a/internal/orca/origin/awss3/awss3.go b/internal/orca/origin/awss3/awss3.go new file mode 100644 index 00000000..6d7e842c --- /dev/null +++ b/internal/orca/origin/awss3/awss3.go @@ -0,0 +1,291 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package awss3 is the AWS S3 (and S3-compatible) origin driver. It +// targets either real AWS S3 or a local S3-compatible endpoint such as +// LocalStack. Useful as a credential-free origin for the dev harness: +// LocalStack acts as both origin and cachestore (different buckets). +// +// This driver is read-only from Orca's perspective (Head, GetRange, +// List). The seed step that uploads test objects to the origin bucket +// happens out-of-band via aws-cli or similar. +package awss3 + +import ( + "context" + "errors" + "fmt" + "io" + "net/http" + "strings" + + "github.com/aws/aws-sdk-go-v2/aws" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" + s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" + "github.com/aws/smithy-go" + + "github.com/Azure/unbounded/internal/orca/origin" +) + +// Adapter implements origin.Origin against an S3-compatible endpoint. +type Adapter struct { + cfg Config + client *s3.Client +} + +// Config is the awss3-driver configuration. Mirrors config.AWSS3 but +// kept package-local so the driver can be unit-tested without +// importing the whole config package. +type Config struct { + // Endpoint, when set, overrides the regional default and routes + // requests at a custom URL (LocalStack uses + // http://localstack:4566). Leave empty for real AWS S3. + Endpoint string + + // Region is the AWS region. LocalStack ignores this; the SDK + // requires a value. + Region string + + // Bucket is the source bucket holding origin objects. + Bucket string + + // AccessKey / SecretKey are static credentials. For LocalStack + // these are "test"/"test"; for real AWS, supply real creds. + AccessKey string + SecretKey string + + // UsePathStyle: true for LocalStack (host-based addressing + // requires DNS wildcards LocalStack does not provide). + UsePathStyle bool +} + +// New constructs an Adapter. +func New(ctx context.Context, cfg Config) (*Adapter, error) { + if cfg.Bucket == "" { + return nil, fmt.Errorf("origin/awss3: bucket required") + } + + if cfg.Region == "" { + cfg.Region = "us-east-1" + } + + awsCfg, err := awsconfig.LoadDefaultConfig(ctx, + awsconfig.WithRegion(cfg.Region), + awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider( + cfg.AccessKey, cfg.SecretKey, "", + )), + // Opt out of CRC64NVME default introduced in aws-sdk-go-v2 + // 1.32. LocalStack 3.8 returns InvalidRequest for unknown + // algorithms; real AWS S3 still works either way. + awsconfig.WithRequestChecksumCalculation(aws.RequestChecksumCalculationWhenRequired), + awsconfig.WithResponseChecksumValidation(aws.ResponseChecksumValidationWhenRequired), + ) + if err != nil { + return nil, fmt.Errorf("origin/awss3: aws config: %w", err) + } + + client := s3.NewFromConfig(awsCfg, func(o *s3.Options) { + if cfg.Endpoint != "" { + o.BaseEndpoint = aws.String(cfg.Endpoint) + } + + o.UsePathStyle = cfg.UsePathStyle + }) + + return &Adapter{cfg: cfg, client: client}, nil +} + +// Head returns ObjectInfo for the named object. The bucket arg lets +// callers override the configured bucket; if empty, the configured +// bucket is used. +func (a *Adapter) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + b := bucket + if b == "" { + b = a.cfg.Bucket + } + + out, err := a.client.HeadObject(ctx, &s3.HeadObjectInput{ + Bucket: aws.String(b), + Key: aws.String(key), + }) + if err != nil { + if isNotFound(err) { + return origin.ObjectInfo{LastStatus: http.StatusNotFound}, origin.ErrNotFound + } + + if isAuth(err) { + return origin.ObjectInfo{}, origin.ErrAuth + } + + return origin.ObjectInfo{}, fmt.Errorf("awss3 head: %w", err) + } + + info := origin.ObjectInfo{LastStatus: http.StatusOK} + if out.ContentLength != nil { + info.Size = *out.ContentLength + } + + if out.ETag != nil { + info.ETag = strings.Trim(*out.ETag, "\"") + } + + if out.ContentType != nil { + info.ContentType = *out.ContentType + } + + if out.LastModified != nil { + info.LastValidated = *out.LastModified + } + + return info, nil +} + +// GetRange fetches [off, off+n) of the object, sending If-Match: . +func (a *Adapter) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) { + b := bucket + if b == "" { + b = a.cfg.Bucket + } + + rng := fmt.Sprintf("bytes=%d-%d", off, off+n-1) + + in := &s3.GetObjectInput{ + Bucket: aws.String(b), + Key: aws.String(key), + Range: aws.String(rng), + } + if etag != "" { + // S3 expects the etag wrapped in double quotes. + in.IfMatch = aws.String("\"" + etag + "\"") + } + + out, err := a.client.GetObject(ctx, in) + if err != nil { + if isPreconditionFailed(err) { + return nil, &origin.OriginETagChangedError{ + Bucket: b, Key: key, Want: etag, + } + } + + if isNotFound(err) { + return nil, origin.ErrNotFound + } + + if isAuth(err) { + return nil, origin.ErrAuth + } + + return nil, fmt.Errorf("awss3 get-range: %w", err) + } + + return out.Body, nil +} + +// List enumerates objects under prefix. +func (a *Adapter) List(ctx context.Context, bucket, prefix, marker string, maxResults int) (origin.ListResult, error) { + b := bucket + if b == "" { + b = a.cfg.Bucket + } + + in := &s3.ListObjectsV2Input{ + Bucket: aws.String(b), + Prefix: aws.String(prefix), + MaxKeys: aws.Int32(int32(maxResults)), + } + if marker != "" { + in.ContinuationToken = aws.String(marker) + } + + out, err := a.client.ListObjectsV2(ctx, in) + if err != nil { + if isAuth(err) { + return origin.ListResult{}, origin.ErrAuth + } + + return origin.ListResult{}, fmt.Errorf("awss3 list: %w", err) + } + + res := origin.ListResult{} + + for _, item := range out.Contents { + entry := origin.ObjectEntry{} + if item.Key != nil { + entry.Key = *item.Key + } + + if item.Size != nil { + entry.Size = *item.Size + } + + if item.ETag != nil { + entry.ETag = strings.Trim(*item.ETag, "\"") + } + + res.Entries = append(res.Entries, entry) + } + + if out.IsTruncated != nil { + res.IsTruncated = *out.IsTruncated + } + + if out.NextContinuationToken != nil { + res.NextMarker = *out.NextContinuationToken + } + + return res, nil +} + +func isNotFound(err error) bool { + var nsk *s3types.NoSuchKey + if errors.As(err, &nsk) { + return true + } + + var nsb *s3types.NoSuchBucket + if errors.As(err, &nsb) { + return true + } + + var notFound *s3types.NotFound + if errors.As(err, ¬Found) { + return true + } + + var apiErr smithy.APIError + if errors.As(err, &apiErr) { + switch apiErr.ErrorCode() { + case "NoSuchKey", "NotFound", "404": + return true + } + } + + return false +} + +func isAuth(err error) bool { + var apiErr smithy.APIError + if errors.As(err, &apiErr) { + switch apiErr.ErrorCode() { + case "AccessDenied", "Unauthorized", "Forbidden", "InvalidAccessKeyId", "SignatureDoesNotMatch": + return true + } + } + + return false +} + +func isPreconditionFailed(err error) bool { + var apiErr smithy.APIError + if errors.As(err, &apiErr) { + switch apiErr.ErrorCode() { + case "PreconditionFailed", "ConditionalRequestConflict": + return true + } + } + + return strings.Contains(err.Error(), "PreconditionFailed") || + strings.Contains(err.Error(), "412") +} diff --git a/internal/orca/origin/azureblob/azureblob.go b/internal/orca/origin/azureblob/azureblob.go new file mode 100644 index 00000000..ab17d422 --- /dev/null +++ b/internal/orca/origin/azureblob/azureblob.go @@ -0,0 +1,265 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package azureblob is the Azure Blob Storage adapter for the Origin +// interface. Block Blobs only (design.md s9). +package azureblob + +import ( + "context" + "errors" + "fmt" + "io" + "net/http" + "strings" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/bloberror" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container" + + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// Adapter implements origin.Origin against Azure Blob Storage. +type Adapter struct { + cfg config.Azureblob + client *azblob.Client +} + +// New builds an Adapter from config. +func New(cfg config.Azureblob) (*Adapter, error) { + if cfg.Account == "" { + return nil, fmt.Errorf("azureblob: account required") + } + + if cfg.AccountKey == "" { + return nil, fmt.Errorf("azureblob: account_key required") + } + + cred, err := azblob.NewSharedKeyCredential(cfg.Account, cfg.AccountKey) + if err != nil { + return nil, fmt.Errorf("azureblob: shared-key credential: %w", err) + } + + endpoint := cfg.Endpoint + if endpoint == "" { + endpoint = fmt.Sprintf("https://%s.blob.core.windows.net/", cfg.Account) + } + + client, err := azblob.NewClientWithSharedKeyCredential(endpoint, cred, nil) + if err != nil { + return nil, fmt.Errorf("azureblob: client: %w", err) + } + + return &Adapter{cfg: cfg, client: client}, nil +} + +// Head returns ObjectInfo for the named blob. +// +// "bucket" maps to the configured container; the bucket arg is honored +// only if non-empty (allowing single-container deployments to use the +// configured container as the default). +func (a *Adapter) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + cName := bucket + if cName == "" { + cName = a.cfg.Container + } + + props, err := a.client.ServiceClient().NewContainerClient(cName). + NewBlobClient(key).GetProperties(ctx, nil) + if err != nil { + if isNotFound(err) { + return origin.ObjectInfo{LastStatus: http.StatusNotFound}, origin.ErrNotFound + } + + if isAuth(err) { + return origin.ObjectInfo{}, origin.ErrAuth + } + + return origin.ObjectInfo{}, fmt.Errorf("azureblob head: %w", err) + } + + if err := validateBlobType(a.cfg.EnforceBlockBlobOnly, cName, key, props.BlobType); err != nil { + return origin.ObjectInfo{}, err + } + + info := origin.ObjectInfo{LastStatus: http.StatusOK} + if props.ContentLength != nil { + info.Size = *props.ContentLength + } + + if props.ETag != nil { + info.ETag = strings.Trim(string(*props.ETag), "\"") + } + + if props.ContentType != nil { + info.ContentType = *props.ContentType + } + + if props.LastModified != nil { + info.LastValidated = *props.LastModified + } + + return info, nil +} + +// GetRange fetches [off, off+n) of the blob, sending If-Match: . +func (a *Adapter) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) { + cName := bucket + if cName == "" { + cName = a.cfg.Container + } + + bc := a.client.ServiceClient().NewContainerClient(cName).NewBlobClient(key) + opts := &azblob.DownloadStreamOptions{ + Range: blob.HTTPRange{Offset: off, Count: n}, + } + + if etag != "" { + etagVal := azcore.ETag(etag) + opts.AccessConditions = &blob.AccessConditions{ + ModifiedAccessConditions: &blob.ModifiedAccessConditions{ + IfMatch: to.Ptr(etagVal), + }, + } + } + + resp, err := bc.DownloadStream(ctx, opts) + if err != nil { + if isPreconditionFailed(err) { + return nil, &origin.OriginETagChangedError{ + Bucket: cName, Key: key, Want: etag, + } + } + + if isNotFound(err) { + return nil, origin.ErrNotFound + } + + if isAuth(err) { + return nil, origin.ErrAuth + } + + return nil, fmt.Errorf("azureblob get-range: %w", err) + } + + return resp.Body, nil +} + +// List enumerates blobs in the container matching prefix. +func (a *Adapter) List(ctx context.Context, bucket, prefix, marker string, maxResults int) (origin.ListResult, error) { + cName := bucket + if cName == "" { + cName = a.cfg.Container + } + + cc := a.client.ServiceClient().NewContainerClient(cName) + max := int32(maxResults) + pager := cc.NewListBlobsFlatPager(&container.ListBlobsFlatOptions{ + Prefix: &prefix, + MaxResults: &max, + Marker: stringOrNil(marker), + }) + out := origin.ListResult{} + + if pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + if isAuth(err) { + return origin.ListResult{}, origin.ErrAuth + } + + return origin.ListResult{}, fmt.Errorf("azureblob list: %w", err) + } + + for _, item := range page.Segment.BlobItems { + entry := origin.ObjectEntry{} + if item.Name != nil { + entry.Key = *item.Name + } + + if item.Properties != nil { + if item.Properties.ContentLength != nil { + entry.Size = *item.Properties.ContentLength + } + + if item.Properties.ETag != nil { + entry.ETag = strings.Trim(string(*item.Properties.ETag), "\"") + } + + if item.Properties.BlobType != nil { + entry.BlobType = string(*item.Properties.BlobType) + } + } + + out.Entries = append(out.Entries, entry) + } + + if page.NextMarker != nil { + out.NextMarker = *page.NextMarker + out.IsTruncated = *page.NextMarker != "" + } + } + + return out, nil +} + +func stringOrNil(s string) *string { + if s == "" { + return nil + } + + return &s +} + +func isNotFound(err error) bool { + return bloberror.HasCode(err, bloberror.BlobNotFound) || + bloberror.HasCode(err, bloberror.ContainerNotFound) || + errors.Is(err, origin.ErrNotFound) +} + +func isAuth(err error) bool { + var rerr *azcore.ResponseError + if errors.As(err, &rerr) { + if rerr.StatusCode == http.StatusUnauthorized || rerr.StatusCode == http.StatusForbidden { + return true + } + } + + return bloberror.HasCode(err, bloberror.AuthenticationFailed) || + bloberror.HasCode(err, bloberror.AuthorizationFailure) +} + +func isPreconditionFailed(err error) bool { + var rerr *azcore.ResponseError + if errors.As(err, &rerr) && rerr.StatusCode == http.StatusPreconditionFailed { + return true + } + + return bloberror.HasCode(err, bloberror.ConditionNotMet) +} + +// validateBlobType returns an UnsupportedBlobTypeError when +// enforceBlockBlobOnly is set and the blob is a non-Block-Blob type +// (Page or Append). Returns nil for Block Blobs and when the gate is +// disabled. Extracted as a pure function so unit tests can cover all +// branches without an Azurite round-trip. +func validateBlobType(enforceBlockBlobOnly bool, container, key string, blobType *blob.BlobType) error { + if !enforceBlockBlobOnly || blobType == nil { + return nil + } + + if *blobType == blob.BlobTypeBlockBlob { + return nil + } + + return &origin.UnsupportedBlobTypeError{ + Bucket: container, + Key: key, + BlobType: string(*blobType), + } +} diff --git a/internal/orca/origin/azureblob/azureblob_test.go b/internal/orca/origin/azureblob/azureblob_test.go new file mode 100644 index 00000000..debfef96 --- /dev/null +++ b/internal/orca/origin/azureblob/azureblob_test.go @@ -0,0 +1,72 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package azureblob + +import ( + "errors" + "testing" + + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob" + + "github.com/Azure/unbounded/internal/orca/origin" +) + +// TestValidateBlobType covers every branch of the EnforceBlockBlobOnly +// gate. The integration suite previously only exercised the +// PageBlob-refused case; this unit test fills in disabled, nil, +// BlockBlob, and AppendBlob. +func TestValidateBlobType(t *testing.T) { + pageBlob := blob.BlobTypePageBlob + appendBlob := blob.BlobTypeAppendBlob + blockBlob := blob.BlobTypeBlockBlob + + tests := []struct { + name string + enforce bool + blobType *blob.BlobType + wantUnsupported bool + }{ + {"enforce off accepts any type", false, &pageBlob, false}, + {"nil blob type passes when enforced (no info)", true, nil, false}, + {"block blob accepted", true, &blockBlob, false}, + {"page blob refused", true, &pageBlob, true}, + {"append blob refused", true, &appendBlob, true}, + } + + const ( + container = "ctr" + key = "key" + ) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateBlobType(tt.enforce, container, key, tt.blobType) + + if (err != nil) != tt.wantUnsupported { + t.Fatalf("err=%v, wantUnsupported=%v", err, tt.wantUnsupported) + } + + if !tt.wantUnsupported { + return + } + + var ube *origin.UnsupportedBlobTypeError + if !errors.As(err, &ube) { + t.Fatalf("err type=%T (want *origin.UnsupportedBlobTypeError): %v", err, err) + } + + if ube.Bucket != container { + t.Errorf("Bucket=%q want %q", ube.Bucket, container) + } + + if ube.Key != key { + t.Errorf("Key=%q want %q", ube.Key, key) + } + + if tt.blobType != nil && ube.BlobType != string(*tt.blobType) { + t.Errorf("BlobType=%q want %q", ube.BlobType, string(*tt.blobType)) + } + }) + } +} diff --git a/internal/orca/origin/origin.go b/internal/orca/origin/origin.go new file mode 100644 index 00000000..06e53b32 --- /dev/null +++ b/internal/orca/origin/origin.go @@ -0,0 +1,90 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package origin defines the upstream-blob-store interface and shared +// types. Concrete adapters live under origin//. +// +// See design/orca/design.md s7 for the full interface. +package origin + +import ( + "context" + "errors" + "fmt" + "io" + "time" +) + +// Origin is a read-only view of an upstream blob store. +type Origin interface { + // Head returns object metadata. If the blob does not exist, returns + // ErrNotFound. If the blob is an unsupported type (e.g., azureblob + // non-BlockBlob), returns UnsupportedBlobTypeError. + Head(ctx context.Context, bucket, key string) (ObjectInfo, error) + + // GetRange fetches [off, off+n) bytes of the object. The etag is + // passed as `If-Match: ` so a mid-flight overwrite is detected + // at the wire (returns OriginETagChangedError). + GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) + + // List enumerates objects under prefix. Pagination via marker. + List(ctx context.Context, bucket, prefix, marker string, max int) (ListResult, error) +} + +// ObjectInfo is the result of a successful Head. +type ObjectInfo struct { + Size int64 + ETag string + ContentType string + LastValidated time.Time + LastStatus int +} + +// ListResult is the paginated result of List. +type ListResult struct { + Entries []ObjectEntry + NextMarker string + IsTruncated bool +} + +// ObjectEntry is one item in a ListResult. +type ObjectEntry struct { + Key string + Size int64 + ETag string + BlobType string // "" for s3; "BlockBlob" / "PageBlob" / "AppendBlob" for azureblob +} + +// Sentinel errors. Wrap with %w so callers use errors.Is. +var ( + ErrNotFound = errors.New("origin: not found") + ErrAuth = errors.New("origin: auth") + ErrThrottle = errors.New("origin: throttle") +) + +// OriginETagChangedError is returned by GetRange when the origin +// rejects the If-Match precondition. +type OriginETagChangedError struct { + Bucket string + Key string + Want string + Got string +} + +func (e *OriginETagChangedError) Error() string { + return fmt.Sprintf("origin etag changed for %s/%s: want=%q got=%q", + e.Bucket, e.Key, e.Want, e.Got) +} + +// UnsupportedBlobTypeError is returned by azureblob.Head when the +// target is a Page or Append blob (design.md s9). +type UnsupportedBlobTypeError struct { + Bucket string + Key string + BlobType string +} + +func (e *UnsupportedBlobTypeError) Error() string { + return fmt.Sprintf("origin unsupported blob type %s for %s/%s", + e.BlobType, e.Bucket, e.Key) +} diff --git a/internal/orca/server/server.go b/internal/orca/server/server.go new file mode 100644 index 00000000..2a1f5546 --- /dev/null +++ b/internal/orca/server/server.go @@ -0,0 +1,434 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package server holds the HTTP handlers for the client edge and the +// internal-listener. +// +// Client edge (8443): GET /{bucket}/{key} (with optional Range), HEAD, +// LIST. No auth in dev (server.auth.enabled=false). +// +// Internal listener (8444): GET /internal/fill?. No mTLS in +// dev (cluster.internal_tls.enabled=false). +package server + +import ( + "context" + "encoding/xml" + "errors" + "fmt" + "io" + "log/slog" + "net/http" + "strconv" + "strings" + + "github.com/Azure/unbounded/internal/orca/cachestore" + "github.com/Azure/unbounded/internal/orca/chunk" + "github.com/Azure/unbounded/internal/orca/cluster" + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// EdgeHandler implements the client-edge S3 surface. +type EdgeHandler struct { + fc edgeFetchAPI + cfg *config.Config + log *slog.Logger +} + +// edgeFetchAPI is the surface area EdgeHandler depends on. The real +// *fetch.Coordinator satisfies it; tests substitute small fakes for +// deterministic unit-level coverage. +type edgeFetchAPI interface { + HeadObject(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) + GetChunk(ctx context.Context, k chunk.Key) (io.ReadCloser, error) + Origin() origin.Origin +} + +// NewEdgeHandler wires the edge handler. +func NewEdgeHandler(fc edgeFetchAPI, cfg *config.Config, log *slog.Logger) *EdgeHandler { + return &EdgeHandler{fc: fc, cfg: cfg, log: log} +} + +// ServeHTTP routes incoming client requests. +// +// Routing (path-style only, since LocalStack and most dev clients +// use path-style): +// +// GET / -> ListBuckets (not supported; 405) +// GET /{bucket}/?list-type=2&prefix=... -> ListObjectsV2 +// GET /{bucket}/ -> ListObjectsV2 (default) +// GET /{bucket}/{key} -> GetObject (with optional Range) +// HEAD /{bucket}/{key} -> HeadObject +// HEAD /{bucket}/ -> HeadBucket (not supported; 405) +func (h *EdgeHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if h.cfg.Server.Auth.Enabled { + // Stub: production would dispatch to bearer/mTLS validation. + // In dev (auth.enabled=false) we skip entirely. + http.Error(w, "auth required (server.auth.enabled=true) but not implemented in MVP", + http.StatusUnauthorized) + + return + } + + bucket, key := splitPath(r.URL.Path) + + switch r.Method { + case http.MethodHead: + if key == "" { + h.notImplemented(w, "HeadBucket") + return + } + + h.handleHead(w, r, bucket, key) + case http.MethodGet: + if key == "" { + h.handleList(w, r, bucket) + return + } + + h.handleGet(w, r, bucket, key) + default: + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + } +} + +func (h *EdgeHandler) handleHead(w http.ResponseWriter, r *http.Request, bucket, key string) { + info, err := h.fc.HeadObject(r.Context(), bucket, key) + if err != nil { + h.writeOriginError(w, err) + return + } + + setObjectHeaders(w, info) + // HEAD must report the Content-Length the GET response would carry. + w.Header().Set("Content-Length", strconv.FormatInt(info.Size, 10)) + w.WriteHeader(http.StatusOK) +} + +func (h *EdgeHandler) handleGet(w http.ResponseWriter, r *http.Request, bucket, key string) { + info, err := h.fc.HeadObject(r.Context(), bucket, key) + if err != nil { + h.writeOriginError(w, err) + return + } + + // Determine byte range. + var ( + rangeStart int64 + rangeEnd = info.Size - 1 + hasRange bool + statusCode = http.StatusOK + ) + if rh := r.Header.Get("Range"); rh != "" { + s, e, ok := parseSimpleByteRange(rh, info.Size) + if !ok { + http.Error(w, "invalid Range", http.StatusRequestedRangeNotSatisfiable) + return + } + + rangeStart, rangeEnd = s, e + hasRange = true + statusCode = http.StatusPartialContent + } + + if rangeStart > rangeEnd { + http.Error(w, "range not satisfiable", http.StatusRequestedRangeNotSatisfiable) + return + } + + chunkSize := h.cfg.Chunking.Size + firstChunk, lastChunk := chunk.IndexRange(rangeStart, rangeEnd, chunkSize, info.Size) + + // Set headers eagerly (Option D commit boundary == first byte from + // origin; for cache hit, immediate). + setObjectHeaders(w, info) + w.Header().Set("Content-Length", strconv.FormatInt(rangeEnd-rangeStart+1, 10)) + + if hasRange { + w.Header().Set("Content-Range", + fmt.Sprintf("bytes %d-%d/%d", rangeStart, rangeEnd, info.Size)) + } + + // Write status now; subsequent failures become mid-stream aborts. + w.WriteHeader(statusCode) + + for ci := firstChunk; ci <= lastChunk; ci++ { + ckey := chunk.Key{ + OriginID: h.cfg.Origin.ID, + Bucket: bucket, + ObjectKey: key, + ETag: info.ETag, + ChunkSize: chunkSize, + Index: ci, + } + + body, err := h.fc.GetChunk(r.Context(), ckey) + if err != nil { + // We've already sent headers; abort the response. + h.log.Warn("mid-stream chunk fetch failed", + "bucket", bucket, "key", key, "chunk", ci, "err", err) + + return + } + + off, length := chunk.ChunkSlice(ci, chunkSize, rangeStart, rangeEnd, info.Size) + if err := streamSlice(w, body, off, length); err != nil { + body.Close() //nolint:errcheck // chunk body close best-effort, response already streaming + h.log.Warn("mid-stream copy failed", + "bucket", bucket, "key", key, "chunk", ci, "err", err) + + return + } + + body.Close() //nolint:errcheck // chunk body close best-effort, response already streaming + + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + } +} + +// streamSlice copies length bytes starting at off from src to dst. +func streamSlice(dst io.Writer, src io.Reader, off, length int64) error { + if off > 0 { + if _, err := io.CopyN(io.Discard, src, off); err != nil { + return err + } + } + + if length > 0 { + if _, err := io.CopyN(dst, src, length); err != nil { + return err + } + } + + return nil +} + +// handleList is a thin pass-through to Origin.List for v1 prototype. +func (h *EdgeHandler) handleList(w http.ResponseWriter, r *http.Request, bucket string) { + // Pass-through; very minimal S3 ListObjectsV2 shape. Reviewers can + // curl this for sanity but full S3 list semantics are not in MVP. + prefix := r.URL.Query().Get("prefix") + marker := r.URL.Query().Get("continuation-token") + maxStr := r.URL.Query().Get("max-keys") + maxKeys := 1000 + + if maxStr != "" { + if v, err := strconv.Atoi(maxStr); err == nil && v > 0 { + maxKeys = v + } + } + + type listEntry struct { + Key string `xml:"Key"` + Size int64 `xml:"Size"` + ETag string `xml:"ETag"` + } + + type listResult struct { + XMLName xml.Name `xml:"ListBucketResult"` + Name string `xml:"Name"` + Prefix string `xml:"Prefix"` + KeyCount int `xml:"KeyCount"` + MaxKeys int `xml:"MaxKeys"` + IsTruncated bool `xml:"IsTruncated"` + NextMarker string `xml:"NextContinuationToken,omitempty"` + Contents []listEntry `xml:"Contents"` + } + + or := h.fc.Origin() + + res, err := or.List(r.Context(), bucket, prefix, marker, maxKeys) + if err != nil { + h.writeOriginError(w, err) + return + } + + body := listResult{ + Name: bucket, + Prefix: prefix, + KeyCount: len(res.Entries), + MaxKeys: maxKeys, + IsTruncated: res.IsTruncated, + NextMarker: res.NextMarker, + } + for _, e := range res.Entries { + body.Contents = append(body.Contents, listEntry{Key: e.Key, Size: e.Size, ETag: e.ETag}) + } + + w.Header().Set("Content-Type", "application/xml") + w.WriteHeader(http.StatusOK) + enc := xml.NewEncoder(w) + _ = enc.Encode(body) //nolint:errcheck // headers already sent; mid-stream encode error not actionable +} + +func (h *EdgeHandler) notImplemented(w http.ResponseWriter, op string) { + http.Error(w, op+" not implemented in MVP", http.StatusNotImplemented) +} + +func (h *EdgeHandler) writeOriginError(w http.ResponseWriter, err error) { + switch { + case errors.Is(err, origin.ErrNotFound): + http.Error(w, "NoSuchKey", http.StatusNotFound) + case errors.Is(err, origin.ErrAuth): + http.Error(w, "Unauthorized origin", http.StatusBadGateway) + default: + var ( + ube *origin.UnsupportedBlobTypeError + ec *origin.OriginETagChangedError + ) + + switch { + case errors.As(err, &ube): + http.Error(w, "OriginUnsupported: "+ube.Error(), http.StatusBadGateway) + case errors.As(err, &ec): + http.Error(w, "OriginETagChanged", http.StatusBadGateway) + default: + h.log.Warn("origin error", "err", err) + http.Error(w, "OriginUnreachable", http.StatusBadGateway) + } + } +} + +func setObjectHeaders(w http.ResponseWriter, info origin.ObjectInfo) { + if info.ContentType != "" { + w.Header().Set("Content-Type", info.ContentType) + } + + if info.ETag != "" { + w.Header().Set("ETag", "\""+info.ETag+"\"") + } + + w.Header().Set("Accept-Ranges", "bytes") +} + +func splitPath(p string) (bucket, key string) { + p = strings.TrimPrefix(p, "/") + if p == "" { + return "", "" + } + + idx := strings.IndexByte(p, '/') + if idx < 0 { + return p, "" + } + + return p[:idx], p[idx+1:] +} + +func parseSimpleByteRange(h string, size int64) (start, end int64, ok bool) { + if !strings.HasPrefix(h, "bytes=") { + return 0, 0, false + } + + spec := strings.TrimPrefix(h, "bytes=") + + parts := strings.Split(spec, "-") + if len(parts) != 2 { + return 0, 0, false + } + + if parts[0] == "" { + // Suffix: -N (last N bytes) + n, err := strconv.ParseInt(parts[1], 10, 64) + if err != nil || n <= 0 || n > size { + return 0, 0, false + } + + return size - n, size - 1, true + } + + s, err := strconv.ParseInt(parts[0], 10, 64) + if err != nil || s < 0 { + return 0, 0, false + } + + if parts[1] == "" { + return s, size - 1, true + } + + e, err := strconv.ParseInt(parts[1], 10, 64) + if err != nil || e < s { + return 0, 0, false + } + + if e >= size { + e = size - 1 + } + + return s, e, true +} + +// InternalHandler implements GET /internal/fill on the internal +// listener. Plain HTTP/2 (no mTLS) in dev. +type InternalHandler struct { + fc internalFetchAPI + cl *cluster.Cluster + log *slog.Logger +} + +// internalFetchAPI is the surface area InternalHandler depends on. The +// real *fetch.Coordinator satisfies it; tests substitute small fakes. +type internalFetchAPI interface { + FillForPeer(ctx context.Context, k chunk.Key) (io.ReadCloser, error) +} + +// NewInternalHandler wires the internal handler. +func NewInternalHandler(fc internalFetchAPI, cl *cluster.Cluster, log *slog.Logger) *InternalHandler { + return &InternalHandler{fc: fc, cl: cl, log: log} +} + +// ServeHTTP handles GET /internal/fill?. +func (h *InternalHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/internal/fill" { + http.NotFound(w, r) + return + } + + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + if r.Header.Get("X-Orca-Internal") != "1" { + http.Error(w, "missing X-Orca-Internal header", http.StatusBadRequest) + return + } + + k, err := cluster.DecodeChunkKey(r.URL.Query()) + if err != nil { + http.Error(w, "invalid chunk key: "+err.Error(), http.StatusBadRequest) + return + } + + if !h.cl.IsCoordinator(k) { + http.Error(w, `{"reason":"not_coordinator"}`, http.StatusConflict) + return + } + + body, err := h.fc.FillForPeer(r.Context(), k) + if err != nil { + h.log.Warn("internal fill failed", "chunk", k.String(), "err", err) + http.Error(w, "fill failed", http.StatusBadGateway) + + return + } + defer body.Close() //nolint:errcheck // internal-fill body close best-effort + + w.Header().Set("Content-Type", "application/octet-stream") + w.WriteHeader(http.StatusOK) + + if _, copyErr := io.Copy(w, body); copyErr != nil { + h.log.Warn("internal fill copy failed", "chunk", k.String(), "err", copyErr) + } +} + +// Compile-time check that the cachestore.ErrNotFound mapping survives +// dead-code elimination across handlers (used only via errors.Is in +// production code paths). +var ( + _ = cachestore.ErrNotFound + _ = context.Canceled +) diff --git a/internal/orca/server/server_test.go b/internal/orca/server/server_test.go new file mode 100644 index 00000000..64999464 --- /dev/null +++ b/internal/orca/server/server_test.go @@ -0,0 +1,482 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package server + +import ( + "context" + "encoding/xml" + "errors" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/Azure/unbounded/internal/orca/chunk" + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// fakeEdgeAPI satisfies edgeFetchAPI with canned responses for unit +// tests. Only the field for the call you want to mock needs to be +// set; an unset *Func panics if the test invokes the corresponding +// method. +type fakeEdgeAPI struct { + HeadObjectFunc func(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) + GetChunkFunc func(ctx context.Context, k chunk.Key) (io.ReadCloser, error) + OriginVal origin.Origin +} + +func (f *fakeEdgeAPI) HeadObject(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + return f.HeadObjectFunc(ctx, bucket, key) +} + +func (f *fakeEdgeAPI) GetChunk(ctx context.Context, k chunk.Key) (io.ReadCloser, error) { + return f.GetChunkFunc(ctx, k) +} + +func (f *fakeEdgeAPI) Origin() origin.Origin { return f.OriginVal } + +// fakeOrigin satisfies origin.Origin for handler tests. Only the +// fields used in the test need to be populated. +type fakeOrigin struct { + HeadFunc func(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) + GetRangeFunc func(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) + ListFunc func(ctx context.Context, bucket, prefix, marker string, max int) (origin.ListResult, error) +} + +func (f *fakeOrigin) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + return f.HeadFunc(ctx, bucket, key) +} + +func (f *fakeOrigin) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) { + return f.GetRangeFunc(ctx, bucket, key, etag, off, n) +} + +func (f *fakeOrigin) List(ctx context.Context, bucket, prefix, marker string, max int) (origin.ListResult, error) { + return f.ListFunc(ctx, bucket, prefix, marker, max) +} + +// TestWriteOriginError covers all five branches of the error mapping. +// Previously only ErrNotFound was exercised (via integration test). +func TestWriteOriginError(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + wantStatus int + wantBody string + }{ + { + name: "not found", + err: origin.ErrNotFound, + wantStatus: http.StatusNotFound, + wantBody: "NoSuchKey", + }, + { + name: "auth", + err: origin.ErrAuth, + wantStatus: http.StatusBadGateway, + wantBody: "Unauthorized origin", + }, + { + name: "unsupported blob type", + err: &origin.UnsupportedBlobTypeError{ + Bucket: "ctr", + Key: "page-blob", + BlobType: "PageBlob", + }, + wantStatus: http.StatusBadGateway, + wantBody: "OriginUnsupported", + }, + { + name: "etag changed", + err: &origin.OriginETagChangedError{ + Bucket: "b", Key: "k", Want: "old", Got: "new", + }, + wantStatus: http.StatusBadGateway, + wantBody: "OriginETagChanged", + }, + { + name: "generic error", + err: errors.New("unexpected"), + wantStatus: http.StatusBadGateway, + wantBody: "OriginUnreachable", + }, + } + + h := &EdgeHandler{log: discardLogger()} + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rr := httptest.NewRecorder() + h.writeOriginError(rr, tt.err) + + if rr.Code != tt.wantStatus { + t.Errorf("status=%d want %d", rr.Code, tt.wantStatus) + } + + if !strings.Contains(rr.Body.String(), tt.wantBody) { + t.Errorf("body %q does not contain %q", rr.Body.String(), tt.wantBody) + } + }) + } +} + +// TestHandleHead covers metadata propagation and the not-found error +// path on HEAD requests. +func TestHandleHead(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + info origin.ObjectInfo + err error + wantStatus int + wantHdrs map[string]string + }{ + { + name: "normal blob", + info: origin.ObjectInfo{ + Size: 1024, + ETag: "abc123", + ContentType: "application/octet-stream", + }, + wantStatus: http.StatusOK, + wantHdrs: map[string]string{ + "Content-Length": "1024", + "ETag": `"abc123"`, + "Content-Type": "application/octet-stream", + }, + }, + { + name: "missing content type omits header", + info: origin.ObjectInfo{Size: 99, ETag: "x"}, + wantStatus: http.StatusOK, + wantHdrs: map[string]string{ + "Content-Length": "99", + "ETag": `"x"`, + }, + }, + { + name: "missing etag omits header", + info: origin.ObjectInfo{Size: 7}, + wantStatus: http.StatusOK, + wantHdrs: map[string]string{ + "Content-Length": "7", + }, + }, + { + name: "origin not found yields 404", + err: origin.ErrNotFound, + wantStatus: http.StatusNotFound, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fc := &fakeEdgeAPI{ + HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return tt.info, tt.err + }, + } + h := NewEdgeHandler(fc, &config.Config{}, discardLogger()) + + req := httptest.NewRequest(http.MethodHead, "/bucket/key", nil) + rr := httptest.NewRecorder() + h.handleHead(rr, req, "bucket", "key") + + if rr.Code != tt.wantStatus { + t.Errorf("status=%d want %d", rr.Code, tt.wantStatus) + } + + for k, want := range tt.wantHdrs { + got := rr.Header().Get(k) + if got != want { + t.Errorf("header %s=%q want %q", k, got, want) + } + } + + if rr.Body.Len() != 0 && tt.wantStatus == http.StatusOK { + t.Errorf("HEAD body should be empty; got %d bytes", rr.Body.Len()) + } + }) + } +} + +// TestHandleList covers the XML pass-through, prefix propagation, +// truncation, and empty-list handling. +func TestHandleList(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + prefix string + listResult origin.ListResult + listErr error + wantStatus int + wantKeys []string + wantTrunc bool + wantNextTok string + }{ + { + name: "normal list", + prefix: "alpha/", + listResult: origin.ListResult{ + Entries: []origin.ObjectEntry{ + {Key: "alpha/one", Size: 3, ETag: "e1"}, + {Key: "alpha/two", Size: 5, ETag: "e2"}, + }, + }, + wantStatus: http.StatusOK, + wantKeys: []string{"alpha/one", "alpha/two"}, + }, + { + name: "empty list", + prefix: "missing/", + listResult: origin.ListResult{}, + wantStatus: http.StatusOK, + wantKeys: nil, + }, + { + name: "truncated list", + listResult: origin.ListResult{ + Entries: []origin.ObjectEntry{{Key: "k1"}}, + IsTruncated: true, + NextMarker: "next-page", + }, + wantStatus: http.StatusOK, + wantKeys: []string{"k1"}, + wantTrunc: true, + wantNextTok: "next-page", + }, + { + name: "origin error yields 502", + listErr: errors.New("upstream broken"), + wantStatus: http.StatusBadGateway, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + or := &fakeOrigin{ + ListFunc: func(_ context.Context, bucket, prefix, _ string, _ int) (origin.ListResult, error) { + if bucket != "b" { + t.Errorf("bucket=%q want %q", bucket, "b") + } + + if prefix != tt.prefix { + t.Errorf("prefix=%q want %q", prefix, tt.prefix) + } + + return tt.listResult, tt.listErr + }, + } + fc := &fakeEdgeAPI{OriginVal: or} + h := NewEdgeHandler(fc, &config.Config{}, discardLogger()) + + req := httptest.NewRequest(http.MethodGet, + "/b/?list-type=2&prefix="+tt.prefix, nil) + rr := httptest.NewRecorder() + h.handleList(rr, req, "b") + + if rr.Code != tt.wantStatus { + t.Errorf("status=%d want %d body=%s", rr.Code, tt.wantStatus, rr.Body.String()) + } + + if tt.wantStatus != http.StatusOK { + return + } + + var got struct { + XMLName xml.Name `xml:"ListBucketResult"` + Name string `xml:"Name"` + Prefix string `xml:"Prefix"` + KeyCount int `xml:"KeyCount"` + IsTruncated bool `xml:"IsTruncated"` + NextMarker string `xml:"NextContinuationToken"` + Contents []struct { + Key string `xml:"Key"` + } `xml:"Contents"` + } + if err := xml.Unmarshal(rr.Body.Bytes(), &got); err != nil { + t.Fatalf("xml decode: %v body=%s", err, rr.Body.String()) + } + + if got.Name != "b" { + t.Errorf("Name=%q want %q", got.Name, "b") + } + + if got.Prefix != tt.prefix { + t.Errorf("Prefix=%q want %q", got.Prefix, tt.prefix) + } + + if got.KeyCount != len(tt.wantKeys) { + t.Errorf("KeyCount=%d want %d", got.KeyCount, len(tt.wantKeys)) + } + + if got.IsTruncated != tt.wantTrunc { + t.Errorf("IsTruncated=%v want %v", got.IsTruncated, tt.wantTrunc) + } + + if got.NextMarker != tt.wantNextTok { + t.Errorf("NextMarker=%q want %q", got.NextMarker, tt.wantNextTok) + } + + gotKeys := make([]string, 0, len(got.Contents)) + for _, c := range got.Contents { + gotKeys = append(gotKeys, c.Key) + } + + if !equalStrings(gotKeys, tt.wantKeys) { + t.Errorf("keys=%v want %v", gotKeys, tt.wantKeys) + } + }) + } +} + +// TestParseSimpleByteRange covers all parser branches. +func TestParseSimpleByteRange(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + header string + size int64 + wantStart int64 + wantEnd int64 + wantOK bool + }{ + {"normal range", "bytes=0-99", 1024, 0, 99, true}, + {"suffix range", "bytes=-100", 1024, 924, 1023, true}, + {"open-ended", "bytes=100-", 1024, 100, 1023, true}, + {"end clamped to size", "bytes=0-9999", 1024, 0, 1023, true}, + {"start > end rejected", "bytes=100-50", 1024, 0, 0, false}, + {"missing prefix rejected", "0-99", 1024, 0, 0, false}, + {"multi-range rejected", "bytes=0-99,200-299", 1024, 0, 0, false}, + {"empty rejected", "", 1024, 0, 0, false}, + {"bytes= alone rejected", "bytes=", 1024, 0, 0, false}, + {"suffix larger than size rejected", "bytes=-9999", 1024, 0, 0, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s, e, ok := parseSimpleByteRange(tt.header, tt.size) + if ok != tt.wantOK { + t.Fatalf("ok=%v want %v (s=%d e=%d)", ok, tt.wantOK, s, e) + } + + if !ok { + return + } + + if s != tt.wantStart || e != tt.wantEnd { + t.Errorf("(s,e)=(%d,%d) want (%d,%d)", s, e, tt.wantStart, tt.wantEnd) + } + }) + } +} + +// TestSplitPath covers path splitting edge cases. +func TestSplitPath(t *testing.T) { + t.Parallel() + + tests := []struct { + in string + wantBucket string + wantKey string + }{ + {"", "", ""}, + {"/", "", ""}, + {"/bucket", "bucket", ""}, + {"/bucket/", "bucket", ""}, + {"/bucket/key", "bucket", "key"}, + {"/bucket/path/to/key", "bucket", "path/to/key"}, + } + + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + b, k := splitPath(tt.in) + if b != tt.wantBucket || k != tt.wantKey { + t.Errorf("splitPath(%q)=(%q,%q) want (%q,%q)", + tt.in, b, k, tt.wantBucket, tt.wantKey) + } + }) + } +} + +// TestSetObjectHeaders covers header propagation including the +// always-set Accept-Ranges and the conditionally-set fields. +func TestSetObjectHeaders(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + info origin.ObjectInfo + want map[string]string + }{ + { + name: "all fields set", + info: origin.ObjectInfo{ETag: "abc", ContentType: "text/plain"}, + want: map[string]string{ + "ETag": `"abc"`, + "Content-Type": "text/plain", + "Accept-Ranges": "bytes", + }, + }, + { + name: "missing content type", + info: origin.ObjectInfo{ETag: "abc"}, + want: map[string]string{ + "ETag": `"abc"`, + "Content-Type": "", + "Accept-Ranges": "bytes", + }, + }, + { + name: "missing etag", + info: origin.ObjectInfo{ContentType: "text/plain"}, + want: map[string]string{ + "ETag": "", + "Content-Type": "text/plain", + "Accept-Ranges": "bytes", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rr := httptest.NewRecorder() + setObjectHeaders(rr, tt.info) + + for k, want := range tt.want { + if got := rr.Header().Get(k); got != want { + t.Errorf("header %s=%q want %q", k, got, want) + } + } + }) + } +} + +// helpers + +func discardLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +func equalStrings(a, b []string) bool { + if len(a) != len(b) { + return false + } + + for i := range a { + if a[i] != b[i] { + return false + } + } + + return true +}