diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 76acf952..fa261da4 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -128,6 +128,34 @@ jobs:
           retention-days: 7
           if-no-files-found: ignore
 
+  # ---------- Orca Integration Tests ----------
+  # Spins up LocalStack and Azurite via testcontainers-go and runs the
+  # orca in-process integration suite (internal/orca/inttest). Docker
+  # is preinstalled on GitHub-hosted Ubuntu runners; no extra services:
+  # block is required.
+  orca-inttest:
+    name: Orca Integration Tests
+    needs: [frontend]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Download frontend dist
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
+        with:
+          name: frontend-dist
+          path: internal/net/html/dist
+
+      - name: Set up Go
+        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        with:
+          go-version-file: go.mod
+          cache-dependency-path: go.sum
+
+      - name: Run orca-inttest
+        run: make orca-inttest
+
   # ---------- Build ----------
   build:
     name: Build
diff --git a/Makefile b/Makefile
index 5be64f18..1c0134c8 100644
--- a/Makefile
+++ b/Makefile
@@ -74,6 +74,14 @@ STAMP_LDFLAGS=-X github.com/Azure/unbounded/internal/version.Version=$(VERSION)
 
 METALMAN_IMAGE=$(CONTAINER_REGISTRY)/metalman:$(VERSION)
 
+# Orca configuration
+ORCA_BIN=bin/orca
+ORCA_CMD=./cmd/orca
+ORCA_IMAGE ?= $(CONTAINER_REGISTRY)/orca:$(VERSION)
+ORCA_NAMESPACE ?= unbounded-kube
+ORCA_MANIFEST_TEMPLATES_DIR := deploy/orca
+ORCA_MANIFEST_RENDERED_DIR  := deploy/orca/rendered
+
 # kubectl-unbounded also stamps the metalman image reference.
 KUBECTL_UNBOUNDED_LDFLAGS=$(STAMP_LDFLAGS) -X github.com/Azure/unbounded/cmd/kubectl-unbounded/app.MetalmanImage=$(METALMAN_IMAGE)
 
@@ -112,6 +120,7 @@ REACT_DEV ?= false
 .PHONY: all help fmt lint test build vulncheck check-deps kubectl-unbounded kubectl-unbounded-build install-tools install-protoc generate kubectl-unbounded forge unbounded-agent machina machina-build machina-oci machina-oci-push machina-manifests machine-ops-controller machine-ops-controller-build machine-ops-controller-oci machine-ops-controller-oci-push machine-ops-manifests metalman metalman-build metalman-oci metalman-oci-push gomod docs-serve unbounded-net-controller unbounded-net-node unbounded-net-routeplan-debug unping unroute notice notice-check
 .PHONY: net-frontend net-frontend-clean net-build-ebpf net-manifests release-manifests
 .PHONY: image-machina-local image-machine-ops-controller-local image-metalman-local image-net-controller-local image-net-node-local images-local
+.PHONY: orca orca-build orca-manifests orca-oci orca-oci-push orca-up orca-down orca-reset orca-inttest image-orca-local
 
 ##@ General
 
@@ -176,6 +185,8 @@ help: ## Show this help
 	@echo "  machina-oci-push                 Build machina image and push"
 	@echo "  machine-ops-controller-oci-push  Build machine-ops-controller image and push"
 	@echo "  metalman-oci-push                Build metalman image and push"
+	@echo "  image-orca-local                 Build orca image"
+	@echo "  orca-oci-push                    Build orca image and push"
 	@echo ""
 	@echo "Net Frontend:"
 	@echo "  net-frontend                     Build frontend into \$$(NET_FRONTEND_DIST_DIR) (cached)"
@@ -188,10 +199,19 @@ help: ## Show this help
 	@echo "  machina-manifests                Render machina manifests into deploy/machina/rendered"
 	@echo "  machine-ops-manifests            Render machine-ops manifests into deploy/machine-ops/rendered"
 	@echo "  net-manifests                    Render net manifests into \$$(NET_MANIFEST_RENDERED_DIR)"
+	@echo "  orca-manifests                   Render orca manifests into deploy/orca/rendered"
 	@echo ""
 	@echo "Net Kubernetes (apply to current kubectl context):"
 	@echo "  See \`make -C hack/net help\` for cluster deploy/undeploy targets."
 	@echo ""
+	@echo "Orca Dev Harness (Kind cluster):"
+	@echo "  orca | orca-build                Build orca binary (with/without lint/test)"
+	@echo "  orca-up                          Bring up Orca dev harness in Kind"
+	@echo "  orca-down                        Tear down Orca dev harness Kind cluster"
+	@echo "  orca-reset                       Rebuild image and rollout-restart deployment"
+	@echo "  orca-inttest                     Run orca integration tests (Docker required)"
+	@echo "  See \`make -C hack/orca help\` for full list."
+	@echo ""
 	@echo "Documentation:"
 	@echo "  docs-serve                       Start local Hugo dev server"
 	@echo ""
@@ -570,6 +590,58 @@ metalman-oci: image-metalman-local ## Alias for image-metalman-local
 metalman-oci-push: metalman-oci ## Build and push the metalman container image
 	$(CONTAINER_ENGINE) push $(METALMAN_IMAGE)
 
+##@ Orca
+
+orca-build: ## Build the orca binary (no lint/test)
+	$(GOBUILD) -ldflags '$(STAMP_LDFLAGS)' -o $(ORCA_BIN) $(ORCA_CMD)/main.go
+
+orca: test orca-build ## Build the orca binary (implies test)
+
+orca-manifests: ## Render orca deployment manifests into deploy/orca/rendered
+	@mkdir -p $(ORCA_MANIFEST_RENDERED_DIR)
+	@find $(ORCA_MANIFEST_RENDERED_DIR) -mindepth 1 -not -name .gitignore -delete 2>/dev/null || true
+	$(GOCMD) run ./hack/cmd/render-manifests \
+		--templates-dir $(ORCA_MANIFEST_TEMPLATES_DIR) \
+		--output-dir $(ORCA_MANIFEST_RENDERED_DIR) \
+		--set Namespace=$(ORCA_NAMESPACE) \
+		--set Image=$(ORCA_IMAGE)
+	@echo "Rendered orca manifests into $(ORCA_MANIFEST_RENDERED_DIR) (image: $(ORCA_IMAGE))"
+
+image-orca-local: ## Build the orca container image locally (single-arch)
+	$(CONTAINER_ENGINE) build \
+		--build-arg VERSION=$(VERSION) \
+		--build-arg GIT_COMMIT=$(GIT_COMMIT) \
+		--build-arg BUILD_TIME=$(BUILD_TIME) \
+		-t orca:$(VERSION) -t $(ORCA_IMAGE) \
+		-f ./images/orca/Containerfile .
+
+orca-oci: image-orca-local ## Alias for image-orca-local
+
+orca-oci-push: orca-oci ## Build and push the orca container image
+	$(CONTAINER_ENGINE) push $(ORCA_IMAGE)
+
+# Dev-cluster proxy targets. The actual implementations live in
+# hack/orca/Makefile (see AGENTS.md convention; mirrors hack/net/).
+orca-up: ## Bring up the Orca dev harness in a Kind cluster
+	$(MAKE) -C hack/orca up
+
+orca-down: ## Tear down the Orca dev harness Kind cluster
+	$(MAKE) -C hack/orca down
+
+orca-reset: ## Rebuild orca image and rolling-restart the dev deployment
+	$(MAKE) -C hack/orca reset
+
+# orca-inttest mirrors the test/test-race pattern: race detector in CI
+# (ubuntu-latest has gcc), no -race locally so developers without a C
+# toolchain can still run integration tests.
+ifdef CI
+orca-inttest: ## Run orca integration tests (LocalStack + Azurite via testcontainers; requires Docker)
+	$(GOTEST) -tags=integrationtest -race -timeout 15m ./internal/orca/inttest/...
+else
+orca-inttest: ## Run orca integration tests (LocalStack + Azurite via testcontainers; requires Docker)
+	$(GOTEST) -tags=integrationtest -timeout 15m ./internal/orca/inttest/...
+endif
+
 image-net-controller-local: net-frontend resources/cni-plugins-linux-$(HOST_GOARCH)-$(CNI_PLUGINS_VERSION).tgz ## Build the unbounded-net-controller image locally (single-arch)
 	$(CONTAINER_ENGINE) build \
 		--target controller \
diff --git a/cmd/orca/main.go b/cmd/orca/main.go
new file mode 100644
index 00000000..f7ea8484
--- /dev/null
+++ b/cmd/orca/main.go
@@ -0,0 +1,10 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package main
+
+import "github.com/Azure/unbounded/cmd/orca/orca"
+
+func main() {
+	orca.Run()
+}
diff --git a/cmd/orca/orca/orca.go b/cmd/orca/orca/orca.go
new file mode 100644
index 00000000..a770bdd7
--- /dev/null
+++ b/cmd/orca/orca/orca.go
@@ -0,0 +1,99 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package orca wires the Orca cache binary together. It is invoked by
+// cmd/orca/main.go and is responsible for parsing flags, loading the
+// YAML config, and delegating to internal/orca/app for actual runtime
+// wiring.
+package orca
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"os"
+	"os/signal"
+	"syscall"
+	"time"
+
+	"github.com/spf13/cobra"
+
+	"github.com/Azure/unbounded/internal/orca/app"
+	"github.com/Azure/unbounded/internal/orca/config"
+)
+
+// Run is the entrypoint invoked by cmd/orca/main.go.
+func Run() {
+	root := &cobra.Command{
+		Use:   "orca",
+		Short: "Orca origin cache - S3-compatible read-only cache fronting Azure / S3 origins",
+	}
+	root.AddCommand(newServeCmd())
+
+	if err := root.Execute(); err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(1)
+	}
+}
+
+func newServeCmd() *cobra.Command {
+	var configPath string
+
+	cmd := &cobra.Command{
+		Use:   "serve",
+		Short: "Run the Orca cache server",
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return serve(cmd.Context(), configPath)
+		},
+	}
+	cmd.Flags().StringVarP(&configPath, "config", "c", "/etc/orca/config.yaml",
+		"path to YAML config file")
+
+	return cmd
+}
+
+func serve(parent context.Context, configPath string) error {
+	log := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
+		Level: slog.LevelInfo,
+	}))
+	slog.SetDefault(log)
+
+	log.Info("orca starting", "config_path", configPath)
+
+	cfg, err := config.Load(configPath)
+	if err != nil {
+		return fmt.Errorf("load config: %w", err)
+	}
+
+	log.Info("config loaded",
+		"origin_id", cfg.Origin.ID,
+		"replicas_target", cfg.Cluster.TargetReplicas,
+		"target_global", cfg.Origin.TargetGlobal,
+		"internal_tls", cfg.Cluster.InternalTLS.Enabled,
+		"client_auth", cfg.Server.Auth.Enabled,
+	)
+
+	ctx, cancel := signal.NotifyContext(parent, os.Interrupt, syscall.SIGTERM)
+	defer cancel()
+
+	a, err := app.Start(ctx, cfg, app.WithLogger(log))
+	if err != nil {
+		return err
+	}
+
+	if waitErr := a.Wait(ctx); waitErr != nil {
+		log.Error("listener exited with error", "err", waitErr)
+		cancel()
+	} else {
+		log.Info("shutdown signal received")
+	}
+
+	shutdownCtx, shCancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer shCancel()
+
+	_ = a.Shutdown(shutdownCtx) //nolint:errcheck // shutdown errors already logged inside App.Shutdown
+
+	log.Info("orca stopped")
+
+	return nil
+}
diff --git a/deploy/orca/01-namespace.yaml.tmpl b/deploy/orca/01-namespace.yaml.tmpl
new file mode 100644
index 00000000..fd353a35
--- /dev/null
+++ b/deploy/orca/01-namespace.yaml.tmpl
@@ -0,0 +1,6 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
diff --git a/deploy/orca/02-rbac.yaml.tmpl b/deploy/orca/02-rbac.yaml.tmpl
new file mode 100644
index 00000000..5961196b
--- /dev/null
+++ b/deploy/orca/02-rbac.yaml.tmpl
@@ -0,0 +1,8 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: orca
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
diff --git a/deploy/orca/03-config.yaml.tmpl b/deploy/orca/03-config.yaml.tmpl
new file mode 100644
index 00000000..811e2fb6
--- /dev/null
+++ b/deploy/orca/03-config.yaml.tmpl
@@ -0,0 +1,71 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: orca-config
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
+data:
+  config.yaml: |
+    # Orca origin cache configuration.
+    # Secret values (account keys, S3 access/secret) are sourced from
+    # environment variables ORCA_AZUREBLOB_ACCOUNT_KEY,
+    # ORCA_CACHESTORE_S3_ACCESS_KEY, ORCA_CACHESTORE_S3_SECRET_KEY,
+    # populated by the orca-credentials Secret via envFrom.
+
+    server:
+      listen: "0.0.0.0:8443"
+      auth:
+        # Dev: disabled. Production: enable bearer or mtls.
+        enabled: {{ default "false" .ServerAuthEnabled }}
+
+    origin:
+      id: {{ default "azureblob-default" .OriginID | quote }}
+      driver: {{ default "azureblob" .OriginDriver }}
+      target_global: {{ default "192" .TargetGlobal }}
+      queue_timeout: 5s
+      retry:
+        attempts: 3
+        backoff_initial: 100ms
+        backoff_max: 2s
+        max_total_duration: 5s
+      azureblob:
+        account: {{ default "" .AzureAccount | quote }}
+        container: {{ default "" .AzureContainer | quote }}
+        endpoint: {{ default "" .AzureEndpoint | quote }}
+        enforce_block_blob_only: true
+      awss3:
+        endpoint: {{ default "" .OriginAWSS3Endpoint | quote }}
+        region: {{ default "us-east-1" .OriginAWSS3Region | quote }}
+        bucket: {{ default "" .OriginAWSS3Bucket | quote }}
+        use_path_style: {{ default "false" .OriginAWSS3UsePathStyle }}
+
+    cachestore:
+      driver: s3
+      s3:
+        endpoint: {{ default "http://localstack.unbounded-kube.svc.cluster.local:4566" .CachestoreEndpoint | quote }}
+        bucket: {{ default "orca-cache" .CachestoreBucket | quote }}
+        region: {{ default "us-east-1" .CachestoreRegion | quote }}
+        use_path_style: true
+        require_unversioned_bucket: true
+
+    cluster:
+      service: {{ default "orca-peers.unbounded-kube.svc.cluster.local" .ClusterService | quote }}
+      membership_refresh: 5s
+      internal_listen: "0.0.0.0:8444"
+      target_replicas: {{ default "3" .TargetReplicas }}
+      internal_tls:
+        # Dev: disabled (plain HTTP/2 between peers). Production: true.
+        enabled: {{ default "false" .InternalTLSEnabled }}
+
+    chunk_catalog:
+      max_entries: 100000
+
+    metadata:
+      ttl: 5m
+      negative_ttl: 60s
+      max_entries: 10000
+
+    chunking:
+      size: 8388608
diff --git a/deploy/orca/04-deployment.yaml.tmpl b/deploy/orca/04-deployment.yaml.tmpl
new file mode 100644
index 00000000..44a0eb80
--- /dev/null
+++ b/deploy/orca/04-deployment.yaml.tmpl
@@ -0,0 +1,76 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: orca
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
+spec:
+  replicas: {{ default "3" .TargetReplicas }}
+  # Required pod-anti-affinity below pins one Orca pod per node.
+  # In the dev harness the worker count == replica count, so default
+  # RollingUpdate can't surge: the new pod has no node to land on.
+  # maxSurge=0 / maxUnavailable=1 walks the replicas one-at-a-time.
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 0
+      maxUnavailable: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: orca
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: orca
+    spec:
+      serviceAccountName: orca
+      # Required anti-affinity: at most one Orca pod per node so that a
+      # single node failure does not knock out multiple replicas. The
+      # dev harness Kind cluster has 3 worker nodes to match the default
+      # 3 replicas.
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            - labelSelector:
+                matchLabels:
+                  app.kubernetes.io/name: orca
+              topologyKey: kubernetes.io/hostname
+      containers:
+        - name: orca
+          image: {{ default "ghcr.io/azure/orca:latest" .Image | quote }}
+          imagePullPolicy: {{ default "IfNotPresent" .ImagePullPolicy }}
+          args:
+            - serve
+            - --config=/etc/orca/config.yaml
+          env:
+            - name: POD_IP
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.podIP
+          envFrom:
+            - secretRef:
+                name: orca-credentials
+          ports:
+            - containerPort: 8443
+              name: edge
+              protocol: TCP
+            - containerPort: 8444
+              name: internal
+              protocol: TCP
+          resources:
+            requests:
+              cpu: {{ default "200m" .ResourceCPURequest }}
+              memory: {{ default "256Mi" .ResourceMemoryRequest }}
+            limits:
+              cpu: {{ default "2" .ResourceCPULimit }}
+              memory: {{ default "1Gi" .ResourceMemoryLimit }}
+          volumeMounts:
+            - name: config
+              mountPath: /etc/orca
+              readOnly: true
+      volumes:
+        - name: config
+          configMap:
+            name: orca-config
diff --git a/deploy/orca/05-service.yaml.tmpl b/deploy/orca/05-service.yaml.tmpl
new file mode 100644
index 00000000..36dba4fd
--- /dev/null
+++ b/deploy/orca/05-service.yaml.tmpl
@@ -0,0 +1,43 @@
+---
+# Client-facing Service: standard ClusterIP. Clients of the cache (e.g.
+# tools speaking S3 to fetch objects) connect here. Kube-proxy load
+# balances across the 3 replicas.
+apiVersion: v1
+kind: Service
+metadata:
+  name: orca
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
+spec:
+  type: ClusterIP
+  selector:
+    app.kubernetes.io/name: orca
+  ports:
+    - name: edge
+      port: 8443
+      targetPort: edge
+      protocol: TCP
+
+---
+# Peer-discovery Service: headless (ClusterIP: None). LookupHost on
+# orca-peers.<ns>.svc.cluster.local returns all pod IPs, enabling
+# rendezvous-hash coordination among Orca replicas.
+apiVersion: v1
+kind: Service
+metadata:
+  name: orca-peers
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
+spec:
+  type: ClusterIP
+  clusterIP: None
+  publishNotReadyAddresses: true
+  selector:
+    app.kubernetes.io/name: orca
+  ports:
+    - name: internal
+      port: 8444
+      targetPort: internal
+      protocol: TCP
diff --git a/deploy/orca/dev/01-localstack.yaml.tmpl b/deploy/orca/dev/01-localstack.yaml.tmpl
new file mode 100644
index 00000000..87dfcc02
--- /dev/null
+++ b/deploy/orca/dev/01-localstack.yaml.tmpl
@@ -0,0 +1,83 @@
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: localstack
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: localstack
+    app.kubernetes.io/part-of: orca-dev
+spec:
+  type: ClusterIP
+  selector:
+    app.kubernetes.io/name: localstack
+  ports:
+    - name: edge
+      port: 4566
+      targetPort: 4566
+      protocol: TCP
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: localstack
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: localstack
+    app.kubernetes.io/part-of: orca-dev
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: localstack
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: localstack
+        app.kubernetes.io/part-of: orca-dev
+    spec:
+      containers:
+        - name: localstack
+          # 3.8 is community-tier; 'latest' became Pro-only and exits
+          # with code 55 ("License activation failed").
+          image: {{ default "localstack/localstack:3.8" .LocalstackImage | quote }}
+          imagePullPolicy: IfNotPresent
+          ports:
+            - containerPort: 4566
+              name: edge
+              protocol: TCP
+          env:
+            - name: SERVICES
+              value: s3
+            - name: DEBUG
+              value: "0"
+            - name: PERSISTENCE
+              value: "0"
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 1
+              memory: 1Gi
+          readinessProbe:
+            httpGet:
+              path: /_localstack/health
+              port: 4566
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            timeoutSeconds: 3
+          livenessProbe:
+            httpGet:
+              path: /_localstack/health
+              port: 4566
+            initialDelaySeconds: 30
+            periodSeconds: 30
+            timeoutSeconds: 5
+          volumeMounts:
+            - name: data
+              mountPath: /var/lib/localstack
+      volumes:
+        - name: data
+          emptyDir: {}
diff --git a/deploy/orca/dev/02-init-job.yaml.tmpl b/deploy/orca/dev/02-init-job.yaml.tmpl
new file mode 100644
index 00000000..0eb41832
--- /dev/null
+++ b/deploy/orca/dev/02-init-job.yaml.tmpl
@@ -0,0 +1,80 @@
+---
+# Init Job: creates the cachestore + origin S3 buckets in LocalStack so
+# that Orca can pass the versioningGate boot check and so that reviewers
+# have an origin bucket to seed sample objects into. Idempotent:
+# CreateBucket returns BucketAlreadyOwnedByYou on rerun, swallowed by
+# the script.
+#
+# Cachestore bucket: versioning left unset (which is what
+# require_unversioned_bucket=true expects).
+# Origin bucket: no versioning constraint; sample objects live here.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: orca-buckets-init
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
+    app.kubernetes.io/part-of: orca-dev
+spec:
+  backoffLimit: 6
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: orca
+        app.kubernetes.io/part-of: orca-dev
+    spec:
+      restartPolicy: OnFailure
+      containers:
+        - name: aws-cli
+          image: {{ default "amazon/aws-cli:latest" .AwsCliImage | quote }}
+          env:
+            - name: AWS_ACCESS_KEY_ID
+              value: test
+            - name: AWS_SECRET_ACCESS_KEY
+              value: test
+            - name: AWS_DEFAULT_REGION
+              value: us-east-1
+            - name: CACHESTORE_BUCKET
+              value: {{ default "orca-cache" .CachestoreBucket | quote }}
+            - name: ORIGIN_BUCKET
+              value: {{ default "orca-origin" .OriginBucket | quote }}
+            - name: ENDPOINT
+              value: http://localstack.{{ default "unbounded-kube" .Namespace }}.svc.cluster.local:4566
+          command:
+            - /bin/sh
+            - -c
+            - |
+              set -e
+              echo "Waiting for LocalStack at $ENDPOINT ..."
+              for i in $(seq 1 60); do
+                if aws --endpoint-url "$ENDPOINT" s3api list-buckets >/dev/null 2>&1; then
+                  echo "LocalStack ready."
+                  break
+                fi
+                sleep 2
+              done
+
+              ensure_bucket() {
+                bucket="$1"
+                echo "Ensuring bucket $bucket (idempotent) ..."
+                if aws --endpoint-url "$ENDPOINT" s3api head-bucket --bucket "$bucket" >/dev/null 2>&1; then
+                  echo "Bucket $bucket already exists."
+                else
+                  aws --endpoint-url "$ENDPOINT" s3api create-bucket --bucket "$bucket"
+                  echo "Bucket $bucket created."
+                fi
+              }
+
+              ensure_bucket "$CACHESTORE_BUCKET"
+              ensure_bucket "$ORIGIN_BUCKET"
+
+              # Verify cachestore bucket versioning is unset (Orca's
+              # versioningGate rejects Enabled or Suspended).
+              status=$(aws --endpoint-url "$ENDPOINT" s3api get-bucket-versioning --bucket "$CACHESTORE_BUCKET" --query Status --output text 2>/dev/null || echo "None")
+              echo "Cachestore bucket versioning: $status (None means unset, which is required)."
+              if [ "$status" = "Enabled" ] || [ "$status" = "Suspended" ]; then
+                echo "ERROR: cachestore bucket versioning is $status; Orca requires unset/None."
+                exit 1
+              fi
+              echo "Init complete."
diff --git a/deploy/orca/dev/03-azurite.yaml.tmpl b/deploy/orca/dev/03-azurite.yaml.tmpl
new file mode 100644
index 00000000..4282c248
--- /dev/null
+++ b/deploy/orca/dev/03-azurite.yaml.tmpl
@@ -0,0 +1,108 @@
+---
+# Azurite is Microsoft's official Azure Storage emulator. We use it as
+# an alternative origin in the dev harness so reviewers can exercise
+# the azureblob origin driver path without a real Azure account.
+#
+# Well-known dev account/key (documented at
+# https://learn.microsoft.com/azure/storage/common/storage-use-azurite):
+#   AccountName: devstoreaccount1
+#   AccountKey:  Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==
+#   BlobURL:     http://azurite.<ns>.svc.cluster.local:10000/devstoreaccount1
+apiVersion: v1
+kind: Service
+metadata:
+  name: azurite
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: azurite
+    app.kubernetes.io/part-of: orca-dev
+spec:
+  type: ClusterIP
+  selector:
+    app.kubernetes.io/name: azurite
+  ports:
+    - name: blob
+      port: 10000
+      targetPort: 10000
+      protocol: TCP
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: azurite
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: azurite
+    app.kubernetes.io/part-of: orca-dev
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: azurite
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: azurite
+        app.kubernetes.io/part-of: orca-dev
+    spec:
+      containers:
+        - name: azurite
+          image: {{ default "mcr.microsoft.com/azure-storage/azurite:3.33.0" .AzuriteImage | quote }}
+          imagePullPolicy: IfNotPresent
+          # Bind to 0.0.0.0 so the Service can reach it; default is
+          # 127.0.0.1.
+          # --skipApiVersionCheck allows newer Azure SDK clients
+          # (which advertise API versions Azurite hasn't yet caught up
+          # with) to talk to it.
+          # --loose disables strict validation of newer SDK headers.
+          # --disableProductStyleUrl forces path-style URL parsing.
+          # Without it, Azurite parses the first DNS label of the Host
+          # header as the account name (so requests to azurite.<ns>...
+          # would be misinterpreted as account="azurite" rather than
+          # account="devstoreaccount1").
+          # --debug routes Azurite's internal request log to a file;
+          # tail it via `kubectl exec ... -- cat /tmp/azurite-debug.log`
+          # when triaging 4xx responses.
+          args:
+            - azurite-blob
+            - --blobHost
+            - 0.0.0.0
+            - --blobPort
+            - "10000"
+            - --skipApiVersionCheck
+            - --loose
+            - --disableProductStyleUrl
+            - --debug
+            - /tmp/azurite-debug.log
+            - --location
+            - /data
+          ports:
+            - containerPort: 10000
+              name: blob
+              protocol: TCP
+          resources:
+            requests:
+              cpu: 50m
+              memory: 128Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+          readinessProbe:
+            tcpSocket:
+              port: 10000
+            initialDelaySeconds: 3
+            periodSeconds: 5
+            timeoutSeconds: 3
+          livenessProbe:
+            tcpSocket:
+              port: 10000
+            initialDelaySeconds: 30
+            periodSeconds: 30
+            timeoutSeconds: 5
+          volumeMounts:
+            - name: data
+              mountPath: /data
+      volumes:
+        - name: data
+          emptyDir: {}
diff --git a/deploy/orca/dev/04-azurite-init.yaml.tmpl b/deploy/orca/dev/04-azurite-init.yaml.tmpl
new file mode 100644
index 00000000..8ad9433f
--- /dev/null
+++ b/deploy/orca/dev/04-azurite-init.yaml.tmpl
@@ -0,0 +1,54 @@
+---
+# Init Job: creates the Azure container in Azurite so Orca's azureblob
+# origin driver has somewhere to read from. Idempotent: az container
+# create with --fail-on-exist false treats existence as success.
+#
+# Uses the well-known Azurite dev creds (devstoreaccount1 + the
+# documented public key); these are baked into Azurite and not
+# secrets.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: orca-azurite-container-init
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
+    app.kubernetes.io/part-of: orca-dev
+spec:
+  backoffLimit: 6
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: orca
+        app.kubernetes.io/part-of: orca-dev
+    spec:
+      restartPolicy: OnFailure
+      containers:
+        - name: az-cli
+          image: {{ default "mcr.microsoft.com/azure-cli:latest" .AzCliImage | quote }}
+          env:
+            - name: AZURE_STORAGE_CONNECTION_STRING
+              value: "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite.{{ default "unbounded-kube" .Namespace }}.svc.cluster.local:10000/devstoreaccount1;"
+            - name: CONTAINER
+              value: {{ default "orca-test" .AzuriteContainer | quote }}
+          command:
+            - /bin/sh
+            - -c
+            - |
+              set -e
+              echo "Waiting for Azurite ..."
+              for i in $(seq 1 60); do
+                if az storage container list --output none 2>/dev/null; then
+                  echo "Azurite ready."
+                  break
+                fi
+                sleep 2
+              done
+              echo "Ensuring container ${CONTAINER} (idempotent) ..."
+              if az storage container exists --name "${CONTAINER}" --query exists --output tsv | grep -qi true; then
+                echo "Container ${CONTAINER} already exists."
+              else
+                az storage container create --name "${CONTAINER}" --output none
+                echo "Container ${CONTAINER} created."
+              fi
+              echo "Init complete."
\ No newline at end of file
diff --git a/deploy/orca/rendered/.gitignore b/deploy/orca/rendered/.gitignore
new file mode 100644
index 00000000..f79c394d
--- /dev/null
+++ b/deploy/orca/rendered/.gitignore
@@ -0,0 +1,3 @@
+# rendered manifests are gitignored; produced by `make orca-manifests`.
+*
+!.gitignore
diff --git a/design/orca/brief.md b/design/orca/brief.md
new file mode 100644
index 00000000..43940c82
--- /dev/null
+++ b/design/orca/brief.md
@@ -0,0 +1,368 @@
+# Orca - Origin Cache - Architecture Brief
+
+A short brief intended for technical leads who need to understand the
+shape of the system, the load-bearing decisions, and what is in v1
+without wading through the full design. Drill-down references point at
+[design.md](./design.md).
+
+## 1. Problem and approach
+
+Cloud blob origins (AWS S3, Azure Blob) are slow and expensive when
+read from on-prem at scale. The intended workload is large immutable
+artifacts (job inputs, model weights, training shards) read by
+thousands of clients with strongly correlated cold starts (job
+launches, distributed-training kickoffs), including FUSE-mounted
+filesystems where edge clients perform interactive `ls` and
+directory navigation. Naive direct access stampedes origin egress
+and cost.
+
+Orca is a read-only S3-compatible HTTP cache deployed inside
+the on-prem datacenter as a multi-replica Kubernetes Deployment
+fronting AWS S3 and Azure Blob. It serves chunked, ETag-keyed bytes
+out of a shared in-DC backing store, dedupes concurrent fills both
+within and across replicas, and presents the same `GetObject` /
+`HeadObject` / `ListObjectsV2` surface clients already use.
+
+## 2. Goals and non-goals
+
+Goals (v1):
+- Read-only S3-compatible API at the edge: `GetObject` (with byte-range
+  `Range`), `HeadObject`, `ListObjectsV2`.
+- Multi-PB working set; thousands of concurrent clients.
+- Multi-DC deployment; each DC independent (no cross-DC peering).
+- Negligible origin stampede under correlated cold-access bursts.
+- Low **TTFB** (time to first byte) on both warm and cold paths.
+- Atomic, durable commit of fetched chunks; safe under concurrent
+  fills.
+- Bounded staleness: `metadata_ttl` (default 5m) on contract violation,
+  `negative_metadata_ttl` (default 60s) on create-after-404; zero
+  otherwise.
+
+Non-goals (v1):
+- Write path, multipart upload, object versioning.
+- Cross-DC peering.
+- SigV4 verification at the edge (bearer / mTLS only).
+- Multi-tenant quotas or per-tenant credentials.
+- Per-client / per-IP edge rate limiting.
+- Mutable-blob invalidation beyond ETag identity.
+- Encryption at rest beyond what the backing store provides.
+
+## 3. System at a glance
+
+Each request lands on one replica (the **assembler**), which iterates
+the requested range chunk by chunk. Hits read directly from the
+shared **CacheStore**. Misses route to the chunk's **coordinator**
+(selected by rendezvous hashing on pod IP from the headless-Service
+membership), which runs a singleflight + tee + spool fill against the
+**Origin** and atomically commits to the CacheStore. The coordinator
+may be the assembler itself (local fill) or a different replica
+(per-chunk internal mTLS fill RPC).
+
+### Diagram A: System overview
+
+```mermaid
+graph TB
+    subgraph DC["On-prem datacenter"]
+        Clients["Edge clients"]
+        Service["Service (ClusterIP / LB)<br/>client traffic"]
+        subgraph Replicas["orca Deployment"]
+            R1["Replica 1"]
+            R2["Replica 2"]
+            R3["Replica N"]
+        end
+        Headless["Headless Service<br/>peer discovery"]
+        Internal["Internal listener :8444<br/>per-chunk fill RPC<br/>(mTLS, peer-IP authz)"]
+        CS[("CacheStore<br/>in-DC S3 / posixfs / localfs")]
+    end
+    subgraph Cloud["Cloud origins"]
+        S3[("AWS S3")]
+        Azure[("Azure Blob<br/>Block Blobs only")]
+    end
+    Clients -- "S3 GET / HEAD / LIST<br/>+ Range" --> Service
+    Service --> R1
+    Service --> R2
+    Service --> R3
+    R1 -. "DNS refresh<br/>default 5s" .-> Headless
+    R2 -.-> Headless
+    R3 -.-> Headless
+    R1 <--> Internal
+    R2 <--> Internal
+    R3 <--> Internal
+    R1 <--> CS
+    R2 <--> CS
+    R3 <--> CS
+    R1 -- "miss-fill<br/>If-Match: etag" --> S3
+    R2 -- "miss-fill<br/>If-Match: etag" --> S3
+    R3 -- "miss-fill<br/>If-Match: etag" --> Azure
+```
+
+## 4. Components
+
+Named building blocks. The first five (Origin, CacheStore, ChunkCatalog,
+Cluster, Spool) are formal Go interfaces in
+[design.md s7](./design.md#7-internal-interfaces); the request-edge
+components (Server, fetch.Coordinator, Singleflight, Auth) are
+process-internal and are described in
+[design.md s4](./design.md#4-architecture) and
+[s8](./design.md#8-stampede-protection).
+
+- **Server** - the S3-compatible HTTP edge for clients, plus a
+  separate internal listener for per-chunk fill RPCs between
+  replicas. Two listeners with two distinct trust roots.
+- **fetch.Coordinator** - orchestrates the per-request fan-out:
+  per-chunk routing, origin concurrency bounding, internal-RPC
+  client. The brain of the assembler.
+- **Singleflight** - per-`ChunkKey` in-flight dedupe so concurrent
+  cold misses for the same chunk collapse into one origin GET.
+  Prevents process-local thundering herds.
+- **Spool** - bounded local-disk staging for in-flight fills.
+  Tees bytes in parallel with the client write (s5.2), giving
+  slow joiners a uniform fallback across all CacheStore drivers
+  and serving as the source for the asynchronous CacheStore
+  commit.
+- **ChunkCatalog** - in-memory LRU recording which chunks the
+  CacheStore holds. Pure hot-path optimization; CacheStore is
+  source of truth.
+- **Origin** - read-only adapter to the upstream cloud blob store
+  (AWS S3, Azure Blob). Sends `If-Match: <etag>` on every range
+  read so mid-flight overwrites are detected at the wire.
+- **CacheStore** - shared in-DC chunk store, source of truth for
+  chunk presence. Pluggable: `localfs`, `posixfs`, `s3`. Driver
+  choice invisible above the cachestore boundary.
+- **Cluster** - peer discovery from the headless Service plus
+  rendezvous hashing on pod IP to pick the coordinator per
+  `ChunkKey`. Refreshes membership every 5s by default.
+- **Auth** - bearer / mTLS on the client edge and mTLS plus
+  peer-IP authorization on the internal listener. Separate trust
+  roots.
+
+## 5. Five load-bearing mechanisms
+
+### 5.1 Chunking and identity
+
+The cache works in fixed-size chunks (default 8 MiB, configurable
+4-16 MiB). The `ChunkKey` is
+`{origin_id, bucket, object_key, etag, chunk_size, chunk_index}` and
+is the on-store path for that chunk. ETag is treated as identity, not
+freshness: any change of origin bytes (under the contract in s5.5)
+produces a new ETag, which deterministically yields a new chunk path.
+The cache cannot, by construction, serve old bytes for a new ETag.
+See [design.md s5](./design.md#5-chunk-model).
+
+### 5.2 Singleflight + tee + spool
+
+Per-`ChunkKey` singleflight on the coordinator collapses concurrent
+misses to a single origin GET. Cold-path bytes stream **directly
+from origin to client**: bounded **pre-header origin retry**
+(default 3 attempts, 5s total budget) handles transient origin
+failures invisibly before any HTTP response header is sent; the
+commit boundary is the first byte arrival from origin. Once
+committed, the leader streams bytes to the client as they arrive.
+In parallel, the leader tees bytes into a small in-memory ring
+buffer (low-TTFB joiners) and a bounded local-disk **Spool**
+(slow joiners that fall behind the ring head, plus uniform
+behavior across all CacheStore drivers). The CacheStore commit
+happens asynchronously after the response completes. The spool
+is NOT on the client TTFB path in v1. See
+[design.md s8.1](./design.md#81-per-chunkkey-singleflight),
+[s8.2](./design.md#82-ttfb-tee--spool), and
+[s8.6](./design.md#86-failure-handling-without-re-stampede).
+
+### 5.3 Per-chunk coordinator (rendezvous hashing)
+
+Each replica polls a headless Service for peer IPs (default every
+5s) and selects the coordinator per `ChunkKey` by rendezvous (Highest
+Random Weight) hash on pod IP. The assembler fans out per-chunk fill
+RPCs over a separate internal mTLS listener (`:8444`) to coordinators
+that are not self. One client request spanning N chunks may use N
+different coordinators; this is intentional for highly correlated
+cold-access workloads, where any single hot key would otherwise pin
+its assembler. Loop prevention is enforced by a header marker plus a
+membership self-check (`409 Conflict` fallback to local fill on
+disagreement). See [design.md s8.3](./design.md#83-cluster-wide-deduplication-via-per-chunk-fill-rpc)
+and [s8.8](./design.md#88-internal-rpc-listener).
+
+### 5.4 Atomic-commit primitive
+
+The leader publishes a chunk to the CacheStore in a single no-clobber
+operation: the second concurrent commit MUST lose without overwriting
+the winner. Two equivalent shapes are picked per driver: object-store
+`PutObject + If-None-Match: *` (used by `cachestore/s3`) and POSIX
+`link()` (or `renameat2(RENAME_NOREPLACE)`) returning `EEXIST` (used
+by `cachestore/localfs` and `cachestore/posixfs`). Both atomic; both
+report the loser as `commit_lost`. Each driver runs
+`SelfTestAtomicCommit` at boot and refuses to start if the backend
+does not honor its primitive. See
+[design.md s10.1](./design.md#101-atomic-commit-per-cachestore-driver).
+
+### 5.5 Bounded staleness contract
+
+Correctness rests on an **immutable-origin contract** with the
+operator: for any given `(origin_id, bucket, key)`, the underlying
+bytes are immutable for the life of the key; replacement MUST publish
+a new key. Because the
+cache key includes ETag (s5.1), as long as the contract holds the
+cache cannot serve stale bytes. If the contract is violated by an
+in-place overwrite, the cache may serve old bytes for at most one
+`metadata_ttl` window (default 5m), bounded by the metadata cache
+TTL. This is the load-bearing semantic for correctness and MUST
+appear in the consumer-API documentation. Defense in depth: every
+`Origin.GetRange` carries `If-Match: <etag>`, so a mid-flight
+overwrite is caught at fill time and increments
+`origin_etag_changed_total`. See
+[design.md s11](./design.md#11-bounded-staleness-contract). A
+symmetric bound applies to **create-after-404** (a key uploaded after
+a client already saw a 404 on it): at most one `negative_metadata_ttl`
+window per replica that observed the original 404 (default 60s)
+before the cache reflects the upload. See
+[design.md s12](./design.md#12-create-after-404-and-negative-cache-lifecycle).
+Operators with workloads requiring shorter effective windows on hot
+keys can opt into a **bounded-freshness mode** (default off): a
+per-replica background loop proactively re-Heads frequently-
+accessed keys ahead of `metadata_ttl`, shrinking the effective
+window for those keys to `refresh_ahead_ratio * metadata_ttl`
+(default 3.5m). See
+[design.md s11.2](./design.md#112-bounded-freshness-mode-optional).
+
+## 6. Backing-store options
+
+The CacheStore is pluggable; choice is a deployment-time decision and
+is invisible above the `cachestore` package boundary. Three drivers
+ship in v1:
+
+- `localfs` - dev only; one POSIX FS per replica; not shared.
+- `posixfs` - shared POSIX FS mounted on every replica at the same
+  path. Supported backends: NFSv4.1+ (baseline), Weka native
+  (`-t wekafs`), CephFS, Lustre, GPFS / IBM Spectrum Scale. Same
+  `link()` / `EEXIST` primitive as `localfs`. Alluxio FUSE is hard-
+  refused (no `link(2)`, no atomic no-overwrite rename).
+- `s3` - in-DC S3-compatible object store (e.g. VAST). `PutObject`
+  + `If-None-Match: *`.
+
+See [design.md s10.1](./design.md#101-atomic-commit-per-cachestore-driver)
+for atomic-commit specifics per driver.
+
+## 7. A request, end-to-end (cold miss with cross-replica fill)
+
+The diagram below traces a cold miss on replica A where the chunk's
+coordinator is replica B. The hot path (cache hit on A) skips
+straight from the catalog lookup to a direct CacheStore read; the
+local-coordinator path (B == A) skips the internal RPC. Cold-path
+bytes stream from origin -> coordinator -> assembler -> client
+in parallel with the spool tee on B. Pre-header retry on B handles
+transient origin failures invisibly; the CacheStore commit happens
+asynchronously after the client has the full chunk.
+
+### Diagram B: Cold miss, cross-replica coordinator
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Client
+    participant A as Replica A (assembler)
+    participant B as Replica B (coordinator for k)
+    participant SF as Singleflight (on B)
+    participant Sp as Spool (B local disk)
+    participant O as Origin
+    participant CS as CacheStore (shared)
+    C->>A: GET /bucket/key Range
+    A->>CS: Stat(k)
+    CS-->>A: ErrNotFound
+    A->>B: /internal/fill?key=k (mTLS)
+    B->>SF: Acquire(k) [leader]
+    SF->>O: GetRange(..., If-Match: etag)<br/>(pre-header retry s8.6)
+    O-->>SF: first byte
+    Note over SF: commit boundary - origin healthy
+    par stream
+        SF-->>B: bytes as they arrive
+        B-->>A: stream
+        A-->>C: 200/206 + headers + body
+    and tee to spool
+        SF->>Sp: bytes (in parallel)
+    end
+    O-->>SF: remaining bytes
+    SF->>Sp: Commit (fsync + close) [after stream]
+    SF-)CS: PutObject (or link()) commit [async]
+    CS--)SF: 200 (commit_won) or failure
+```
+
+## 8. Top risks worth your attention
+
+1. **Immutable-origin contract** - Correctness rests on operators
+   publishing new keys instead of overwriting. Bounded violation
+   window is `metadata_ttl` (5m default). Must be visible in
+   consumer-API documentation. See
+   [design.md s11](./design.md#11-bounded-staleness-contract).
+2. **Commit-after-serve failure** - The CacheStore commit happens
+   asynchronously after the client response is complete (cold-path
+   bytes stream origin -> client directly with pre-header retry on
+   the cache side). If the async commit fails after the client has
+   the full chunk, the chunk is silently uncached and the next
+   request refills. Sustained failure is visible only via
+   `commit_after_serve_total{result="failed"}`; alerting is required.
+   See [design.md s8.6](./design.md#86-failure-handling-without-re-stampede).
+3. **Spool locality** - The Spool MUST live on a local block device
+   by default (boot-time `statfs(2)` check refuses to start on
+   NFS / SMB / CephFS / Lustre / GPFS / FUSE). With the v1 streaming
+   design the spool is no longer on the client TTFB path, so this
+   contract is defense-in-depth: a network-FS spool would only
+   degrade joiner-fallback latency, not first byte. Operators with
+   unusual placements MAY relax via `spool.require_local_fs: false`;
+   production deployments are expected to keep the default. See
+   [design.md s10.4](./design.md#104-spool-locality-contract).
+4. **Per-replica origin semaphore is approximate** - Origin
+   concurrency is capped per-replica at
+   `floor(target_global / cluster.target_replicas)` (default 64
+   slots/replica at `target_global=192`,
+   `cluster.target_replicas=3`). Realized cluster-wide concurrency
+   tracks `target_global` only when actual replica count matches
+   `cluster.target_replicas`; scale-out without updating the knob
+   over-allocates against origin, scale-in under-allocates.
+   Origin throttling is handled by the leader's pre-header retry
+   loop (exponential backoff) rather than by a hard coordinated
+   cap. A coordinated cluster-wide limiter and dynamic recompute
+   are deferred future work; see
+   [design.md s15.5](./design.md#155-coordinated-cluster-wide-origin-limiter)
+   and
+   [design.md s15.6](./design.md#156-dynamic-per-replica-origin-cap).
+5. **POSIX backend hardening** - NFS exports MUST be `sync` (not
+   `async`); Weka NFS `link()`/`EEXIST` is not docs-confirmed and
+   is gated by `SelfTestAtomicCommit` at boot; Alluxio FUSE is
+   hard-refused with a documented workaround
+   (`cachestore.driver: s3` against the Alluxio S3 gateway). See
+   [design.md s10.1.2](./design.md#1012-cachestoreposixfs).
+6. **Create-after-404 staleness** - A key uploaded after clients
+   already observed it as `404` will return stale `404` for up to
+   `negative_metadata_ttl` (default 60s) per replica that observed
+   the original miss. Round-robin LB can produce alternating `404`
+   / `200` during the drain. No event-driven invalidation or admin-
+   invalidation in v1 (the immutable-origin contract makes them
+   unnecessary for the documented workload); operators must wait
+   the TTL after uploading a previously-missing key. Mitigation:
+   short default TTL, `metadata_negative_*` metrics. See
+   [design.md s12](./design.md#12-create-after-404-and-negative-cache-lifecycle).
+
+## 9. Where to go next
+
+`design.md` (full mechanism + flow):
+- [s2 Decisions](./design.md#2-decisions) - locked design choices.
+- [s3 Terminology](./design.md#3-terminology) - full glossary.
+- [s4 Architecture and onward](./design.md#4-architecture) -
+  architecture, request flow, internal interfaces, stampede protection.
+- [s8.4 Origin backpressure](./design.md#84-origin-backpressure) -
+  per-replica static cap and pre-header retry for throttle handling.
+- [s10.1 Atomic commit per driver](./design.md#101-atomic-commit-per-cachestore-driver)
+- [s11 Bounded staleness](./design.md#11-bounded-staleness-contract)
+  - [s11.2 Bounded-freshness mode (optional)](./design.md#112-bounded-freshness-mode-optional)
+- [s12 Create-after-404 and negative-cache lifecycle](./design.md#12-create-after-404-and-negative-cache-lifecycle)
+- [s13 Eviction and capacity](./design.md#13-eviction-and-capacity) -
+  passive lifecycle and optional active eviction; ChunkCatalog
+  size-awareness operational guidance.
+- [s15 Deferred optimizations](./design.md#15-deferred-optimizations) -
+  v1 scope-discipline catalog (edge rate limiting, cluster-wide HEAD
+  singleflight, cluster-wide LIST coordinator, mid-stream origin
+  resume, coordinated cluster-wide origin limiter, dynamic per-
+  replica origin cap).
+- 12 inline mermaid diagrams covering hits, misses, cross-replica
+  fills, atomic commit, create-after-404 timeline, and membership
+  flux.
diff --git a/design/orca/design.md b/design/orca/design.md
new file mode 100644
index 00000000..f131fe59
--- /dev/null
+++ b/design/orca/design.md
@@ -0,0 +1,2711 @@
+# Orca - Origin Cache - Design (mechanism & flow)
+
+Status: draft for review (round 2 incorporating reviewer feedback)
+Owner: TBD
+
+---
+
+## Table of contents
+
+### Sections
+
+1. [Overview](#1-overview)
+2. [Decisions](#2-decisions)
+3. [Terminology](#3-terminology)
+4. [Architecture](#4-architecture)
+5. [Chunk model](#5-chunk-model)
+6. [Request flow](#6-request-flow)
+   - [6.1 HEAD request flow](#61-head-request-flow)
+   - [6.2 LIST request flow](#62-list-request-flow)
+   - [6.3 HTTP error-code mapping](#63-http-error-code-mapping)
+7. [Internal interfaces](#7-internal-interfaces)
+8. [Stampede protection](#8-stampede-protection)
+   - [8.1 Per-`ChunkKey` singleflight](#81-per-chunkkey-singleflight)
+   - [8.2 TTFB tee + spool](#82-ttfb-tee--spool)
+   - [8.3 Cluster-wide deduplication via per-chunk fill RPC](#83-cluster-wide-deduplication-via-per-chunk-fill-rpc)
+   - [8.4 Origin backpressure](#84-origin-backpressure)
+   - [8.5 Cancellation safety](#85-cancellation-safety)
+   - [8.6 Failure handling without re-stampede](#86-failure-handling-without-re-stampede)
+   - [8.7 Metadata-layer singleflight](#87-metadata-layer-singleflight)
+   - [8.8 Internal RPC listener](#88-internal-rpc-listener)
+9. [Azure adapter: Block Blob only](#9-azure-adapter-block-blob-only)
+10. [Concurrency, durability, correctness](#10-concurrency-durability-correctness)
+    - [10.1 Atomic commit (per CacheStore driver)](#101-atomic-commit-per-cachestore-driver)
+    - [10.2 Catalog correctness, typed errors, circuit breaker](#102-catalog-correctness-typed-errors-circuit-breaker)
+    - [10.3 Range, sizes, and edge cases](#103-range-sizes-and-edge-cases)
+    - [10.4 Spool locality contract](#104-spool-locality-contract)
+    - [10.5 Readiness probe (`/readyz`)](#105-readiness-probe-readyz)
+11. [Bounded staleness contract](#11-bounded-staleness-contract)
+    - [11.1 The contract and the staleness window](#111-the-contract-and-the-staleness-window)
+    - [11.2 Bounded-freshness mode (optional)](#112-bounded-freshness-mode-optional)
+12. [Create-after-404 and negative-cache lifecycle](#12-create-after-404-and-negative-cache-lifecycle)
+13. [Eviction and capacity](#13-eviction-and-capacity)
+    - [13.1 Passive eviction (lifecycle)](#131-passive-eviction-lifecycle)
+    - [13.2 Active eviction (opt-in, access-frequency)](#132-active-eviction-opt-in-access-frequency)
+    - [13.3 ChunkCatalog size awareness](#133-chunkcatalog-size-awareness-load-bearing-operational-note)
+    - [13.4 Spool capacity](#134-spool-capacity)
+    - [13.5 `chunk_size` config-change capacity impact](#135-chunk_size-config-change-capacity-impact)
+    - [13.6 Eviction interactions](#136-eviction-interactions)
+14. [Horizontal scale](#14-horizontal-scale)
+15. [Deferred optimizations](#15-deferred-optimizations)
+    - [15.1 Edge rate limiting](#151-edge-rate-limiting)
+    - [15.2 Cluster-wide HEAD singleflight](#152-cluster-wide-head-singleflight)
+    - [15.3 Cluster-wide LIST coordinator](#153-cluster-wide-list-coordinator)
+    - [15.4 Mid-stream origin resume](#154-mid-stream-origin-resume)
+    - [15.5 Coordinated cluster-wide origin limiter](#155-coordinated-cluster-wide-origin-limiter)
+    - [15.6 Dynamic per-replica origin cap](#156-dynamic-per-replica-origin-cap)
+
+### Request scenarios
+
+Concrete request-flow narratives. Each scenario has a stable letter
+identifier reused in the diagram heading.
+
+- **Scenario A** - warm read (cache hit): [Diagram 3](#diagram-3-scenario-a---warm-read-cache-hit)
+- **Scenario B** - cold miss, local coordinator: [Diagram 4](#diagram-4-scenario-b---cold-miss-local-coordinator)
+- **Scenario C** - concurrent miss, same-replica joiner: [Diagram 5](#diagram-5-scenario-c---concurrent-miss-same-replica-joiner)
+- **Scenario D** - cold miss, remote coordinator (cross-replica fill): [Diagram 6](#diagram-6-scenario-d---cold-miss-remote-coordinator)
+- **Scenario E** - range spanning multiple coordinators: [Diagram 7](#diagram-7-scenario-e---range-spanning-multiple-coordinators)
+- **Scenario F** - Azure non-BlockBlob rejection: [Diagram 8](#diagram-8-scenario-f---azure-non-blockblob-rejection)
+- **Scenario G** - create-after-404 (operator upload after client miss): [Diagram 10](#diagram-10-scenario-g---create-after-404-timeline)
+- **Scenario H** - rolling restart membership flux: [Diagram 12](#diagram-12-scenario-h---rolling-restart-membership-flux)
+
+Other diagrams (D1, D2, D9, D11) depict architecture, math, or
+mechanism rather than request scenarios and are reachable from the
+Sections list above.
+
+---
+
+## 1. Overview
+
+Edge devices inside an on-prem datacenter need read access to large files
+held in cloud blob storage (S3, Azure Blob). Direct egress per device is
+unacceptable (cost, latency, throughput, security boundary). Orca is
+a read-only caching layer, deployed inside each datacenter, that fronts
+cloud blob storage with an S3-compatible API. Clients issue range reads;
+Orca serves from a shared in-DC store when present, otherwise
+fetches from the cloud origin, stores the chunk, and returns it.
+
+This document describes the mechanism: decisions, components, request flow,
+stampede protection, atomic commit, and horizontal-scale coordination.
+
+## 2. Decisions
+
+| Area | Decision |
+|---|---|
+| Client API | S3-compatible HTTP; `GET` + `HEAD` + `ListObjectsV2`; supports `Range`. |
+| Auth (v1) | Network-perimeter trust + bearer / mTLS. No SigV4 verification yet. |
+| Origins | S3 + Azure Blob behind a pluggable `Origin` interface. |
+| Azure constraint | Block Blobs only. Append/Page Blobs rejected at `Head`. |
+| Backing store | Pluggable `CacheStore`; `localfs` for dev, `s3` (VAST or any S3-compatible in-DC object store) **or** `posixfs` (NFSv4.1+, Weka native, CephFS, Lustre, GPFS, or any shared POSIX FS that honors `link()` / `EEXIST` and directory `fsync`) for prod. The CacheStore is the source of truth for chunk presence. Driver choice is a deployment-time decision per replica set; `s3` and `posixfs` are interchangeable from the cache layer's perspective. |
+| In-DC S3 vs. cloud S3 | The in-DC S3-compatible store is treated identically to cloud S3 at the protocol level. The only difference is "much faster, in-DC". Both `Origin` and the `cachestore/s3` driver are thin S3-client adapters with no special-casing. The `cachestore/posixfs` driver replaces the S3 protocol with shared-POSIX primitives but presents the same `CacheStore` interface, so nothing above s7 changes. |
+| CacheStore atomic-commit primitive | Two equivalent primitives, picked per driver: object-store `PutObject + If-None-Match: *` (used by `cachestore/s3`) and POSIX `link()` / `renameat2(RENAME_NOREPLACE)` returning `EEXIST` (used by `cachestore/localfs` and `cachestore/posixfs`). Both are atomic, no-clobber, and have a "you lost the race" failure mode that maps cleanly onto `commit_lost`. Each driver runs `SelfTestAtomicCommit` at boot and refuses to start on backends that don't honor its primitive. |
+| Chunking | Fixed 8 MiB default (configurable 4-16 MiB). `chunk_size` baked into `ChunkKey`. |
+| Consistency | **Origin objects are immutable per operator contract**: an `(origin_id, bucket, key)` never has its bytes modified once published; replacement must be a new key. `ETag` is identity, not freshness. `If-Match: <etag>` on every `Origin.GetRange` is defense-in-depth that traps in-flight overwrites only. Bounded staleness uses two TTLs: `metadata_ttl` (default 5m) on positive entries (caps in-place-overwrite contract violations; see [s11](#11-bounded-staleness-contract)) and `negative_metadata_ttl` (default 60s) on negative entries (caps the create-after-404 unavailability window after an operator uploads a previously-missing key; see [s12](#12-create-after-404-and-negative-cache-lifecycle)). |
+| Catalog | In-memory `ChunkCatalog` fronting `CacheStore.Stat`. No persistent local index. Per-entry access-frequency tracking (s10.2) feeds the optional active-eviction loop (s13.2). Bounded by `chunk_catalog.max_entries`; size to estimated working-set chunks (s13.3). |
+| Eviction | Two-tier. Passive: bounded LRU on the in-memory ChunkCatalog (always on); CacheStore lifecycle (S3 lifecycle / posixfs operator sweep) for storage-side cleanup. Active: opt-in access-frequency-driven eviction loop (`chunk_catalog.active_eviction.enabled`, default `false`) that deletes cold chunks from the CacheStore via `CacheStore.Delete`. Operators using `cachestore/posixfs` typically enable active eviction since posixfs has no native lifecycle. See [s13](#13-eviction-and-capacity). |
+| Prefetch | Sequential read-ahead by default. Configurable depth, capped concurrency. |
+| Cluster | Kubernetes Deployment + headless Service for peer discovery + ClusterIP/LB for client traffic. Rendezvous hashing on pod IP selects the coordinator per `ChunkKey` for miss-fills only; receiving replica is the **assembler** that fans out per-chunk fill RPCs to coordinators (s8.3). All replicas can read all chunks directly from the CacheStore on hits. |
+| Inter-replica auth | Separate internal mTLS listener (default `:8444`) chained to an internal CA distinct from the client mTLS CA; authorization = "presenter source IP is in current peer-IP set" (s8.8). |
+| Local spool | Every fill writes origin bytes through a local spool (`internal/orca/fetch/spool`) in parallel with streaming to the client; serves as a slow-joiner fallback and as the source for the asynchronous CacheStore commit. The spool is NOT on the client-TTFB path in v1; client bytes flow origin -> client directly (s8.2 / s8.6). |
+| Atomic commit | `localfs` and `posixfs` stage inside `<root>/.staging/<uuid>` with parent-dir fsync, then `link()` no-clobber (returns `EEXIST` to the loser); `s3` uses direct `PutObject` with `If-None-Match: *`. Each driver runs `SelfTestAtomicCommit` at boot: `s3` proves the backend honors `If-None-Match: *`; `posixfs` proves the backend honors `link()` / `EEXIST` and that directory fsync is durable, and additionally enforces `nfs.minimum_version` (default `4.1`, with opt-in `nfs.allow_v3`) and refuses to start on Alluxio FUSE backends. Cold-path bytes stream directly from origin to client; bounded leader-side **pre-header origin retry** (s8.6) handles transient origin failures invisibly before response headers are committed. The spool tees in parallel for joiners (s8.2) and as the CacheStore-commit source. CacheStore commit happens asynchronously after the response completes; commit-after-serve failure becomes `commit_after_serve_total{result="failed"}` rather than a client error (s8.6). |
+| Versioned buckets on cachestore/s3 | Not supported. The `cachestore/s3` driver requires the bucket to have versioning **disabled**. AWS S3 honors `If-None-Match: *` on both versioned and unversioned buckets, but VAST Cluster (and likely other S3-compatible backends) only honors it on unversioned buckets ([VAST KB][vast-kb-conditional-writes]). The driver enforces this at boot via an explicit `GetBucketVersioning` versioning gate (s10.1.3); refusing to start on enabled or suspended versioning avoids a class of silent atomic-commit failures. |
+| LIST caching | Per-replica TTL'd LIST cache (s6.2 / FW3) in front of `Origin.List`, sized for the FUSE-`ls` workload pattern. Default `list_cache.ttl=60s`, configurable. Cluster-wide LIST coordination is a deferred optimization ([s15.3](#153-cluster-wide-list-coordinator)). |
+| Origin concurrency cap | Per-replica token bucket sized `floor(target_global / cluster.target_replicas)`. Default `target_global=192` and `cluster.target_replicas=3`, giving 64 slots per replica. Origin throttling responses (503 / 429) are handled by the leader's pre-header retry loop (s8.6) with exponential backoff. A coordinated cluster-wide limiter and dynamic recompute from `len(Cluster.Peers())` are deferred optimizations; see [s15.5](#155-coordinated-cluster-wide-origin-limiter) and [s15.6](#156-dynamic-per-replica-origin-cap). |
+| Bounded-freshness mode | Optional, opt-in via `metadata_refresh.enabled` (default `false`). When enabled, a per-replica background loop proactively re-Heads hot keys (`AccessCount >= access_threshold`) ahead of `metadata_ttl` to shrink the effective bounded-staleness window for popular content. See [s11.2](#112-bounded-freshness-mode-optional). |
+| Tenancy | Single tenant, single origin credential set in v1. |
+| Edge rate limiting | Documented v1 gap; see [s15.1](#151-edge-rate-limiting). v1 has implicit hot-client mitigation via the per-replica origin limiter (s8.4) and singleflight (s8.1); per-client / per-IP / per-credential edge rate limiting is deferred future work. |
+| Repo home | This repo. Layout mirrors `machina`. |
+
+[vast-kb-conditional-writes]: https://kb.vastdata.com/documentation/docs/s3-conditional-writes
+
+## 3. Terminology
+
+Terms used throughout this document. Forward-references point at the
+section that defines or implements the full mechanism.
+
+- **Replica** - one running pod of the `orca` Deployment. All
+  replicas are interchangeable; there is no per-pod state.
+- **Client** - external caller using an S3-compatible HTTP API (e.g.
+  `aws-sdk`, `boto3`).
+- **Origin** - upstream cloud blob store (AWS S3 or Azure Blob); read-only
+  from our perspective. Interface defined in
+  [s7](#7-internal-interfaces).
+- **CacheStore** - the in-DC durable store that holds cached chunk bytes
+  and is shared by all replicas. Pluggable: `localfs` for dev, `s3` (e.g.
+  VAST or any S3-compatible in-DC object store) and `posixfs` (shared
+  POSIX FS - NFSv4.1+, Weka native, CephFS, Lustre, GPFS) for prod;
+  driver choice is a deployment-time decision and is invisible above the
+  cachestore boundary. Treated as the source of truth for chunk presence.
+  Interface in [s7](#7-internal-interfaces); commit semantics in
+  [s10](#10-concurrency-durability-correctness).
+- **Chunk** - a fixed-size byte range of an origin object (default 8 MiB);
+  the unit of caching and fill.
+- **ChunkKey** - the immutable identifier for a chunk:
+  `{origin_id, bucket, object_key, etag, chunk_size, chunk_index}`. Full
+  definition in [s5](#5-chunk-model).
+- **Headless Service** - Kubernetes `Service` with `clusterIP: None`; its
+  DNS A-record resolves to the IPs of all Ready pods. We poll it (default
+  every 5s) to discover the current peer set.
+- **Rendezvous hashing** (a.k.a. Highest Random Weight, HRW) - for a given
+  key, score each peer with `hash(peer_ip || key)` and pick the argmax.
+  Stable under membership changes that don't add or remove the winning
+  peer. We use it to pick exactly one coordinator per chunk from the
+  current peer set.
+- **Coordinator** - the replica that rendezvous hashing selects to perform
+  the miss-fill for a particular chunk. Ownership is **per chunk**, not
+  per request and not per object: a single client request spanning N
+  chunks may have N different coordinators.
+- **Assembler** - the replica that received the client request. It is
+  responsible for stitching the client response. For each chunk in the
+  requested range, the assembler either (a) reads from CacheStore on a
+  hit, (b) runs a local miss-fill if it is the coordinator for that
+  chunk, or (c) issues an internal fill RPC to the coordinator otherwise.
+  See [s8.3](#83-cluster-wide-deduplication-via-per-chunk-fill-rpc).
+- **Singleflight** - a per-key in-process deduplication primitive.
+  Concurrent requests for the same `ChunkKey` share a single in-flight
+  fill: the first arrival is the **leader** (issues the origin GET);
+  subsequent arrivals are **joiners** (wait on the leader's stream). Full
+  mechanism in [s8.1](#81-per-chunkkey-singleflight).
+- **Tee** - the leader's origin byte stream is split two ways: into a
+  small in-memory ring buffer for low-TTFB joiners, and into the Spool
+  (below) for slow joiners that fall behind the ring head. Joiners
+  therefore stream through the leader rather than waiting for the full
+  disk write. Full mechanism in [s8.2](#82-ttfb-tee--spool).
+- **Spool** - bounded local-disk staging area for in-flight fills
+  (`internal/orca/fetch/spool`). Ensures slow joiners always have a
+  local fallback regardless of CacheStore driver. Detail in
+  [s8.2](#82-ttfb-tee--spool).
+- **Atomic CacheStore commit** - the leader publishes the completed chunk
+  in a single no-clobber operation: `link()` /
+  `renameat2(RENAME_NOREPLACE)` for `localfs`; `PutObject` +
+  `If-None-Match: *` for `s3`. Concurrent commits cannot overwrite each
+  other; the loser is recorded as `commit_lost`. See
+  [s10](#10-concurrency-durability-correctness).
+- **Per-chunk internal fill RPC** - `GET /internal/fill?key=<encoded
+  ChunkKey>` over mTLS on the internal listener (default `:8444`). The
+  assembler calls the coordinator when a chunk is missed and the
+  coordinator is not self. See [s8.8](#88-internal-rpc-listener).
+- **Immutable origin contract** - operator promise that an
+  `(origin_id, bucket, key)` never has its bytes modified once published;
+  replacement is always a new key. The cache trusts this contract; on
+  violation, the bounded staleness window is `metadata_ttl` (default 5m).
+  Full statement in [s11](#11-bounded-staleness-contract).
+- **Pre-header retry** - the leader retries `Origin.GetRange` on
+  transient errors **before** sending HTTP response headers to the
+  client, making transient origin failures invisible to the client.
+  Bounded by `origin.retry.attempts` (default 3) and
+  `origin.retry.max_total_duration` (default 5s). The "commit
+  boundary" is the first byte arrival from origin: once received,
+  the cache sends headers and starts streaming; subsequent origin
+  failures become mid-stream client aborts (handled by S3 SDK
+  retry via `Content-Length` mismatch). `OriginETagChangedError`
+  is non-retryable. Detail in
+  [s8.6](#86-failure-handling-without-re-stampede). Mid-stream
+  origin resume is deferred future work
+  ([s15.4](#154-mid-stream-origin-resume)).
+- **CacheStore circuit breaker** - per-process error-rate breaker around
+  `CacheStore` calls. On sustained `ErrTransient` / `ErrAuth`, the
+  breaker opens, short-circuits writes, and surfaces via metrics and
+  `/readyz`. Defaults: 10 errors / 30s window, 30s open, 3 half-open
+  probes. Detail in [s10.2](#102-catalog-correctness-typed-errors-circuit-breaker).
+- **Negative-cache entry** - a metadata-cache entry recording an
+  authoritative `404` (or unsupported-blob-type rejection) from
+  origin. Reused for `negative_metadata_ttl` (default 60s) before
+  re-Heading. Bounds the create-after-404 unavailability window;
+  see [s12](#12-create-after-404-and-negative-cache-lifecycle).
+- **Shared-POSIX CacheStore** - the `cachestore/posixfs` driver: a
+  `CacheStore` backed by a shared POSIX-style filesystem mounted on every
+  replica at the same path. Concrete supported backends are NFSv4.1+ (the
+  baseline), Weka native (`-t wekafs`), CephFS (`-t ceph`), Lustre
+  (`-t lustre`), and IBM Spectrum Scale / GPFS (`-t gpfs`). Disqualified
+  on purpose: Alluxio FUSE (no `link(2)`, no atomic no-overwrite rename,
+  no NFS gateway). The driver depends on
+  `internal/orca/cachestore/internal/posixcommon/` (link-based
+  commit, dir-fsync, staging-dir helpers, fan-out path layout) which is
+  also depended on by `cachestore/localfs`. Detail in
+  [s10.1.2](#1012-cachestoreposixfs).
+- **Atomic-commit primitive** - the no-clobber publish step that ends a
+  fill. Two equivalent shapes: object-store
+  `PutObject + If-None-Match: *` (used by `cachestore/s3`) and POSIX
+  `link()` / `renameat2(RENAME_NOREPLACE)` returning `EEXIST` to the
+  loser (used by `cachestore/localfs` and `cachestore/posixfs`). Both are
+  atomic, return a "you lost the race" signal that becomes
+  `commit_lost`, and are validated at boot by `SelfTestAtomicCommit`.
+  Detail in [s10.1](#101-atomic-commit-per-cachestore-driver).
+- **Spool locality contract** - the local Spool (`spool.dir`) MUST live
+  on a local block device. The cache layer enforces this at boot via
+  `statfs(2)` against a denylist of network filesystems
+  (NFS / SMB / Ceph / Lustre / GPFS / FUSE) and refuses to start on
+  violation. Governed by `spool.require_local_fs` (default `true`). The
+  rationale and the boot check are in
+  [s10.4](#104-spool-locality-contract); the spool's role in the
+  cold-path TTFB barrier is in [s8.2](#82-ttfb-tee--spool).
+- **LIST cache** - per-replica TTL'd cache of `Origin.List` responses
+  keyed on the full query tuple `(origin_id, bucket, prefix,
+  continuation_token, start_after, delimiter, max_keys)`. Default
+  `list_cache.ttl=60s`, configurable. Sized for the FUSE-`ls`
+  workload pattern (s6.2). Cluster-wide LIST coordination is a
+  deferred optimization ([s15.3](#153-cluster-wide-list-coordinator)).
+- **Active eviction** - optional, opt-in background loop in the
+  cache layer (`chunk_catalog.active_eviction.enabled`, default
+  `false`) that uses access-frequency tracking on the
+  `ChunkCatalog` to delete cold chunks from the CacheStore via
+  `CacheStore.Delete`. Recommended for `cachestore/posixfs`
+  deployments without external sweep tooling. Detail in
+  [s13.2](#132-active-eviction-opt-in-access-frequency).
+- **Bounded-freshness mode** - optional, opt-in
+  (`metadata_refresh.enabled`, default `false`) per-replica
+  background loop that proactively re-Heads hot keys ahead of
+  `metadata_ttl`. Shrinks the effective bounded-staleness window
+  for popular content from `metadata_ttl` to
+  `refresh_ahead_ratio * metadata_ttl` (default 3.5m). Hot-key
+  detection uses access-frequency counters on the metadata cache
+  (parallel to the ChunkCatalog tracking from FW8). Detail in
+  [s11.2](#112-bounded-freshness-mode-optional).
+- **S3 versioning gate** - boot-time `GetBucketVersioning` check
+  by `cachestore/s3` that refuses to start if the bucket has
+  versioning enabled or suspended. Required because
+  `If-None-Match: *` is not honored on versioned buckets across
+  all S3-compatible backends; without this gate the atomic-commit
+  primitive silently degrades. Detail in
+  [s10.1.3](#1013-cachestores3).
+
+## 4. Architecture
+
+A single binary, `orca`, deployed as a Kubernetes Deployment.
+Replicas discover each other through a headless Service and refresh the
+peer set on a configurable interval (default 5s). A request from a client
+lands on one replica - the **assembler** - which iterates the requested
+range chunk-by-chunk. For each `ChunkKey`, the assembler reads directly
+from the shared CacheStore on a hit; on a miss it routes to the chunk's
+**coordinator** (selected by rendezvous hashing on the current peer-IP
+set) for a singleflight + tee + spool + atomic-commit fill. The
+coordinator may be the assembler itself, in which case the fill runs
+locally; otherwise the assembler issues a per-chunk internal fill RPC.
+All terms are defined in [s3](#3-terminology). Single tenant. One origin
+credential set per deployment.
+
+### Diagram 1: System overview
+
+```mermaid
+graph TB
+    subgraph DC["On-prem datacenter"]
+        Clients["Edge clients"]
+        Service["Service (ClusterIP / LB)<br/>client traffic"]
+        subgraph Replicas["orca Deployment"]
+            R1["Replica 1"]
+            R2["Replica 2"]
+            R3["Replica N"]
+        end
+        Headless["Headless Service<br/>peer discovery"]
+        Internal["Internal listener :8444<br/>per-chunk fill RPC<br/>(mTLS, peer-IP authz)"]
+        CS[("CacheStore<br/>in-DC S3 / posixfs / localfs")]
+    end
+    subgraph Cloud["Cloud origins"]
+        S3[("AWS S3")]
+        Azure[("Azure Blob<br/>Block Blobs only")]
+    end
+    Clients -- "S3 GET / HEAD / LIST<br/>+ Range" --> Service
+    Service --> R1
+    Service --> R2
+    Service --> R3
+    R1 -. "DNS refresh<br/>default 5s" .-> Headless
+    R2 -.-> Headless
+    R3 -.-> Headless
+    R1 <--> Internal
+    R2 <--> Internal
+    R3 <--> Internal
+    R1 <--> CS
+    R2 <--> CS
+    R3 <--> CS
+    R1 -- "miss-fill<br/>If-Match: etag" --> S3
+    R2 -- "miss-fill<br/>If-Match: etag" --> S3
+    R3 -- "miss-fill<br/>If-Match: etag" --> Azure
+```
+
+## 5. Chunk model
+
+- `ChunkKey = {origin_id, bucket, object_key, etag, chunk_size, chunk_index}`.
+  - `origin_id` is a deployment-scoped identifier from config (e.g.
+    `aws-us-east-1-prod`, `azure-eastus-research`). Required. Namespaces
+    cache key derivation and the on-store path so two deployments can
+    safely share a CacheStore bucket.
+  - `etag` captures immutability. A new ETag is treated as a new logical
+    object and gets a fresh set of chunks. Old chunks age out via the
+    CacheStore's lifecycle policy.
+  - `chunk_size` is part of the key so a runtime config change does not
+    silently corrupt or shadow existing data.
+- `chunk_index = floor(byte / chunk_size)`.
+- An object metadata cache holds `{origin_id, bucket, key} -> {size, etag,
+  content_type, last_validated, last_status}` with a small TTL. Avoids
+  re-`HEAD`ing origin on every request.
+
+The CacheStore's namespace **is** the chunk index. `ChunkKey`
+deterministically produces a path. Cache key derivation uses canonical
+length-prefixed encoding to remove ambiguity from separators that may
+appear in any field:
+
+```
+LP(s)   = LE64(uint64(len(s))) || s
+hashKey = sha256(
+            LP(origin_id) ||
+            LP(bucket)    ||
+            LP(key)       ||
+            LP(etag)      ||
+            LE64(chunk_size)
+          )
+path    = "<origin_id>/<hex(hashKey)>/<chunk_index>"
+```
+
+`origin_id` appears in the path in the clear (and `chunk_size` is folded
+into the hash, not the path) so operators can run per-origin lifecycle
+policies and target a specific deployment with `aws s3 rm --recursive
+<bucket>/<origin_id>/`.
+
+The `cachestore/posixfs` driver inserts a 2-character hex fan-out
+between `<origin_id>` and `<hex(hashKey)>` to keep directory sizes
+manageable on multi-PB working sets; that variant and its
+`cachestore.posixfs.fanout_chars` knob are specified in
+[s10.1.2](#1012-cachestoreposixfs). The `s3` and `localfs` drivers use
+the unmodified path above.
+
+**Operational note: changing `chunk_size`.** Because `chunk_size` is a
+field of `ChunkKey` and is folded into the path hash, changing it in
+deployment config never corrupts or shadows existing chunks; old-sized
+chunks remain valid byte ranges of the old logical layout but are no
+longer addressable. Operators should plan for transient storage
+doubling and a cold-period origin-cost spike when changing
+`chunk_size` on a hot working set: the working set is rebuilt at the
+new size on demand while the old set ages out via the CacheStore
+lifecycle policy (or, on `posixfs`, the operator's external sweep -
+see [s13](#13-eviction-and-capacity)).
+
+Whether a chunk is present is answered by `CacheStore.Stat(key)`. An
+in-memory `ChunkCatalog` LRU memoizes recent positive lookups so the hot
+path never touches the CacheStore for metadata. The catalog is purely a
+hot-path optimization; it can be dropped at any time without affecting
+correctness.
+
+For a request `Range: bytes=A-B`:
+
+```
+firstChunk = A / chunk_size
+lastChunk  = B / chunk_size
+for cid := firstChunk; cid <= lastChunk; cid++ {  // streaming iterator
+    fetchOrServe(cid)                              // + sliding prefetch window
+    sliceWithin(cid, max(A, cid*sz), min(B, (cid+1)*sz - 1))
+}
+```
+
+The chunk loop is a **streaming iterator**: at no point is the full
+`[]ChunkKey` for the range materialized into a slice. Prefetch operates on
+a sliding window of `min(prefetch_depth, lastChunk - cid)` ahead of the
+current cursor. A configurable `server.max_response_bytes` cap returns
+`416 Requested Range Not Satisfiable` (with header
+`x-orca-cap-exceeded: true`) before any cache lookup if the
+computed response size exceeds the cap.
+
+### Diagram 2: Range request -> chunk index mapping
+
+```mermaid
+flowchart LR
+    Req["GET /bucket/key<br/>Range: bytes=A-B"] --> Math["chunk_size = 8 MiB<br/>firstChunk = A / chunk_size<br/>lastChunk  = B / chunk_size"]
+    Math --> Iter["streaming iterator<br/>cid := firstChunk..lastChunk<br/>sliding prefetch window"]
+    Iter --> Keys["per cid: ChunkKey =<br/>{origin_id, bucket, key,<br/>etag, chunk_size, cid}"]
+    Keys --> Path["path =<br/>origin_id /<br/>hex(sha256(LP(origin_id) || ...)) /<br/>cid"]
+    Path --> CS[("CacheStore<br/>address")]
+```
+
+## 6. Request flow
+
+1. `GET /{bucket}/{key}` arrives with optional `Range`.
+2. Auth middleware (bearer / mTLS) validates the caller.
+3. `fetch.Coordinator` looks up object metadata in the metadata cache. On
+   miss, **per-replica** singleflight at the metadata layer issues at most
+   one `HEAD` per object per replica per metadata-cache window. Cluster-wide
+   bound is therefore N HEADs per object per window worst case where N is
+   the current peer-set size; this is acceptable in v1 (a cluster-wide HEAD
+   singleflight is a deferred optimization; see [s15.2](#152-cluster-wide-head-singleflight)).
+   Two TTLs apply, asymmetric by design (s12):
+   **positive entries** (`200` + ETag) are reused for `metadata_ttl`
+   (default 5m), which also bounds the staleness window if the
+   immutable-origin contract (s11) is violated. **Negative entries**
+   (`404`, unsupported-blob-type) are reused for `negative_metadata_ttl`
+   (default 60s), which bounds the create-after-404 unavailability window
+   after an operator uploads a previously-missing key.
+4. If the request has `Range`, validate against `ObjectInfo.Size`; serve
+   `416` if unsatisfiable. Compute `firstChunk` and `lastChunk`. If
+   `server.max_response_bytes > 0` and the computed response size exceeds
+   it, return `400 RequestSizeExceedsLimit` (S3-style XML error body)
+   with `x-orca-cap-exceeded: true`. `416` is reserved for true
+   Range-vs-object-size violations.
+5. Iterate the chunk range as a streaming iterator. For each `ChunkKey`:
+   - **ChunkCatalog hit:** open reader from `CacheStore`. Typed
+     `CacheStore` errors (s7) are honored: only `ErrNotFound` triggers a
+     refill; `ErrTransient` surfaces as `503 Slow Down` with `Retry-After`,
+     `ErrAuth` surfaces as `502 Bad Gateway` and counts toward the
+     `/readyz` `ErrAuth` threshold (default 3 consecutive -> NotReady).
+   - **ChunkCatalog miss:** call `CacheStore.Stat(key)`. If present,
+     record in the catalog and serve from the CacheStore. If absent, take
+     the miss-fill path (s8), which routes to the coordinator for that
+     specific chunk via local singleflight or per-chunk internal RPC.
+6. **Cold path: stream directly with pre-header retry**. On a chunk
+   miss, the leader issues `Origin.GetRange` with bounded retry
+   (s8.6) **before** any HTTP response header is sent to the client.
+   Transient origin failures (5xx, network errors) on retryable
+   attempts are invisible to the client: the leader retries up to
+   `origin.retry.attempts` (default 3) with exponential backoff
+   capped by `origin.retry.max_total_duration` (default 5s). The
+   commit boundary is the **first byte arrival from origin**: once
+   the leader has received any byte, response headers
+   (`Content-Length`, `Content-Range`, `ETag`,
+   `Accept-Ranges: bytes`) are sent immediately and the leader
+   begins streaming bytes to the client as they arrive from origin.
+   The leader simultaneously tees bytes into the local Spool (s8.2)
+   for joiner support and for the asynchronous CacheStore commit.
+   `Content-Length` and `Content-Range` are computable from
+   `ObjectInfo.Size` and the chunk math, so headers can be sent
+   before the body completes. Pre-commit failures
+   (`OriginETagChangedError`, retry budget exhausted, internal RPC
+   failure, semaphore timeout) return a clean HTTP error before
+   any byte is sent (typically `502 Bad Gateway` or `503 Slow
+   Down`). The CacheStore commit happens asynchronously after the
+   client response completes, using whichever atomic primitive the
+   configured driver advertises (`PutObject + If-None-Match: *` for
+   `s3`; `link()` / `EEXIST` for `localfs` and `posixfs`). The
+   assembler is driver-agnostic: it calls `CacheStore.PutChunk` and
+   treats the typed error the same way regardless of backing store.
+   Commit-after-serve failure does NOT affect the in-flight client
+   response; it increments
+   `orca_commit_after_serve_total{result="failed"}` and the
+   chunk is **not** recorded in the `ChunkCatalog` (the next
+   request will refill).
+7. **Mid-stream failure**: once any body byte has been written
+   (i.e., after the commit boundary), no HTTP error status is
+   possible. Mid-stream failures (origin disconnect after first
+   byte, or any post-commit error) abort the response (HTTP/2
+   `RST_STREAM` with `INTERNAL_ERROR`; HTTP/1.1 `Connection: close`
+   after the partial write) and increment
+   `orca_responses_aborted_total{phase="mid_stream",reason}`.
+   S3 clients (aws-sdk, boto3, etc.) detect this via
+   `Content-Length` mismatch and retry. Mid-stream origin resume
+   (re-issue origin GET with `Range: bytes=<offset>-` and continue
+   feeding the client transparently) is deferred future work
+   ([s15.4](#154-mid-stream-origin-resume)).
+8. If sequential prefetch is enabled, the iterator schedules asynchronous
+   fills for the next N chunks (capped per blob and globally) one chunk
+   ahead of the cursor.
+
+### Diagram 3: Scenario A - warm read (cache hit)
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Client
+    participant R as Replica
+    participant Cat as ChunkCatalog
+    participant CS as CacheStore
+    C->>R: GET /bucket/key Range: bytes=A-B
+    R->>R: chunk math -> streaming iterator
+    Note over R: defer headers until first chunk in hand
+    loop each ChunkKey (streaming)
+        R->>Cat: Lookup(k)
+        Cat-->>R: hit (ChunkInfo)
+        R->>CS: GetChunk(k, off, n)
+        CS-->>R: bytes
+        opt first chunk
+            R-->>C: 200/206 + Content-Length, Content-Range, ETag
+        end
+        R-->>C: stream slice
+    end
+    Note over R,CS: All replicas read directly from shared CacheStore on hit<br/>and no peer is involved on the hit path
+```
+
+### Diagram 4: Scenario B - cold miss, local coordinator
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Client
+    participant R as Replica (assembler == coordinator)
+    participant Cat as ChunkCatalog
+    participant SF as Singleflight
+    participant Sp as Spool
+    participant O as Origin
+    participant CS as CacheStore
+    C->>R: GET /bucket/key Range
+    R->>Cat: Lookup(k)
+    Cat-->>R: miss
+    R->>CS: Stat(k)
+    CS-->>R: ErrNotFound
+    R->>SF: Acquire(k) [leader]
+    SF->>O: GetRange(bucket, key, etag, off, n)<br/>If-Match: etag<br/>(pre-header retry s8.6)
+    O-->>SF: first byte
+    Note over SF: commit boundary - origin healthy
+    par stream to client
+        SF-->>R: stream bytes as they arrive from origin
+        R-->>C: 200/206 + headers + body
+    and tee to spool
+        SF->>Sp: write bytes (in parallel)
+    end
+    O-->>SF: remaining bytes
+    SF->>Sp: Commit (fsync + close) [after stream complete]
+    SF-)CS: PutObject(final, body, If-None-Match: *) [async]
+    CS--)SF: 200 (commit_won) or failure
+    alt commit ok
+        SF->>Cat: Record(k, info)
+        Note over SF: commit_after_serve_total{result=ok}++
+    else commit failed
+        Note over SF: commit_after_serve_total{result=failed}++<br/>chunk NOT recorded - next request refills
+    end
+    SF->>SF: Release(k)
+    SF->>Sp: release after joiners drain
+```
+
+### 6.1 HEAD request flow
+
+`HEAD /{bucket}/{key}` is served entirely from object metadata; no
+chunk lookup is performed.
+
+1. Auth as for GET.
+2. `fetch.Coordinator` looks up `ObjectInfo` in the metadata cache.
+   On miss, the metadata-layer singleflight (s8.7) issues at most one
+   `Origin.Head` per object per replica per `metadata_ttl` window.
+3. On success, return `200 OK` with `Content-Length:
+   ObjectInfo.Size`, `ETag: "ObjectInfo.ETag"`, `Content-Type:
+   ObjectInfo.ContentType`, `Accept-Ranges: bytes`. No
+   `CacheStore.Stat` and no `CacheStore.GetChunk` calls.
+4. Negative cases reuse the GET error mapping (s6.3): `404` is
+   negatively cached for `negative_metadata_ttl` (s12); an unsupported azureblob
+   blob type (s9) returns `502 OriginUnsupported` with the
+   `x-orca-reject-reason` header.
+
+HEAD does NOT validate `If-Match` / `If-None-Match` / `If-Modified-Since`
+preconditions against the cache state in v1; conditional HEAD is a
+read-only client-side concern that operates on the returned `ETag`.
+
+### 6.2 LIST request flow
+
+`GET /{bucket}/?list-type=2&prefix=...` (S3 ListObjectsV2). v1 LIST
+serves from a per-replica **LIST cache** (s6.2 introduces it; FW3)
+in front of the existing per-replica LIST singleflight. The cache
+is sized and tuned for the FUSE-`ls` workload pattern: thousands of
+edge clients implementing FUSE filesystems perform interactive
+`ls` and directory navigation against the S3 API, generating
+prefix-clustered LIST traffic where the same query is repeated
+many times within a short window. Per-replica caching is naturally
+effective for FUSE clients because they typically pin to one
+replica via HTTP/2 keepalive.
+
+**Cache key**: the full LIST query tuple
+`(origin_id, bucket, prefix, continuation_token, start_after,
+delimiter, max_keys)`. Pagination tokens are part of the key, so
+sequential page-through caches each page independently and does
+not collide.
+
+**TTL**: governed by `list_cache.ttl` (default 60s, configurable
+typical range 5s - 30m). The 60s default trades freshness vs.
+origin load: a freshly-uploaded key is invisible to LIST clients
+for up to 60s. Acceptable for the immutable-artifact workload;
+operators with write-and-immediately-list patterns should tune
+shorter.
+
+**Eviction**: bounded LRU on `list_cache.max_entries` (default
+1024). Memory math: 1024 entries times ~10 KB typical (1000-key
+listing) = ~10 MB worst case.
+
+**Response-size cap**: very large LIST responses
+(>`list_cache.max_response_bytes`, default 1 MiB) bypass the cache
+entirely; the response is served to the client but not stored.
+
+**Steps**:
+
+0. **Cache lookup**. Compute the cache key from the request
+   parameters. On hit, serve the cached `ListResult` directly with
+   header `x-orca-list-cache-age: <seconds>`. No origin
+   call. No singleflight acquisition. `list_cache_hit_total{origin_id,
+   result="hit"}++`.
+
+1. Auth as for GET.
+
+2. On cache miss, the request parameters `(prefix, continuation-token
+   / start-after, max-keys, delimiter)` are forwarded verbatim to
+   `Origin.List`. The continuation token returned to the client is
+   the origin's token passed through unchanged. There is no token
+   rewriting.
+
+3. **Per-replica LIST singleflight** keyed on the same cache-key
+   tuple collapses concurrent identical LIST calls on the same
+   replica during the cache miss. There is no cluster-wide LIST
+   singleflight in v1; cluster-wide bound is up to `N` `Origin.List`
+   calls per identical query per `list_cache.ttl` window where `N`
+   is peer-set size. Acceptable at v1 scale; a cluster-wide LIST
+   coordinator is a deferred optimization
+   ([s15.3](#153-cluster-wide-list-coordinator)).
+
+4. **azureblob origin**: when `cachestore.azureblob.list_mode = filter`
+   (the default), non-BlockBlob entries are stripped while
+   continuation tokens are preserved (s9). `passthrough` mode
+   disables filtering and returns the entire listing including
+   unsupported blob types.
+
+5. **Cache populate** on successful `Origin.List`. If the serialized
+   `ListResult` exceeds `list_cache.max_response_bytes`, skip the
+   populate (serve the response normally) and increment
+   `list_cache_evict_total{reason="response_too_large"}`. Otherwise
+   store with TTL = `list_cache.ttl`. Negative responses (errors)
+   are NOT cached; errors fall through every time. Empty-result
+   listings ARE cached (an authoritative "this prefix has no keys"
+   for the TTL window).
+
+6. LIST does NOT populate the metadata cache for individual entries.
+   A subsequent GET / HEAD on a listed key still triggers an
+   `Origin.Head` (subject to its own singleflight and TTL).
+   Rationale: eager metadata population on large listings would
+   balloon the metadata cache, and the FUSE workload typically
+   reads only a fraction of listed entries.
+
+7. Origin failures during LIST surface as `502 Bad Gateway`
+   (`ErrTransient` upstream) or the corresponding S3 error code;
+   LIST does NOT trip the CacheStore circuit breaker because it
+   never touches the CacheStore.
+
+**Stale-while-revalidate** is opt-in via
+`list_cache.swr_enabled: false` default. When enabled with
+`list_cache.swr_threshold_ratio: 0.5` (default), an entry whose
+age exceeds half of `list_cache.ttl` is served immediately AND
+triggers a background `Origin.List` to refresh; the user-observed
+latency stays at cache-hit speed even at TTL boundaries. Adds
+small extra origin load (one refresh per entry per TTL window).
+Useful for heavy interactive FUSE deployments where `ls` latency
+spikes at TTL expiry are user-visible.
+
+**Toggle**: `list_cache.enabled: true` default. Set `false` to
+disable the cache layer for diagnostics; LIST falls through to the
+existing pass-through behavior with per-replica singleflight only.
+
+### 6.3 HTTP error-code mapping
+
+The complete catalog of HTTP statuses the cache layer can return on
+the **client edge**. Internal-listener (`:8444`, s8.8) statuses are
+listed inline in s8.3 and are not reproduced here.
+
+| Status | S3-style code | Reason | Triggered by | Client retry? |
+|---|---|---|---|---|
+| `200 OK` / `206 Partial Content` | (none) | normal hit or successful fill | hit + range OK; cold-path fill after pre-header-retry commit (s8.6) | n/a |
+| `400 RequestSizeExceedsLimit` | `RequestSizeExceedsLimit` | response would exceed `server.max_response_bytes` | range math at request entry; `x-orca-cap-exceeded: true` | no (different range) |
+| `416 Requested Range Not Satisfiable` | `InvalidRange` | range vs. `ObjectInfo.Size` violation | range math at request entry | no (different range) |
+| `502 Bad Gateway` | `OriginUnreachable` | origin error before commit boundary | `Origin.GetRange` 5xx; origin DNS failure; semaphore exhausted past wait | yes, small backoff |
+| `502 Bad Gateway` | `OriginRetryExhausted` | leader retry budget exhausted (`origin.retry.attempts` or `origin.retry.max_total_duration`) before any byte from origin (s8.6) | sustained transient origin failures during pre-header retry | yes (origin may recover) |
+| `502 Bad Gateway` | `OriginETagChanged` | `OriginETagChangedError` from `Origin.GetRange` (s8.6) | mid-flight overwrite caught by `If-Match`; non-retryable | yes (next request re-Heads) |
+| `502 Bad Gateway` | `OriginUnsupported` | non-BlockBlob azureblob (s9) | `Origin.Head` returns unsupported blob type | no |
+| `502 Bad Gateway` | `BackendUnavailable` | CacheStore `ErrAuth` | CacheStore credentials rejected | no (operator) |
+| `503 Slow Down` | `SlowDown` | CacheStore `ErrTransient` | CacheStore 5xx / timeout / throttle | yes |
+| `503 Slow Down` | `SlowDown` | spool full | `spool.max_inflight` exhausted past wait | yes |
+| `503 Slow Down` | `SlowDown` | breaker open | per-process CacheStore breaker open (s10.2) | yes |
+| `503 Service Unavailable` | (probe) | replica NotReady | `/readyz` failing predicates (s10.5) | n/a (LB drain) |
+| (mid-stream abort) | n/a | post-commit-boundary failure | origin disconnect after first byte sent to client; CacheStore commit failure does NOT cause this (commit is post-response) | client SDK detects via `Content-Length` mismatch and retries; mid-stream resume deferred (s15.4) |
+
+`Retry-After: 1s` is set on every `503 Slow Down`. Pre-first-byte
+errors carry an S3-style XML body (`<Error><Code>...<Message>...`).
+Mid-stream aborts terminate the response (`HTTP/2 RST_STREAM(INTERNAL_ERROR)`
+or `HTTP/1.1 Connection: close`) and increment
+`orca_responses_aborted_total{phase="mid_stream",reason}`.
+
+## 7. Internal interfaces
+
+The mechanism's named seams. Implementations live under
+`internal/orca/`.
+
+```go
+// Origin: read-only view of upstream blob store. GetRange takes the etag
+// from the prior Head and uses it as an If-Match precondition; mid-flight
+// overwrite returns OriginETagChangedError.
+type Origin interface {
+    Head(ctx context.Context, bucket, key string) (ObjectInfo, error)
+    GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error)
+    List(ctx context.Context, bucket, prefix, marker string, max int) (ListResult, error)
+}
+
+// OriginETagChangedError is returned by Origin.GetRange when the origin
+// rejects the If-Match precondition. The fill is refused and the metadata
+// cache entry for {origin_id, bucket, key} is invalidated; the next
+// request re-Heads and gets a fresh ChunkKey.etag.
+type OriginETagChangedError struct {
+    Bucket, Key string
+    Want, Got   string // Want = ETag we expected; Got = current ETag if known
+}
+
+// CacheStore: where chunk bytes physically live in the DC. Treated as the
+// source of truth for chunk presence; backed by an in-DC S3-like service
+// in production and a local directory in dev. PutChunk is atomic and
+// no-clobber; the second concurrent PutChunk for the same key returns a
+// CommitLost error. Read/Stat methods return typed errors:
+//   - ErrNotFound:  chunk is absent. ONLY this error triggers a refill.
+//   - ErrTransient: backend hiccup (5xx, timeout, throttle). Surfaced as
+//                   503 Slow Down + Retry-After. Counts toward the
+//                   per-process circuit breaker (see s10.2).
+//   - ErrAuth:      backend rejected credentials (401/403). Surfaced as
+//                   502 BadGateway. Counts toward the breaker AND toward
+//                   the /readyz consecutive-ErrAuth threshold (default 3
+//                   -> NotReady).
+//
+// Delete removes a chunk; used by active eviction (s13.2). Idempotent;
+// ErrNotFound on a missing chunk is treated as success by the eviction
+// loop. Delete errors count toward the same circuit breaker as Get / Put.
+type CacheStore interface {
+    GetChunk(ctx context.Context, k ChunkKey, off, n int64) (io.ReadCloser, error)
+    PutChunk(ctx context.Context, k ChunkKey, size int64, r io.Reader) error // atomic, no-clobber
+    Stat(ctx context.Context, k ChunkKey) (ChunkInfo, error)
+    Delete(ctx context.Context, k ChunkKey) error // s13.2 active eviction
+    SelfTestAtomicCommit(ctx context.Context) error // startup probe
+}
+
+// CacheStore typed errors. Wrap with %w so callers use errors.Is.
+var (
+    ErrNotFound  = errors.New("cachestore: not found")
+    ErrTransient = errors.New("cachestore: transient")
+    ErrAuth      = errors.New("cachestore: auth")
+)
+
+// ChunkCatalog: in-memory, best-effort record of chunks known to be
+// present in the CacheStore. Purely a hot-path optimization; the
+// ChunkCatalog: in-memory, best-effort record of chunks known to be
+// present in the CacheStore. Purely a hot-path optimization; the
+// CacheStore is the source of truth. A Lookup miss falls through to
+// CacheStore.Stat; the result is Recorded for subsequent requests.
+//
+// Lookup has a side effect: it increments the matched entry's
+// AccessCount and updates LastAccessed (s10.2). These access counters
+// are consumed by the optional active eviction loop (s13.2). Side
+// effects are atomic; Lookup remains safe for concurrent callers.
+//
+// Forget is invoked when an entry is known to be invalid:
+//   - on OriginETagChangedError, the assembler Forgets the now-stale
+//     ChunkKey (its etag has been superseded);
+//   - on a CacheStore.GetChunk returning ErrNotFound for a key that
+//     was previously Recorded (lifecycle eviction caught the entry);
+//   - by the active eviction loop (s13.2) after a successful
+//     CacheStore.Delete.
+// In v1 there are no other callers.
+type ChunkCatalog interface {
+    Lookup(k ChunkKey) (ChunkInfo, bool)
+    Record(k ChunkKey, info ChunkInfo)
+    Forget(k ChunkKey)
+}
+
+// Cluster: peer discovery + rendezvous hashing. Returns the coordinator
+// peer for a given ChunkKey. self == coordinator means handle locally.
+// InternalDial returns a transport (HTTP/2 over mTLS) for issuing
+// internal RPCs to a non-self peer. ServerName returns the stable SAN
+// (default "orca.<ns>.svc") used for TLS verification across
+// rolling restarts and pod-IP churn; per-replica internal-listener certs
+// MUST include this SAN.
+type Cluster interface {
+    Coordinator(k ChunkKey) Peer  // returns self or remote Peer
+    Self() Peer
+    Peers() []Peer                // current membership snapshot
+    InternalDial(ctx context.Context, p Peer) (InternalClient, error)
+    ServerName() string           // e.g. "orca.<ns>.svc"
+}
+
+// Spool: bounded local-disk staging area for in-flight fills. Every fill
+// writes through the spool so slow joiners can fall back from the leader's
+// ring buffer to a local disk reader regardless of CacheStore driver.
+type Spool interface {
+    Begin(k ChunkKey, size int64) (SpoolWriter, error)
+    Reader(k ChunkKey, off int64) (io.ReadCloser, error)
+    Release(k ChunkKey) // drop spool entry once all in-flight readers are done
+}
+
+type SpoolWriter interface {
+    io.Writer
+    Commit() error // fsync + close
+    Abort() error  // discard
+}
+
+// ---------------------------------------------------------------------
+// Supporting types referenced by the interfaces above.
+// ---------------------------------------------------------------------
+
+// ObjectInfo: result of a successful Origin.Head and the metadata-cache
+// entry shape. LastValidated and LastStatus are advisory and used for
+// negative-cache TTL accounting (s8.6).
+type ObjectInfo struct {
+    Size          int64
+    ETag          string
+    ContentType   string
+    LastValidated time.Time
+    LastStatus    int // last HTTP status seen from the origin
+}
+
+// ChunkInfo: result of a successful CacheStore.Stat or
+// ChunkCatalog.Lookup. Size is the on-store byte length, which equals
+// chunk_size for all chunks except the last chunk of an object (which
+// is partial; see s10.3).
+//
+// AccessCount, LastAccessed, and LastEntered are set by the
+// ChunkCatalog as access-frequency tracking for the optional active
+// eviction loop (s13.2). They are zero-valued on freshly-Recorded
+// entries and are atomically updated by Lookup.
+type ChunkInfo struct {
+    Size         int64
+    Committed    time.Time
+    AccessCount  uint32    // s13.2; saturates at MaxUint32
+    LastAccessed time.Time // s13.2; updated on Lookup hit
+    LastEntered  time.Time // s13.2; set on Record; never updated
+}
+
+// ListResult: paginated result from Origin.List.
+type ListResult struct {
+    Entries     []ObjectEntry
+    NextMarker  string
+    IsTruncated bool
+}
+
+// ObjectEntry: one item in a ListResult. BlobType is azureblob-specific
+// and lets the cache filter non-BlockBlob entries while preserving
+// continuation tokens (s9).
+type ObjectEntry struct {
+    Key      string
+    Size     int64
+    ETag     string
+    BlobType string // "" for s3 origin; "BlockBlob" / "PageBlob" / "AppendBlob" for azureblob
+}
+
+// Peer: a single replica in the current peer-set snapshot returned by
+// Cluster.Peers / Cluster.Coordinator / Cluster.Self.
+type Peer struct {
+    IP   string // pod IP from the headless Service A-record
+    Self bool   // true iff this is the current process
+}
+
+// InternalClient: HTTP/2 over mTLS client to a peer's internal listener.
+// Returned by Cluster.InternalDial. v1 exposes the per-chunk fill RPC
+// only.
+type InternalClient interface {
+    Fill(ctx context.Context, k ChunkKey) (io.ReadCloser, error)
+}
+
+// MetadataCacheEntry: per-entry shape of the metadata cache (s8.7,
+// s11.2). Access tracking is set unconditionally on Lookup hit but
+// only consumed by the optional bounded-freshness mode (s11.2).
+type MetadataCacheEntry struct {
+    ObjectInfo
+    AccessCount  uint32    // s11.2; saturates at MaxUint32
+    LastAccessed time.Time // s11.2; updated on Lookup hit
+    LastEntered  time.Time // s11.2; set on Record; never updated
+}
+```
+
+Implementations:
+
+- `Origin`: `origin/s3`, `origin/azureblob` (Block Blob only). Both pass
+  the caller's `etag` as `If-Match` on the underlying GET; both translate
+  the backend's "precondition failed" status into `OriginETagChangedError`.
+- `CacheStore`: `cachestore/localfs` (dev), `cachestore/s3` (in-DC
+  S3-compatible object store, e.g. VAST), `cachestore/posixfs` (shared
+  POSIX FS: NFSv4.1+ baseline, plus Weka native, CephFS, Lustre, GPFS).
+  See [s10.1](#101-atomic-commit-per-cachestore-driver) for atomic-commit
+  specifics per driver. The two POSIX-shaped drivers (`localfs` and
+  `posixfs`) share their commit primitives (`link()` no-clobber, dir
+  fsync, staging-dir layout, optional fan-out) via
+  `internal/orca/cachestore/internal/posixcommon/`; this is an
+  internal-to-cachestore package and is not visible to the rest of the
+  cache layer.
+- `ChunkCatalog`: a single in-memory LRU implementation with
+  optional access-frequency tracking driving the active eviction
+  loop (s13.2). Bounded by `chunk_catalog.max_entries`.
+- `Cluster`: a single implementation that polls the headless Service
+  (default 5s), computes rendezvous hashes against pod IPs, and exposes
+  an mTLS HTTP/2 client for the internal listener.
+- `Spool`: a single implementation backed by a configured local directory
+  (`spool.dir`) with a capacity cap (`spool.max_bytes`) and an in-flight
+  cap (`spool.max_inflight`).
+
+## 8. Stampede protection
+
+The single most important hot-path correctness issue. Layered defense.
+
+### 8.1 Per-`ChunkKey` singleflight
+
+Process-local map `inflight: map[ChunkKey]*Fill`, guarded by a mutex. Each
+`*Fill` has a `done` channel, an error slot, the resulting `ChunkInfo`, a
+bounded ring buffer, a `Spool` handle (s8.2), and a refcount. Acquire
+path: under the lock, either return the existing entry as a joiner or
+insert a new entry and become the leader. Release path: leader removes
+the entry from the map after signalling, so any thread arriving while the
+entry is mapped joins; any thread arriving after removal records the
+chunk in the `ChunkCatalog` (which the leader populated before releasing)
+and serves a normal hit.
+
+### 8.2 TTFB tee + spool
+
+In v1 the leader streams origin bytes directly to the requesting
+client (after pre-header retry confirms a healthy origin
+connection, s8.6) AND simultaneously tees the bytes into two
+side channels for joiner support and the asynchronous CacheStore
+commit:
+
+1. **Ring buffer** (in-memory, bounded 1-2 MiB by default). Joiners
+   obtain a `Reader` over this buffer that replays buffered bytes
+   and blocks on a condition variable for more. Delivers low TTFB
+   for on-pace joiners.
+2. **Spool** (local disk file via the `Spool` interface). The
+   leader writes every byte to a local spool file in parallel
+   with the client write and the CacheStore upload. A slow joiner
+   that falls behind the ring buffer head transparently switches
+   to a `Spool.Reader(k, off)`. The spool exists because the
+   production `cachestore/s3` driver streams directly into
+   `PutObject` and does not produce a readable on-disk tmp file -
+   without the spool, slow joiners on the s3 path would have no
+   local fallback. The spool unifies joiner-fallback behavior
+   across `localfs`, `s3`, and `posixfs` drivers.
+
+**The spool is NOT on the client TTFB path in v1.** Cold-path
+client TTFB is bounded by origin first-byte latency plus a small
+amount of pre-header retry overhead (s8.6). The leader does NOT
+wait for the chunk to be fully written or fsynced into the spool
+before sending bytes to the client. The spool is a parallel
+side-channel for joiner support and CacheStore commit; the client
+write is independent of and in parallel with the spool write.
+
+**Spool locality is required (with a documented override).** The
+Spool MUST live on a local block device by default. At boot, the
+cache layer runs `statfs(2)` against `spool.dir` and refuses to
+start (exit non-zero) if the filesystem magic matches a network FS
+denylist (NFS, SMB / CIFS, CephFS, Lustre, GPFS, FUSE including
+Alluxio FUSE), incrementing
+`orca_spool_locality_check_total{result="refused"}`.
+Governed by `spool.require_local_fs` (default `true`). The
+rationale is now defense-in-depth: with the v1 streaming design
+the spool no longer gates client TTFB, but joiner-fallback latency
+still benefits materially from local NVMe (a remote-FS spool would
+convert microsecond-class read-from-spool to milliseconds-class
+network-round-trip on every joiner switchover). Operators with
+unusual placements (e.g., large RAM-disk) MAY relax the contract
+via `spool.require_local_fs: false`; production deployments are
+expected to keep the default. See
+[s10.4](#104-spool-locality-contract) for the full check.
+
+**CacheStore commit timing.** After the leader has streamed the
+full chunk to the client (and the spool has finished receiving),
+the leader performs the CacheStore commit asynchronously
+(`PutObject + If-None-Match: *` for `s3`; `link()` for `localfs`
+and `posixfs`). Success increments
+`commit_after_serve_total{result="ok"}`; failure increments
+`commit_after_serve_total{result="failed"}` AND skips
+`ChunkCatalog.Record` so the next request refills. The client
+response is unaffected either way - by this point the client has
+already received the full chunk.
+
+Capacity: `spool.max_bytes` caps total spool footprint (default 8
+GiB); `spool.max_inflight` caps concurrent fills using the spool.
+When the spool is full, new fills wait briefly on the
+`spool.max_inflight` semaphore; on timeout they return `503 Slow
+Down` to the client.
+
+After the leader's CacheStore commit succeeds, the spool entry is
+retained briefly so any in-flight joiner can finish reading; once
+joiner refcount hits zero the spool entry is released. On commit-
+after-serve failure the spool entry is released the same way; the
+cache layer simply does not record the chunk and the next request
+refills.
+
+### Diagram 5: Scenario C - concurrent miss, same-replica joiner
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant A as Client A (leader request)
+    participant B as Client B (joiner)
+    participant R as Replica
+    participant SF as Singleflight
+    participant Ring as Ring buffer (1-2 MiB)
+    participant Sp as Spool (local disk)
+    participant O as Origin
+    participant CS as CacheStore
+    participant Cat as ChunkCatalog
+    A->>R: GET k
+    R->>SF: Acquire(k) [leader = A]
+    SF->>O: GetRange(..., If-Match: etag)<br/>(pre-header retry s8.6)
+    O-->>SF: first byte
+    Note over SF: commit boundary - origin healthy
+    par tee to ring
+        SF->>Ring: bytes
+    and tee to spool
+        SF->>Sp: bytes
+    and stream to A
+        SF-->>A: stream bytes as they arrive
+    end
+    O-->>SF: remaining bytes
+    B->>R: GET k (concurrent)
+    R->>SF: Acquire(k) [joiner = B]
+    SF-->>B: stream from Ring
+    Note over B: B falls behind ring head
+    SF-->>B: switch to Spool.Reader
+    SF->>Sp: Commit (fsync + close) [after stream complete]
+    SF-)CS: PutObject(final, body, If-None-Match: *) [async]
+    CS--)SF: 200 (commit_won) or failure
+    alt commit ok
+        SF->>Cat: Record(k, info)
+    else commit failed
+        Note over SF: commit_after_serve_total{result=failed}++<br/>chunk NOT recorded
+    end
+    SF->>SF: Release(k)
+    SF->>Sp: Release after joiners drain
+```
+
+### 8.3 Cluster-wide deduplication via per-chunk fill RPC
+
+Rendezvous hashing on `ChunkKey` against the current pod-IP set selects
+**one coordinator per chunk**. A range request can span N chunks; those
+chunks may have N distinct coordinators. The replica that receives the
+client request is therefore the **assembler**, not a forwarder of the
+whole HTTP request. For each `ChunkKey k` in the requested range:
+
+- **Hit** (Catalog or `Stat` says present): assembler reads from
+  `CacheStore` directly. No internal RPC.
+- **Miss + `Coordinator(k) == self`**: assembler runs the local
+  singleflight + tee + spool + commit path (s8.1, s8.2, s10).
+- **Miss + `Coordinator(k) != self`**: assembler issues
+  `GET /internal/fill?key=<encoded ChunkKey>` to the coordinator on the
+  coordinator's internal listener (s8.8). The coordinator runs the
+  singleflight + tee + spool + commit path locally and streams the chunk
+  bytes back. The assembler stitches the returned bytes into the client
+  response, slicing the first and last chunk to match the client's `Range`.
+
+**Loop prevention**: the assembler sets `X-Origincache-Internal: 1` on
+internal RPCs. A receiver seeing this header MUST self-check:
+`Cluster.Coordinator(k) == Cluster.Self()`. On disagreement (membership
+flux), the receiver returns `409 Conflict` with body
+`{"reason":"not_coordinator"}`; the assembler falls back to local fill
+for that chunk (one duplicate fill possible during flux; observable via
+the duplicate-fills metric below). Receivers MUST NOT chain forward
+internal RPCs.
+
+Combined with s8.1, exactly one origin GET per cold chunk per cluster in
+steady state. During membership change we accept up to one duplicate fill
+per chunk (loser drops on commit collision; observable via
+`orca_origin_duplicate_fills_total{result="commit_lost"}`). The
+duplicate-fill metric is the leading indicator that this routing is
+working: a sustained non-zero `commit_lost` rate signals chronic
+membership flux or a bug in the hash distribution.
+
+### Diagram 6: Scenario D - cold miss, remote coordinator
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Client
+    participant A as Replica A (assembler)
+    participant B as Replica B (coordinator for k)
+    participant SF as Singleflight @ B
+    participant Sp as Spool @ B
+    participant O as Origin
+    participant CS as CacheStore
+    C->>A: GET /bucket/key Range
+    A->>A: rendezvous(k, peer IPs) = B
+    Note over A: B != self
+    A->>B: GET /internal/fill?key=k<br/>X-Origincache-Internal: 1<br/>(mTLS, internal listener :8444)
+    B->>B: self-check: Coordinator(k) == self?
+    Note over B: yes, proceed
+    B->>SF: Acquire(k) [leader]
+    SF->>O: GetRange(..., If-Match: etag)<br/>(pre-header retry s8.6)
+    O-->>SF: first byte
+    Note over SF: commit boundary - origin healthy
+    par stream to A
+        SF-->>B: stream bytes as they arrive
+        B-->>A: chunk bytes (stream)
+        A-->>C: stream slice
+    and tee to spool @ B
+        SF->>Sp: write bytes (in parallel)
+    end
+    O-->>SF: remaining bytes
+    SF->>Sp: Commit (fsync + close) [after stream complete]
+    SF-)CS: PutObject(final, body, If-None-Match: *) [async]
+    CS--)SF: 200 (commit_won) or failure
+    Note over A,B: On membership disagreement at B<br/>B returns 409 and A falls back to local fill
+    Note over A,B: On hit (chunk in CacheStore)<br/>A reads CacheStore directly with no internal RPC
+```
+
+### Diagram 7: Scenario E - range spanning multiple coordinators
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Client
+    participant A as Replica A (assembler)
+    participant CS as CacheStore
+    participant B as Coordinator(k2)
+    participant D as Coordinator(k3)
+    Note over A: Range bytes=X-Y -> chunks {k1, k2, k3}
+    C->>A: GET /bucket/key Range
+    A->>A: streaming chunk iterator
+    Note over A: k1: Stat hit -> read CacheStore
+    A->>CS: GetChunk(k1)
+    CS-->>A: bytes
+    A-->>C: stream slice (first chunk -> headers go out)
+    Note over A: k2: miss, Coordinator(k2) = B != self
+    A->>B: GET /internal/fill?key=k2 (mTLS)
+    B-->>A: chunk bytes
+    A-->>C: stream slice
+    Note over A: k3: miss, Coordinator(k3) = D != self
+    A->>D: GET /internal/fill?key=k3 (mTLS)
+    D-->>A: chunk bytes
+    A-->>C: stream slice
+```
+
+### 8.4 Origin backpressure
+
+Each replica enforces a **per-replica token bucket** that caps
+concurrent `Origin.GetRange` calls. The bucket is sized to a
+conservative per-replica fraction of the desired cluster-wide
+concurrency:
+
+```
+target_per_replica = floor(target_global / N_typical)
+```
+
+where `N_typical` is the expected replica count in steady state
+(`cluster.target_replicas`, default 3). Defaults: `target_global=192`,
+giving `target_per_replica=64`.
+
+This is approximate. Realized cluster-wide concurrency depends on
+the actual replica count `N_actual`:
+
+- `N_actual == N_typical`: realized cap is `target_global` exactly.
+- `N_actual > N_typical` (scaled out without updating
+  `cluster.target_replicas`): realized cap exceeds `target_global`
+  by up to `(N_actual - N_typical) * target_per_replica`.
+- `N_actual < N_typical` (scaled in): realized cap falls below
+  `target_global` by `(N_typical - N_actual) * target_per_replica`.
+
+Operators MUST update `cluster.target_replicas` after any sustained
+scale change. Dynamic recompute of the cap from `len(Cluster.Peers())`
+is a deferred optimization; see
+[s15.6](#156-dynamic-per-replica-origin-cap).
+
+Origin throttling responses (HTTP 503 SlowDown, 429, retryable
+5xx) are handled by the leader's pre-header retry loop (s8.6 /
+Option D), which provides exponential backoff transparent to the
+client. If the retry budget exhausts, the leader returns
+`502 OriginRetryExhausted`. The system self-regulates without
+cluster-wide coordination: an over-loaded origin slows individual
+fills via backoff; the per-replica cap bounds inflight per pod;
+the singleflight (s8.1) collapses concurrent identical fills.
+
+When the bucket is saturated, leaders queue with bounded wait
+(`origin.queue_timeout`, default 5s); on timeout, the request
+returns `503 Slow Down` to the client so clients back off.
+Joiners on existing fills do not consume slots.
+
+The current saturation is exposed as
+`orca_origin_inflight{origin}` (per-replica gauge).
+Operators can sum across replicas in their monitoring stack to
+observe approach to `target_global`.
+
+A real coordinated cluster-wide limiter (Kubernetes-Lease-elected
+authority + slot-lease tokens + RPC-based slot acquisition +
+graceful fallback) is a deferred optimization; see
+[s15.5](#155-coordinated-cluster-wide-origin-limiter) for the
+full design, trigger conditions, and v1 bound. Build only when
+measured deployment scale (>10 replicas with steady-state slot
+under-utilization) justifies the additional surface area.
+
+Optional token bucket on origin bytes/sec layered on top of the
+slot-based concurrency cap.
+
+### 8.5 Cancellation safety
+
+`Fill.run()` uses an internal long-lived context, not any single client's
+context. The fill outlives any single requester. If every joiner cancels
+we still finish the fill (cheap insurance; configurable to abort). A
+joiner cancelling unblocks only itself.
+
+### 8.6 Failure handling without re-stampede
+
+- **Retryable error**: short-lived negative entry in the singleflight map
+  (cooldown 100 ms - 1 s) so concurrent joiners share the failure rather
+  than each retrying immediately.
+- **`OriginETagChangedError`**: leader (a) invalidates the metadata cache
+  entry for `{origin_id, bucket, key}`, (b) fails the in-flight fill, (c)
+  joiners receive the same error and abort their responses (or, if
+  pre-commit, get a `502 Bad Gateway`). The next request triggers a
+  fresh `Head` and a new `ChunkKey` with the new ETag. Old chunks under
+  the old ETag age out via the CacheStore lifecycle. Increments
+  `orca_origin_etag_changed_total`.
+- **Hard 404 / unsupported blob type**: cached in the metadata cache as
+  a negative entry for `negative_metadata_ttl` (default 60s,
+  configurable). Per-replica HEAD singleflight (s8.7) caps origin HEAD
+  load at one HEAD per object per replica per window. The full
+  negative-cache lifecycle and the create-after-404 case (an operator
+  uploads `K` after a client has already observed `404` on `K`) are in
+  [s12](#12-create-after-404-and-negative-cache-lifecycle).
+- **Pre-header origin retry (the v1 cold-path retry mechanism)**:
+  the leader retries `Origin.GetRange` on transient errors **before**
+  any HTTP response header is sent to the client, making transient
+  origin failures invisible to the client. The retry budget is
+  bounded by both attempt count and total wall-clock duration:
+  - `origin.retry.attempts` (default 3): max attempts.
+  - `origin.retry.backoff_initial` (default 100ms),
+    `origin.retry.backoff_max` (default 2s): exponential backoff
+    cap per attempt.
+  - `origin.retry.max_total_duration` (default 5s): absolute
+    wall-clock cap; if exceeded the leader returns `502 Bad Gateway`
+    even before all attempts complete.
+
+  The **commit boundary** is the first byte arrival from origin:
+  once received, the leader sends headers + first byte, then
+  streams. Pre-commit failures return clean HTTP errors (`502
+  Bad Gateway` with code `OriginUnreachable` or
+  `OriginRetryExhausted`); post-commit failures become mid-stream
+  client aborts (s6 step 7). `OriginETagChangedError` is
+  non-retryable (the object identity changed; refilling under the
+  old ETag is the bug we are preventing); the leader returns
+  `502 OriginETagChanged` immediately. Joiners sit through retries
+  on the same `Fill`. Outcomes are exposed as
+  `orca_origin_retry_total{result="success|exhausted_attempts|exhausted_duration|etag_changed"}`
+  (one increment per request that entered the retry loop) and
+  `orca_origin_retry_attempts` (histogram of attempt count
+  per request).
+
+  The retry budget defaults are intentionally smaller than typical
+  S3 SDK read timeouts (aws-sdk-go: 30s; boto3: 60s) so retries
+  complete before clients time out.
+- **`CommitFailedAfterServe`**: the CacheStore commit happens
+  asynchronously after the client response is complete (s8.2). A
+  failure here is NOT visible to the client. The leader increments
+  `orca_commit_after_serve_total{result="failed"}` and
+  does NOT call `ChunkCatalog.Record`. Joiners on the same fill
+  that are still draining the Spool finish normally; the next
+  request for the same `ChunkKey` re-runs the fill (one extra
+  origin GET worst case). Sustained non-zero `failed` rate is a
+  CacheStore-health alert, not a per-request error path.
+- **Typed `CacheStore` errors during read**: `ErrNotFound` triggers the
+  miss-fill path; `ErrTransient` surfaces as `503 Slow Down` with
+  `Retry-After: 1s`; `ErrAuth` surfaces as `502 Bad Gateway`. Sustained
+  `ErrTransient` / `ErrAuth` trips the per-process **CacheStore circuit
+  breaker** (s10.2). Sustained `ErrAuth` (default 3 consecutive) flips
+  `/readyz` to NotReady so load balancers drain the replica.
+
+### 8.7 Metadata-layer singleflight
+
+Same pattern at the metadata cache:
+`metaInflight: map[ObjectKey]*MetaFill`. Without this, a flood of
+distinct cold keys shifts the storm from chunk GETs to chunk HEADs.
+Stale-while-revalidate behavior: serve stale within a small margin while
+one background refresh runs. The singleflight is **per-replica**: a
+cluster-wide cold-fan-out can cause up to N HEADs per object per
+`metadata_ttl` window where N is the current peer-set size. This is
+acceptable in v1; a cluster-wide HEAD singleflight is a deferred
+optimization (see [s15.2](#152-cluster-wide-head-singleflight)).
+
+**LIST cache singleflight (FW3, s6.2).** A parallel per-replica
+singleflight collapses concurrent identical `Origin.List` calls
+keyed on the full LIST query tuple. Sits in front of the LIST
+cache; reused on cache miss. Cluster-wide bound is up to N origin
+LIST per identical query per `list_cache.ttl`; a cluster-wide LIST
+coordinator is a deferred optimization (s15.3).
+
+**Bounded-freshness mode interaction (FW5, s11.2).** When
+`metadata_refresh.enabled: true`, background refresh workers are
+gated by the same per-replica HEAD singleflight: if both an
+on-demand miss-fill and a background refresh fire for the same
+object key concurrently, they share one `Origin.Head` and both
+consumers receive the result. New entries Recorded on a miss-fill
+start with `AccessCount=0` and `LastEntered=now`; the cold-start
+protection (`min_age`) prevents these from being immediately
+eligible for refresh.
+
+### 8.8 Internal RPC listener
+
+Per-chunk fill RPCs (`GET /internal/fill?key=<encoded ChunkKey>`) are
+served on a separate listener bound to a distinct port (default `:8444`,
+config `cluster.internal_listen`). This isolates inter-replica traffic
+from the client edge.
+
+- **Transport**: HTTP/2 over mTLS.
+- **Server cert**: per-replica cert (e.g. cert-manager-issued) chained to
+  a configured **internal CA** (`cluster.internal_tls.ca_file`). The
+  internal CA is **distinct** from the client mTLS CA so a leaked client
+  cert cannot be used to dial the internal listener. The cert MUST
+  include the stable SAN `cluster.internal_tls.server_name` (default
+  `orca.<ns>.svc`); pod-IP SANs are NOT used because pod IPs
+  change on rolling restart.
+- **Client auth**: peer presents a client cert chained to the internal CA
+  AND the peer's source IP must be in the current peer-IP set
+  (`Cluster.Peers()`). The IP-set check guards against a leaked internal
+  cert being usable from outside the Deployment.
+- **TLS verification**: the dialer pins `tls.Config.ServerName` to the
+  value returned by `Cluster.ServerName()` (the same stable SAN above)
+  rather than to the destination pod IP. This keeps verification
+  consistent across rolling restarts and pod-IP churn.
+- **Authorization scope**: the internal listener serves `GET
+  /internal/fill?key=<encoded ChunkKey>` only - the per-chunk
+  fill RPC (s8.3). No client identity is propagated from the
+  assembler because chunk content is identity-independent: any
+  authorized client at the assembler is entitled to the chunk
+  bytes, and the coordinator is doing the same fill it would do
+  for a local request.
+- **NetworkPolicy**: ingress on `:8444` allowed only from pods with
+  label `app=orca` in the same namespace.
+- **Loop prevention**: receiver enforces `X-Origincache-Internal: 1` ->
+  self must be coordinator for the requested `ChunkKey`, else
+  `409 Conflict`.
+
+Metrics: `orca_cluster_internal_fill_requests_total{direction=
+"sent|received|conflict"}`,
+`orca_cluster_internal_fill_duration_seconds`.
+
+## 9. Azure adapter: Block Blob only
+
+Hardened constraint.
+
+- Enforced in `internal/orca/origin/azureblob.Head`. Block type is
+  immutable on an existing blob (you have to delete and recreate to change
+  it, which produces a new ETag), so checking once per `(container, blob,
+  etag)` is sufficient.
+- Detection via `Get Blob Properties` -> `BlobType` field. Reject anything
+  other than `BlockBlob` with a typed error `UnsupportedBlobTypeError`
+  exported from `internal/orca/origin`.
+- Surfaced to clients as HTTP `502 Bad Gateway` with S3 error code
+  `OriginUnsupported`, body containing reason, plus
+  `x-orca-reject-reason: azure-blob-type=<type>` header.
+- Negatively cached in the metadata cache for `negative_metadata_ttl`
+  (default 60s; see [s12](#12-create-after-404-and-negative-cache-lifecycle))
+  and
+  singleflighted at the metadata layer to prevent re-probing.
+- `ListObjectsV2` defaults to `filter` mode: non-Block Blob entries are
+  skipped while preserving continuation tokens. `passthrough` mode is
+  available for debugging.
+- Config schema reserves `enforce_block_blob_only: true`. Setting it to
+  false is rejected at startup.
+- `Origin.GetRange` on the azureblob adapter uses `If-Match: <etag>` on
+  the underlying Get Blob; `412 Precondition Failed` is translated to
+  `OriginETagChangedError` (s8.6).
+- Prometheus counter:
+  `orca_origin_rejected_total{origin="azureblob",reason="non_block_blob",blob_type=...}`.
+
+### Diagram 8: Scenario F - Azure non-BlockBlob rejection
+
+```mermaid
+flowchart TD
+    Req["client GET /bucket/key<br/>(azureblob origin)"] --> Meta["Metadata cache lookup"]
+    Meta -- "hit: BlockBlob" --> OkPath["proceed: chunk path<br/>(GetRange uses If-Match: etag)"]
+    Meta -- "hit: rejected" --> Reject1["502 OriginUnsupported<br/>(neg cache TTL)"]
+    Meta -- "miss" --> Head["Origin Get Blob Properties<br/>(metadata-layer singleflight)"]
+    Head --> Type{"BlobType?"}
+    Type -- "BlockBlob" --> CacheOk["metadata cache:<br/>BlockBlob<br/>(default TTL)"]
+    Type -- "PageBlob | AppendBlob" --> CacheReject["metadata cache:<br/>UnsupportedBlobTypeError<br/>(rejection_ttl)<br/>+ rejected_total++"]
+    CacheOk --> OkPath
+    CacheReject --> Reject2["502 OriginUnsupported<br/>x-orca-reject-reason:<br/>azure-blob-type=type"]
+    LR["ListObjectsV2<br/>(list_mode=filter)"] --> Filter["skip non-BlockBlob entries,<br/>preserve continuation tokens"]
+```
+
+## 10. Concurrency, durability, correctness
+
+### 10.1 Atomic commit (per CacheStore driver)
+
+The leader publishes a chunk to the CacheStore atomically and
+no-clobber: the second concurrent commit for the same key MUST lose
+without overwriting the winner. Cold-path commit happens
+asynchronously **after** the client response is complete (s8.2 / s6
+step 6), so a commit failure here does NOT affect the
+in-flight client response; it only increments
+`orca_commit_after_serve_total{result="failed"}` and skips
+`ChunkCatalog.Record` (next request refills).
+
+Three drivers ship in v1, mapped onto two equivalent atomic-commit
+primitives. `localfs` and `posixfs` both use POSIX `link()` (or
+`renameat2(RENAME_NOREPLACE)` on Linux) returning `EEXIST` to the
+loser, and share their helpers via
+`internal/orca/cachestore/internal/posixcommon/`. `s3` uses
+`PutObject + If-None-Match: *` returning `412` to the loser. All three
+drivers run `SelfTestAtomicCommit` at boot.
+
+Commit outcomes are recorded as label values on the metric
+`orca_origin_duplicate_fills_total{result="commit_won|commit_lost"}`
+(s8.3). Throughout this section "increment commit_won" / "increment
+commit_lost" is shorthand for "increment that counter with the
+matching label value".
+
+#### 10.1.1 cachestore/localfs
+
+1. Leader stages the chunk inside `<root>/.staging/<uuid>` (a fixed
+   subdirectory of the CacheStore root, NOT `/tmp` and NOT the spool
+   directory). Staging inside the root keeps the file on the same
+   filesystem as the destination, which is required for `link()` to
+   succeed; the spool MAY be on a different filesystem and so cannot
+   also serve as the staging area.
+2. After write, `fsync(<staging file>)` then `fsync(<staging dir>)`.
+3. Commit: `link(<root>/.staging/<uuid>, <final>)`. POSIX `link()` is
+   atomic and returns `EEXIST` if the destination exists. On `EEXIST`,
+   the leader treats the existing `<final>` as the source of truth,
+   `unlink(<root>/.staging/<uuid>)`, `fsync(<root>/.staging/)`, and
+   increments commit_lost. On success, `unlink(<root>/.staging/<uuid>)`,
+   `fsync(<root>/.staging/)`, `fsync(<final parent dir>)`, and
+   increment commit_won.
+4. On Linux, `renameat2(RENAME_NOREPLACE)` is preferred when available
+   (single syscall) with the same parent-dir fsync sequencing; the
+   `link` + `unlink` form is the portable fallback (also works on
+   macOS dev environments). Plain `rename()` is **never** used because
+   it overwrites the destination on POSIX.
+5. Crash recovery: a periodic background sweep (default every 1 hour)
+   unlinks `<root>/.staging/<uuid>` entries older than
+   `cachestore.localfs.staging_max_age` (default 1h), with a
+   `fsync(<root>/.staging/)` after the batch. Nothing breaks if a
+   staging file lingers briefly. Each sweep increments
+   `orca_localfs_dir_fsync_total{result}`.
+
+#### 10.1.2 cachestore/posixfs
+
+`posixfs` runs the same `link()` no-clobber primitive as `localfs`, but
+against a shared POSIX-style filesystem mounted on every replica at the
+same mount point and the same `<root>`. All replicas race the same
+`link()` syscall against the same destination inode; the kernel (NFS
+server, Weka, CephFS MDS, Lustre MDS, GPFS, etc.) is the arbiter, and
+exactly one wins.
+
+1. Backend selection and detection. At boot the driver inspects the
+   filesystem under `<root>` via `statfs(2)` (`f_type`) and
+   `/proc/mounts` and emits an info gauge
+   `orca_posixfs_backend{type,version,major,minor}` (e.g.
+   `type="nfs",version="4.1"`, `type="wekafs"`, `type="ceph"`,
+   `type="lustre"`, `type="gpfs"`). Operators MAY override the detected
+   `type` via `cachestore.posixfs.backend_type` for backends with
+   ambiguous magic numbers; the override is logged loudly. Detected
+   `type="fuse"` triggers an extra check: if `/proc/mounts` source
+   matches `alluxio` (case-insensitive), the driver increments
+   `orca_posixfs_alluxio_refusal_total` and exits non-zero with
+   `cachestore/posixfs: Alluxio FUSE is unsupported (no link(2), no
+   atomic no-overwrite rename, no NFS gateway); use cachestore.driver:
+   s3 against the Alluxio S3 gateway instead`.
+2. NFS minimum version. If `type="nfs"`, the driver reads the
+   negotiated NFS version from `/proc/mounts` (the `vers=` option). If
+   the version is below `cachestore.posixfs.nfs.minimum_version`
+   (default `4.1`), the driver refuses to start. NFSv3 is opt-in only
+   via `cachestore.posixfs.nfs.allow_v3: true`, which logs a loud
+   warning and increments
+   `orca_posixfs_nfs_v3_optin_total`. Rationale: NFSv3 has weak
+   retransmit semantics; NFSv4.0 has atomic CREATE EXCLUSIVE but no
+   session idempotency; NFSv4.1+ provides session-based idempotency
+   that makes `link()` / `EEXIST` safe under client retries.
+3. Path layout adds a 2-character hex fan-out to keep directory sizes
+   manageable on multi-PB working sets:
+   `<root>/<origin_id>/<hash[0:2]>/<hash>/<chunk_index>` where `hash`
+   is the existing s5 hex hash. Fan-out width is governed by
+   `cachestore.posixfs.fanout_chars` (default `2`, 0 disables). The
+   `localfs` driver does NOT add fan-out by default (small dev working
+   sets), but the `posixcommon` helper supports it on both drivers.
+4. Stage + commit + recovery: identical to `localfs` (steps 1-5 above)
+   with the fan-out parent dirs created lazily and `fsync`ed on first
+   use, and `cachestore.posixfs.staging_max_age` (default 1h) governing
+   the sweep.
+5. **Startup self-test** (`SelfTestAtomicCommit`): on driver init the
+   `posixfs` driver creates a staging file, links it to a probe final,
+   then attempts a second `link()` to the same probe final and asserts
+   `EEXIST`. It then writes a known-size payload to the linked file via
+   a separate handle and asserts the size is observable to a re-`stat`
+   after `fsync(<final parent dir>)`. If `EEXIST` is not returned (the
+   second `link()` succeeds, or returns a different error), or if the
+   size verification fails, the driver exits non-zero with
+   `cachestore/posixfs: backend does not honor link()/EEXIST or
+   directory fsync; refusing to start`. Governed by
+   `cachestore.posixfs.require_atomic_link_self_test` (default `true`;
+   never disabled in production). On success, the driver records
+   `orca_posixfs_selftest_last_success_timestamp`.
+6. NFS export hardening. `posixfs` documents (and the operator runbook
+   enforces) that NFS exports MUST use `sync` (not `async`); an `async`
+   export weakens the dir-fsync guarantee that the commit primitive
+   depends on. The driver cannot detect server-side `async` directly;
+   the runbook is the contract, and the boot self-test catches the most
+   common misconfigurations by re-`stat`ing through the negotiated
+   client cache.
+
+#### 10.1.3 cachestore/s3
+
+1. Leader streams origin bytes (via the Spool, s8.2) into a single
+   `PutObject(final_key, body, If-None-Match: "*")`. There is no tmp
+   key and no copy hop.
+2. `200 OK` -> commit_won. `412 Precondition Failed` -> commit_lost
+   (treat the existing object as the source of truth; no cleanup
+   needed because no tmp object was created).
+3. **Startup self-test** (`SelfTestAtomicCommit`): on driver init the
+   `cachestore/s3` driver writes a probe key, then attempts a second
+   `PutObject(probe_key, ..., If-None-Match: "*")` and asserts a
+   `412` response. If the backend returns `200` instead (silently
+   overwrites), the driver fails to start with `cachestore/s3:
+   backend does not honor If-None-Match: *; refusing to start`. This
+   prevents silent double-writes on backends that don't implement the
+   precondition. Verified backends as of v1: AWS S3 (since 2024-08),
+   MinIO, VAST Cluster (**non-versioned buckets only**). VAST
+   documents that `If-None-Match: *` is honored on `PutObject` and
+   `CompleteMultipartUpload` against unversioned buckets but is NOT
+   supported on versioned buckets ([VAST KB: S3 Conditional
+   Writes][vast-kb-conditional-writes], 2026-01-26).
+4. **Startup versioning gate**: to prevent silent atomic-commit
+   failures the driver also issues `GetBucketVersioning(bucket)` at
+   boot. If the response indicates `Status: Enabled` OR
+   `Status: Suspended` (suspended also disables `If-None-Match`-
+   based atomic writes on AWS S3), the driver exits non-zero with
+   `cachestore/s3: bucket <name> has versioning enabled or
+   suspended; If-None-Match: * is not honored on versioned buckets
+   and the atomic-commit primitive cannot guarantee no-clobber.
+   Disable bucket versioning to use cachestore/s3.` Governed by
+   `cachestore.s3.require_unversioned_bucket` (default `true`;
+   never disabled in production). The gate emits
+   `orca_s3_versioning_check_total{result="ok|refused"}` once
+   per boot.
+
+[vast-kb-conditional-writes]: https://kb.vastdata.com/documentation/docs/s3-conditional-writes
+
+### 10.2 Catalog correctness, typed errors, circuit breaker
+
+The CacheStore is the source of truth. The `ChunkCatalog` is purely an
+optimization and may be dropped at any time without affecting correctness;
+a `Lookup` miss falls through to `CacheStore.Stat` and refills the
+catalog. Catalog entries that point at a now-absent chunk (e.g. evicted
+by lifecycle) result in a `CacheStore.GetChunk` returning `ErrNotFound`,
+which is the only error treated as a miss and refilled.
+
+`CacheStore` returns three typed error classes (s7); the cache layer
+honors them distinctly:
+
+- **`ErrNotFound`** (chunk absent): triggers the miss-fill path. Normal
+  cold-path behavior; not an error from the operator's perspective.
+- **`ErrTransient`** (5xx, timeout, throttle): surfaced to the client as
+  `503 Slow Down` with `Retry-After: 1s`. Counts toward the breaker.
+  Does NOT trigger refill (would amplify load against an already-degraded
+  backend).
+- **`ErrAuth`** (401/403): surfaced as `502 Bad Gateway`. Counts toward
+  the breaker. Counts toward the `/readyz` consecutive-`ErrAuth`
+  threshold (default 3); on threshold the replica reports NotReady and
+  load balancers drain it. A single non-`ErrAuth` success resets the
+  counter.
+
+To prevent amplifying degradation under sustained backend failure, a
+**per-process CacheStore circuit breaker** wraps every `CacheStore`
+call. Defaults (configurable):
+
+- `error_window: 30s`
+- `error_threshold: 10` (`ErrTransient` + `ErrAuth` count; `ErrNotFound`
+  does not)
+- `open_duration: 30s`
+- `half_open_probes: 3`
+
+State machine: **closed** (normal pass-through) -> **open** (immediately
+short-circuits CacheStore writes with `ErrTransient`; reads still attempt
+once per `open_duration / 10` for liveness probing) -> **half-open**
+(allows up to `half_open_probes` test calls; on all-success returns to
+closed; on any failure returns to open). Transitions are exposed as
+`orca_cachestore_breaker_transitions_total{from,to}` and the
+current state as `orca_cachestore_breaker_state` (0=closed,
+1=open, 2=half_open).
+
+**Access-frequency tracking on `Lookup`.** Per FW8 (s13.2), each
+`ChunkCatalog.Lookup` hit has a side effect: it increments the
+matched entry's `AccessCount` and updates `LastAccessed`. This data
+is consumed by the optional active-eviction loop (s13.2). The side
+effect is correctness-irrelevant: catalog `Lookup` continues to be
+safe to call from any goroutine; access counters are stored
+atomically. New entries Recorded by `ChunkCatalog.Record` start with
+`AccessCount=0` and `LastEntered=now`.
+
+**`CacheStore.Delete` breaker integration.** Active eviction
+(s13.2) calls `CacheStore.Delete` in the background. `Delete`
+errors count toward the same breaker as `Get` / `Put` errors:
+sustained `ErrTransient` or `ErrAuth` from `Delete` opens the
+breaker, which short-circuits subsequent writes (including the
+eviction loop's deletes). The eviction loop checks breaker state
+at run start and skips entirely if the breaker is open
+(`active_eviction_runs_total{result="breaker_open"}++`). This
+prevents the eviction loop from amplifying load against a
+degraded backend.
+
+### 10.3 Range, sizes, and edge cases
+
+- Partial last chunk of a blob stored at its actual size; `ChunkInfo.Size`
+  records it; range math respects it.
+- `416 Requested Range Not Satisfiable` is returned by the server before
+  any cache lookup, using object metadata, **only** for true Range vs.
+  object-size violations.
+- `server.max_response_bytes` overflow returns
+  `400 RequestSizeExceedsLimit` (S3-style XML error body) with
+  `x-orca-cap-exceeded: true` (s6). It is reported as `400` and
+  not `416` because the cap is a server policy, not a property of the
+  object: clients cannot fix it by re-requesting a different Range past
+  EOF.
+- Origin failure during fill never commits the staging file or makes a
+  final PutObject. Pre-commit (before first byte from origin): the
+  pre-header retry loop (s8.6) handles transient cases; if the retry
+  budget exhausts, the leader returns `502 Bad Gateway` to the client
+  and records a transient negative singleflight entry. Post-commit
+  (after first byte sent to client): the response aborts mid-stream
+  (s6 step 7); any CacheStore commit failure is invisible to the
+  client and recorded as `commit_after_serve_total{result="failed"}`
+  (s8.6). Mid-stream origin resume is deferred future work
+  (s15.4).
+
+### Diagram 9: Atomic commit (localfs vs posixfs vs s3 CacheStore)
+
+```mermaid
+flowchart TB
+    Leader["Singleflight leader<br/>finishes origin read<br/>(via Spool tee; client response<br/>already complete)"] --> Driver{"CacheStore<br/>driver"}
+    Driver -- "localfs" --> L1["stage in &lt;root&gt;/.staging/&lt;uuid&gt;<br/>fsync(file) + fsync(staging dir)"]
+    L1 --> L2["link(staging, final)<br/>or renameat2(RENAME_NOREPLACE)"]
+    L2 -- "EEXIST" --> Llost["unlink staging<br/>fsync(staging dir)<br/>commit_lost++<br/>treat existing final as truth"]
+    L2 -- "ok" --> Lwon["unlink staging<br/>fsync(staging dir) + fsync(final parent dir)<br/>commit_won++"]
+    Driver -- "posixfs" --> P1["stage in &lt;root&gt;/.staging/&lt;uuid&gt;<br/>fsync(file) + fsync(staging dir)<br/>(shared FS - same primitive as localfs)"]
+    P1 --> P2["link(staging, final)<br/>across NFSv4.1+ / Weka / CephFS / Lustre / GPFS"]
+    P2 -- "EEXIST" --> Plost["unlink staging<br/>fsync(staging dir)<br/>commit_lost++<br/>treat existing final as truth"]
+    P2 -- "ok" --> Pwon["unlink staging<br/>fsync(staging dir) + fsync(final parent dir)<br/>commit_won++"]
+    Driver -- "s3" --> S1["PutObject(final, body,<br/>If-None-Match: *)"]
+    S1 -- "200" --> Swon["commit_won++"]
+    S1 -- "412" --> Slost["commit_lost++<br/>treat existing object as truth"]
+    Lwon --> Pub["ChunkCatalog.Record(k, info)"]
+    Llost --> Pub
+    Pwon --> Pub
+    Plost --> Pub
+    Swon --> Pub
+    Slost --> Pub
+    Pub --> Done["chunk visible to all replicas"]
+    Sweep["periodic sweep cleans<br/>stale &lt;root&gt;/.staging/&lt;uuid&gt;<br/>older than staging_max_age"] -.-> L1
+    Sweep -.-> P1
+    SelfTestS3["startup SelfTestAtomicCommit (s3)<br/>refuse to start if<br/>If-None-Match not honored"] -.-> S1
+    SelfTestPosix["startup SelfTestAtomicCommit (posixfs)<br/>link EEXIST + dir-fsync + size verify<br/>refuse on Alluxio FUSE<br/>refuse if NFS &lt; minimum_version<br/>(opt-in via nfs.allow_v3)"] -.-> P1
+    Failed["any commit failure<br/>after client response complete"] -.-> CASF["commit_after_serve_total{failed}++<br/>skip Catalog.Record"]
+```
+
+### 10.4 Spool locality contract
+
+The local Spool (s8.2) is no longer on the cold-path client-TTFB
+path in v1: bytes stream origin -> client directly (s6 step 6 /
+s8.6 pre-header retry). The spool is a parallel side-channel that
+serves joiner-fallback reads and feeds the asynchronous CacheStore
+commit.
+
+Even so, the spool benefits materially from a local block device.
+A joiner that falls behind the in-memory ring buffer head
+transparently switches to a `Spool.Reader(k, off)`. Local NVMe
+serves these reads in microsecond-class latency; a network
+filesystem (NFS, CephFS, Lustre, GPFS, FUSE) instead pays a
+network round-trip on every read, which is tens of milliseconds
+at best and seconds during congestion. That converts smooth
+joiner-fallback into multi-second TTFB stalls for slow joiners.
+Network-FS spools also weaken the durability semantics that the
+asynchronous CacheStore commit relies on.
+
+To prevent foot-gun deployments, the cache layer enforces a
+**boot-time locality check** before any client traffic is
+accepted, governed by `spool.require_local_fs` (default `true`):
+
+1. Resolve `spool.dir` to an absolute path; resolve symlinks.
+2. Call `statfs(2)` on the resolved path. Read `f_type`.
+3. Compare `f_type` against a denylist (these magic numbers indicate a
+   network or virtual FS that violates the locality contract):
+   - `NFS_SUPER_MAGIC` (`0x6969`) - any NFS version, including
+     NFSv4.1+.
+   - `SMB2_MAGIC_NUMBER` (`0xfe534d42`), `CIFS_MAGIC_NUMBER`
+     (`0xff534d42`) - SMB / CIFS.
+   - `CEPH_SUPER_MAGIC` (`0x00c36400`) - CephFS kernel client.
+   - `LUSTRE_SUPER_MAGIC` (`0x0bd00bd0`) - Lustre.
+   - `GPFS_SUPER_MAGIC` (`0x47504653`) - IBM Spectrum Scale.
+   - `FUSE_SUPER_MAGIC` (`0x65735546`) - any FUSE mount, including
+     Alluxio FUSE.
+4. On match: increment
+   `orca_spool_locality_check_total{result="refused",fs_type="<name>"}`,
+   log `spool: <spool.dir> is on a network filesystem (<name>);
+   joiner-fallback latency would be unbounded. Refusing to start.
+   Set spool.dir to a local-NVMe-backed path or, for unusual
+   placements (e.g., RAM-disk), set spool.require_local_fs=false`,
+   and exit non-zero.
+5. On no match: increment
+   `orca_spool_locality_check_total{result="ok",fs_type="<name>"}`
+   and proceed.
+
+**Relaxation**. `spool.require_local_fs: false` allows operators
+with unusual placements (RAM-disk, tmpfs, exotic local FS not on
+the denylist) to bypass the check. The override is supported but
+not recommended for production: with the v1 streaming design the
+spool no longer gates client TTFB, but joiner-fallback latency
+still benefits materially from local block storage. The metric
+label `result="bypassed"` distinguishes overridden runs from
+clean ones, and the boot log carries a loud `WARN
+spool.require_local_fs is disabled; joiner-fallback latency is
+best-effort` line.
+
+The check is in `internal/orca/fetch/spool/` and runs from
+`cmd/orca/orca/main.go` before the HTTP listener binds.
+It runs before any CacheStore self-test so a misconfigured spool
+fails fast even on backends that would otherwise pass their own
+self-test.
+
+### 10.5 Readiness probe (`/readyz`)
+
+The HTTP `/readyz` endpoint reports whether the replica should
+receive client traffic. It is checked by the Kubernetes readiness
+probe and by front-of-cluster load balancers. Distinct from
+`/livez`, which is a process-liveness check only.
+
+**Response shape.**
+
+- `200 OK`, body `{"ready": true}`, when **all** of the following
+  predicates hold:
+  1. boot self-tests have passed (`SelfTestAtomicCommit` for the
+     configured CacheStore driver; spool locality check, s10.4);
+  2. the per-process CacheStore circuit breaker (s10.2) is `closed`
+     or `half_open`;
+  3. consecutive `ErrAuth` count from the CacheStore is below
+     `readyz.errauth_consecutive_threshold` (default 3);
+  4. peer discovery (s14) has completed at least one successful DNS
+     refresh since boot (the empty-peer fallback in s14 keeps the
+     replica functional, but `/readyz` still requires one
+     successful refresh so a totally broken DNS path does not stay
+     silently masked);
+  5. the local Spool has free capacity below `spool.max_bytes`.
+
+- `503 Service Unavailable`, body
+  `{"ready": false, "reasons": ["..."]}`, when any predicate above
+  fails. The `reasons` array names the failing predicates by stable
+  string keys (`selftest_pending`, `selftest_failed`,
+  `breaker_open`, `errauth_threshold`, `peer_discovery_pending`,
+  `spool_full`) so operators can triage from a probe response
+  alone.
+
+**NotReady -> Ready transitions.** The endpoint is stateless apart
+from reading the underlying components. Predicates clear themselves
+as the system recovers:
+
+- breaker `open` -> `closed` after `half_open_probes` successful
+  probes (s10.2);
+- `ErrAuth` consecutive counter resets on any non-`ErrAuth` success;
+- spool fullness clears as in-flight fills drain;
+- peer discovery flips to "completed" on the first successful
+  refresh and stays sticky for the lifetime of the process.
+
+**`/livez`.** A liveness-only check that returns `200 OK` if the
+process is running and the HTTP listener is bound; it does NOT
+consider any of the predicates above and is intentionally trivial.
+This separation lets the readiness probe drain a misconfigured
+replica without restarting it (so operators can inspect logs).
+
+`/readyz` and `/livez` are bound to the same client listener as the
+S3 API; they are NOT served on the internal listener (`:8444`,
+s8.8) because the internal listener's authorization scope is
+restricted to the `/internal/fill` per-chunk fill RPC.
+
+## 11. Bounded staleness contract
+
+Orca trusts an **operator contract** for correctness, and bounds
+the consequences of contract violation by configuration.
+
+### 11.1 The contract and the staleness window
+
+**The contract.** For a given `(origin_id, bucket, object_key)`, the
+underlying bytes are immutable for the life of the key. If the data
+changes, operators MUST publish it under a new key. Replacement in place
+is a contract violation.
+
+**Why we trust it.** Cache key derivation includes the origin `ETag`
+(s5), and a new ETag deterministically yields a new `ChunkKey` and a
+fresh chunk path on the CacheStore. As long as the contract holds, the
+cache cannot serve stale bytes: every change of identity is a change of
+key.
+
+**What happens if the contract is violated.** The cache may serve the
+old bytes for up to one **`metadata_ttl`** window (default 5m,
+configurable). Mechanism:
+
+- Object metadata (`size`, `etag`, `content_type`) is cached for
+  `metadata_ttl` to avoid re-`HEAD`ing on every request.
+- During that window, requests resolve to the old `etag`, derive the
+  same `ChunkKey`, and serve from cached chunks.
+- After the window expires, the next request triggers a fresh `Head`,
+  observes the new ETag, derives a new `ChunkKey`, and refills.
+
+**Why this is acceptable for v1.** The intended workload is large
+immutable artifacts (job inputs, model weights, training shards). The
+contract matches how those are produced. The 5m window is a tunable
+upper bound, not a typical case: a flood of distinct cold keys reads the
+correct ETag on first contact with the cache.
+
+**Defense in depth.** `If-Match: <etag>` is sent on every
+`Origin.GetRange` (s8.6). If an in-flight fill races with an in-place
+overwrite, the origin returns `412 Precondition Failed` and the leader
+fails the fill, invalidates the metadata cache entry for
+`{origin_id, bucket, key}`, and increments
+`orca_origin_etag_changed_total`. This catches the narrow window
+where a violation happens between the cache's `Head` and its `GetRange`.
+It does NOT catch a violation that happens between two complete
+request lifecycles within the same `metadata_ttl` window; the
+`metadata_ttl` cap is what bounds that case.
+
+### 11.2 Bounded-freshness mode (optional)
+
+The default v1 posture is "trust the contract, cap the window". Some
+workloads benefit from shorter effective staleness windows on hot keys
+(typically: deployments where contract violations are operationally
+possible, or where TTL-boundary cold-miss latency on popular content
+is unacceptable). For those workloads, FW5 adds an opt-in
+**bounded-freshness mode** that proactively re-Heads hot keys ahead
+of `metadata_ttl`.
+
+**Opt-in via config**: `metadata_refresh.enabled: false` (default).
+When `false`, no background activity; the cache behaves exactly as
+described in s11.1.
+
+**Hot-key tracking**. Bounded-freshness mode requires per-entry access
+tracking on the metadata cache, parallel to the chunk-catalog access
+tracking from FW8 (s13.2). Each `MetadataCacheEntry` gains:
+- `AccessCount` (uint32, increments on Lookup hit)
+- `LastAccessed` (updated on Lookup hit)
+- `LastEntered` (set on Record; never updated)
+
+This tracking is independent of the chunk-catalog tracking; metadata
+hotness can diverge from chunk hotness (e.g., random-range reads
+access many chunks of one object).
+
+**Eligibility**. An entry is eligible for proactive refresh when ALL
+of:
+- `AccessCount >= access_threshold` (default 5; "hot" key)
+- `now - LastEntered >= refresh_ahead_ratio * metadata_ttl` (default
+  0.7 * 5m = 3.5m; approaching TTL)
+- `now - LastEntered < metadata_ttl` (still valid)
+- `now - LastEntered >= min_age` (default `metadata_ttl/4` = 75s;
+  cold-start protection)
+- no in-flight refresh for this key (per-replica HEAD singleflight,
+  s8.7, gates this)
+
+**Negative entries** (404, unsupported blob type) are NOT refreshed.
+Refreshing them would generate HEAD load to confirm a known-missing
+key; `negative_metadata_ttl` (default 60s, s12) handles the
+create-after-404 recovery instead.
+
+**Refresh loop**:
+
+```
+every metadata_refresh.interval:                          # default 1m
+  candidates = []
+  scan metadata cache:
+    for each entry e:
+      if eligible(e):
+        candidates.append(e)
+  sort candidates:
+    primary: highest AccessCount first
+    secondary: oldest LastEntered first
+  refresh_count = min(len(candidates), max_refreshes_per_run)  # 100
+  spawn refresh workers (concurrency: refresh_concurrency, default 8)
+  for first refresh_count entries:
+    result = Origin.Head(e.bucket, e.key)
+    case result of:
+      ok with same ETag:
+        metadata_cache.RefreshTTL(e.key)              # extend TTL
+        metric: metadata_refresh_total{result="ok"}++
+      ok with new ETag:
+        metadata_cache.Update(e.key, result)
+        metric: metadata_refresh_total{result="etag_changed"}++
+        metric: origin_etag_changed_total++           # existing metric
+        # old chunks orphaned; lifecycle / active eviction (s13)
+        # cleans up
+      err:
+        # don't extend TTL; entry expires naturally
+        metric: metadata_refresh_total{result="error"}++
+```
+
+**Origin HEAD load bound**. Per-replica per cycle: at most
+`max_refreshes_per_run` HEADs (default 100). Per minute (default
+interval): 100 HEADs. At 3 replicas: 300 HEADs/min. Negligible
+against documented S3 / Azure HEAD rate limits.
+
+The refresh workers compete for the existing **origin limiter**
+(s8.4) so they cannot starve on-demand fills. If the limiter is
+saturated, refresh requests queue with bounded wait and skip past
+timeout (`metric: metadata_refresh_total{result="skipped_limiter_busy"}`).
+
+**Effective staleness window** with bounded-freshness enabled:
+`refresh_ahead_ratio * metadata_ttl` for hot keys (default 3.5m).
+Cold keys still bounded by full `metadata_ttl` (default 5m). Negative
+entries bounded by `negative_metadata_ttl` (default 60s).
+
+**Cluster-wide HEAD bound** with bounded-freshness enabled: each
+replica refreshes its own metadata cache independently. With N
+replicas and H hot keys, refresh load is up to N*H HEADs per refresh
+cycle. The cluster-wide HEAD coordinator (deferred future work, see
+s15.2) would naturally absorb this load if N grows large enough to
+matter.
+
+**Failure modes**:
+- `Origin.Head` error during refresh: don't extend TTL; entry expires
+  naturally at `metadata_ttl`; on-demand miss re-Heads. Log + metric.
+- Origin limiter saturated: refresh worker times out; entry expires
+  naturally.
+- Loop hangs / crashes: metadata cache continues to age; entries
+  expire at `metadata_ttl`. Detected via
+  `metadata_refresh_runs_total` not advancing.
+- Refresh detects ETag change: metadata updated; old chunks orphaned;
+  active eviction (FW8 / s13.2) or CacheStore lifecycle handles
+  cleanup.
+
+**When to enable**:
+- Workload has identifiable hot keys with sub-`metadata_ttl`
+  staleness sensitivity.
+- Operators want shorter effective windows on popular content.
+- Origin can absorb the additional HEAD load (typically small for
+  bounded hot-key sets).
+
+**When to leave disabled (default)**:
+- Strict immutable-contract workload where `metadata_ttl` staleness
+  is acceptable.
+- Origin HEAD rate is constrained.
+- Hot-key set is unbounded (every key appears hot - refresh load
+  matches request load, defeating the purpose).
+
+Cross-references: [s2 Decisions / Consistency](#2-decisions),
+[s8.6 Failure handling](#86-failure-handling-without-re-stampede),
+[s8.7 Metadata-layer singleflight](#87-metadata-layer-singleflight),
+[s10.2 Catalog correctness](#102-catalog-correctness-typed-errors-circuit-breaker),
+[s12 Create-after-404 and negative-cache lifecycle](#12-create-after-404-and-negative-cache-lifecycle),
+[s13.2 Active eviction](#132-active-eviction-opt-in-access-frequency).
+
+## 12. Create-after-404 and negative-cache lifecycle
+
+### 12.1 The scenario
+
+A client GETs a key `K` before the operator has uploaded it to
+origin. The cache observes `404` from `Origin.Head(K)`, records a
+negative metadata-cache entry, and returns `404` to the client. The
+operator then uploads `K`. Subsequent client requests still see
+`404` until the negative entry expires - the "we forgot to upload
+that" case.
+
+This is operationally indistinguishable from a contract violation
+(s11): from the client's perspective, the bytes for `K` changed
+without the cache being told. Event-driven origin invalidation is
+intentionally not in v1 scope (the immutable-origin contract makes
+it unnecessary for the documented workload); the cache can only
+bound how long it serves the stale `404`.
+
+### 12.2 Two TTLs (positive vs negative)
+
+The metadata cache uses two TTLs:
+
+| TTL | Default | Bounds | Rationale |
+|---|---|---|---|
+| `metadata_ttl` | 5m | positive entry (`200` + ETag) reuse without re-Head | immutable-origin contract (s11); long TTL keeps HEAD load low |
+| `negative_metadata_ttl` | 60s | negative entry (`404` / unsupported blob type) reuse without re-Head | operator "oops upload" recovery should be fast |
+
+Asymmetric defaults reflect asymmetric operational reality:
+positive-entry staleness only matters on contract violation;
+negative-entry staleness matters every time an operator uploads a
+previously-missing key, which is a normal operational event.
+
+Per-replica HEAD singleflight (s8.7) caps the HEAD load that a short
+negative TTL would otherwise create: a flood of distinct missing
+keys generates at most one HEAD per object per replica per
+`negative_metadata_ttl` window. At default settings (60s, 3
+replicas) origin sees at most 3 HEADs per missing key per minute,
+well under any S3 / Azure HEAD rate limit.
+
+### 12.3 Worst-case unavailability window
+
+After an operator uploads a previously-missing key:
+
+- A replica that observed the original `404` keeps serving `404`
+  for up to `negative_metadata_ttl` from its OWN observation time,
+  regardless of when the upload happened. The TTL is
+  observation-anchored, not upload-anchored, because the cache
+  cannot know about the upload.
+- A replica that did NOT observe the `404` will Head fresh on the
+  first request after the upload and serve `200` immediately.
+- Worst case across replicas: `negative_metadata_ttl` after the
+  LATEST replica's observation of the old `404`. Under round-robin
+  load balancing, clients can see alternating `404` / `200`
+  responses during the drain window (Diagram 10).
+
+There is no active invalidation in v1: neither event-driven
+invalidation (origin-pushed) nor an admin-invalidation RPC is in
+v1 scope. Operator workaround: wait `negative_metadata_ttl` after
+upload before announcing the key.
+
+### 12.4 Defense-in-depth and observability
+
+`If-Match: <etag>` (s8.6) does NOT defend against this case: there
+is no in-flight fill for a `404`'d key, so no precondition exists
+to trip on. The TTL is the only bound.
+
+Negative-cache metrics let operators observe drain progress after
+an upload:
+
+- `orca_metadata_negative_entries` (gauge) - current count
+  of negative entries.
+- `orca_metadata_negative_hit_total{origin_id}` (counter) -
+  returns served from a negative entry. A spike after a known
+  upload signals ongoing drain.
+- `orca_metadata_negative_age_seconds{origin_id}`
+  (histogram) - age of negative entries at hit time. Use
+  upper-bound percentiles to size `negative_metadata_ttl`.
+
+Cross-references: [s2 Decisions / Consistency](#2-decisions),
+[s6 Request flow](#6-request-flow),
+[s8.6 Failure handling](#86-failure-handling-without-re-stampede),
+[s8.7 Metadata-layer singleflight](#87-metadata-layer-singleflight),
+[s11 Bounded staleness contract](#11-bounded-staleness-contract).
+
+### Diagram 10: Scenario G - create-after-404 timeline
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant Op as Operator
+    participant C as Client
+    participant A as Replica A
+    participant B as Replica B
+    participant O as Origin
+    Note over A,B: t=0  K not yet uploaded
+    C->>A: GET /bucket/K
+    A->>O: Head(K)
+    O-->>A: 404
+    Note over A: cache K -> 404<br/>TTL = negative_metadata_ttl (60s)
+    A-->>C: 404
+    Note over Op,O: t=30s  operator uploads K
+    Op->>O: PUT /bucket/K
+    Note over A,B: t=45s  drain period
+    C->>B: GET /bucket/K (LB routes to B)
+    B->>O: Head(K)
+    O-->>B: 200 + ETag
+    B->>O: GetRange (fill path)
+    O-->>B: bytes
+    B-->>C: 200 + bytes
+    Note over A,B: inconsistent results across replicas during drain
+    C->>A: GET /bucket/K (LB routes to A again)
+    Note over A: negative entry still valid<br/>age 45s less than 60s
+    A-->>C: 404 STALE
+    Note over A: t=60s+  negative entry expires
+    C->>A: GET /bucket/K (t=70s)
+    A->>O: Head(K)
+    O-->>A: 200 + ETag
+    A->>O: GetRange (fill path)
+    O-->>A: bytes
+    A-->>C: 200 + bytes
+    Note over A,B: drain complete - all replicas consistent
+```
+
+## 13. Eviction and capacity
+
+Two complementary mechanisms govern CacheStore footprint in v1:
+**passive lifecycle eviction** (always on, driver-dependent) and
+**optional active eviction** by the cache layer itself (opt-in,
+access-frequency-driven). Operators choose one, the other, or both
+depending on CacheStore driver and workload.
+
+### 13.1 Passive eviction (lifecycle)
+
+Eviction is delegated to the CacheStore's storage system in the
+default v1 configuration. Recommended baseline is age-based
+expiration on the chunk prefix with a TTL chosen to fit the
+deployment's working set in the available capacity. Operators tune
+the TTL based on `orca_origin_bytes_total` and capacity
+utilization metrics exposed by the CacheStore. Because the
+on-store path is namespaced by `origin_id` (s5), per-origin
+lifecycle policies can be configured independently on the same
+CacheStore bucket.
+
+**`cachestore/s3` deployments**: AWS S3, MinIO, and VAST all
+support bucket lifecycle policies for age-based expiration.
+Configure the lifecycle directly on the bucket (or delegate to the
+in-DC object store's tooling).
+
+**`cachestore/posixfs` deployments**: shared POSIX filesystems
+(NFSv4.1+, Weka native, CephFS, Lustre, GPFS) do not provide
+native object-lifecycle policies. Two options for posixfs:
+- **External sweep**: schedule an age-based sweep against
+  `<root>/<origin_id>/` from cron or a Kubernetes `CronJob` (e.g.
+  `find <root>/<origin_id> -type f -atime +<n> -delete`). The
+  sweep runs out-of-band; `CacheStore.GetChunk` on a swept entry
+  returns `ErrNotFound` and re-enters the miss-fill path.
+  Operators SHOULD NOT sweep the staging subdirectory
+  `<root>/.staging/` - that is managed by the driver's own
+  background sweep (`cachestore.posixfs.staging_max_age`, default
+  1h, s10.1.2).
+- **Active eviction** (s13.2): enable the cache layer's
+  access-frequency-driven eviction loop. This is the recommended
+  posixfs path when external sweep tooling is impractical.
+
+### 13.2 Active eviction (opt-in, access-frequency)
+
+When `chunk_catalog.active_eviction.enabled: true` (default
+`false`), each replica runs a background eviction loop that
+deletes cold chunks from BOTH the in-memory `ChunkCatalog` AND
+the CacheStore. The decision uses **access-frequency tracking**
+recorded in the catalog on every `Lookup` hit.
+
+**Per-entry tracking** added by FW8 to each `ChunkCatalogEntry`:
+
+```go
+type ChunkCatalogEntry struct {
+    ChunkInfo
+    AccessCount  uint32     // increments on each Lookup hit;
+                            // saturates at MaxUint32 (practically
+                            // unreachable)
+    LastAccessed time.Time  // updated on each Lookup hit
+    LastEntered  time.Time  // set on Record; never updated
+}
+```
+
+**Eviction policy**: a chunk is eligible for active eviction when
+ALL of:
+- `now - LastAccessed > inactive_threshold` (default 24h)
+- `AccessCount < access_threshold` (default 5)
+- `now - LastEntered >= min_age` (default 5m, cold-start protection
+  preventing newly-recorded entries from being evicted before they
+  accumulate hits)
+
+**Score** for ordering candidates (lowest first = most evictable):
+- primary: `AccessCount`
+- tiebreak: oldest `LastAccessed`
+
+**Loop**: every `eviction_interval` (default 10m), scan the
+catalog, identify eligible candidates, sort by score, evict up to
+`max_evictions_per_run` (default 1000) per cycle. For each
+evicted entry: call `CacheStore.Delete(k)`, then
+`ChunkCatalog.Forget(k)` on success. Bounded per-run cost
+prevents pathological delete-storms on a large catalog; the next
+cycle catches the remainder.
+
+**Failure handling**:
+- `Delete` returns `ErrNotFound` (already gone) - treat as success
+  and Forget.
+- `Delete` returns `ErrTransient` - do NOT Forget; retry next
+  cycle. Counter feeds the existing per-process circuit breaker
+  (s10.2).
+- `Delete` returns `ErrAuth` - stop the entire run; do NOT
+  Forget; metric increments. Circuit breaker integrates as usual.
+- Circuit breaker open - skip the eviction run entirely
+  (`active_eviction_runs_total{result="breaker_open"}++`) to
+  avoid amplifying load against a degraded backend.
+
+**Counter saturation, no decay in v1**: AccessCount is `uint32`
+and saturates at ~4 billion (practically unreachable). New entries
+start at 0 and must compete with old popular entries once past
+`min_age`. The cold-start protection covers this; for steady-state
+workloads the relative ordering remains correct.
+
+### 13.3 ChunkCatalog size awareness (load-bearing operational note)
+
+The ChunkCatalog is the active-eviction policy's window into
+chunk activity. Its size relative to the CacheStore working set
+determines eviction quality:
+
+- **catalog == working set**: full visibility; eviction policy
+  considers every chunk; quality is optimal.
+- **catalog < working set**: many chunks live in the CacheStore
+  but are NOT tracked by the catalog. They cannot be considered
+  for active eviction; they live indefinitely until external
+  lifecycle (if any) cleans them up. Active eviction has
+  incomplete visibility; effective behavior is "evict from the
+  visible subset only".
+- **catalog > working set**: wasted RAM but no correctness or
+  eviction-quality cost.
+
+**Sizing guidance for operators**:
+
+```
+target_catalog_entries = 1.2 * estimated_active_working_set_chunks
+                       (where chunk = chunk_size, default 8 MiB)
+
+memory_estimate = target_catalog_entries * ~120 bytes/entry
+```
+
+| Active working set | Chunks at 8 MiB | Catalog entries | RAM (~120 B/entry) |
+|---|---|---|---|
+| 100 GiB | ~13K | 16K | ~2 MB |
+| 1 TiB | ~130K | 160K | ~20 MB |
+| 10 TiB | ~1.3M | 1.6M | ~190 MB |
+| 100 TiB | ~13M | 16M | ~1.9 GB |
+
+For very large working sets (>1 PiB at 8 MiB chunks), operators
+should consider one of:
+- larger `chunk_size` (e.g., 16 MiB) to reduce catalog entry count
+  by half (note: changing `chunk_size` orphans the existing chunk
+  set, see s5);
+- disabling active eviction and relying on CacheStore lifecycle
+  exclusively (the default v1 posture);
+- a future external/persistent catalog (deferred future work,
+  not in v1).
+
+**Metrics for detecting undersizing**:
+- `orca_chunk_catalog_hit_rate` (derived from `_hit_total`):
+  sustained < 0.7 suggests undersizing.
+- `orca_chunk_catalog_evict_total{reason="size"}`: high
+  rate means LRU eviction is fighting the access-frequency policy;
+  catalog is too small.
+- `orca_chunk_catalog_entries`: pinned at `max_entries`
+  may indicate undersizing.
+
+### 13.4 Spool capacity
+
+The local **spool** (s8.2) is bounded by `spool.max_bytes`;
+full-spool conditions block new fills briefly, then return `503
+Slow Down` to clients. Spool entries are released as soon as
+in-flight readers drain. Spool capacity is independent of the
+ChunkCatalog and CacheStore footprint.
+
+### 13.5 `chunk_size` config-change capacity impact
+
+See the operational note in [s5](#5-chunk-model): changing
+`chunk_size` orphans the existing chunk set under the old size;
+storage transiently doubles and the working set is rebuilt at the
+new size on demand. The CacheStore lifecycle policy (or, on
+posixfs with active eviction enabled, the access-frequency loop
+detecting the orphans as cold) ages the orphaned chunks out.
+
+### 13.6 Eviction interactions
+
+Operators using BOTH passive lifecycle AND active eviction need
+to understand the interaction:
+- Lifecycle deletes a chunk -> active eviction sees `ErrNotFound`
+  on `Delete`; treats as success. No conflict.
+- Active eviction deletes a chunk -> lifecycle sees it gone. No
+  conflict.
+- Both aggressive on the same chunk -> "double eviction" with no
+  correctness impact, but the chunk is gone slightly faster than
+  either policy alone would have removed it. Operators should
+  pick one as the primary mechanism and configure the other as
+  defense-in-depth (e.g., long lifecycle TTL + short active
+  eviction `inactive_threshold`).
+
+## 14. Horizontal scale
+
+Cluster membership comes from the headless Service: an A-record lookup
+returns the IPs of all Ready pods backing the Service. Cluster code
+consumes that list, refreshes it on a configurable interval (default 5s),
+and rendezvous-hashes `ChunkKey` against pod IPs to select a coordinator
+**per chunk**. The replica that received the client request acts as the
+**assembler** (s8.3): for each chunk in the requested range, it serves
+from CacheStore on hit, performs a local singleflight + tee + spool +
+commit if it is the coordinator, or issues a per-chunk
+`GET /internal/fill?key=<k>` to the coordinator on the coordinator's
+internal mTLS listener (s8.8). The assembler stitches returned bytes into
+the client response, slicing the first and last chunk to match the
+client `Range`.
+
+Pod names are not stable under a Deployment; we never address peers by
+name, only by the IPs the headless Service publishes.
+
+We accept up to one duplicate fill per chunk during membership flux (e.g.
+rolling restarts when a pod's IP changes); the duplicate-fill metric
+makes that visible.
+
+Replication factor = 1 in v1 (cache loss is recoverable from origin).
+Every replica sees the entire CacheStore. No replica owns bytes;
+replica loss never strands data.
+
+**Empty / unavailable peer set.** If `Cluster.Peers()` returns an
+empty set (the headless Service has no Ready endpoints, the DNS
+record returns NXDOMAIN, or the kube-dns / CoreDNS path is broken),
+the replica treats itself as the only peer: rendezvous hashing
+returns self for every `ChunkKey` and all fills run locally. The
+replica does NOT refuse to serve; cluster-wide deduplication
+(s8.3) degrades to per-replica deduplication for the duration. A
+subsequent successful DNS refresh re-introduces peers without
+process restart.
+
+DNS-refresh outcomes are exposed as
+`orca_cluster_dns_refresh_total{result="ok|fail|empty"}` and
+the current peer-set size as `orca_cluster_peers` (gauge).
+Boot-time failure is logged at WARN; sustained empty-peer state is
+trivially observable from the gauge. The `/readyz` predicate
+(s10.5) requires that **at least one** DNS refresh has succeeded
+since boot; a totally broken DNS path therefore keeps the replica
+NotReady and load balancers drain it, even though the empty-peer
+local-fill fallback would otherwise let it serve.
+
+### Diagram 11: Membership & rendezvous hash
+
+```mermaid
+flowchart LR
+    DNS["headless Service<br/>A-record lookup<br/>(every 5s)"] --> IPs["pod IP set:<br/>[10.0.1.5,<br/> 10.0.1.6,<br/> 10.0.1.7]"]
+    Req["incoming request<br/>ChunkKey k"] --> Hash["for each IP:<br/>w(IP, k) = hash(IP || k)<br/>argmax(w)"]
+    IPs --> Hash
+    Hash --> Coord["coordinator IP<br/>(e.g. 10.0.1.6)"]
+    Coord --> Decide{"== self?"}
+    Decide -- "yes" --> Local["local fill path<br/>(singleflight + tee + spool + commit)"]
+    Decide -- "no" --> Forward["GET /internal/fill?key=k<br/>(mTLS, internal listener)"]
+```
+
+### Diagram 12: Scenario H - rolling restart membership flux
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant A as Replica A
+    participant DNS as headless Service DNS
+    participant B as Replica B (old IP)
+    participant Bp as Replica B' (new IP)
+    participant CS as CacheStore
+    Note over A,B: t=0  peers (A's view) = {A, B}<br/>chunk k owned by B
+    A->>DNS: refresh
+    DNS-->>A: [ip(A), ip(B)]
+    Note over B,Bp: t=5s  rolling restart: B terminates,<br/>B' starts with a new IP
+    Note over A: A's cached membership still {A, B}<br/>until next refresh
+    A->>A: rendezvous(k, {A,B}) = B (stale)
+    A->>B: /internal/fill (connection refused)
+    A->>A: fallback: fill locally
+    A->>CS: PutObject(final, ..., If-None-Match: *)
+    Note over Bp: B' bootstraps, refreshes DNS<br/>peers (B's view) = {A, B'}
+    Bp->>Bp: rendezvous(k, {A,B'}) = B'
+    Bp->>CS: PutObject(final, ..., If-None-Match: *)
+    CS-->>A: 200 commit_won
+    CS-->>Bp: 412 commit_lost
+    Note over A,Bp: duplicate_fills_total{commit_lost} += 1
+    Note over A,DNS: t=10s  A refreshes DNS<br/>peers converge to {A, B'}<br/>steady state restored
+```
+
+## 15. Deferred optimizations
+
+This section catalogs concerns that are intentionally NOT in v1. Each
+entry names what is deferred, why v1 ships without it, what operational
+evidence would justify building it, and a sketch of how it would fit
+into the existing surface area. None of these items require breaking
+changes to v1 interfaces.
+
+### 15.1 Edge rate limiting
+
+**What**: Per-client / per-IP / per-credential token-bucket rate
+limiting at the S3 edge; '429 Too Many Requests' on exhaustion;
+identity from auth subject (mTLS cert subject or bearer-token claim)
+with source-IP fallback when no auth identity is established.
+
+**Why deferred**: v1 has implicit hot-client mitigation - the per-
+replica origin semaphore (s8.4) and singleflight (s8.1)
+coalesce concurrent identical work and cap cold-fill concurrency
+regardless of caller. No measured noisy-neighbor evidence at v1
+scale; cost of building edge rate limiting (token-bucket per
+identity, identity extraction, new HTTP error path, new metric)
+outweighs the speculative benefit.
+
+**Trigger**: Operator reports a single client / credential is
+measurably monopolizing TTFB or driving disproportionate origin
+load past internal mechanisms.
+
+**Sketch (if built)**: Token bucket per identity in
+`internal/orca/server/edgelimit/`; refill rate per identity
+configurable; per-replica enforcement (no cluster-wide
+coordination); returns `429 Too Many Requests` with
+`Retry-After: 1s`. New metric
+`orca_edge_ratelimit_total{identity,result}`.
+
+**Known v1 limitation**: documented gap. Multi-tenant deployments
+worried about single-client monopolization should layer rate
+limiting at an upstream proxy or LB until this lands.
+
+### 15.2 Cluster-wide HEAD singleflight
+
+**What**: A second coordinator role parallel to the chunk fill
+coordinator (s8.3): rendezvous-hash on `(origin_id, bucket, key)`
+to pick exactly one HEAD coordinator per object per cluster. New
+`/internal/head` RPC. After: exactly one `Origin.Head` per object
+per `metadata_ttl` window cluster-wide.
+
+**Why deferred**: Per-replica HEAD singleflight (s8.7) caps
+cluster-wide HEAD load at `N * (objects / metadata_ttl)`. At
+documented v1 scale (3-5 replicas, 5m TTL), this is well under
+documented S3 / Azure HEAD rate limits. Savings only become
+material at much larger scale.
+
+**Trigger**: any of:
+- peer-set size exceeds ~10 replicas, AND keys cluster under
+  shared prefixes approaching per-prefix rate limits (5500/sec on
+  AWS S3);
+- `metadata_ttl` configured short enough that HEAD storms repeat
+  frequently;
+- operator measures HEAD throttling on origin.
+
+**Sketch (if built)**: New `ObjectKey = {origin_id, bucket,
+object_key}` type. New `Cluster.HeadCoordinator(ObjectKey) Peer`
+parallel to `Coordinator(ChunkKey) Peer`. New
+`InternalClient.Head(ctx, ObjectKey) (ObjectInfo, error)`. New
+endpoint `GET /internal/head?origin_id=...&bucket=...&key=...` on
+existing internal listener (s8.8); reuses mTLS + peer-IP authz.
+Same `409 Conflict` membership-flux fallback as chunk fill.
+Coordinator-unreachable degrades to local `Origin.Head`. New
+`cluster_internal_head_*` metrics. The bounded-freshness mode
+(s11.2) would naturally route its background HEADs through this
+same coordinator pattern.
+
+**Known v1 bound**: at N replicas and `metadata_ttl=5m`, cold
+popular-key fan-out generates **N HEADs per object per 5 minutes
+cluster-wide**. Documented and acceptable at v1 scale.
+
+### 15.3 Cluster-wide LIST coordinator
+
+**What**: Extend FW2's coordinator pattern to LIST: rendezvous-
+hash on the full LIST query tuple `(origin_id, bucket, prefix,
+continuation_token, start_after, delimiter, max_keys)` to pick
+one coordinator per query per cluster. New `/internal/list` RPC.
+Coordinator's per-replica LIST cache (s6.2) becomes the de facto
+cluster cache. After: exactly one `Origin.List` per identical
+query per `list_cache.ttl` cluster-wide.
+
+**Why deferred**: v1 ships with per-replica LIST cache (s6.2,
+default 60s TTL). For the documented FUSE-`ls` workload, FUSE
+clients are typically pinned to one replica via HTTP/2 keepalive,
+making per-replica caching naturally effective for any single
+client. Across many clients sharing prefixes, per-replica caching
+holds origin LIST load to N per popular prefix per
+`list_cache.ttl` window - well under any documented rate limit
+at v1 scale.
+
+**Trigger**: any of:
+- peer-set size exceeds ~10 replicas, AND
+- highly-shared FUSE prefixes, AND
+- tight `ls` latency budgets (so the additional 5-20ms internal-
+  RPC hop is acceptable in trade for reduced origin load);
+- OR operator measures sustained LIST throttling on origin.
+
+**Sketch (if built)**: Symmetric to s15.2. New
+`Cluster.ListCoordinator(ListKey) Peer`. New
+`InternalClient.List` RPC. Coordinator runs the LIST cache and
+the existing per-replica LIST singleflight; non-coordinators
+route to it on cache miss. Same `409 Conflict` membership-flux
+fallback. Coordinator-unreachable degrades to local
+`Origin.List`. The internal-RPC latency overhead matters more
+for FUSE-`ls` than chunk fills, so caching at the coordinator
+must be aggressive (TTL >= 60s).
+
+**Known v1 bound**: cluster-wide LIST load is up to N origin LIST
+calls per identical query per `list_cache.ttl` window where N is
+peer count. Acceptable at v1 scale.
+
+### 15.4 Mid-stream origin resume
+
+**What**: After the commit boundary (s8.6 / s6 step 6) the v1 cache
+streams origin bytes directly to the client. If the origin
+connection breaks mid-chunk, the response aborts (HTTP/2
+`RST_STREAM` or HTTP/1.1 `Connection: close`); the S3 SDK detects
+the `Content-Length` mismatch and retries. Mid-stream origin
+resume would replace the abort with a transparent re-issue: the
+leader tracks bytes sent to client; on origin disconnect, it
+re-issues `Origin.GetRange` with `Range: bytes=<offset>-` (and
+the same `If-Match: <etag>`) and continues feeding the client
+without ever showing an error.
+
+**Why deferred**: v1 relies on the SDK retry behavior (every
+mainstream S3 client handles this case correctly) which is
+acceptable for the documented workload. Mid-stream resume
+requires non-trivial state tracking (bytes-sent counter, retry
+budget for the resume itself, interaction with the singleflight
+joiner state), and the abort case is handled by the SDK so the
+operational impact is small.
+
+**Trigger**: any of:
+- mid-stream client aborts measurably impact tail TTFB on the
+  documented workload (visible via
+  `responses_aborted_total{phase="mid_stream"}` rate);
+- workload uses non-S3-compatible clients without robust retry
+  (uncommon);
+- post-commit origin failures are systematically more frequent
+  than pre-commit (e.g., long-tail origin connections that
+  succeed initially then drop).
+
+**Sketch (if built)**: extend `fetch.Coordinator` to track
+`bytesSent` per fill. On `Origin.GetRange` error after the commit
+boundary, retry origin with `Range: bytes=<bytesSent>-` (within
+the requested chunk's range; bounded by a separate
+`origin.resume.attempts` budget, e.g. 1-2 attempts). Joiners reading
+through the leader's tee transparently see the gap closed. The
+spool tee continues unaffected; the resumed bytes flow through
+the same ring buffer + spool. New metric:
+`orca_origin_resume_total{result="success|exhausted|error"}`.
+
+**Known v1 bound**: post-commit origin failures abort the client
+response; client SDK retries from scratch
+(`responses_aborted_total{phase="mid_stream"}` increments).
+Acceptable for the documented workload at v1 scale.
+
+### 15.5 Coordinated cluster-wide origin limiter
+
+**What**: Replace the per-replica static cap (s8.4) with a true
+cluster-wide cap on concurrent `Origin.GetRange` calls. Mechanism:
+Kubernetes-Lease-elected **limiter authority** + in-memory
+counting semaphore at the elected leader + slot-lease tokens
+(batched) issued over an internal RPC + per-peer local bucket
+that auto-refills + graceful fallback to the v1 per-replica
+static cap when the authority is unreachable.
+
+**Why deferred**: at documented v1 scale (3-5 replicas), the
+per-replica static cap (s8.4) is approximate but acceptable;
+cluster-wide concurrency tracks `target_global` within a small
+margin during steady state, and the pre-header retry loop (s8.6)
+handles origin throttling responses (`503 SlowDown` / `429`)
+self-correctingly. The K8s Lease design adds substantial surface
+area (election machinery, slot-lease tokens, batching, fallback
+mode, RBAC, ~12 metrics, ~10 tests, an additional `Limiter`
+interface plus `LimiterToken` type, three new internal RPC
+endpoints) that is not justified at v1 scale. Reviewer feedback
+flagged the cumulative complexity as not earning its keep.
+
+**Trigger**: any of:
+- peer-set size grows past ~10 replicas, AND measured steady-
+  state slot under-utilization (one replica saturated while
+  others are idle for the same hot work) is causing
+  `503 Slow Down` to clients;
+- operator requires a hard cluster-wide cap (e.g., dedicated
+  origin pipe sized for X concurrent connections; cost-sensitive
+  deployment cannot tolerate the static cap's worst-case
+  overshoot);
+- origin imposes an account-wide rate limit (rather than
+  per-prefix) that the static cap would routinely exceed.
+
+**Sketch (if built)**:
+
+- **Election**: standard `client-go/tools/leaderelection` against
+  a single `coordination.k8s.io/v1.Lease` resource named e.g.
+  `orca-limiter` in the deployment's namespace. RBAC:
+  `get / list / watch / create / update / patch` on the named
+  Lease, scoped to the deployment's namespace. Steady-state K8s
+  API load: ~6-30 writes/min/deployment (the elected leader
+  renews; non-leaders do not write).
+
+- **Authority**: holds an in-memory counting semaphore of
+  `cluster.limiter.target_global` slots (default 192). Serves
+  three RPCs over the existing internal listener (s8.8):
+  `POST /internal/limiter/acquire` (issues a lease token holding
+  N batched slots; default `batch.size=8`, configurable;
+  `token.ttl=30s` wall-clock expiry); `POST /internal/limiter/extend`
+  (bumps an existing token's expiry; returns `unknown_token` or
+  `expired` if reclaimed); `POST /internal/limiter/release`
+  (returns slots; idempotent). Background sweep every 5s reclaims
+  expired tokens.
+
+- **Peer**: each non-authority replica holds a small local bucket
+  of slots acquired in batches; auto-refill triggers when remaining
+  slots fall to or below `cluster.limiter.batch.refill_threshold`
+  (default 2). Tokens auto-extend when their age exceeds
+  `cluster.limiter.token.extend_at_ratio * token.ttl` (default
+  0.5 * 30s = 15s). When the local bucket empties, the replica
+  releases the old token and acquires a fresh one.
+
+- **Authority changeover**: when the K8s Lease holder changes,
+  the new authority starts with an empty slot table while old
+  lease tokens at peers continue draining. Cluster-wide inflight
+  may transiently exceed `target_global` by up to one full set
+  of tokens; drains within `lease.duration + token.ttl` =
+  45s worst case with defaults. Acceptable because the limiter
+  is a soft cap; correctness is unaffected.
+
+- **Fallback mode**: peer cannot reach authority -> activates the
+  v1 per-replica static cap (the same `floor(target_global / N)`
+  semaphore from s8.4). Transparent to the client. Reconnects
+  automatically on `cluster.limiter.fallback.check_interval`
+  (default 5s). Limiter authority unreachability is intentionally
+  NOT a `/readyz` predicate: replicas in fallback are still
+  serving correctly.
+
+- **Disable toggle**: `cluster.limiter.enabled: false` returns
+  the v1 per-replica static cap permanently. No K8s API access;
+  no Lease object created. Useful for deployments without RBAC
+  for the Lease resource, or for isolated debugging.
+
+- **New metrics**: `orca_limiter_state{role="authority|peer|fallback"}`,
+  `orca_limiter_target_global`,
+  `orca_limiter_slots_available` (authority-only),
+  `orca_limiter_slots_granted` (authority-only),
+  `orca_limiter_slots_local` (per-peer),
+  `orca_limiter_acquire_total{result}`,
+  `orca_limiter_acquire_duration_seconds`,
+  `orca_limiter_extend_total{result}`,
+  `orca_limiter_release_total`,
+  `orca_limiter_election_total{result}`,
+  `orca_limiter_lease_expired_total`,
+  `orca_limiter_fallback_active`.
+
+- **New interfaces in s7**: `Limiter` (`Acquire(ctx) (Slot, error)`,
+  `State() LimiterState`); `Slot` (`Release()`); `LimiterToken`
+  struct (`ID`, `Slots`, `ExpiresAt`); `InternalClient` gains
+  `LimiterAcquire`, `LimiterExtend`, `LimiterRelease`.
+
+- **Composition with [s15.6](#156-dynamic-per-replica-origin-cap)**:
+  the coordinated authority (this entry) and dynamic per-replica
+  recompute (s15.6) are orthogonal mechanisms. If both ever
+  ship, dynamic per-replica is the uncoordinated baseline that
+  coordination tightens further.
+
+**Known v1 limitation**: per-replica static cap; cluster-wide
+concurrency tracks `target_global` only when `N_actual ==
+cluster.target_replicas`. Documented and acceptable at v1
+documented scale.
+
+### 15.6 Dynamic per-replica origin cap
+
+**What**: Derive `target_per_replica` at runtime from
+`len(Cluster.Peers())` rather than from the static
+`cluster.target_replicas` config knob. The per-replica origin
+semaphore is resized on each membership-refresh, keeping
+realized cluster-wide concurrency close to `target_global`
+regardless of actual replica count.
+
+**Why deferred**: v1 ships with `cluster.target_replicas` as a
+static config knob (s8.4). Static is simpler, deterministic,
+and matches the operator's mental model when the deployment has
+a stable replica count (the documented v1 target of 3-5
+replicas without HPA). Dynamic adds:
+
+- a resizable-semaphore primitive (the Go standard library and
+  `golang.org/x/sync/semaphore` both fix capacity at
+  construction; a custom wrapper is required, ~30-40 lines);
+- a peer-change notification channel on the `Cluster` interface
+  (`PeersChanges() <-chan []Peer` or equivalent);
+- a watcher goroutine that recomputes the cap on each membership
+  change;
+- edge-case handling (empty peer set, current inflight exceeding
+  the new cap, rapid peer-set churn).
+
+Roughly 60-80 lines of code plus ~5 new tests. Modest in
+isolation but composes with the broader complaint that the v1
+design has too many moving parts.
+
+**Trigger**: any of:
+
+- HPA-driven autoscaling produces frequent replica-count
+  changes;
+- operators routinely scale the deployment without updating
+  `cluster.target_replicas`, leaving the realized cap
+  mis-sized;
+- operator measures sustained over- or under-allocation against
+  `target_global` (sum of per-replica `origin_inflight` gauges
+  diverging persistently from `target_global`).
+
+**Sketch (if built)**:
+
+- `internal/orca/origin/semaphore.go`: resizable semaphore
+  wrapper with `Acquire(ctx)`, `Release()`, `SetCapacity(n)`.
+- `Cluster` interface gains a peer-change notification surface
+  (channel or callback).
+- Watcher goroutine recomputes on each membership change:
+  `target_per_replica = floor(target_global / max(1, len(peers)))`.
+  The `max(1, ...)` matches the empty-peer fallback (s14): a
+  lone replica gets `target_global` slots, which is correct for
+  the last-replica-standing case.
+- Edge cases: current inflight exceeds new cap (existing holders
+  complete naturally; new acquires queue against the new cap);
+  rapid peer-set churn (optional debouncing or rate-limiting on
+  `SetCapacity` calls).
+- Composes naturally with [s15.5](#155-coordinated-cluster-wide-origin-limiter):
+  the coordinated authority (s15.5) and per-replica dynamic cap
+  (this entry) are orthogonal mechanisms; if both ever ship,
+  dynamic is the uncoordinated baseline that coordination
+  tightens further.
+
+**Known v1 limitation**: the static cap is approximate. Realized
+cluster-wide concurrency depends on `N_actual`:
+
+- `N_actual > N_typical`: realized cap exceeds `target_global` by
+  up to `(N_actual - N_typical) * target_per_replica`.
+- `N_actual < N_typical`: realized cap falls below `target_global`
+  by `(N_typical - N_actual) * target_per_replica`.
+
+Over-allocation may stress origin; under-allocation wastes
+capacity. Operators MUST update `cluster.target_replicas` after
+any sustained scale change.
diff --git a/design/orca/plan.md b/design/orca/plan.md
new file mode 100644
index 00000000..e1ef33d3
--- /dev/null
+++ b/design/orca/plan.md
@@ -0,0 +1,1554 @@
+# Orca - Origin Cache - Implementation & Operations Plan
+
+Status: draft for review (round 2 incorporating reviewer feedback)
+Owner: TBD
+Targets: Phase 0 walking skeleton in this repo, growing to multi-PB multi-replica cluster
+
+> Mechanism, decisions, internal interfaces, and flow diagrams: see [design.md](./design.md).
+> Terminology and component glossary: see [design.md#3-terminology](./design.md#3-terminology).
+
+---
+
+## 1. Goal
+
+Ship a read-only S3-compatible blob caching layer ("Orca") inside an
+on-prem datacenter, fronting cloud blob storage (AWS S3 + Azure Blob).
+Clients issue range reads against Orca; Orca serves from a
+shared in-DC store when present, otherwise fetches from the cloud origin,
+stores the chunk, and returns it. There is no client-initiated write path.
+
+This document covers deliverable scope, repo layout, configuration, auth,
+observability, phasing, testing, risks, and the approval checklist. The
+mechanism that delivers this behavior is described in
+[design.md](./design.md).
+
+## 2. Scope
+
+In scope (v1):
+
+- Read-only S3-compatible client API: `GetObject` (with `Range`),
+  `HeadObject`, `ListObjectsV2`.
+- Origin adapters for AWS S3 and Azure Blob (Block Blobs only - see
+  [design.md#9-azure-adapter-block-blob-only](./design.md#9-azure-adapter-block-blob-only)).
+- Pluggable backing store ("CacheStore"): local filesystem for development;
+  in-DC S3-compatible store (e.g. VAST) for production.
+- Fixed-size chunking with stampede protection (singleflight + tee +
+  spool).
+- ETag-based immutable-blob model with strict `If-Match` enforcement on
+  every origin range read - see
+  [design.md#8-stampede-protection](./design.md#8-stampede-protection).
+- Sequential read-ahead.
+- Single-tenant deployment, network-perimeter trust (bearer / mTLS) on the
+  client edge, separate internal mTLS listener for inter-replica RPCs, no
+  SigV4 verification in v1.
+- Multi-replica Kubernetes Deployment from day one. All replicas share a
+  single in-DC CacheStore; rendezvous hashing on `ChunkKey` selects the
+  coordinator for miss-fills; the receiving replica is the assembler that
+  fans out per-chunk fill RPCs.
+- Observable (Prometheus), operable (health probes, manifests, container
+  image), testable in CI against `minio` and `azurite`.
+
+Out of scope (v1):
+
+- Writes, multipart uploads, object versioning.
+- Cross-DC cache peering.
+- S3 SigV4 verification on the client edge.
+- Multi-tenant quotas and per-tenant credentials.
+- Mutable-blob invalidation / origin event subscriptions.
+- Encryption at rest beyond what the underlying CacheStore provides.
+
+## 3. Repo layout (mirrors `machina`)
+
+```
+cmd/orca/
+    main.go                         # thin wrapper -> orca.Run()
+    orca/
+        orca.go              # cobra root, config load, wiring
+        server/                     # S3-compatible HTTP handlers (client edge)
+            internal/               # internal listener handlers
+                                    #   GET /internal/fill?key=<encoded ChunkKey>
+internal/orca/
+    types.go                        # ChunkKey, ObjectInfo, ChunkInfo, Config
+    chunker/                        # range <-> chunk math (streaming iterator)
+    fetch/                          # Coordinator: meta + chunk SF, semaphore,
+                                    # assembler fan-out, internal RPC client
+        spool/                      # bounded local-disk staging area for in-flight
+                                    # fills; slow-joiner fallback regardless of
+                                    # CacheStore driver
+    chunkcatalog/                   # in-memory LRU fronting CacheStore.Stat
+    cachestore/
+        localfs/                    # dev; link()/renameat2(RENAME_NOREPLACE);
+                                    # uses internal/posixcommon for staging,
+                                    # link-commit, dir-fsync helpers
+        posixfs/                    # prod; shared POSIX FS (NFSv4.1+ baseline,
+                                    # plus Weka native, CephFS, Lustre, GPFS);
+                                    # same primitive as localfs via posixcommon;
+                                    # adds backend detection, NFS minimum-version
+                                    # gate, Alluxio-FUSE refusal, fan-out path
+                                    # layout, SelfTestAtomicCommit at startup
+        s3/                         # VAST and other in-DC S3-like stores;
+                                    # PutObject + If-None-Match: *;
+                                    # SelfTestAtomicCommit at startup
+        internal/
+            posixcommon/            # shared link()/EEXIST commit primitive,
+                                    # staging-dir layout, dir-fsync, optional
+                                    # 2-char hex fan-out; consumed by
+                                    # cachestore/localfs and cachestore/posixfs
+                                    # only; not visible above the cachestore
+                                    # package boundary
+    origin/
+        types.go                    # Origin interface, error types incl.
+                                    # OriginETagChangedError, UnsupportedBlobTypeError
+        s3/                         # If-Match: <etag> on every GetRange
+        azureblob/                  # Block Blob only; If-Match on Get Blob
+    singleflight/                   # per-key in-flight dedupe + tee
+    cluster/                        # membership refresh from headless Service
+                                    # DNS (default 5s); rendezvous hashing on
+                                    # pod IP; per-chunk internal fill RPC
+                                    # client + server helpers
+    auth/                           # bearer / mTLS verification (client edge);
+                                    # internal-listener mTLS + peer-IP authz
+    metrics/                        # Prometheus collectors
+deploy/orca/
+    01-namespace.yaml.tmpl
+    02-rbac.yaml.tmpl
+    03-config.yaml.tmpl
+    04-deployment.yaml.tmpl         # exposes container ports 8443 (client),
+                                    # 8444 (internal), 9090 (metrics)
+    05-service.yaml.tmpl            # headless service for membership
+    06-service-clientvip.yaml.tmpl  # ClusterIP for client traffic
+    07-networkpolicy.yaml.tmpl      # restricts ingress on :8444 to pods
+                                    # labelled app=orca in-namespace;
+                                    # rendered only when
+                                    # networkpolicy.enabled=true (omit in dev)
+    # 08-storage-pvc.yaml.tmpl      - RESERVED for Phase 2 cachestore/posixfs
+    #                               deployments that wire the shared FS in via
+    #                               a PVC + CSI driver rather than a kubelet
+    #                               mount or hostPath; content deferred
+    dev/                            # dev-only manifests overlay
+        01-localstack-deployment.yaml   # LocalStack pod (ephemeral; no PVC);
+                                        # pinned to localstack/localstack:3.8
+                                        # (community)
+        02-localstack-service.yaml      # ClusterIP exposing :4566
+        03-localstack-init-job.yaml     # Job that creates the chunks bucket
+                                        # via awslocal at bring-up
+    embed.go
+    rendered/                       # gitignored, produced by render-manifests
+images/orca/
+    Containerfile
+design/orca/
+    plan.md                         # this file
+    design.md                       # mechanism + flow diagrams
+    brief.md                        # stakeholder-facing brief
+hack/orca/
+    Makefile                        # dev-cluster targets: up, down, reset,
+                                    # render, port-forward, status, logs,
+                                    # seed-azure (real Azure only).
+                                    # Top-level Makefile may add `orca-`
+                                    # prefixed proxies that invoke
+                                    # `make -C hack/orca <target>`
+                                    # (matches the hack/net/ convention).
+    dev-harness.md                  # how to use the dev harness in Kind
+                                    # (LocalStack as cachestore/s3, real Azure
+                                    # as origin)
+    inttest.md                      # integration test guide for
+                                    # internal/orca/inttest/
+    up.sh                           # kind create + image build + load + render
+                                    # manifests + apply + wait-for-ready
+    down.sh                         # kind delete cluster
+    reset.sh                        # rebuild image + kind load + rollout
+                                    # restart
+    clear-cache.sh                  # delete LocalStack pod (recreated; cache
+                                    # state wiped without rebuilding the
+                                    # cluster)
+    seed-azure.sh                   # generate small/medium/large blobs and
+                                    # upload to the configured Azure account
+    port-forward.sh                 # kubectl port-forward orca client
+                                    # service to localhost
+    sample-get.sh, sample-list.sh   # example S3 client invocations
+    logs.sh                         # tail logs across replicas
+    .env.example                    # AZURE_STORAGE_ACCOUNT, AZURE_STORAGE_KEY,
+                                    # AZURE_CONTAINER, ORCA_REPLICAS,
+                                    # ORCA_IMAGE_TAG
+    kind-config.yaml                # 1 control + 3 worker nodes (one Orca
+                                    # replica per worker via required
+                                    # anti-affinity)
+```
+
+`Makefile` additions: `orca`, `orca-build`, `orca-image`,
+`orca-manifests`. `make` continues to build everything.
+
+## 4. Auth (v1)
+
+Two listeners with two distinct trust roots.
+
+### 4.1 Client edge listener (default `:8443`)
+
+- Bearer token middleware: HMAC token validated against a shared secret in
+  a Kubernetes Secret.
+- Optional mTLS: client cert validated against a configured **client CA
+  bundle** (`server.tls.client_ca_file`).
+- Pluggable so SigV4 verification can land later without rewriting the
+  request pipeline.
+
+### 4.2 Internal listener (default `:8444`)
+
+Serves `GET /internal/fill?key=<encoded ChunkKey>` for per-chunk fill RPCs
+between replicas. Implementation follows
+[design.md#88-internal-rpc-listener](./design.md#88-internal-rpc-listener).
+
+- Transport: HTTP/2 over mTLS.
+- Server cert: per-replica cert (e.g. cert-manager-issued) chained to a
+  configured **internal CA** (`cluster.internal_tls.ca_file`). The
+  internal CA is **distinct** from the client mTLS CA so a leaked client
+  cert cannot be used to dial the internal listener.
+- Client auth: peer presents a client cert chained to the internal CA AND
+  the peer's source IP must be in the current peer-IP set
+  (`Cluster.Peers()`).
+- NetworkPolicy (`07-networkpolicy.yaml.tmpl`) restricts ingress on `:8444`
+  to pods with label `app=orca` in the same namespace.
+- Loop prevention: receiver enforces `X-Origincache-Internal: 1` and
+  self-checks `Cluster.Coordinator(k) == Self()`; on disagreement returns
+  `409 Conflict` and the assembler falls back to local fill (one duplicate
+  fill possible during membership flux, observable via
+  `orca_origin_duplicate_fills_total{result="commit_lost"}`).
+
+## 5. Configuration shape
+
+```yaml
+server:
+  listen: 0.0.0.0:8443
+  max_response_bytes: 0                           # 0 = no cap; >0 returns
+                                                  # 400 RequestSizeExceedsLimit
+                                                  # (S3-style XML) with header
+                                                  # x-orca-cap-exceeded: true
+                                                  # before any cache lookup.
+                                                  # 416 is reserved for true
+                                                  # Range vs. object-size violations.
+  tls:
+    cert_file: /etc/orca/tls/tls.crt
+    key_file:  /etc/orca/tls/tls.key
+    client_ca_file: /etc/orca/tls/client-ca.crt   # optional, enables mTLS
+  auth:
+    enabled: true                                 # production: true. Dev:
+                                                  # set false to disable client
+                                                  # auth entirely (no token /
+                                                  # cert required). NOT a
+                                                  # dev_mode flag - just an
+                                                  # auth-on/off knob.
+    mode: bearer                                  # bearer | mtls | both
+                                                  # (only meaningful when
+                                                  # enabled=true)
+    bearer_secret_file: /etc/orca/secret/token
+
+readyz:
+  errauth_consecutive_threshold: 3                # mark NotReady after this many
+                                                  # consecutive CacheStore ErrAuth;
+                                                  # one non-ErrAuth success resets
+
+metadata_ttl: 5m                                  # bounded-staleness window
+                                                  # (design.md#11-bounded-staleness-contract);
+                                                  # default 5m. Upper bound on
+                                                  # serving stale ETag if the
+                                                  # immutable-origin contract
+                                                  # is violated by an operator.
+
+negative_metadata_ttl: 60s                        # negative-cache window
+                                                  # (design.md#12-create-after-404-and-negative-cache-lifecycle);
+                                                  # default 60s. Upper bound on
+                                                  # serving stale 404 / unsupported-
+                                                  # blob-type after the operator
+                                                  # uploads a previously-missing
+                                                  # key. Independent of metadata_ttl;
+                                                  # short by design so create-after-404
+                                                  # recovery is fast.
+
+chunking:
+  size: 8MiB                                      # 4-16 MiB
+  prefetch:
+    enabled: true
+    depth: 4
+    max_inflight_per_blob: 8
+    max_inflight_global: 256
+
+list_cache:                                       # per-replica TTL'd cache
+                                                  # of Origin.List responses;
+                                                  # sized for FUSE-`ls` workload
+                                                  # (design.md s6.2 / FW3)
+  enabled: true                                   # default true; toggle off
+                                                  # for diagnostics
+  ttl: 60s                                        # default 60s; configurable
+                                                  # 5s - 30m typical range
+  max_entries: 1024                               # bounded LRU
+  max_response_bytes: 1MiB                        # responses larger than this
+                                                  # bypass the cache entirely
+  swr_enabled: false                              # stale-while-revalidate;
+                                                  # off by default
+  swr_threshold_ratio: 0.5                        # background refresh trigger
+                                                  # when entry age > ratio * ttl;
+                                                  # only meaningful when
+                                                  # swr_enabled=true
+
+chunk_catalog:                                    # in-memory chunk presence
+                                                  # cache + access tracking
+                                                  # (design.md s10.2 / s13.2)
+  max_entries: 100000                             # default 100K (~12 MB at
+                                                  # ~120B/entry); SIZE TO
+                                                  # WORKING SET per s13.3
+  active_eviction:
+    enabled: false                                # default false; opt-in
+                                                  # (preserves v1 lifecycle-
+                                                  # only behavior); enable
+                                                  # for posixfs deployments
+                                                  # without external sweep
+    interval: 10m                                 # eviction loop period
+    inactive_threshold: 24h                       # entry must be older than
+                                                  # this since last access
+    access_threshold: 5                           # evict only if AccessCount
+                                                  # < threshold
+    min_age: 5m                                   # cold-start protection;
+                                                  # never evict entries
+                                                  # younger than this
+    max_evictions_per_run: 1000                   # bound per-cycle work
+
+metadata_refresh:                                 # opt-in bounded-freshness
+                                                  # mode (design.md s11.2 /
+                                                  # FW5); proactively re-Heads
+                                                  # hot keys ahead of
+                                                  # metadata_ttl
+  enabled: false                                  # default false; preserves
+                                                  # "trust the contract"
+                                                  # posture
+  interval: 1m                                    # refresh-loop period
+  refresh_ahead_ratio: 0.7                        # eligible when entry age
+                                                  # >= ratio * metadata_ttl
+                                                  # (default 0.7 * 5m = 3.5m)
+  access_threshold: 5                             # only refresh hot keys
+                                                  # (AccessCount >= threshold)
+  min_age: 75s                                    # cold-start protection;
+                                                  # never refresh entries
+                                                  # younger than this
+                                                  # (default = metadata_ttl/4)
+  max_refreshes_per_run: 100                      # bound per-cycle work
+  refresh_concurrency: 8                          # parallel refresh workers
+
+spool:
+  dir: /var/lib/orca/spool                 # bounded local-disk staging
+  max_bytes: 8GiB                                 # full-spool -> 503 Slow Down
+  max_inflight: 64                                # concurrent fills using spool
+  tmp_max_age: 1h                                 # crash-recovery sweep age
+  require_local_fs: true                          # boot statfs(2) check; refuse
+                                                  # to start if spool.dir is on
+                                                  # NFS/SMB/CephFS/Lustre/GPFS/
+                                                  # FUSE. Defense-in-depth: the
+                                                  # spool is no longer on the
+                                                  # client TTFB path in v1, but
+                                                  # joiner-fallback latency
+                                                  # benefits materially from
+                                                  # local block storage.
+                                                  # Operators with unusual
+                                                  # placements MAY relax to
+                                                  # false; production deploys
+                                                  # are expected to keep the
+                                                  # default.
+                                                  # See design.md#104-spool-locality-contract.
+
+origin:                                           # leader-side pre-header
+                                                  # retry budget; transient
+                                                  # origin failures retry
+                                                  # invisibly to the client
+                                                  # before HTTP response
+                                                  # headers are committed
+                                                  # (design.md s8.6 / Option D)
+  retry:
+    attempts: 3                                   # max attempts before giving
+                                                  # up and returning 502
+                                                  # OriginRetryExhausted
+    backoff_initial: 100ms                        # initial backoff
+    backoff_max: 2s                               # capped backoff per attempt
+    max_total_duration: 5s                        # absolute wall-clock cap;
+                                                  # 502 if exhausted regardless
+                                                  # of attempt count. Bounded
+                                                  # well below typical S3 SDK
+                                                  # read timeouts (aws-sdk-go
+                                                  # 30s; boto3 60s) so retries
+                                                  # complete before clients
+                                                  # time out.
+
+cachestore:
+  driver: localfs                                 # localfs | posixfs | s3
+  localfs:
+    root: /var/lib/orca/chunks
+    staging_max_age: 1h                           # sweep <root>/.staging/<uuid>
+                                                  # entries older than this; staging
+                                                  # MUST live inside <root> to keep
+                                                  # link()/renameat2 atomic on the
+                                                  # same filesystem
+  posixfs:                                        # shared POSIX FS backend; same
+                                                  # link()/EEXIST primitive as
+                                                  # localfs but mounted on every
+                                                  # replica at the same path
+    root: /mnt/orca/chunks                 # mount point + base dir; MUST
+                                                  # be the same on every replica
+    staging_max_age: 1h                           # sweep <root>/.staging/<uuid>
+                                                  # entries older than this
+    fanout_chars: 2                               # 2-char hex fan-out under
+                                                  # <origin_id>/ to bound dir
+                                                  # sizes; 0 disables. localfs
+                                                  # does NOT enable this by
+                                                  # default; posixfs does.
+    backend_type: ""                              # "" = auto-detect via
+                                                  # statfs(2) f_type + /proc/mounts
+                                                  # (nfs|wekafs|ceph|lustre|gpfs|...);
+                                                  # operator override allowed for
+                                                  # backends with ambiguous magic
+                                                  # numbers, logged loudly.
+    nfs:
+      minimum_version: "4.1"                      # refuse to start if mount
+                                                  # negotiates a lower NFS version;
+                                                  # see design.md#1012-cachestoreposixfs
+      allow_v3: false                             # opt-in NFSv3 with loud warning
+                                                  # and posixfs_nfs_v3_optin_total++;
+                                                  # NEVER set true in production
+      mount_check: true                           # parse /proc/mounts at boot to
+                                                  # confirm vers= and sync export
+                                                  # options; warn (not refuse) on
+                                                  # async export
+    require_atomic_link_self_test: true           # SelfTestAtomicCommit at startup;
+                                                  # refuse to start if backend
+                                                  # does not honor link()/EEXIST,
+                                                  # directory fsync, or size verify
+                                                  # via re-stat. Never disabled in
+                                                  # production.
+  s3:
+    endpoint: https://vast.dc.example.internal
+    bucket: orca-chunks
+    region: us-east-1
+    credentials_file: /etc/orca/cachestore-creds
+    atomic_commit_self_test: true                 # SelfTestAtomicCommit at
+                                                  # startup; refuse to start if
+                                                  # backend silently overwrites
+                                                  # despite If-None-Match: *
+    require_unversioned_bucket: true              # boot-time GetBucketVersioning
+                                                  # check (design.md s10.1.3);
+                                                  # refuse to start if Status:
+                                                  # Enabled or Suspended;
+                                                  # required because
+                                                  # If-None-Match: * is not
+                                                  # honored on versioned buckets
+                                                  # across all S3-compatible
+                                                  # backends (notably VAST)
+  circuit_breaker:                                # per-process breaker around all
+                                                  # CacheStore calls; trips on
+                                                  # sustained ErrTransient/ErrAuth
+                                                  # to prevent amplifying degradation
+    enabled: true
+    error_window: 30s
+    error_threshold: 10                           # ErrTransient + ErrAuth count;
+                                                  # ErrNotFound does NOT
+    open_duration: 30s
+    half_open_probes: 3
+
+chunkcatalog:
+  max_entries: 1_000_000                          # ~128 MiB at ~128 B/entry
+
+origin:
+  id: aws-us-east-1-prod                          # deployment-scoped origin
+                                                  # identifier; required;
+                                                  # baked into ChunkKey and the
+                                                  # on-store path so two
+                                                  # deployments can safely share
+                                                  # one CacheStore bucket
+  target_global: 192                              # desired cluster-wide cap
+                                                  # on concurrent
+                                                  # Origin.GetRange (design.md
+                                                  # s8.4). Per-replica cap is
+                                                  # floor(target_global /
+                                                  # cluster.target_replicas).
+                                                  # Realized cluster-wide cap
+                                                  # tracks target_global only
+                                                  # when actual replica count
+                                                  # equals
+                                                  # cluster.target_replicas.
+                                                  # Coordinated cluster-wide
+                                                  # limiter is deferred future
+                                                  # work (design.md s15.5).
+  queue_timeout: 5s                               # bounded wait when the
+                                                  # per-replica bucket is
+                                                  # saturated; on timeout the
+                                                  # request returns 503 Slow
+                                                  # Down so clients back off
+  driver: s3                                      # s3 | azureblob
+  s3:
+    region: us-east-1
+    bucket: example-data
+    credentials: env                              # env | irsa | file
+  azureblob:
+    account: exampleacct
+    container: data
+    auth: managed-identity                        # managed-identity | sas | key
+    enforce_block_blob_only: true                 # locked true; setting false
+                                                  # is rejected at startup
+    list_mode: filter                             # filter | passthrough
+    metadata_ttl: 5m
+    rejection_ttl: 5m
+
+cluster:
+  enabled: true
+  service: orca.orca.svc.cluster.local
+  port: 8443                                      # client edge port on peers
+                                                  # (used only as a discovery
+                                                  # convention; internal RPCs
+                                                  # use internal_listen below)
+  membership_refresh: 5s                          # headless Service DNS poll
+  internal_listen: 0.0.0.0:8444                   # per-chunk fill RPC listener
+  internal_tls:
+    enabled: true                                   # production: true (mTLS).
+                                                    # Dev: set false to listen
+                                                    # plain HTTP/2; binary logs
+                                                    # WARN at startup. NOT a
+                                                    # dev_mode flag - just a
+                                                    # security knob.
+    cert_file: /etc/orca/internal-tls/tls.crt
+    key_file:  /etc/orca/internal-tls/tls.key
+    ca_file:   /etc/orca/internal-tls/ca.crt   # internal CA, distinct
+                                                      # from client CA
+    server_name: orca.<ns>.svc                 # stable SAN; pinned as
+                                                      # tls.Config.ServerName by
+                                                      # internal-RPC dialers
+                                                      # (NOT pod IPs); per-replica
+                                                      # certs MUST include this SAN
+  target_replicas: 3                                  # expected replica count;
+                                                      # used to compute the
+                                                      # per-replica origin
+                                                      # concurrency cap
+                                                      # (target_per_replica =
+                                                      # floor(origin.target_global /
+                                                      # cluster.target_replicas))
+                                                      # (design.md s8.4).
+                                                      # MUST be updated after
+                                                      # any sustained scale
+                                                      # change. Dynamic recompute
+                                                      # is deferred future work
+                                                      # (design.md s15.6).
+```
+
+CacheStore eviction (TTL / lifecycle) is configured separately on the
+underlying storage system and is not a cache-layer concern. See
+`operations.md` for recommended baselines.
+
+## 6. Observability
+
+- Prometheus collectors:
+  - `orca_requests_total{op,status}`
+  - `orca_request_duration_seconds{op}` (histogram)
+  - `orca_responses_aborted_total{phase,reason}` -- mid-stream
+    aborts after first byte sent (HTTP/2 `RST_STREAM` or HTTP/1.1
+    `Connection: close`); `phase` in `pre_first_byte|mid_stream`
+  - `orca_chunk_hits_total`, `orca_chunk_misses_total`
+  - `orca_chunkcatalog_hits_total`, `orca_chunkcatalog_misses_total`
+  - `orca_chunkcatalog_entries`
+  - `orca_cachestore_stat_total{result="present|absent|error"}`
+  - `orca_cachestore_stat_duration_seconds` (histogram)
+  - `orca_origin_requests_total{origin,op,status}`
+  - `orca_origin_bytes_total{origin}`
+  - `orca_origin_request_duration_seconds{origin,op}` (histogram)
+  - `orca_origin_rejected_total{origin,reason,blob_type}`
+  - `orca_origin_etag_changed_total{origin}` -- count of `412
+    Precondition Failed` responses to `If-Match: <etag>` GETs;
+    leading indicator of mid-flight overwrite or stale metadata cache
+  - `orca_origin_retry_total{result="success|exhausted_attempts|exhausted_duration|etag_changed"}`
+    -- one increment per request that entered the pre-header retry
+    loop ([design.md s8.6](./design.md#86-failure-handling-without-re-stampede)).
+    `success` = origin returned a first byte after some attempts;
+    `exhausted_attempts` = ran out of attempts within the time
+    budget -> 502 OriginRetryExhausted;
+    `exhausted_duration` = exceeded `origin.retry.max_total_duration`
+    -> 502 OriginRetryExhausted;
+    `etag_changed` = OriginETagChangedError (non-retryable) -> 502
+    OriginETagChanged. Sustained non-zero `exhausted_*` rates
+    indicate origin health issues.
+  - `orca_origin_retry_attempts` -- histogram of attempt
+    count per request that entered the retry loop. p50 should be
+    1 (first attempt succeeds); a long tail toward
+    `origin.retry.attempts` indicates degraded origin.
+  - `orca_responses_aborted_total{phase="pre_commit|mid_stream",reason}`
+    -- response abort counters. `pre_commit` covers errors before
+    response headers are sent (mostly diagnostic; the request
+    typically returns a clean HTTP error). `mid_stream` covers
+    aborts after the commit boundary (origin disconnect after
+    first byte) and is the metric to watch for the cost paid by
+    the v1 streaming design. Sustained non-zero `mid_stream` rate
+    is the trigger for considering mid-stream origin resume
+    ([design.md s15.4](./design.md#154-mid-stream-origin-resume)).
+  - `orca_origin_duplicate_fills_total{result="commit_won|commit_lost"}`
+    - increments at every CacheStore commit attempt. The `commit_lost` rate
+      quantifies cross-replica fill duplication that escaped coordinator
+      routing (e.g. during membership flux during rolling restart). See
+      [design.md#8-stampede-protection](./design.md#8-stampede-protection)
+      and [design.md#14-horizontal-scale](./design.md#14-horizontal-scale).
+  - `orca_inflight_fills`
+  - `orca_singleflight_joiners_total`
+  - `orca_spool_bytes` -- current spool footprint
+  - `orca_spool_evictions_total{reason="committed|aborted|full"}`
+  - `orca_cluster_internal_fill_requests_total{direction="sent|received|conflict"}`
+    -- `conflict` increments whenever the receiver returns `409 Conflict`
+    because of a coordinator-membership disagreement
+  - `orca_cluster_internal_fill_duration_seconds` (histogram)
+  - `orca_cluster_membership_size`
+  - `orca_cluster_membership_refresh_duration_seconds` (histogram)
+  - `orca_cachestore_self_test_total{result="ok|failed"}` --
+    incremented once per process start by `SelfTestAtomicCommit`
+  - `orca_cachestore_errors_total{kind="not_found|transient|auth"}`
+    -- typed CacheStore error counts (see
+    [design.md#102-catalog-correctness-typed-errors-circuit-breaker](./design.md#102-catalog-correctness-typed-errors-circuit-breaker));
+    `not_found` is normal cold-path traffic, `transient` and `auth`
+    feed the breaker and (for `auth`) the `/readyz` threshold
+  - `orca_cachestore_breaker_state` -- 0=closed, 1=open,
+    2=half_open
+  - `orca_cachestore_breaker_transitions_total{from,to}` --
+    breaker state-transition counter
+  - `orca_origin_inflight{origin}` -- per-replica gauge of
+    in-flight `Origin.GetRange` calls; cap is
+    `floor(target_global / N_replicas)` per
+    [design.md#84-origin-backpressure](./design.md#84-origin-backpressure)
+  - `orca_metadata_origin_heads_total{origin,result}` --
+    per-replica HEAD calls that actually reached the origin (not
+    served from the metadata cache); cluster-wide bound is N per
+    object per `metadata_ttl` window in v1
+  - `orca_metadata_negative_entries` -- gauge of negative
+    metadata-cache entries (404 / unsupported-blob-type) currently
+    held by this replica. Drains as entries expire after
+    `negative_metadata_ttl`. See
+    [design.md#12-create-after-404-and-negative-cache-lifecycle](./design.md#12-create-after-404-and-negative-cache-lifecycle).
+  - `orca_metadata_negative_hit_total{origin_id}` -- counter
+    of requests served from a negative entry. A spike following a
+    known operator upload signals create-after-404 drain in
+    progress.
+  - `orca_metadata_negative_age_seconds{origin_id}` --
+    histogram of negative-entry age at hit time. Upper-bound
+    percentiles inform `negative_metadata_ttl` tuning.
+  - `orca_list_cache_entries` -- gauge of LIST cache size
+    (current LRU population). Approaches `list_cache.max_entries`
+    indicate undersizing for the workload. See
+    [design.md s6.2](./design.md#62-list-request-flow).
+  - `orca_list_cache_hit_total{origin_id,result="hit|miss"}`
+    -- LIST cache hit rate; `result="hit"` increments on cache
+    serve, `result="miss"` on origin pass-through. Hit rate is the
+    primary indicator of LIST cache effectiveness for the FUSE
+    workload.
+  - `orca_list_cache_evict_total{reason="size|ttl|response_too_large"}`
+    -- LIST cache evictions by trigger. `size` = LRU bound;
+    `ttl` = lazy expiration on lookup; `response_too_large` =
+    response exceeded `list_cache.max_response_bytes` and bypassed
+    cache.
+  - `orca_list_cache_origin_calls_total{origin_id,result}`
+    -- LIST calls that actually reached origin (cache miss +
+    singleflight collapse). With per-replica caching, cluster-wide
+    bound is N origin LIST per identical query per
+    `list_cache.ttl`.
+  - `orca_list_cache_swr_refresh_total{origin_id,result}`
+    -- background stale-while-revalidate refreshes. Only emitted
+    when `list_cache.swr_enabled=true`.
+  - `orca_chunk_catalog_entries` -- gauge of in-memory
+    ChunkCatalog size. Pinned at `chunk_catalog.max_entries`
+    suggests undersizing relative to the working set
+    ([design.md s13.3](./design.md#133-chunkcatalog-size-awareness-load-bearing-operational-note)).
+  - `orca_chunk_catalog_hit_total{result="hit|miss"}` --
+    catalog Lookup outcomes. Sustained hit_rate < 0.7 suggests
+    undersizing.
+  - `orca_chunk_catalog_evict_total{reason="size|active|forget"}`
+    -- catalog evictions by trigger. `size` = LRU bound (passive);
+    `active` = active eviction loop deleted from CacheStore;
+    `forget` = explicit Forget (ETag changed, GetChunk ErrNotFound).
+  - `orca_chunk_catalog_active_eviction_runs_total{result="ok|breaker_open|aborted"}`
+    -- active eviction loop completions. `breaker_open` means the
+    loop skipped this cycle because the CacheStore breaker is
+    open. Only emitted when
+    `chunk_catalog.active_eviction.enabled=true`.
+  - `orca_chunk_catalog_active_eviction_candidates` --
+    histogram of per-run candidate count. Visibility into
+    eligible-but-not-yet-evicted entries.
+  - `orca_cachestore_delete_total{result="ok|not_found|transient|auth"}`
+    -- `CacheStore.Delete` outcomes (called by active eviction).
+    `not_found` is treated as success by the eviction loop
+    (idempotent). `transient` and `auth` count toward the
+    CacheStore circuit breaker.
+  - `orca_metadata_refresh_runs_total{result="ok|aborted|breaker_open"}`
+    -- bounded-freshness mode (FW5) per-loop completions. Only
+    emitted when `metadata_refresh.enabled=true`. See
+    [design.md s11.2](./design.md#112-bounded-freshness-mode-optional).
+  - `orca_metadata_refresh_total{result="ok|etag_changed|error|skipped_limiter_busy"}`
+    -- per-key refresh outcomes. `etag_changed` indicates an
+    immutable-contract violation detected proactively (the metric
+    `orca_origin_etag_changed_total` also increments).
+  - `orca_metadata_refresh_candidates` -- histogram of
+    eligible candidates per refresh-loop run. Visibility into the
+    hot-key set size.
+  - `orca_metadata_refresh_lag_seconds` -- histogram of
+    `(now - LastEntered)` at refresh time; should cluster around
+    `metadata_refresh.refresh_ahead_ratio * metadata_ttl`.
+  - `orca_s3_versioning_check_total{result="ok|refused"}` --
+    once-per-boot emission from the `cachestore/s3` versioning
+    gate ([design.md s10.1.3](./design.md#1013-cachestores3)).
+    `refused` indicates the bucket has versioning enabled or
+    suspended; the process exits non-zero immediately after.
+  - `orca_commit_after_serve_total{result="ok|failed"}` --
+    asynchronous CacheStore commits that run after the client
+    response is complete; `failed` means the
+    client response succeeded but the chunk was NOT recorded in the
+    `ChunkCatalog` (next request refills); see
+    [design.md#86-failure-handling-without-re-stampede](./design.md#86-failure-handling-without-re-stampede)
+  - `orca_localfs_dir_fsync_total{result="ok|failed"}` --
+    `fsync()` of the `<root>/.staging/` and final-parent directories
+    on every commit, sweep, and orphaned-staging cleanup
+  - `orca_posixfs_link_total{result="commit_won|commit_lost|error"}` --
+    every `link()` no-clobber commit attempt by `cachestore/posixfs`;
+    the loser of a race is `commit_lost` (returned `EEXIST`); other
+    failures are `error` and feed the breaker. See
+    [design.md#1012-cachestoreposixfs](./design.md#1012-cachestoreposixfs).
+  - `orca_posixfs_dir_fsync_total{result="ok|failed"}` --
+    `fsync()` of `<root>/.staging/` and `<final parent>` directories
+    by `cachestore/posixfs`; rate matters because a network FS may
+    silently degrade dir-fsync semantics under an `async` export.
+  - `orca_posixfs_backend{type,version,major,minor}` -- info
+    gauge (value=1) labelled with the auto-detected (or
+    operator-overridden) backend at boot, e.g.
+    `type="nfs",version="4.1"`; `type="wekafs"`; `type="ceph"`;
+    `type="lustre"`; `type="gpfs"`. Used to tag every other posixfs
+    metric in dashboards via `group_left`.
+  - `orca_posixfs_selftest_last_success_timestamp` -- unix
+    seconds of the last successful `SelfTestAtomicCommit`; absent if
+    the driver never reached a green self-test.
+  - `orca_posixfs_nfs_v3_optin_total` -- count of boot-time
+    NFSv3 opt-in events (operator set
+    `cachestore.posixfs.nfs.allow_v3: true`); should be `0` in
+    production.
+  - `orca_posixfs_alluxio_refusal_total` -- count of boot
+    refusals because the detected backend was Alluxio FUSE; should be
+    `0`. Operators MUST switch to `cachestore.driver: s3` against the
+    Alluxio S3 gateway.
+  - `orca_spool_locality_check_total{result="ok|refused|bypassed",fs_type}` --
+    boot `statfs(2)` outcome for `spool.dir`; `refused` means the FS
+    is on the network-FS denylist and the process exited non-zero;
+    `bypassed` means `spool.require_local_fs=false` (test-only).
+    See [design.md#104-spool-locality-contract](./design.md#104-spool-locality-contract).
+  - `orca_readyz_errauth_consecutive` -- current count of
+    consecutive `ErrAuth` responses from CacheStore; flips `/readyz`
+    to NotReady at `readyz.errauth_consecutive_threshold` (default 3)
+- Structured logs with request IDs propagated to origin SDKs.
+- `/healthz` and `/readyz`. Ready when the CacheStore is reachable, the
+  CacheStore startup self-test has succeeded (s10 of design.md), the
+  internal listener is bound, and origin credentials are valid. There is
+  no persistent local state to load.
+- Admin endpoints (gated by separate listener / auth):
+  dump cluster topology, lookup chunk, force-`Forget` a catalog entry,
+  dump current spool inventory.
+- `kubectl unbounded orca` subcommand for inspection (later phase).
+
+## 7. Phased delivery
+
+| Phase | Scope | Definition of done |
+|---|---|---|
+| **0 - skeleton** | `cmd/orca` boilerplate; `Origin` and `CacheStore` interfaces; `origin/s3`; `cachestore/localfs`; in-memory `chunkcatalog`; single-process Range GET; streaming chunk iterator; `make` integration; basic unit tests | One process serves a Range GET against a real S3 bucket and re-serves it from `localfs` |
+| **1 - prod basics** | `fetch.Coordinator` with chunk + meta singleflight + tee; `chunkcatalog` LRU + Stat-on-miss path with **per-entry access-frequency tracking** (FW8) and bounded by `chunk_catalog.max_entries` with size-awareness operational guidance ([design.md s13.3](./design.md#133-chunkcatalog-size-awareness-load-bearing-operational-note)); atomic CacheStore writes (`localfs` `link`/`renameat2(RENAME_NOREPLACE)` with **staging inside `<root>/.staging/<uuid>` + parent-dir fsync**); metadata cache with `metadata_ttl=5m` and **`negative_metadata_ttl=60s`** (asymmetric defaults; bounds the create-after-404 unavailability window per [design.md s12](./design.md#12-create-after-404-and-negative-cache-lifecycle)) including `metadata_negative_entries` / `metadata_negative_hit_total` / `metadata_negative_age_seconds` metrics; **per-replica LIST cache** (FW3) with default `list_cache.ttl=60s`, `max_entries=1024`, sized for FUSE-`ls` workload ([design.md s6.2](./design.md#62-list-request-flow)); **active eviction** (FW8) opt-in via `chunk_catalog.active_eviction.enabled` (default off; recommended on for posixfs deployments without external sweep) including `CacheStore.Delete` interface method; **bounded-freshness mode** (FW5) opt-in via `metadata_refresh.enabled` (default off) with hot-key detection via metadata-cache access counters ([design.md s11.2](./design.md#112-bounded-freshness-mode-optional)); **distributed origin limiter** is deferred future work (see [design.md s15.5](./design.md#155-coordinated-cluster-wide-origin-limiter)); v1 ships with a per-replica token bucket sized `floor(origin.target_global / cluster.target_replicas)` (default 64 slots/replica at `target_global=192`, `target_replicas=3`), with origin throttling responses handled by the leader's pre-header retry loop ([design.md s8.4](./design.md#84-origin-backpressure)); **bounded staleness contract documented**; **strict `If-Match: <etag>` on every `Origin.GetRange` plus `OriginETagChangedError` handling**; **typed `CacheStore` errors (`ErrNotFound|ErrTransient|ErrAuth`)** with only `ErrNotFound` triggering refill; **per-replica HEAD singleflight wording** in metadata layer; **pre-header origin retry** (`origin.retry.attempts=3`, `origin.retry.max_total_duration=5s` defaults) as the cold-path commit boundary - cold-path bytes stream origin -> client directly with bounded leader-side retry handling transient origin failures invisibly before HTTP response headers are committed; spool tees in parallel for joiner support and as the asynchronous CacheStore-commit source ([design.md s8.6](./design.md#86-failure-handling-without-re-stampede)); **mid-stream abort** on post-first-byte failure (`RST_STREAM` / `Connection: close`); **`server.max_response_bytes` cap returns `400 RequestSizeExceedsLimit`** (S3-style XML; 416 reserved for Range vs. EOF); `HeadObject`; `ListObjectsV2`; `origin/azureblob` (Block Blob only); **`cachestore/s3` versioning gate** ([design.md s10.1.3](./design.md#1013-cachestores3)) refusing to start on versioned buckets; Prometheus; structured logging; health / readiness | One replica deployed in a dev K8s cluster serving traffic against both S3 and Azure (multi-replica clustering lands in Phase 3) |
+| **2 - prod backend & ops** | `cachestore/s3` for VAST with `PutObject` + `If-None-Match: *` and **`SelfTestAtomicCommit` at startup** (refuse to start if backend silently overwrites); **`cachestore/posixfs` for shared POSIX FS deployments** (NFSv4.1+ baseline, plus Weka native, CephFS, Lustre, GPFS) sharing `link()`/`EEXIST` + dir-fsync helpers with `cachestore/localfs` via `internal/orca/cachestore/internal/posixcommon/`, with **`SelfTestAtomicCommit` at startup** (refuse to start on Alluxio FUSE, on NFS below `nfs.minimum_version=4.1` unless `nfs.allow_v3` is set, or on any backend that fails the link-EEXIST + dir-fsync + size-verify self-test) and 2-char hex fan-out under `<origin_id>/`; **`internal/orca/fetch/spool` layer** (slow-joiner fallback regardless of CacheStore driver) **with mandatory boot `statfs(2)` locality check** that refuses to start when `spool.dir` is on a network FS (NFS / SMB / CephFS / Lustre / GPFS / FUSE); **`commit_after_serve_total{ok|failed}` async-commit metric path**; **per-process CacheStore circuit breaker** (`enabled,error_window=30s,error_threshold=10,open_duration=30s,half_open_probes=3`); **per-replica origin semaphore documented** with formula `floor(target_global / N_replicas)` + `origin_inflight` gauge; **`localfs` `staging_max_age=1h` orphaned-staging sweeper** (and equivalent `posixfs.staging_max_age=1h`); **`/readyz` ErrAuth threshold (default 3 consecutive -> NotReady)**; sequential read-ahead; bearer / mTLS auth on the client edge; `deploy/orca/` manifests (incl. `07-networkpolicy.yaml.tmpl`); `images/orca/` Containerfile; `hack/orca/` published with CacheStore lifecycle policy guidance and POSIX-backend support matrix | Production-shaped service running against VAST in a real DC with the self-test green, AND a parallel green run against at least one shared-POSIX backend (NFSv4.1+ baseline) |
+| **3 - cluster** | `cluster/` peer discovery from headless Service DNS; rendezvous hashing on pod IP; **per-chunk internal fill RPC** (assembler fan-out); **internal mTLS listener on `:8444`** with internal CA + peer-IP authz + **stable `ServerName=orca.<ns>.svc`** pinned by dialers (per-replica certs MUST include this SAN) + `X-Origincache-Internal` loop prevention + `409 Conflict` on coordinator disagreement; NetworkPolicy applied; `kubectl unbounded orca` inspection subcommand | Multi-replica Deployment sustaining target throughput; `commit_lost` rate near zero in steady state |
+| **4 - optional** | NVMe / HDD tiering; S3 SigV4 verification; adaptive prefetch; deferred optimizations catalogued in [design.md s15](./design.md#15-deferred-optimizations) (edge rate limiting, cluster-wide HEAD singleflight, cluster-wide LIST coordinator) if measured to be needed | As needed |
+
+Estimated calendar: Phase 0 + 1 ~= 3-4 focused weeks. Phase 2 + 3 another
+4-6 weeks depending on ops depth.
+
+## 8. Test strategy
+
+- `chunker` and `singleflight`: table-driven + fuzz (`go test -fuzz`).
+  Iterator must never materialize the full `[]ChunkKey` for a range;
+  test with `lastChunk - firstChunk = 1_000_000` and assert bounded
+  allocation.
+- `chunkcatalog`: LRU eviction behavior, concurrent `Lookup` /
+  `Record` / `Forget`, bounded entry count.
+- `cachestore/localfs`: temp-dir integration tests including:
+  - crash simulation (kill mid-write, verify `*.tmp.*` cleanup and
+    recovery via the periodic sweep);
+  - **two-leader race**: two goroutines both call `PutChunk(k, ..)` with
+    distinct payloads; assert exactly one wins (`commit_won`), the other
+    sees `EEXIST` and reports `commit_lost`, and the on-disk content
+    matches the winner.
+- `cachestore/s3`: integration tests against `minio` covering:
+  - direct `PutObject(final, body, If-None-Match: "*")` commit;
+  - **`SelfTestAtomicCommit` pass** (real `minio` returns `412` on the
+    second probe write);
+  - **`SelfTestAtomicCommit` fail** (mock S3 server that always returns
+    `200`; assert process exits with the documented error);
+  - **412 commit_lost path**: two concurrent leaders, distinct payloads;
+    assert exactly one `commit_won` and one `commit_lost`, and the stored
+    object equals the winner's bytes;
+  - idempotent re-PUT (committed key + repeated PutObject yields 412
+    without data loss).
+- `origin/s3`: contract tests against `minio` in CI, including:
+  - **`If-Match: <etag>` header is sent on every `GetRange`** (assert via
+    request capture);
+  - **412 -> `OriginETagChangedError`**: overwrite the object mid-test,
+    issue `GetRange` with the old etag, assert typed error and that the
+    metadata cache entry for `{origin_id, bucket, key}` is invalidated.
+- `origin/azureblob`: contract tests against `azurite` in CI, including:
+  - One Block Blob, one Page Blob, one Append Blob.
+  - GETs against Page / Append return `502 OriginUnsupported` and
+    increment `orca_origin_rejected_total`.
+  - `ListObjectsV2` in `filter` mode returns only the Block Blob and
+    preserves continuation tokens across pages.
+  - 1000 concurrent requests for the same Page Blob produce exactly one
+    upstream `HEAD`.
+  - `If-Match: <etag>` sent on every Get Blob; 412 -> `OriginETagChangedError`.
+- `fetch.Coordinator` stampede tests:
+  - 1000 goroutines requesting the same `ChunkKey`; mock origin called
+    exactly once; all readers receive identical bytes.
+  - Same as above but origin returns an error after N bytes; all
+    pre-first-byte joiners get a `502`; mid-stream joiners get an aborted
+    response (`RST_STREAM` or `Connection: close`); a follow-up request
+    triggers exactly one new origin call.
+  - All joiners cancel mid-fill; chunk still lands in cache.
+  - **Mid-fill `OriginETagChangedError`**: after N bytes, mock origin
+    returns 412 on `If-Match`; assert (a) leader fails the fill with
+    `OriginETagChangedError`, (b) metadata cache entry invalidated, (c)
+    `orca_origin_etag_changed_total` increments, (d) pre-first-byte
+    joiners receive `502`, mid-stream joiners are aborted, (e) the next
+    request issues a fresh `Head`, gets a new etag, derives a new
+    `ChunkKey`, and successfully fills.
+  - **Slow-joiner spool fallback**: leader streams from origin via
+    spool + ring buffer; one joiner is artificially slowed beyond the
+    ring buffer head; assert the joiner transparently switches to
+    `Spool.Reader` and receives identical bytes; spool entry is released
+    after refcount hits zero.
+  - **Spool exhaustion**: fill `spool.max_bytes` with held-open joiners;
+    assert subsequent fill requests time out on `spool.max_inflight` and
+    return `503 Slow Down` to the client.
+- Cold-start: a freshly started replica receives a request for a chunk
+  already present in the CacheStore; assert exactly one
+  `CacheStore.Stat`, no origin call, chunk served from CacheStore,
+  `ChunkCatalog` populated; subsequent request hits the catalog.
+- Cluster:
+  - in-process 3-replica test for assembler fan-out and per-chunk
+    coordinator routing against a shared CacheStore; assert
+    `orca_origin_duplicate_fills_total{result="commit_lost"}` = 0
+    under steady-state membership;
+  - **internal-listener authz**: peer with valid internal cert but source
+    IP outside `Cluster.Peers()` is rejected; client cert chained only to
+    the *client* CA is rejected;
+  - **loop prevention**: replica A forwards `/internal/fill` to replica B
+    with `X-Origincache-Internal: 1`; B's view of `Coordinator(k)` is C;
+    assert B returns `409 Conflict` and A falls back to local fill;
+  - **1000-chunk fan-out**: client requests a `Range` spanning 1000
+    distinct cold chunks across 3 replicas; assert the assembler issues
+    fan-out fill RPCs concurrently up to the configured cap, response
+    body is byte-identical to a direct origin read, and total origin
+    GETs equal exactly 1000.
+- End-to-end: docker-compose with `minio` (origin) + a second `minio`
+  (CacheStore) + a single `orca` process; scripted range-read
+  scenarios incl. mid-test object overwrite to exercise the `If-Match`
+  path end-to-end.
+- Load test: `vegeta` / `k6` against a process backed by a mock origin with
+  injected latency. Confirm origin RPS stays at exactly 1 per cold chunk
+  and at most semaphore-limited overall, while client RPS scales linearly.
+- **T-1a metadata_ttl bound** (`metadata` package): seed metadata cache
+  with `etag=v1` at t=0; at t=`metadata_ttl - jitter`, assert reads
+  still see `v1` without a new HEAD; at t=`metadata_ttl + jitter`,
+  overwrite origin to `etag=v2`, assert next request triggers HEAD,
+  observes `v2`, and derives a new `ChunkKey`. Asserts the staleness
+  cap from
+  [design.md#11-bounded-staleness-contract](./design.md#11-bounded-staleness-contract).
+- **T-create-after-404a stale window**
+  (`metadata` + `fetch.Coordinator`): origin returns `404` for key `K`
+  at t=0; assert the cache returns `404` to the client and records a
+  negative metadata entry. Operator-side mock uploads `K` to origin at
+  t=`negative_metadata_ttl / 2`. At t=`negative_metadata_ttl - jitter`,
+  re-issue the client GET against the same replica; assert `404` is
+  still returned (negative entry still valid) and that
+  `metadata_negative_hit_total` was incremented. Asserts the bound in
+  [design.md#12-create-after-404-and-negative-cache-lifecycle](./design.md#12-create-after-404-and-negative-cache-lifecycle).
+- **T-create-after-404b recovery**
+  (`metadata` + `fetch.Coordinator`): same setup as 404a, but at
+  t=`negative_metadata_ttl + jitter` re-issue the GET against the same
+  replica; assert the cache re-Heads, observes `200`, and serves the
+  newly-uploaded bytes via the normal fill path.
+- **T-create-after-404c per-replica fan-out** (multi-replica integration):
+  in a 2-replica deployment, route the original `404` GET to replica A
+  only; upload `K` to origin; route a follow-up GET to replica B and
+  assert it serves `200` immediately (replica B never observed the
+  404, so its metadata cache is fresh); route another follow-up to
+  replica A and assert it still returns `404` until its own
+  `negative_metadata_ttl` window expires.
+- **T-list-cache-hit** (`metadata` + `fetch.Coordinator`): identical
+  LIST queries within `list_cache.ttl` -> first triggers
+  `Origin.List`, second served from cache; assert
+  `list_cache_hit_total{result="hit"}` increments and origin LIST
+  count = 1.
+- **T-list-cache-ttl-expiry**: identical LIST query at `t=0` and
+  `t=list_cache.ttl + jitter` -> two `Origin.List` calls; assert
+  cache expired correctly.
+- **T-list-cache-response-too-large**: mock `Origin.List` returning
+  a response that exceeds `list_cache.max_response_bytes` -> response
+  served to client but cache not populated; assert
+  `list_cache_evict_total{reason="response_too_large"}` incremented.
+- **T-list-cache-error-passthrough**: `Origin.List` returns 503 ->
+  error passed to client; subsequent retry calls origin again (no
+  negative caching).
+- **T-list-cache-pagination**: continuation tokens are part of the
+  cache key -> different tokens cache independently; sequential
+  page-through doesn't collide.
+- **T-list-cache-swr-trigger**: with `list_cache.swr_enabled=true`,
+  query at `t=0`, query at `t=ttl*ratio + jitter` -> assert
+  immediate cached response AND background refresh fires; assert
+  origin LIST count = 2 over the window.
+- **T-list-cache-fuse-pattern**: simulate FUSE `ls` workload (1 query
+  / 5s for 5 minutes against same prefix at `list_cache.ttl=60s`) ->
+  assert origin LIST count == 5 (one per minute); assert all client-
+  observed latencies are sub-millisecond except the 5 cache-miss
+  instances.
+- **T-catalog-access-tracking** (`chunkcatalog`): Lookup hits
+  increment `AccessCount`; `LastAccessed` updates; cold entries
+  score lower than warm entries by the eviction ordering.
+- **T-catalog-cold-start-protection**: entry created at t=0 not
+  eligible for active eviction at `t < min_age` regardless of
+  `AccessCount`.
+- **T-active-eviction-cold-chunk** (`chunkcatalog` + `cachestore`):
+  chunk in CacheStore + catalog entry with `AccessCount=0`,
+  `LastEntered=t-25h`, `chunk_catalog.active_eviction.enabled=true`.
+  Run eviction loop. Assert `CacheStore.Delete` called; catalog
+  Forgets the entry; metric
+  `cachestore_delete_total{result="ok"}` increments.
+- **T-active-eviction-popular-chunk**: chunk with `AccessCount=10`.
+  Run eviction loop. Assert NOT deleted.
+- **T-active-eviction-bounded-run**: 5000 eligible candidates,
+  `max_evictions_per_run=1000`. Assert exactly 1000 deleted, 4000
+  remain (next cycle catches them).
+- **T-active-eviction-breaker-open**: simulate `CacheStore.Delete`
+  returning `ErrTransient` repeatedly until breaker opens. Assert
+  subsequent eviction runs skip with
+  `active_eviction_runs_total{result="breaker_open"}`.
+- **T-catalog-size-undersized**: `chunk_catalog.max_entries=10`,
+  working set=100 entries. Assert hit rate < 0.7; assert
+  `chunk_catalog_evict_total{reason="size"}` increments steadily.
+- **T-metadata-refresh-hot-key** (`metadata`): hot entry
+  (`AccessCount=10`) at age `0.7 * metadata_ttl` is refreshed by the
+  bounded-freshness loop; `LastEntered` updates; client sees no
+  observable change. Requires `metadata_refresh.enabled=true`.
+- **T-metadata-refresh-cold-key-skipped**: cold entry
+  (`AccessCount=2`) NOT refreshed even when eligible by age.
+- **T-metadata-refresh-cold-start-protected**: entry created at t=0,
+  hot, NOT refreshed at `t < min_age`.
+- **T-metadata-refresh-etag-changed**: background refresh detects
+  new ETag; metadata cache updates; old `ChunkKey`s are orphaned;
+  next chunk request derives new `ChunkKey`s; metric
+  `metadata_refresh_total{result="etag_changed"}` increments;
+  `origin_etag_changed_total` also increments.
+- **T-metadata-refresh-bounded**: 500 eligible candidates,
+  `max_refreshes_per_run=100` -> exactly 100 refreshed per cycle;
+  remaining catch up on subsequent cycles.
+- **T-metadata-refresh-disabled**: `enabled=false` -> no background
+  activity; behaves like v1.
+- **T-metadata-refresh-singleflight-race**: on-demand HEAD and
+  background refresh fire concurrently for the same key; per-replica
+  HEAD singleflight collapses to one origin HEAD; both consumers
+  get the result.
+- **T-metadata-refresh-negative-entries-not-refreshed**: negative
+  entry (404) under `negative_metadata_ttl` is NOT refreshed;
+  expires naturally.
+- **T-origin-per-replica-cap** (`origin` + mock origin): with
+  `cluster.target_replicas=3` and `origin.target_global=192`
+  (giving per-replica cap = 64), launch 100 concurrent
+  `Origin.GetRange` calls on a single replica. Assert at most 64
+  hit origin concurrently; the remainder queue up to
+  `origin.queue_timeout` (5s) before returning `503 Slow Down` to
+  the client. Validates the simple per-replica token bucket
+  (design.md s8.4).
+- **T-origin-throttle-handled-by-retry** (`origin` +
+  `fetch.Coordinator` + mock origin): origin returns `503 SlowDown`
+  on the first attempt and `200` on the second. Assert client sees
+  a clean 200 response; assert
+  `origin_retry_total{result="success"}=1`. Validates that origin
+  throttling does NOT require a coordinated cluster-wide cap;
+  pre-header retry handles it.
+- **T-s3-versioned-bucket-refusal** (`cachestore/s3`): configure
+  `cachestore/s3` against a bucket with versioning enabled; assert
+  process exits non-zero with the documented error message and
+  metric `s3_versioning_check_total{result="refused"}=1`.
+- **T-s3-unversioned-bucket-ok** (`cachestore/s3`): configure
+  `cachestore/s3` against an unversioned bucket; assert
+  `GetBucketVersioning` returns `Status: Disabled`; gate passes;
+  metric `s3_versioning_check_total{result="ok"}=1`; driver proceeds
+  to `SelfTestAtomicCommit`.
+- **T-pre-header-retry-success** (`fetch.Coordinator` + mock origin):
+  origin returns transient 503 on attempt 1, 200 + bytes on attempt 2;
+  assert client sees clean 200 response with no observable abort;
+  assert `origin_retry_total{result="success"}=1`; assert
+  `origin_retry_attempts` records 2 attempts.
+- **T-pre-header-retry-exhausted-attempts**: origin returns 503 on
+  every attempt within the duration budget; assert client receives
+  clean `502 Bad Gateway` with code `OriginRetryExhausted` after
+  `origin.retry.attempts` exhaust; assert
+  `origin_retry_total{result="exhausted_attempts"}=1`.
+- **T-pre-header-retry-exhausted-duration**: origin slow-503 with
+  hangs that push total wall-clock past
+  `origin.retry.max_total_duration`; assert client receives `502`
+  before all attempts complete; assert
+  `origin_retry_total{result="exhausted_duration"}=1`.
+- **T-pre-header-retry-etag-changed-non-retryable**: origin returns
+  `OriginETagChangedError` on attempt 1; assert NO retry happens;
+  assert `502` with code `OriginETagChanged`; assert
+  `origin_retry_total{result="etag_changed"}=1`; assert metadata
+  cache invalidated.
+- **T-pre-header-retry-cold-path-ttfb** (`fetch` + mock origin):
+  with origin returning bytes after 10ms first-byte latency,
+  assert client TTFB < 50ms (sum of origin first-byte + small
+  pre-header retry overhead); assert NO chunk-download wait on
+  the TTFB path. Validates Option D's TTFB claim
+  ([design.md s8.6](./design.md#86-failure-handling-without-re-stampede)).
+- **T-mid-stream-abort-first-chunk-after-commit** (`fetch` +
+  `spool` + mock origin): origin succeeds for first byte; cache
+  commits headers + first byte; origin disconnects at 50% of
+  chunk; assert client connection aborts (HTTP/2 RST_STREAM or
+  HTTP/1.1 Connection: close); assert
+  `responses_aborted_total{phase="mid_stream"}=1`; client SDK
+  retries (validated separately via real aws-sdk-go integration
+  test).
+- **T-spool-tee-joiner-during-streaming** (`fetch` + `spool`):
+  leader streams 8 MiB chunk to client A; joiner B arrives at
+  50% point through the singleflight; B reads from ring buffer
+  while on-pace; B falls behind; B switches to spool reader; both
+  finish with full chunk byte-for-byte. Confirms the spool tee
+  works in parallel with client streaming and joiner-fallback is
+  unaffected by the drop of the spool-fsync gate.
+- **T-commit-after-serve failure** (`fetch` + `spool` + `cachestore`):
+  inject CacheStore commit error after the client response is
+  complete; assert the client response completes successfully
+  byte-for-byte; assert
+  `orca_commit_after_serve_total{result="failed"}` == 1;
+  assert `ChunkCatalog.Lookup(k)` is still a miss; assert a
+  follow-up request triggers exactly one new origin GET.
+- **T-3 typed CacheStore errors** (`cachestore` + `fetch`): inject each
+  of `ErrNotFound|ErrTransient|ErrAuth` from `CacheStore.GetChunk`:
+  - `ErrNotFound` -> miss-fill path runs, eventual 200/206 to client;
+  - `ErrTransient` -> client receives `503 Slow Down` with
+    `Retry-After: 1s` and `cachestore_errors_total{kind="transient"}`
+    increments; no refill attempted;
+  - `ErrAuth` -> client receives `502 Bad Gateway`,
+    `cachestore_errors_total{kind="auth"}` increments,
+    `readyz_errauth_consecutive` increments.
+- **T-3 circuit breaker** (`cachestore`): inject 10 `ErrTransient` over
+  30s; assert breaker opens (`breaker_state=1`,
+  `breaker_transitions_total{from="closed",to="open"}` == 1); subsequent
+  calls short-circuit; after 30s, the next 3 probes are allowed (half-open
+  state); on all-success, breaker closes; on any failure during half-open,
+  breaker re-opens.
+- **T-4a per-replica origin semaphore** (`fetch`): set semaphore to 4;
+  drive 16 concurrent cold misses across 16 distinct chunks; assert
+  in-flight `Origin.GetRange` never exceeds 4; assert
+  `orca_origin_inflight{origin}` saturates at 4; remaining 12
+  fills queue and complete in 4-wide batches.
+- **T-6a localfs staging-inside-root** (`cachestore/localfs`): assert
+  every commit writes to `<root>/.staging/<uuid>` (NOT `/tmp` and NOT
+  the spool dir); assert `link()` to final and `unlink()` of staging
+  both happen on the same filesystem; inject orphaned staging entries
+  older than `staging_max_age=1h`, run sweep, assert they are removed
+  and `localfs_dir_fsync_total` increments. Verify parent-dir fsync is
+  invoked by intercepting the syscall via a test seam (no strace
+  required).
+- **T-posixfs-nfs link-EEXIST race** (`cachestore/posixfs`): two
+  goroutines on two simulated replicas (two open mount handles to a
+  loopback `nfsd` v4.1 export in CI) call `PutChunk(k, ..)` with
+  distinct payloads; assert exactly one wins (`commit_won`,
+  `posixfs_link_total{result="commit_won"}` == 1), the other observes
+  `EEXIST` and reports `commit_lost`
+  (`posixfs_link_total{result="commit_lost"}` == 1), and the on-disk
+  content visible from a third reader matches the winner. Repeat
+  against `tmpfs` (treated as local) as a control.
+- **T-posixfs-nfs SelfTestAtomicCommit success** (`cachestore/posixfs`):
+  boot the driver against a CI loopback `nfsd` v4.1 export with `sync`;
+  assert `posixfs_selftest_last_success_timestamp` is set and the
+  process accepts traffic. Repeat against an `async` export and assert
+  the runbook warning is logged (note: detecting server-side `async`
+  is best-effort; the size-verify step still runs and may pass even
+  with `async` because the kernel client cache is consistent within a
+  process).
+- **T-posixfs-nfs SelfTestAtomicCommit failure** (`cachestore/posixfs`):
+  boot against a mock POSIX backend (FUSE shim) that
+  (a) returns `0` instead of `EEXIST` from a second `link()`, OR
+  (b) silently drops the size-verify check; assert the process exits
+  non-zero with the documented `cachestore/posixfs: backend does not
+  honor link()/EEXIST or directory fsync; refusing to start` message.
+- **T-posixfs-nfs version gate** (`cachestore/posixfs`): boot against
+  a loopback NFSv3 export with `cachestore.posixfs.nfs.allow_v3:
+  false` (default); assert the process exits non-zero. Then set
+  `allow_v3: true` and reboot; assert the process starts with a loud
+  WARN log line and `posixfs_nfs_v3_optin_total` == 1. Boot against
+  NFSv4.0 with the default config; assert exit non-zero (4.0 < 4.1
+  minimum and 4.0 is not v3-opt-in eligible).
+- **T-posixfs-nfs Alluxio refusal** (`cachestore/posixfs`): boot
+  against a FUSE mount whose `/proc/mounts` source string contains
+  `alluxio` (case-insensitive); assert the process exits non-zero
+  with the `cachestore/posixfs: Alluxio FUSE is unsupported` message
+  and `posixfs_alluxio_refusal_total` == 1. Repeat with a non-Alluxio
+  FUSE mount (e.g. a test FUSE shim) and assert the process still
+  refuses (because FUSE_SUPER_MAGIC also fails the spool-locality
+  check when `spool.dir` is on the same FS, AND `cachestore/posixfs`
+  treats a generic FUSE backend as unverified).
+- **T-posixfs-fanout** (`cachestore/posixfs`): with
+  `fanout_chars: 2`, assert chunk paths under
+  `<root>/<origin_id>/<hash[0:2]>/<hash>/<chunk_index>`; with
+  `fanout_chars: 0`, assert paths under
+  `<root>/<origin_id>/<hash>/<chunk_index>`; assert `localfs` default
+  (`fanout_chars: 0` for localfs) produces the flat layout. Verify
+  the same `posixcommon` package powers both code paths via a unit
+  test on the helper.
+- **T-spool-locality refusal** (`spool` + `cmd/orca`): boot
+  with `spool.dir` on a tmpfs-backed loopback NFS mount (CI helper);
+  assert the process exits non-zero with the `spool: ... is on a
+  network filesystem (nfs); ... Refusing to start` message and
+  `orca_spool_locality_check_total{result="refused",fs_type="nfs"}`
+  == 1. Repeat with `spool.require_local_fs: false`; assert the
+  process starts, `result="bypassed"` is emitted, and the boot log
+  carries the `WARN spool.require_local_fs is disabled` line.
+  Separately assert a clean local-FS run emits `result="ok"`.
+- **T-D3 internal mTLS ServerName** (`cluster`): boot 3 replicas with
+  per-replica certs whose only SAN is `orca.<ns>.svc`;
+  rolling-restart one pod so its IP changes; assert the dialer pins
+  `tls.Config.ServerName = orca.<ns>.svc` and the handshake
+  succeeds against the new pod IP without cert reissuance.
+- **T-D4 readyz on ErrAuth** (`cachestore` + `server`): inject 1
+  `ErrAuth` -> `/readyz` still 200; inject 3 consecutive `ErrAuth` ->
+  `/readyz` returns 503 NotReady and
+  `readyz_errauth_consecutive` == 3; interleave a non-auth `ErrNotFound`
+  between failures and assert it does NOT reset the counter (only a
+  successful CacheStore call resets); inject success after the
+  threshold trips, assert counter resets to 0 and `/readyz` returns
+  200 again.
+- **T-edge cap-exceeded 400** (`server`): set `max_response_bytes=1MiB`;
+  request `Range: bytes=0-2097151` (2 MiB); assert response is
+  `400 RequestSizeExceedsLimit` (S3-style XML body) with
+  `x-orca-cap-exceeded: true`; separately, request a Range past
+  EOF and assert response is `416 Requested Range Not Satisfiable`
+  (cap-exceeded MUST NOT be reported as 416).
+
+## 9. Out of scope for v1 (explicit)
+
+Re-stated to prevent drift:
+
+- No write path, multipart upload, or object versioning.
+- No cross-DC peering.
+- No SigV4 verification.
+- No multi-tenant quotas or per-tenant credentials.
+- No mutable-blob invalidation. ETag change is the only signal we honor,
+  and it is enforced at the origin via `If-Match` on every GET (no
+  opt-out).
+- No encryption at rest beyond what the underlying CacheStore provides.
+
+## 10. Open questions / risks
+
+- **Origin immutability is an operator contract**: Orca trusts
+  that an `(origin_id, bucket, object_key)` is immutable for the life
+  of the key (replacement must use a new key); the bounded violation
+  window is `metadata_ttl` (default 5m). `If-Match: <etag>` on every
+  `Origin.GetRange` is defense-in-depth that catches in-flight
+  overwrites only. Operators MUST surface this contract in the consumer
+  API documentation. See
+  [design.md#11-bounded-staleness-contract](./design.md#11-bounded-staleness-contract).
+- **Commit-after-serve failure** (decision 2b): with v1 Option D
+  the cold-path bytes stream origin -> client directly; the
+  CacheStore commit is async and happens after the client response
+  is complete. A failure there leaves the client successful but
+  the chunk uncached. Repeated
+  failures are visible only via
+  `orca_commit_after_serve_total{result="failed"}` and the
+  CacheStore circuit breaker; operators MUST alert on a sustained
+  non-zero rate (it indicates CacheStore degradation, not request
+  errors).
+- **Per-replica origin semaphore is approximate**: each replica
+  enforces `floor(origin.target_global / cluster.target_replicas)`
+  (default 64 slots/replica at `target_global=192`,
+  `target_replicas=3`). Realized cluster-wide concurrency tracks
+  `target_global` only when `N_actual == cluster.target_replicas`;
+  scale-out without updating the knob over-allocates against
+  origin (cluster-wide cap exceeds `target_global` by
+  `(N_actual - target_replicas) * target_per_replica`); scale-in
+  under-allocates. Mitigations: operators MUST update
+  `cluster.target_replicas` after sustained scale changes; a
+  coordinated cluster-wide limiter (s15.5) and dynamic recompute
+  from `len(Cluster.Peers())` (s15.6) are deferred future work.
+  Origin throttling responses (`503 SlowDown` / `429`) are handled
+  by the leader's pre-header retry loop (s8.6) with exponential
+  backoff regardless; origin self-protects against the static-cap
+  overshoot.
+- **VAST `If-None-Match: *` requires unversioned bucket**: the
+  `cachestore/s3` driver relies on the backend honoring
+  `If-None-Match: *` to enforce no-clobber atomic commit. AWS S3
+  (since 2024-08), MinIO, and VAST Cluster (non-versioned buckets
+  only) are verified. The driver runs a boot-time `GetBucketVersioning`
+  versioning gate ([design.md s10.1.3](./design.md#1013-cachestores3))
+  and refuses to start on enabled or suspended versioning. VAST KB
+  citation is in design.md. The `SelfTestAtomicCommit` probe is the
+  defense-in-depth backstop if any future S3-compatible backend
+  reports versioning correctly but silently overwrites anyway.
+- **LocalStack community community-tier image must be pinned**: the
+  dev harness uses LocalStack as the `cachestore/s3` backend
+  (`hack/orca/dev-harness.md`). The `localstack/localstack:latest`
+  tag now requires a Pro auth token and exits with code 55 on the
+  free tier. Dev manifests pin to `localstack/localstack:3.8`, the
+  last known-stable community-tier release whose S3 implementation
+  honors `PutObject + If-None-Match: *` (verified locally; both the
+  `SelfTestAtomicCommit` and the `GetBucketVersioning` versioning
+  gate pass). Future LocalStack releases may diverge; if the dev
+  harness fails to start, first action is verify `If-None-Match: *`
+  + `GetBucketVersioning` against the pinned image.
+- **NFS export `async` weakens dir-fsync**: `cachestore/posixfs`
+  depends on directory `fsync()` being durable on the server, which
+  requires the NFS export to be `sync` (not `async`). The driver
+  cannot reliably detect server-side `async` from the client; Phase 2
+  ships an operator runbook entry that mandates `sync` exports and a
+  best-effort warning if `/proc/mounts` reveals an `async` client mount
+  option. Mitigation: the boot self-test re-`stat`s through the kernel
+  client cache and catches the most common misconfigurations; persistent
+  silent corruption requires both server `async` AND a
+  power-loss-window-sized failure, which is outside v1's correctness
+  envelope. Document this loudly in `operations.md`.
+- **Weka NFS `link()` / `EEXIST` semantics not docs-confirmed**: Weka's
+  NFS share (`-t nfs4` to a Weka cluster) is verified up to NFSv4.1
+  (`NFS4_CREATE_SESSION`, `ATOMIC_FILEOPEN`) but the `link()` no-clobber
+  return of `EEXIST` is not explicitly documented. The driver treats
+  this as a "must pass `SelfTestAtomicCommit` to start" case: if Weka
+  NFS fails the self-test, operators MUST switch to Weka native
+  (`-t wekafs`), which is a true POSIX FS and a separately-detected
+  backend. This is not a code change, only a configuration / mount-time
+  decision; document the matrix in `operations.md`.
+- **Alluxio FUSE is a tempting misconfiguration**: Alluxio markets a
+  shared filesystem mount but provides no `link(2)` and no atomic
+  no-overwrite rename, which makes it unsafe for `cachestore/posixfs`.
+  The driver detects Alluxio FUSE explicitly (FUSE_SUPER_MAGIC +
+  `/proc/mounts` source matches `alluxio`) and refuses to start. The
+  documented workaround is `cachestore.driver: s3` against the
+  Alluxio S3 gateway, which is a normal in-DC S3 backend from the
+  cache layer's perspective. Operators MUST be steered to this in the
+  runbook to prevent Phase-2 deployments from getting stuck.
+- **Spool on a network filesystem degrades joiner-fallback latency**:
+  with the v1 streaming design (Option D) the spool is no longer on
+  the client TTFB path, but joiner-fallback reads still benefit
+  materially from local block storage. A spool placed on NFS /
+  SMB / CephFS / Lustre / GPFS / FUSE pays a network round-trip
+  per joiner-fallback read, converting microsecond-class
+  switchover into milliseconds-class. The cache layer enforces
+  local placement at boot via `statfs(2)` and refuses to start by
+  default (`spool.require_local_fs=true`; see
+  [design.md#104-spool-locality-contract](./design.md#104-spool-locality-contract)).
+  Operators with unusual placements (e.g., RAM-disk) MAY relax to
+  `spool.require_local_fs=false`; production deployments are
+  expected to keep the default. Operators should also pin
+  `spool.dir` to a hostPath / local-PV pointing at NVMe and avoid
+  generic-default-storage-class PVCs that may bind to network volumes.
+- **Spool exhaustion under sustained burst**: `spool.max_bytes` (default
+  8 GiB) and `spool.max_inflight` (default 64) bound the local staging
+  area. A correlated cold-access burst that exceeds these returns `503
+  Slow Down` to clients, which is the intended backpressure but visible
+  as user-facing errors. Operators should monitor `orca_spool_bytes`
+  and `orca_spool_evictions_total{reason="full"}` and tune the caps
+  per node disk capacity.
+- **Internal cert rotation**: the internal listener uses per-replica certs
+  chained to an internal CA. Rotation is delegated to the issuing system
+  (e.g. cert-manager). The server hot-reloads `cluster.internal_tls.cert_file`
+  / `key_file` on file change (inotify / periodic stat); the CA bundle is
+  reloaded the same way. CA rotation requires both old and new CAs to
+  appear in the bundle for at least one full rolling-restart window;
+  document this in `operations.md`. Misconfiguration risk: dropping the
+  old CA too early breaks inter-replica RPCs cluster-wide.
+- **Cluster membership during rolling restart**: rendezvous hashing
+  tolerates membership flux, but a pod restart with a new IP looks like a
+  new member for up to one refresh interval (default 5s), shifting
+  ownership for ~1/N keys until the next DNS refresh. Back-to-back
+  restarts can cause repeated duplicate fills. The
+  `orca_origin_duplicate_fills_total{result="commit_lost"}` metric
+  makes this visible. We accept this in v1 and revisit if it proves
+  material. See
+  [design.md#14-horizontal-scale](./design.md#14-horizontal-scale).
+- **Create-after-404 unavailability window**: clients that hit a missing
+  key before the operator uploads it will continue to see `404` for up
+  to `negative_metadata_ttl` per replica that observed the original
+  `404` (default 60s). Worst case across replicas: round-robin LB can
+  alternate `404` / `200` during the drain. There is no event-driven
+  invalidation or admin-invalidation in v1 (the immutable-origin
+  contract makes them unnecessary).
+  Mitigations: short default `negative_metadata_ttl=60s`,
+  `metadata_negative_*` metrics expose drain progress, runbook
+  instructs operators to wait `negative_metadata_ttl` after uploading
+  a previously-missing key before announcing it. See
+  [design.md#12-create-after-404-and-negative-cache-lifecycle](./design.md#12-create-after-404-and-negative-cache-lifecycle).
+- **ChunkCatalog undersizing degrades active eviction quality**:
+  the optional active eviction loop (s13.2) bases decisions on
+  per-entry access counters in the ChunkCatalog. If
+  `chunk_catalog.max_entries` is much smaller than the working set,
+  many chunks live in the CacheStore but are not tracked; they
+  cannot be considered for active eviction; they live indefinitely
+  until external lifecycle (if any) cleans them up. Operators MUST
+  size the catalog to roughly 1.2x the estimated working-set chunk
+  count
+  ([design.md s13.3](./design.md#133-chunkcatalog-size-awareness-load-bearing-operational-note));
+  metrics `chunk_catalog_hit_rate` and
+  `chunk_catalog_evict_total{reason="size"}` make undersizing
+  visible.
+- **LIST cache staleness in write-and-immediately-list workloads**:
+  the per-replica LIST cache (s6.2) defaults to 60s TTL. A key
+  uploaded mid-window will not appear in `Origin.List` results
+  served from cache until the entry expires (up to 60s).
+  Acceptable for the documented FUSE-`ls` read-mostly workload;
+  operators with write-and-immediately-list patterns should tune
+  `list_cache.ttl` shorter or disable the cache via
+  `list_cache.enabled: false`.
+- **Mid-stream client aborts on post-commit origin failure**:
+  the v1 streaming design (Option D) sends response headers and
+  begins streaming as soon as origin returns a first byte. If the
+  origin connection breaks mid-chunk after the cache has committed,
+  the response aborts (HTTP/2 `RST_STREAM` or HTTP/1.1
+  `Connection: close`). S3 SDKs handle this via `Content-Length`
+  mismatch retry; the operational impact is small for the
+  documented workload but visible in
+  `responses_aborted_total{phase="mid_stream"}`. Sustained non-
+  zero rates indicate origin tail-latency issues; the trigger for
+  considering mid-stream origin resume
+  ([design.md s15.4](./design.md#154-mid-stream-origin-resume))
+  is sustained mid-stream abort rate measurably impacting
+  end-to-end client latency.
+- **Cold-start Stat storm**: a freshly started replica receiving a wide
+  fan-out of distinct cold keys does one `CacheStore.Stat` per `ChunkKey`.
+  At in-DC latencies this is cheap but not free. If a deployment routinely
+  sees wide-fan-out cold starts we may add a bulk-stat path or warm the
+  `ChunkCatalog` from a CacheStore listing on startup. Defer until
+  measured.
+- **CacheStore lifecycle eviction of hot chunks**: age-based expiration may
+  evict a chunk that is still hot, forcing a re-fetch from origin.
+  Operators should tune TTL against `orca_origin_bytes_total`. Phase
+  4 may add an in-`chunkcatalog` access-tracking layer if this proves
+  material.
+- **Origin egress cost spikes**: cold-start fan-out can be expensive even
+  with singleflight if many distinct keys are touched simultaneously.
+  Origin semaphore + 503 backpressure protects us, but operators should
+  monitor `orca_origin_bytes_total` and set DC-side egress budgets.
+- **Prefetch-induced waste**: sequential read-ahead can fetch chunks the
+  client never reads. Default depth (4) is conservative; we expose the knob
+  and the metric.
+- **Mid-stream abort detection by clients**: post-first-byte failures abort
+  the response; standard S3 SDKs (aws-sdk, boto3) detect via
+  `Content-Length` mismatch and retry. Non-standard or hand-rolled HTTP
+  clients may silently truncate. Document this in `operations.md`.
+
+## 11. Approval checklist
+
+Before starting Phase 0 implementation, please confirm:
+
+- [ ] Repo layout under `cmd/orca/`, `internal/orca/`,
+      `deploy/orca/`, `images/orca/`,
+      `design/orca/`, `hack/orca/` is acceptable,
+      including `internal/orca/fetch/spool/`,
+      `cmd/orca/orca/server/internal/`, and
+      `deploy/orca/07-networkpolicy.yaml.tmpl`.
+- [ ] Default chunk size of 8 MiB is acceptable.
+- [ ] Bearer / mTLS auth on the client edge in v1 is acceptable; SigV4
+      is deferred future work.
+- [ ] **Separate internal mTLS listener (`:8444`) with an internal CA
+      distinct from the client mTLS CA, peer-IP-set authorization,
+      and a NetworkPolicy restricting ingress to `app=orca` pods,
+      is acceptable.**
+- [ ] Azure constraint to Block Blobs only, surfaced as
+      `502 OriginUnsupported`, is acceptable.
+- [ ] No persistent local index in v1; in-memory `ChunkCatalog` +
+      `CacheStore.Stat` on miss is sufficient.
+- [ ] CacheStore lifecycle / TTL is the eviction mechanism in v1; cache
+      layer ships no eviction code.
+- [ ] **Strict `If-Match: <etag>` on every `Origin.GetRange` (no opt-out),
+      with `412` translated to `OriginETagChangedError`, metadata cache
+      invalidation, and a non-retryable fill failure, is acceptable.**
+- [ ] **Local Spool layer (default 8 GiB) as the universal slow-joiner
+      fallback, with `503 Slow Down` on exhaustion, is acceptable.**
+- [ ] **Atomic-commit model is acceptable: `localfs` uses
+      `link()` / `renameat2(RENAME_NOREPLACE)` (no plain `rename()`);
+      `cachestore/s3` uses `PutObject` + `If-None-Match: *` with no
+      tmp key and no copy hop; `SelfTestAtomicCommit` at startup refuses
+      to start if the backend doesn't honor the precondition.**
+- [ ] **Deferred response headers until first chunk in hand, plus
+      mid-stream abort (HTTP/2 `RST_STREAM` / HTTP/1.1 `Connection: close`)
+      on post-first-byte failure, is acceptable.**
+- [ ] **Assembler-per-request + per-chunk coordinator routing via
+      internal fill RPC (rather than whole-request reverse-proxy) is the
+      right v1 mechanism for strongly correlated cold-access workloads.**
+- [ ] Deployment (not StatefulSet) is acceptable for v1 given no per-pod
+      state, faster rolling updates, and parity with other stateless
+      components in this repo.
+- [ ] Phase 0 deliverable definition (one process serving a Range GET
+      against real S3 and re-serving from `localfs`) is the right starting
+      milestone.
+- [ ] No cross-cmd imports; shared code lives under `internal/orca/`
+      per the project's coding standards.
+- [ ] **Bounded staleness contract published in design.md s11 with
+      `metadata_ttl=5m` default; operators are expected to honor the
+      immutable-origin contract.**
+- [ ] **Pre-header origin retry (Option D) ships in Phase 1: the
+      leader retries `Origin.GetRange` up to
+      `origin.retry.attempts` (default 3) with exponential backoff
+      capped by `origin.retry.max_total_duration` (default 5s)
+      BEFORE response headers are sent to the client; transparent
+      to the client. The commit boundary is the first byte arrival
+      from origin: post-commit, bytes stream origin -> client
+      directly; spool tees in parallel for joiner support and as
+      the asynchronous CacheStore-commit source. Pre-commit
+      failures (retry budget exhausted, `OriginETagChangedError`)
+      return clean HTTP errors; post-commit failures become
+      mid-stream client aborts (handled by SDK retry).
+      `origin_retry_total` and `origin_retry_attempts` metrics
+      exposed; T-pre-header-retry-* test group in Phase 1.
+      Mid-stream origin resume is deferred future work
+      ([design.md s15.4](./design.md#154-mid-stream-origin-resume)).
+      CacheStore commit runs asynchronously after the client
+      response completes; commit-after-serve failures are reported
+      as `commit_after_serve_total{result="failed"}` and do NOT
+      affect client responses.**
+- [ ] **`CacheStore` returns typed errors `ErrNotFound|ErrTransient|ErrAuth`;
+      only `ErrNotFound` triggers refill; `ErrTransient` -> `503 Slow Down`
+      with `Retry-After`; `ErrAuth` -> `502 Bad Gateway`.**
+- [ ] **Per-process CacheStore circuit breaker with defaults
+      `error_window=30s, error_threshold=10, open_duration=30s,
+      half_open_probes=3`; state and transitions exported as metrics.**
+- [ ] **Origin backpressure is per-replica static cap:
+      `target_per_replica = floor(origin.target_global /
+      cluster.target_replicas)` (default 64 slots/replica at
+      `target_global=192`, `target_replicas=3`); origin throttling
+      responses (`503 SlowDown` / `429`) are handled by the
+      pre-header retry loop (`origin.retry.*`); `origin_inflight`
+      gauge exposes per-replica saturation. Coordinated
+      cluster-wide limiter and dynamic per-replica recompute are
+      deferred future work, see
+      [design.md s15.5](./design.md#155-coordinated-cluster-wide-origin-limiter)
+      and
+      [design.md s15.6](./design.md#156-dynamic-per-replica-origin-cap).
+      Operators MUST update `cluster.target_replicas` after any
+      sustained scale change.**
+- [ ] **`cachestore/localfs` stages inside `<root>/.staging/<uuid>` (NOT
+      `/tmp` and NOT spool dir); parent-dir fsync after every link/unlink;
+      `staging_max_age=1h` orphaned-staging sweeper.**
+- [ ] **Internal mTLS dialer pins `tls.Config.ServerName` to the stable
+      SAN `orca.<ns>.svc`; per-replica certs MUST include this
+      SAN; pod-IP SANs are NOT used.**
+- [ ] **`/readyz` flips to NotReady after `readyz.errauth_consecutive_threshold=3`
+      consecutive `ErrAuth` from CacheStore; one non-`ErrAuth` success
+      resets the counter.**
+- [ ] **`server.max_response_bytes` overflow returns
+      `400 RequestSizeExceedsLimit` (S3-style XML body); `416` is
+      reserved for true Range vs. object-size violations.**
+- [ ] **`cachestore/posixfs` ships in Phase 2 alongside `cachestore/s3`,
+      sharing `link()`/`EEXIST` + dir-fsync helpers with
+      `cachestore/localfs` via
+      `internal/orca/cachestore/internal/posixcommon/`. Supported
+      backends: NFSv4.1+ (baseline), Weka native (`-t wekafs`), CephFS,
+      Lustre, GPFS / IBM Spectrum Scale.**
+- [ ] **`cachestore/posixfs` runs `SelfTestAtomicCommit` at startup
+      (link()/`EEXIST` + dir-fsync + size verify); refuses to start on
+      any failure. Never disabled in production
+      (`require_atomic_link_self_test: true`).**
+- [ ] **NFS minimum version is `4.1`
+      (`cachestore.posixfs.nfs.minimum_version: "4.1"`); NFSv3 is opt-in
+      only (`cachestore.posixfs.nfs.allow_v3: true`) with a loud WARN
+      log and `posixfs_nfs_v3_optin_total++`; `allow_v3` MUST stay
+      `false` in production manifests.**
+- [ ] **Backend auto-detection via `statfs(2)` `f_type` + `/proc/mounts`
+      emits `posixfs_backend{type,version}` info gauge; operator
+      override allowed via `cachestore.posixfs.backend_type` for
+      ambiguous magic numbers; override is logged loudly.**
+- [ ] **Alluxio FUSE is unsupported: `cachestore/posixfs` detects it
+      (FUSE_SUPER_MAGIC + `/proc/mounts` source matches `alluxio`) and
+      refuses to start with a message pointing operators to
+      `cachestore.driver: s3` against the Alluxio S3 gateway;
+      `posixfs_alluxio_refusal_total` exposes accidental
+      misconfigurations.**
+- [ ] **`cachestore/posixfs` paths use a 2-character hex fan-out under
+      `<root>/<origin_id>/<hash[0:2]>/<hash>/<chunk_index>` by default
+      (`fanout_chars: 2`); `cachestore/localfs` keeps the flat layout
+      (`fanout_chars: 0` default) but the helper is shared.**
+- [ ] **NFS export hardening is operator-runbook material: exports MUST
+      be `sync` (not `async`); the driver issues a best-effort warning
+      from `/proc/mounts` client-side options but does not refuse on
+      `async` (it cannot reliably detect server-side `async`); document
+      this in `operations.md`.**
+- [ ] **Spool locality is enforced at boot: `spool.require_local_fs:
+      true` (default) runs `statfs(2)` on `spool.dir` and refuses to
+      start when the FS magic matches NFS / SMB / CephFS / Lustre /
+      GPFS / FUSE. With Option D the spool is no longer on the
+      client TTFB path, so the contract is defense-in-depth for
+      joiner-fallback latency; operators with unusual placements
+      (e.g., RAM-disk) MAY relax via `spool.require_local_fs: false`
+      with the documented operational warning. Production deploys
+      are expected to keep the default. See
+      [design.md#104-spool-locality-contract](./design.md#104-spool-locality-contract).**
+- [ ] **Negative-cache TTL is independent: `negative_metadata_ttl: 60s`
+      (default) is distinct from `metadata_ttl: 5m`; bounds the
+      create-after-404 unavailability window. The
+      `metadata_negative_entries` / `metadata_negative_hit_total` /
+      `metadata_negative_age_seconds` metrics are exposed; the
+      `T-create-after-404a/b/c` test group is in Phase 1.
+      Event-driven invalidation and admin-invalidation RPC are
+      out of v1 scope (the immutable-origin contract makes them
+      unnecessary). See
+      [design.md#12-create-after-404-and-negative-cache-lifecycle](./design.md#12-create-after-404-and-negative-cache-lifecycle).**
+- [ ] **Per-replica LIST cache (FW3) ships in Phase 1 sized for
+      the FUSE-`ls` workload pattern: default `list_cache.ttl=60s`,
+      `max_entries=1024`, `max_response_bytes=1MiB`, no negative
+      caching, optional stale-while-revalidate (`swr_enabled: false`
+      default); `list_cache_*` metrics exposed; T-list-cache-* test
+      group in Phase 1; cluster-wide LIST coordinator is a
+      deferred optimization
+      ([design.md s15.3](./design.md#153-cluster-wide-list-coordinator)).**
+- [ ] **ChunkCatalog access-frequency tracking (FW8) added in
+      Phase 1: per-entry `AccessCount`, `LastAccessed`,
+      `LastEntered`. Optional active eviction loop opt-in via
+      `chunk_catalog.active_eviction.enabled` (default `false`)
+      with `inactive_threshold=24h`, `access_threshold=5`,
+      `min_age=5m`, `max_evictions_per_run=1000`. New
+      `CacheStore.Delete` method on the interface;
+      `cachestore_delete_total` and `chunk_catalog_*` metrics
+      exposed. Operators MUST size `chunk_catalog.max_entries` to
+      ~1.2x estimated working-set chunks per the load-bearing
+      operational note in
+      [design.md s13.3](./design.md#133-chunkcatalog-size-awareness-load-bearing-operational-note).
+      `T-active-eviction-*` and `T-catalog-*` test groups in Phase 1.**
+- [ ] **Bounded-freshness mode (FW5) opt-in via
+      `metadata_refresh.enabled` (default `false`) with hot-key
+      detection via metadata-cache access counters (parallel to
+      ChunkCatalog tracking from FW8). Defaults: `interval=1m`,
+      `refresh_ahead_ratio=0.7`, `access_threshold=5`,
+      `min_age=metadata_ttl/4=75s`, `max_refreshes_per_run=100`,
+      `refresh_concurrency=8`. Negative entries are NOT refreshed.
+      `metadata_refresh_*` metrics exposed; `T-metadata-refresh-*`
+      test group in Phase 1. See
+      [design.md s11.2](./design.md#112-bounded-freshness-mode-optional).**
+- [ ] **`cachestore/s3` versioning gate enforced at boot: drives
+      `GetBucketVersioning` and refuses to start on `Status: Enabled`
+      or `Status: Suspended`. Governed by
+      `cachestore.s3.require_unversioned_bucket: true` (default;
+      never disabled in production). Required because
+      `If-None-Match: *` is not honored on versioned buckets across
+      all S3-compatible backends (notably VAST). Metric
+      `s3_versioning_check_total{result="ok|refused"}` emitted once
+      per boot. `T-s3-versioned-bucket-refusal` and
+      `T-s3-unversioned-bucket-ok` tests in Phase 1. See
+      [design.md s10.1.3](./design.md#1013-cachestores3) and the
+      VAST KB citation therein.**
+- [ ] **Edge rate limiting documented as v1 gap in
+      [design.md s15.1](./design.md#151-edge-rate-limiting). Multi-
+      tenant deployments worried about single-client monopolization
+      should layer rate limiting at an upstream proxy or LB until
+      this lands as a future deliverable.**
+- [ ] **Dev harness brings up cleanly with `make -C hack/orca up`
+      against LocalStack (cachestore/s3) and a real Azure storage
+      account (origin) inside a Kind cluster. End-to-end flow
+      verified: cold miss -> Azure -> LocalStack -> client; warm
+      hit served from LocalStack without origin call; 50 parallel
+      GETs across 3 replicas dedupe to 1 origin GET (cluster-wide
+      via `/internal/fill`). LocalStack pinned to a community-tier
+      image; dev disables `cluster.internal_tls.enabled` and
+      `server.auth.enabled`. NetworkPolicy not applied in dev. See
+      [hack/orca/dev-harness.md](../../hack/orca/dev-harness.md).**
diff --git a/go.mod b/go.mod
index 9fdc87a3..49794bf4 100644
--- a/go.mod
+++ b/go.mod
@@ -26,6 +26,11 @@ require (
 	github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4
 	github.com/Masterminds/semver/v3 v3.4.0
 	github.com/Masterminds/sprig/v3 v3.3.0
+	github.com/aws/aws-sdk-go-v2 v1.41.7
+	github.com/aws/aws-sdk-go-v2/config v1.32.17
+	github.com/aws/aws-sdk-go-v2/credentials v1.19.16
+	github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0
+	github.com/aws/smithy-go v1.25.1
 	github.com/bougou/go-ipmi v0.8.3
 	github.com/cilium/ebpf v0.21.0
 	github.com/coder/websocket v1.8.14
@@ -49,6 +54,7 @@ require (
 	github.com/spf13/cobra v1.10.2
 	github.com/spf13/pflag v1.0.10
 	github.com/stretchr/testify v1.11.1
+	github.com/testcontainers/testcontainers-go v0.42.0
 	github.com/vishvananda/netlink v1.3.1
 	golang.org/x/crypto v0.50.0
 	golang.org/x/mod v0.35.0
@@ -73,27 +79,51 @@ require (
 )
 
 require (
-	dario.cat/mergo v1.0.1 // indirect
-	github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1 // indirect
+	dario.cat/mergo v1.0.2 // indirect
+	github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 // indirect
 	github.com/Azure/azure-sdk-for-go/sdk/internal v1.12.0 // indirect
-	github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
+	github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect
 	github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect
 	github.com/Masterminds/goutils v1.1.1 // indirect
+	github.com/Microsoft/go-winio v0.6.2 // indirect
 	github.com/apex/log v1.9.0 // indirect
+	github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10 // indirect
+	github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23 // indirect
+	github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 // indirect
+	github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 // indirect
+	github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21 // indirect
+	github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/blang/semver/v4 v4.0.0 // indirect
+	github.com/cenkalti/backoff/v4 v4.3.0 // indirect
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
+	github.com/containerd/errdefs v1.0.0 // indirect
+	github.com/containerd/errdefs/pkg v0.3.0 // indirect
 	github.com/containerd/log v0.1.0 // indirect
+	github.com/cpuguy83/dockercfg v0.3.2 // indirect
 	github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
 	github.com/cyphar/filepath-securejoin v0.5.0 // indirect
 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
+	github.com/distribution/reference v0.6.0 // indirect
+	github.com/docker/go-connections v0.6.0 // indirect
 	github.com/docker/go-units v0.5.0 // indirect
 	github.com/dustin/go-humanize v1.0.1 // indirect
+	github.com/ebitengine/purego v0.10.0 // indirect
 	github.com/emicklei/go-restful/v3 v3.12.2 // indirect
 	github.com/evanphx/json-patch/v5 v5.9.11 // indirect
+	github.com/felixge/httpsnoop v1.0.4 // indirect
 	github.com/fxamacker/cbor/v2 v2.9.0 // indirect
 	github.com/go-errors/errors v1.4.2 // indirect
+	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/go-logr/zapr v1.3.0 // indirect
+	github.com/go-ole/go-ole v1.2.6 // indirect
 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
 	github.com/go-openapi/jsonreference v0.20.2 // indirect
 	github.com/go-openapi/swag v0.23.0 // indirect
@@ -110,12 +140,14 @@ require (
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/josharian/native v1.1.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
-	github.com/klauspost/compress v1.18.0 // indirect
+	github.com/klauspost/compress v1.18.5 // indirect
 	github.com/klauspost/pgzip v1.2.6 // indirect
 	github.com/kr/pretty v0.3.1 // indirect
 	github.com/kr/text v0.2.0 // indirect
 	github.com/kylelemons/godebug v1.1.0 // indirect
 	github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
+	github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
+	github.com/magiconair/properties v1.8.10 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
 	github.com/mattn/go-colorable v0.1.14 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
@@ -125,10 +157,16 @@ require (
 	github.com/mdlayher/socket v0.5.1 // indirect
 	github.com/mitchellh/copystructure v1.2.0 // indirect
 	github.com/mitchellh/reflectwalk v1.0.2 // indirect
+	github.com/moby/docker-image-spec v1.3.1 // indirect
+	github.com/moby/go-archive v0.2.0 // indirect
+	github.com/moby/moby/api v1.54.1 // indirect
+	github.com/moby/moby/client v0.4.0 // indirect
+	github.com/moby/patternmatcher v0.6.1 // indirect
 	github.com/moby/spdystream v0.5.1 // indirect
+	github.com/moby/sys/sequential v0.6.0 // indirect
 	github.com/moby/sys/user v0.4.0 // indirect
 	github.com/moby/sys/userns v0.1.0 // indirect
-	github.com/moby/term v0.5.0 // indirect
+	github.com/moby/term v0.5.2 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
 	github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
@@ -145,6 +183,7 @@ require (
 	github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
+	github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
 	github.com/prometheus/client_model v0.6.2 // indirect
 	github.com/prometheus/common v0.66.1 // indirect
 	github.com/prometheus/procfs v0.16.1 // indirect
@@ -153,10 +192,13 @@ require (
 	github.com/rogpeppe/go-internal v1.14.1 // indirect
 	github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67 // indirect
 	github.com/russross/blackfriday/v2 v2.1.0 // indirect
+	github.com/shirou/gopsutil/v4 v4.26.3 // indirect
 	github.com/shopspring/decimal v1.4.0 // indirect
-	github.com/sirupsen/logrus v1.9.3 // indirect
+	github.com/sirupsen/logrus v1.9.4 // indirect
 	github.com/sony/gobreaker/v2 v2.4.0 // indirect
 	github.com/spf13/cast v1.7.0 // indirect
+	github.com/tklauser/go-sysconf v0.3.16 // indirect
+	github.com/tklauser/numcpus v0.11.0 // indirect
 	github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 // indirect
 	github.com/urfave/cli v1.22.12 // indirect
 	github.com/vbatts/go-mtree v0.6.1-0.20250911112631-8307d76bc1b9 // indirect
@@ -164,6 +206,12 @@ require (
 	github.com/x448/float16 v0.8.4 // indirect
 	github.com/xlab/treeprint v1.2.0 // indirect
 	github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect
+	github.com/yusufpapurcu/wmi v1.2.4 // indirect
+	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
+	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
+	go.opentelemetry.io/otel v1.41.0 // indirect
+	go.opentelemetry.io/otel/metric v1.41.0 // indirect
+	go.opentelemetry.io/otel/trace v1.41.0 // indirect
 	go.uber.org/multierr v1.11.0 // indirect
 	go.uber.org/zap v1.27.0 // indirect
 	go.yaml.in/yaml/v2 v2.4.3 // indirect
@@ -172,7 +220,7 @@ require (
 	golang.org/x/oauth2 v0.34.0 // indirect
 	golang.org/x/telemetry v0.0.0-20260409153401-be6f6cb8b1fa // indirect
 	golang.org/x/text v0.36.0 // indirect
-	golang.org/x/time v0.9.0 // indirect
+	golang.org/x/time v0.11.0 // indirect
 	golang.org/x/tools v0.44.0 // indirect
 	golang.org/x/vuln v1.2.0 // indirect
 	golang.zx2c4.com/wireguard v0.0.0-20231211153847-12269c276173 // indirect
diff --git a/go.sum b/go.sum
index 91bab086..3cf29662 100644
--- a/go.sum
+++ b/go.sum
@@ -1,9 +1,9 @@
 cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4=
 cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4=
-dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s=
-dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk=
-github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1 h1:EKPd1INOIyr5hWOWhvpmQpY6tKjeG0hT1s3AMC/9fic=
-github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1/go.mod h1:VzwV+t+dZ9j/H867F1M2ziD+yLHtB46oM35FxxMJ4d0=
+dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8=
+dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA=
+github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk=
+github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8=
 github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1 h1:jHb/wfvRikGdxMXYV3QG/SzUOPYN9KEUUuC0Yd0/vC0=
 github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1/go.mod h1:pzBXCYn05zvYIrwLgtK8Ap8QcjRg+0i76tMQdWN6wOk=
 github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4=
@@ -46,8 +46,8 @@ github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.8.1
 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.8.1/go.mod h1:Ng3urmn6dYe8gnbCMoHHVl5APYz2txho3koEkV2o2HA=
 github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4 h1:jWQK1GI+LeGGUKBADtcH2rRqPxYB1Ljwms5gFA2LqrM=
 github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4/go.mod h1:8mwH4klAm9DUgR2EEHyEEAQlRDvLPyg5fQry3y+cDew=
-github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0=
-github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
+github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg=
+github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
 github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM=
 github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE=
 github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs=
@@ -59,6 +59,8 @@ github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1
 github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
 github.com/Masterminds/sprig/v3 v3.3.0 h1:mQh0Yrg1XPo6vjYXgtf5OtijNAKJRNcTdOOGZe3tPhs=
 github.com/Masterminds/sprig/v3 v3.3.0/go.mod h1:Zy1iXRYNqNLUolqCpL4uhk6SHUMAOSCzdgBfDb35Lz0=
+github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
+github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
 github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI=
 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g=
 github.com/apex/log v1.9.0 h1:FHtw/xuaM8AgmvDDTI9fiwoAL25Sq2cxojnZICUU8l0=
@@ -69,6 +71,42 @@ github.com/aphistic/sweet v0.2.0/go.mod h1:fWDlIh/isSE9n6EPsRmC0det+whmX6dJid3st
 github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
 github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
 github.com/aws/aws-sdk-go v1.20.6/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
+github.com/aws/aws-sdk-go-v2 v1.41.7 h1:DWpAJt66FmnnaRIOT/8ASTucrvuDPZASqhhLey6tLY8=
+github.com/aws/aws-sdk-go-v2 v1.41.7/go.mod h1:4LAfZOPHNVNQEckOACQx60Y8pSRjIkNZQz1w92xpMJc=
+github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10 h1:gx1AwW1Iyk9Z9dD9F4akX5gnN3QZwUB20GGKH/I+Rho=
+github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10/go.mod h1:qqY157uZoqm5OXq/amuaBJyC9hgBCBQnsaWnPe905GY=
+github.com/aws/aws-sdk-go-v2/config v1.32.17 h1:FpL4/758/diKwqbytU0prpuiu60fgXKUWCpDJtApclU=
+github.com/aws/aws-sdk-go-v2/config v1.32.17/go.mod h1:OXqUMzgXytfoF9JaKkhrOYsyh72t9G+MJH8mMRaexOE=
+github.com/aws/aws-sdk-go-v2/credentials v1.19.16 h1:r3RJBuU7X9ibt8RHbMjWE6y60QbKBiII6wSrXnapxSU=
+github.com/aws/aws-sdk-go-v2/credentials v1.19.16/go.mod h1:6cx7zqDENJDbBIIWX6P8s0h6hqHC8Avbjh9Dseo27ug=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 h1:UuSfcORqNSz/ey3VPRS8TcVH2Ikf0/sC+Hdj400QI6U=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23/go.mod h1:+G/OSGiOFnSOkYloKj/9M35s74LgVAdJBSD5lsFfqKg=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 h1:GpT/TrnBYuE5gan2cZbTtvP+JlHsutdmlV2YfEyNde0=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23/go.mod h1:xYWD6BS9ywC5bS3sz9Xh04whO/hzK2plt2Zkyrp4JuA=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23 h1:bpd8vxhlQi2r1hiueOw02f/duEPTMK59Q4QMAoTTtTo=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23/go.mod h1:15DfR2nw+CRHIk0tqNyifu3G1YdAOy68RftkhMDDwYk=
+github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24 h1:OQqn11BtaYv1WLUowvcA30MpzIu8Ti4pcLPIIyoKZrA=
+github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24/go.mod h1:X5ZJyfwVrWA96GzPmUCWFQaEARPR7gCrpq2E92PJwAE=
+github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9 h1:FLudkZLt5ci0ozzgkVo8BJGwvqNaZbTWb3UcucAateA=
+github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9/go.mod h1:w7wZ/s9qK7c8g4al+UyoF1Sp/Z45UwMGcqIzLWVQHWk=
+github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15 h1:ieLCO1JxUWuxTZ1cRd0GAaeX7O6cIxnwk7tc1LsQhC4=
+github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15/go.mod h1:e3IzZvQ3kAWNykvE0Tr0RDZCMFInMvhku3qNpcIQXhM=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 h1:pbrxO/kuIwgEsOPLkaHu0O+m4fNgLU8B3vxQ+72jTPw=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23/go.mod h1:/CMNUqoj46HpS3MNRDEDIwcgEnrtZlKRaHNaHxIFpNA=
+github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23 h1:03xatSQO4+AM1lTAbnRg5OK528EUg744nW7F73U8DKw=
+github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23/go.mod h1:M8l3mwgx5ToK7wot2sBBce/ojzgnPzZXUV445gTSyE8=
+github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0 h1:etqBTKY581iwLL/H/S2sVgk3C9lAsTJFeXWFDsDcWOU=
+github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0/go.mod h1:L2dcoOgS2VSgbPLvpak2NyUPsO1TBN7M45Z4H7DlRc4=
+github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 h1:TdJ+HdzOBhU8+iVAOGUTU63VXopcumCOF1paFulHWZc=
+github.com/aws/aws-sdk-go-v2/service/signin v1.0.11/go.mod h1:R82ZRExE/nheo0N+T8zHPcLRTcH8MGsnR3BiVGX0TwI=
+github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 h1:7byT8HUWrgoRp6sXjxtZwgOKfhss5fW6SkLBtqzgRoE=
+github.com/aws/aws-sdk-go-v2/service/sso v1.30.17/go.mod h1:xNWknVi4Ezm1vg1QsB/5EWpAJURq22uqd38U8qKvOJc=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21 h1:+1Kl1zx6bWi4X7cKi3VYh29h8BvsCoHQEQ6ST9X8w7w=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21/go.mod h1:4vIRDq+CJB2xFAXZ+YgGUTiEft7oAQlhIs71xcSeuVg=
+github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 h1:F/M5Y9I3nwr2IEpshZgh1GeHpOItExNM9L1euNuh/fk=
+github.com/aws/aws-sdk-go-v2/service/sts v1.42.1/go.mod h1:mTNxImtovCOEEuD65mKW7DCsL+2gjEH+RPEAexAzAio=
+github.com/aws/smithy-go v1.25.1 h1:J8ERsGSU7d+aCmdQur5Txg6bVoYelvQJgtZehD12GkI=
+github.com/aws/smithy-go v1.25.1/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc=
 github.com/aybabtme/rgbterm v0.0.0-20170906152045-cc83f3b3ce59/go.mod h1:q/89r3U2H7sSsE2t6Kca0lfwTK8JdoNGS/yzM/4iH5I=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
@@ -84,29 +122,41 @@ github.com/cilium/ebpf v0.21.0 h1:4dpx1J/B/1apeTmWBH5BkVLayHTkFrMovVPnHEk+l3k=
 github.com/cilium/ebpf v0.21.0/go.mod h1:1kHKv6Kvh5a6TePP5vvvoMa1bclRyzUXELSs272fmIQ=
 github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g=
 github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg=
+github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI=
+github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M=
+github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE=
+github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk=
 github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I=
 github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo=
 github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpSBQv6A=
 github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw=
 github.com/coreos/go-iptables v0.8.0 h1:MPc2P89IhuVpLI7ETL/2tx3XZ61VeICZjYqDEgNsPRc=
 github.com/coreos/go-iptables v0.8.0/go.mod h1:Qe8Bv2Xik5FyTXwgIbLAnv2sWSBmvWdFETJConOQ//Q=
+github.com/cpuguy83/dockercfg v0.3.2 h1:DlJTyZGBDlXqUZ2Dk2Q3xHs/FtnooJJVaad2S9GKorA=
+github.com/cpuguy83/dockercfg v0.3.2/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
 github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
 github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY=
-github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4=
+github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s=
+github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE=
 github.com/cyphar/filepath-securejoin v0.5.0 h1:hIAhkRBMQ8nIeuVwcAoymp7MY4oherZdAxD+m0u9zaw=
 github.com/cyphar/filepath-securejoin v0.5.0/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
+github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
+github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94=
+github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE=
 github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
 github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
+github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU=
+github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
 github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU=
 github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
 github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k=
@@ -128,12 +178,15 @@ github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj2
 github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA=
 github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og=
 github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
+github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
 github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
 github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
 github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
 github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
 github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
+github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
+github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
 github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
 github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
 github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
@@ -166,6 +219,7 @@ github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7O
 github.com/google/go-cmdtest v0.4.1-0.20220921163831-55ab3332a786 h1:rcv+Ippz6RAtvaGgKxc+8FQIpxHgsF+HBzPyYL2cyVU=
 github.com/google/go-cmdtest v0.4.1-0.20220921163831-55ab3332a786/go.mod h1:apVn/GCasLZUVpAJ6oWAuyP7Ne7CEsQbTnc0plM3m+o=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/go-configfs-tsm v0.3.3-0.20240919001351-b4b5b84fdcbc h1:SG12DWUUM5igxm+//YX5Yq4vhdoRnOG9HkCodkOn+YU=
@@ -220,8 +274,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
 github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU=
 github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k=
-github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
-github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
+github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE=
+github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
 github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
 github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
@@ -239,6 +293,10 @@ github.com/lib/pq v1.12.3 h1:tTWxr2YLKwIvK90ZXEw8GP7UFHtcbTtty8zsI+YjrfQ=
 github.com/lib/pq v1.12.3/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA=
 github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0=
 github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE=
+github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
+github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
+github.com/magiconair/properties v1.8.10 h1:s31yESBquKXCV9a/ScB3ESkOjUYYv+X0rg8SYxI99mE=
+github.com/magiconair/properties v1.8.10/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0=
 github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ=
@@ -266,14 +324,26 @@ github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa1
 github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s=
 github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ=
 github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
+github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
+github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
+github.com/moby/go-archive v0.2.0 h1:zg5QDUM2mi0JIM9fdQZWC7U8+2ZfixfTYoHL7rWUcP8=
+github.com/moby/go-archive v0.2.0/go.mod h1:mNeivT14o8xU+5q1YnNrkQVpK+dnNe/K6fHqnTg4qPU=
+github.com/moby/moby/api v1.54.1 h1:TqVzuJkOLsgLDDwNLmYqACUuTehOHRGKiPhvH8V3Nn4=
+github.com/moby/moby/api v1.54.1/go.mod h1:+RQ6wluLwtYaTd1WnPLykIDPekkuyD/ROWQClE83pzs=
+github.com/moby/moby/client v0.4.0 h1:S+2XegzHQrrvTCvF6s5HFzcrywWQmuVnhOXe2kiWjIw=
+github.com/moby/moby/client v0.4.0/go.mod h1:QWPbvWchQbxBNdaLSpoKpCdf5E+WxFAgNHogCWDoa7g=
+github.com/moby/patternmatcher v0.6.1 h1:qlhtafmr6kgMIJjKJMDmMWq7WLkKIo23hsrpR3x084U=
+github.com/moby/patternmatcher v0.6.1/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc=
 github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y=
 github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI=
+github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU=
+github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko=
 github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs=
 github.com/moby/sys/user v0.4.0/go.mod h1:bG+tYYYJgaMtRKgEmuueC0hJEAZWwtIbZTB+85uoHjs=
 github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g=
 github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28=
-github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
-github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
+github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ=
+github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -331,6 +401,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU=
+github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
 github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
 github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
 github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
@@ -354,10 +426,12 @@ github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD
 github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
 github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ=
 github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
+github.com/shirou/gopsutil/v4 v4.26.3 h1:2ESdQt90yU3oXF/CdOlRCJxrP+Am1aBYubTMTfxJ1qc=
+github.com/shirou/gopsutil/v4 v4.26.3/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ=
 github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k=
 github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME=
-github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
-github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
+github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w=
+github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g=
 github.com/smartystreets/assertions v1.0.0/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM=
 github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9/go.mod h1:SnhjPscd9TpLiy1LpzGSKh3bXCfxxXuqd9xmQJy3slM=
 github.com/smartystreets/gunit v1.0.0/go.mod h1:qwPWnhz6pn0NnRBP++URONOVyNkPyr4SauJk4cUOwJs=
@@ -375,8 +449,8 @@ github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8w
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
-github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
-github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
+github.com/stretchr/objx v0.5.3 h1:jmXUvGomnU1o3W/V5h2VEradbpJDwGrzugQQvL0POH4=
+github.com/stretchr/objx v0.5.3/go.mod h1:rDQraq+vQZU7Fde9LOZLr8Tax6zZvy4kuNKF+QYS+U0=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
@@ -385,6 +459,8 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
 github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+github.com/testcontainers/testcontainers-go v0.42.0 h1:He3IhTzTZOygSXLJPMX7n44XtK+qhjat1nI9cneBbUY=
+github.com/testcontainers/testcontainers-go v0.42.0/go.mod h1:vZjdY1YmUA1qEForxOIOazfsrdyORJAbhi0bp8plN30=
 github.com/tj/assert v0.0.0-20171129193455-018094318fb0/go.mod h1:mZ9/Rh9oLWpLLDRpvE+3b7gP/C2YyLFYxNmcLnPTMe0=
 github.com/tj/assert v0.0.3 h1:Df/BlaZ20mq6kuai7f5z2TvPFiwC3xaWJSDQNiIS3Rk=
 github.com/tj/assert v0.0.3/go.mod h1:Ne6X72Q+TB1AteidzQncjw9PabbMp4PBMZ1k+vd1Pvk=
@@ -392,6 +468,10 @@ github.com/tj/go-buffer v1.1.0/go.mod h1:iyiJpfFcR2B9sXu7KvjbT9fpM4mOelRSDTbntVj
 github.com/tj/go-elastic v0.0.0-20171221160941-36157cbbebc2/go.mod h1:WjeM0Oo1eNAjXGDx2yma7uG2XoyRZTq1uv3M/o7imD0=
 github.com/tj/go-kinesis v0.0.0-20171128231115-08b17f58cb1b/go.mod h1:/yhzCV0xPfx6jb1bBgRFjl5lytqVqZXEaeqWP8lTEao=
 github.com/tj/go-spin v1.1.0/go.mod h1:Mg1mzmePZm4dva8Qz60H2lHwmJ2loum4VIrLgVnKwh4=
+github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
+github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
+github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
+github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
 github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 h1:tHNk7XK9GkmKUR6Gh8gVBKXc2MVSZ4G/NnWLtzw4gNA=
 github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923/go.mod h1:eLL9Nub3yfAho7qB0MzZizFhTU2QkLeoVsWdHtDW264=
 github.com/urfave/cli v1.22.12 h1:igJgVw1JdKH+trcLWLeLwZjU9fEfPesQ+9/e4MQ44S8=
@@ -408,6 +488,8 @@ github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ=
 github.com/xlab/treeprint v1.2.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0=
 github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM=
 github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI=
+github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
+github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
 go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
 go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus=
@@ -460,9 +542,10 @@ golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5h
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220622161953-175b2fd9d664/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -477,8 +560,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
 golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg=
 golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164=
-golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
-golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
+golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0=
+golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c=
 golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI=
@@ -526,6 +609,8 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C
 gopkg.in/yaml.v3 v3.0.0-20200605160147-a5ece683394c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q=
+gotest.tools/v3 v3.5.2/go.mod h1:LtdLGcnqToBH83WByAAi/wiwSFCArdFIUV/xxN4pcjA=
 k8s.io/api v0.35.4 h1:P7nFYKl5vo9AGUp1Z+Pmd3p2tA7bX2wbFWCvDeRv988=
 k8s.io/api v0.35.4/go.mod h1:yl4lqySWOgYJJf9RERXKUwE9g2y+CkuwG+xmcOK8wXU=
 k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4=
@@ -582,6 +667,8 @@ modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
 modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
 oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc=
 oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o=
+pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk=
+pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04=
 sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM=
 sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
 sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80=
diff --git a/hack/cmd/render-manifests/main.go b/hack/cmd/render-manifests/main.go
index 475c7129..187676fa 100644
--- a/hack/cmd/render-manifests/main.go
+++ b/hack/cmd/render-manifests/main.go
@@ -10,19 +10,19 @@
 // evaluate to empty strings (text/template's missingkey=zero behaviour for map
 // data), which lets templates rely on sprig's `default` function to supply
 // documented fallbacks.
+//
+// The actual rendering logic lives in the render sub-package so it can be
+// invoked programmatically from tests.
 package main
 
 import (
-	"bytes"
 	"flag"
 	"fmt"
 	"os"
-	"path/filepath"
 	"sort"
 	"strings"
-	"text/template"
 
-	"github.com/Masterminds/sprig/v3"
+	"github.com/Azure/unbounded/hack/cmd/render-manifests/render"
 )
 
 // setFlags implements flag.Value for repeatable --set key=value arguments.
@@ -75,60 +75,11 @@ func main() {
 		exitWithError("--output-dir is required")
 	}
 
-	if err := renderTemplates(templatesDir, outputDir, data); err != nil {
+	if err := render.Render(templatesDir, outputDir, data); err != nil {
 		exitWithError(err.Error())
 	}
 }
 
-func renderTemplates(templatesDir, outputDir string, data setFlags) error {
-	return filepath.WalkDir(templatesDir, func(path string, d os.DirEntry, err error) error {
-		if err != nil {
-			return err
-		}
-
-		if d.IsDir() {
-			return nil
-		}
-
-		if !strings.HasSuffix(path, ".yaml.tmpl") {
-			return nil
-		}
-
-		relPath, err := filepath.Rel(templatesDir, path)
-		if err != nil {
-			return err
-		}
-
-		outputRelPath := strings.TrimSuffix(relPath, ".tmpl")
-		outputPath := filepath.Join(outputDir, outputRelPath)
-
-		templateBytes, err := os.ReadFile(path)
-		if err != nil {
-			return fmt.Errorf("read template %q: %w", path, err)
-		}
-
-		tmpl, err := template.New(relPath).Funcs(sprig.TxtFuncMap()).Option("missingkey=zero").Parse(string(templateBytes))
-		if err != nil {
-			return fmt.Errorf("parse template %q: %w", path, err)
-		}
-
-		if err := os.MkdirAll(filepath.Dir(outputPath), 0o755); err != nil {
-			return fmt.Errorf("create output dir for %q: %w", outputPath, err)
-		}
-
-		var rendered bytes.Buffer
-		if err := tmpl.Execute(&rendered, map[string]string(data)); err != nil {
-			return fmt.Errorf("execute template %q: %w", path, err)
-		}
-
-		if err := os.WriteFile(outputPath, rendered.Bytes(), 0o644); err != nil {
-			return fmt.Errorf("write rendered manifest %q: %w", outputPath, err)
-		}
-
-		return nil
-	})
-}
-
 func exitWithError(message string) {
 	fmt.Fprintln(os.Stderr, message)
 	os.Exit(1)
diff --git a/hack/cmd/render-manifests/render/render.go b/hack/cmd/render-manifests/render/render.go
new file mode 100644
index 00000000..13d3dce5
--- /dev/null
+++ b/hack/cmd/render-manifests/render/render.go
@@ -0,0 +1,75 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package render implements the manifest template renderer used by
+// the render-manifests CLI. Exposed as a package so tests in other
+// packages (e.g. internal/orca/manifests) can render the orca
+// templates programmatically without shelling out to `go run`.
+package render
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"text/template"
+
+	"github.com/Masterminds/sprig/v3"
+)
+
+// Render walks templatesDir for *.yaml.tmpl files, executes each with
+// Go's text/template (plus the sprig function library), and writes
+// the rendered output under outputDir mirroring the source tree.
+//
+// Template data is supplied via the data map. Missing keys evaluate
+// to empty strings (text/template's missingkey=zero), which lets
+// templates rely on sprig's `default` function for fallbacks.
+func Render(templatesDir, outputDir string, data map[string]string) error {
+	return filepath.WalkDir(templatesDir, func(path string, d os.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+
+		if d.IsDir() {
+			return nil
+		}
+
+		if !strings.HasSuffix(path, ".yaml.tmpl") {
+			return nil
+		}
+
+		relPath, err := filepath.Rel(templatesDir, path)
+		if err != nil {
+			return err
+		}
+
+		outputRelPath := strings.TrimSuffix(relPath, ".tmpl")
+		outputPath := filepath.Join(outputDir, outputRelPath)
+
+		templateBytes, err := os.ReadFile(path)
+		if err != nil {
+			return fmt.Errorf("read template %q: %w", path, err)
+		}
+
+		tmpl, err := template.New(relPath).Funcs(sprig.TxtFuncMap()).Option("missingkey=zero").Parse(string(templateBytes))
+		if err != nil {
+			return fmt.Errorf("parse template %q: %w", path, err)
+		}
+
+		if err := os.MkdirAll(filepath.Dir(outputPath), 0o755); err != nil {
+			return fmt.Errorf("create output dir for %q: %w", outputPath, err)
+		}
+
+		var rendered bytes.Buffer
+		if err := tmpl.Execute(&rendered, data); err != nil {
+			return fmt.Errorf("execute template %q: %w", path, err)
+		}
+
+		if err := os.WriteFile(outputPath, rendered.Bytes(), 0o644); err != nil {
+			return fmt.Errorf("write rendered manifest %q: %w", outputPath, err)
+		}
+
+		return nil
+	})
+}
diff --git a/images/orca/Containerfile b/images/orca/Containerfile
new file mode 100644
index 00000000..6a987546
--- /dev/null
+++ b/images/orca/Containerfile
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Build stage
+FROM --platform=$BUILDPLATFORM docker.io/library/golang:1.26.2-trixie AS builder
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    make \
+    gcc \
+    git \
+    ca-certificates \
+    && apt-get clean
+
+ENV CGO_ENABLED=0
+ENV GOPATH=/go
+ENV GOTOOLCHAIN=auto
+ENV PATH=$PATH:/go/bin
+
+WORKDIR /src
+
+COPY go.mod go.sum ./
+RUN go mod download
+
+COPY ../../ .
+
+ARG TARGETOS
+ARG TARGETARCH
+ARG VERSION=dev
+ARG GIT_COMMIT=
+ARG BUILD_TIME=
+RUN GOOS=${TARGETOS} GOARCH=${TARGETARCH} \
+    make orca-build VERSION=${VERSION} ${GIT_COMMIT:+GIT_COMMIT=${GIT_COMMIT}} ${BUILD_TIME:+BUILD_TIME=${BUILD_TIME}}
+
+# Runtime stage
+FROM ubuntu:noble
+
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN mkdir -p /unbounded/bin
+
+COPY --from=builder /src/bin/orca /unbounded/bin/orca
+
+ENV PATH="/unbounded/bin:${PATH}"
+
+WORKDIR /unbounded
+
+ENTRYPOINT ["/unbounded/bin/orca"]
diff --git a/internal/orca/app/app.go b/internal/orca/app/app.go
new file mode 100644
index 00000000..12a1d7db
--- /dev/null
+++ b/internal/orca/app/app.go
@@ -0,0 +1,374 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package app wires the Orca runtime: origin + cachestore + cluster +
+// fetch coordinator + edge / internal HTTP listeners.
+//
+// Production callers (cmd/orca/orca/orca.go) drive this from a YAML
+// config; integration tests (internal/orca/inttest) drive it from a
+// programmatic *config.Config plus options that inject in-memory or
+// counting decorators around the origin / cachestore.
+package app
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"log/slog"
+	"net"
+	"net/http"
+	"sync"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/cachestore"
+	cachestores3 "github.com/Azure/unbounded/internal/orca/cachestore/s3"
+	"github.com/Azure/unbounded/internal/orca/chunkcatalog"
+	"github.com/Azure/unbounded/internal/orca/cluster"
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/fetch"
+	"github.com/Azure/unbounded/internal/orca/metadata"
+	"github.com/Azure/unbounded/internal/orca/origin"
+	"github.com/Azure/unbounded/internal/orca/origin/awss3"
+	"github.com/Azure/unbounded/internal/orca/origin/azureblob"
+	"github.com/Azure/unbounded/internal/orca/server"
+)
+
+// App is a running Orca instance.
+//
+// Construct with Start; tear down with Shutdown. Start is non-blocking:
+// the returned App's listeners are accepting connections (via
+// net.Listen) before Start returns, so EdgeAddr / InternalAddr are
+// resolved (including any :0 ports) by the time the caller sees them.
+type App struct {
+	// EdgeAddr is the resolved client-edge listen address (host:port).
+	// When the config requested ":0" the port is the OS-assigned one.
+	EdgeAddr string
+
+	// InternalAddr is the resolved peer-RPC listen address (host:port).
+	InternalAddr string
+
+	// Cluster is exposed so tests can inspect peer state and call
+	// Coordinator/Self for assertions. Production callers should treat
+	// this as read-only.
+	Cluster *cluster.Cluster
+
+	log         *slog.Logger
+	edgeSrv     *http.Server
+	internalSrv *http.Server
+	wg          sync.WaitGroup
+	errCh       chan error
+}
+
+type options struct {
+	log                 *slog.Logger
+	clusterOpts         []cluster.Option
+	origin              origin.Origin
+	cacheStore          cachestore.CacheStore
+	skipCacheSelfTst    bool
+	internalHandlerWrap func(http.Handler) http.Handler
+	edgeListener        net.Listener
+	internalListener    net.Listener
+}
+
+// Option configures Start.
+type Option func(*options)
+
+// WithLogger overrides the slog.Logger used for the App's output. If
+// not provided, a JSON handler writing to stdout at LevelInfo is used.
+func WithLogger(log *slog.Logger) Option {
+	return func(o *options) { o.log = log }
+}
+
+// WithResolver overrides only the DNS resolver inside the default
+// peer source. Convenient for tests that want to keep the production
+// DNS-discovery shape but substitute the resolver itself.
+func WithResolver(r cluster.Resolver) Option {
+	return func(o *options) {
+		o.clusterOpts = append(o.clusterOpts, cluster.WithResolver(r))
+	}
+}
+
+// WithPeerSource replaces the cluster's entire peer-discovery
+// mechanism. Intended for integration tests that need full control
+// (e.g. per-replica peer sets with explicit ports).
+func WithPeerSource(s cluster.PeerSource) Option {
+	return func(o *options) {
+		o.clusterOpts = append(o.clusterOpts, cluster.WithPeerSource(s))
+	}
+}
+
+// WithOrigin replaces the origin driver constructed from cfg. Tests use
+// this to wire counting / fault-injecting decorators around a real
+// awss3 or azureblob client.
+func WithOrigin(or origin.Origin) Option {
+	return func(o *options) { o.origin = or }
+}
+
+// WithCacheStore replaces the cachestore driver constructed from cfg.
+// Tests use this to wire a counting / fault-injecting decorator around
+// a real s3 client (or to use an in-memory implementation).
+func WithCacheStore(cs cachestore.CacheStore) Option {
+	return func(o *options) { o.cacheStore = cs }
+}
+
+// WithSkipCachestoreSelfTest disables the boot-time atomic-commit
+// self-test. Useful only in tests that wire a cachestore decorator
+// already known to honor If-None-Match: *.
+func WithSkipCachestoreSelfTest() Option {
+	return func(o *options) { o.skipCacheSelfTst = true }
+}
+
+// WithInternalHandlerWrap installs a decorator around the internal
+// peer-RPC handler. The wrap function receives the production handler
+// and returns one that the http.Server actually serves. Production
+// passes nothing -> identity. Tests use this to count 409 responses
+// per source IP for the not-coordinator fallback assertion.
+func WithInternalHandlerWrap(wrap func(http.Handler) http.Handler) Option {
+	return func(o *options) { o.internalHandlerWrap = wrap }
+}
+
+// WithEdgeListener supplies a pre-bound listener for the client-edge
+// HTTP server, bypassing app.Start's own net.Listen call. Intended
+// for integration tests that need to allocate a port before starting
+// the app (so peer sets can advertise the captured port from t=0
+// without a close/re-bind race window).
+func WithEdgeListener(ln net.Listener) Option {
+	return func(o *options) { o.edgeListener = ln }
+}
+
+// WithInternalListener supplies a pre-bound listener for the peer-RPC
+// internal HTTP server. See WithEdgeListener for rationale.
+func WithInternalListener(ln net.Listener) Option {
+	return func(o *options) { o.internalListener = ln }
+}
+
+// Start wires every dependency and begins serving on the configured
+// listeners. It returns once both listeners are accepting connections
+// (or returns the error that prevented startup).
+//
+// The returned App must be Shutdown by the caller; Start does not own
+// the parent context's lifetime.
+func Start(ctx context.Context, cfg *config.Config, opts ...Option) (*App, error) {
+	o := options{}
+	for _, opt := range opts {
+		opt(&o)
+	}
+
+	log := o.log
+	if log == nil {
+		log = slog.Default()
+	}
+
+	or, err := buildOrigin(ctx, cfg, o.origin)
+	if err != nil {
+		return nil, err
+	}
+
+	cs, err := buildCacheStore(ctx, cfg, o.cacheStore)
+	if err != nil {
+		return nil, err
+	}
+
+	if !o.skipCacheSelfTst {
+		if err := cs.SelfTestAtomicCommit(ctx); err != nil {
+			return nil, fmt.Errorf("cachestore self-test failed: %w", err)
+		}
+
+		log.Info("cachestore self-test passed")
+	}
+
+	cl, err := cluster.New(ctx, cfg.Cluster, o.clusterOpts...)
+	if err != nil {
+		return nil, fmt.Errorf("init cluster: %w", err)
+	}
+
+	cat := chunkcatalog.New(cfg.ChunkCatalog.MaxEntries)
+	mc := metadata.NewCache(cfg.Metadata)
+	fc := fetch.NewCoordinator(or, cs, cl, cat, mc, cfg)
+
+	edgeHandler := server.NewEdgeHandler(fc, cfg, log)
+
+	var internalHandler http.Handler = server.NewInternalHandler(fc, cl, log)
+	if o.internalHandlerWrap != nil {
+		internalHandler = o.internalHandlerWrap(internalHandler)
+	}
+
+	edgeLn := o.edgeListener
+	if edgeLn == nil {
+		ln, err := net.Listen("tcp", cfg.Server.Listen)
+		if err != nil {
+			cl.Close()
+			return nil, fmt.Errorf("edge listener bind %q: %w", cfg.Server.Listen, err)
+		}
+
+		edgeLn = ln
+	}
+
+	internalLn := o.internalListener
+	if internalLn == nil {
+		ln, err := net.Listen("tcp", cfg.Cluster.InternalListen)
+		if err != nil {
+			_ = edgeLn.Close() //nolint:errcheck // best-effort close on bind failure
+
+			cl.Close()
+
+			return nil, fmt.Errorf("internal listener bind %q: %w", cfg.Cluster.InternalListen, err)
+		}
+
+		internalLn = ln
+	}
+
+	a := &App{
+		EdgeAddr:     edgeLn.Addr().String(),
+		InternalAddr: internalLn.Addr().String(),
+		Cluster:      cl,
+		log:          log,
+		edgeSrv: &http.Server{
+			Handler:           edgeHandler,
+			ReadHeaderTimeout: 10 * time.Second,
+		},
+		internalSrv: &http.Server{
+			Handler:           internalHandler,
+			ReadHeaderTimeout: 10 * time.Second,
+		},
+		errCh: make(chan error, 2),
+	}
+
+	a.wg.Add(1)
+
+	go func() {
+		defer a.wg.Done()
+
+		log.Info("edge listener", "addr", a.EdgeAddr)
+
+		if err := a.edgeSrv.Serve(edgeLn); err != nil && !errors.Is(err, http.ErrServerClosed) {
+			a.errCh <- fmt.Errorf("edge listener: %w", err)
+		}
+	}()
+
+	a.wg.Add(1)
+
+	go func() {
+		defer a.wg.Done()
+
+		log.Info("internal listener",
+			"addr", a.InternalAddr,
+			"tls_enabled", cfg.Cluster.InternalTLS.Enabled,
+		)
+
+		var lerr error
+		if cfg.Cluster.InternalTLS.Enabled {
+			lerr = a.internalSrv.ServeTLS(internalLn,
+				cfg.Cluster.InternalTLS.CertFile,
+				cfg.Cluster.InternalTLS.KeyFile,
+			)
+		} else {
+			log.Warn("internal listener TLS DISABLED - unsafe for production",
+				"addr", a.InternalAddr)
+
+			lerr = a.internalSrv.Serve(internalLn)
+		}
+
+		if lerr != nil && !errors.Is(lerr, http.ErrServerClosed) {
+			a.errCh <- fmt.Errorf("internal listener: %w", lerr)
+		}
+	}()
+
+	return a, nil
+}
+
+// Wait blocks until either the parent context is canceled or one of
+// the listeners exits unexpectedly. It returns the listener error (if
+// any) or nil if ctx was canceled. Wait is intended for the production
+// "serve until SIGTERM" path; tests typically call Shutdown directly.
+func (a *App) Wait(ctx context.Context) error {
+	select {
+	case <-ctx.Done():
+		return nil
+	case err := <-a.errCh:
+		return err
+	}
+}
+
+// Shutdown gracefully stops both listeners and the cluster goroutine.
+// It is safe to call multiple times; subsequent calls are no-ops.
+func (a *App) Shutdown(ctx context.Context) error {
+	var firstErr error
+
+	if err := a.edgeSrv.Shutdown(ctx); err != nil {
+		a.log.Warn("edge listener shutdown failed", "err", err)
+
+		firstErr = err
+	}
+
+	if err := a.internalSrv.Shutdown(ctx); err != nil {
+		a.log.Warn("internal listener shutdown failed", "err", err)
+
+		if firstErr == nil {
+			firstErr = err
+		}
+	}
+
+	a.Cluster.Close()
+	a.wg.Wait()
+
+	return firstErr
+}
+
+func buildOrigin(ctx context.Context, cfg *config.Config, override origin.Origin) (origin.Origin, error) {
+	if override != nil {
+		return override, nil
+	}
+
+	switch cfg.Origin.Driver {
+	case "azureblob":
+		or, err := azureblob.New(cfg.Origin.Azureblob)
+		if err != nil {
+			return nil, fmt.Errorf("init origin/azureblob: %w", err)
+		}
+
+		return or, nil
+	case "awss3":
+		or, err := awss3.New(ctx, awss3.Config{
+			Endpoint:     cfg.Origin.AWSS3.Endpoint,
+			Region:       cfg.Origin.AWSS3.Region,
+			Bucket:       cfg.Origin.AWSS3.Bucket,
+			AccessKey:    cfg.Origin.AWSS3.AccessKey,
+			SecretKey:    cfg.Origin.AWSS3.SecretKey,
+			UsePathStyle: cfg.Origin.AWSS3.UsePathStyle,
+		})
+		if err != nil {
+			return nil, fmt.Errorf("init origin/awss3: %w", err)
+		}
+
+		return or, nil
+	default:
+		return nil, fmt.Errorf("unsupported origin driver: %q", cfg.Origin.Driver)
+	}
+}
+
+func buildCacheStore(ctx context.Context, cfg *config.Config, override cachestore.CacheStore) (cachestore.CacheStore, error) {
+	if override != nil {
+		return override, nil
+	}
+
+	switch cfg.Cachestore.Driver {
+	case "s3":
+		cs, err := cachestores3.New(ctx, cachestores3.Config{
+			Endpoint:                 cfg.Cachestore.S3.Endpoint,
+			Bucket:                   cfg.Cachestore.S3.Bucket,
+			Region:                   cfg.Cachestore.S3.Region,
+			AccessKey:                cfg.Cachestore.S3.AccessKey,
+			SecretKey:                cfg.Cachestore.S3.SecretKey,
+			UsePathStyle:             cfg.Cachestore.S3.UsePathStyle,
+			RequireUnversionedBucket: cfg.Cachestore.S3.RequireUnversionedBucket,
+		})
+		if err != nil {
+			return nil, fmt.Errorf("init cachestore/s3: %w", err)
+		}
+
+		return cs, nil
+	default:
+		return nil, fmt.Errorf("unsupported cachestore driver: %q", cfg.Cachestore.Driver)
+	}
+}
diff --git a/internal/orca/cachestore/cachestore.go b/internal/orca/cachestore/cachestore.go
new file mode 100644
index 00000000..f51e664f
--- /dev/null
+++ b/internal/orca/cachestore/cachestore.go
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package cachestore defines the in-DC chunk store interface and shared
+// types. Concrete drivers live under cachestore/<driver>/.
+//
+// See design/orca/design.md s7 for the full interface and s10.1 for the
+// atomic-commit contract.
+package cachestore
+
+import (
+	"context"
+	"errors"
+	"io"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+)
+
+// CacheStore is where chunk bytes physically live. Source of truth for
+// chunk presence; backed by an in-DC S3-like store in production and
+// LocalStack in dev (Scope A+B).
+type CacheStore interface {
+	GetChunk(ctx context.Context, k chunk.Key, off, n int64) (io.ReadCloser, error)
+	PutChunk(ctx context.Context, k chunk.Key, size int64, r io.Reader) error
+	Stat(ctx context.Context, k chunk.Key) (Info, error)
+	Delete(ctx context.Context, k chunk.Key) error
+	SelfTestAtomicCommit(ctx context.Context) error
+}
+
+// Info is the result of a successful Stat.
+type Info struct {
+	Size      int64
+	Committed time.Time
+}
+
+// Sentinel errors. Wrap with %w so callers use errors.Is.
+var (
+	ErrNotFound   = errors.New("cachestore: not found")
+	ErrTransient  = errors.New("cachestore: transient")
+	ErrAuth       = errors.New("cachestore: auth")
+	ErrCommitLost = errors.New("cachestore: commit lost (no-clobber denied)")
+)
diff --git a/internal/orca/cachestore/s3/s3.go b/internal/orca/cachestore/s3/s3.go
new file mode 100644
index 00000000..fc915642
--- /dev/null
+++ b/internal/orca/cachestore/s3/s3.go
@@ -0,0 +1,354 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package s3 is the cachestore driver for in-DC S3-compatible stores.
+// In production this targets VAST or another S3-compatible object
+// store; in dev it targets LocalStack.
+//
+// Atomic commit is implemented via PutObject + If-None-Match: * (s3
+// conditional writes). The boot SelfTestAtomicCommit verifies the
+// backend honors the precondition; the boot versioning gate verifies
+// the bucket is not versioned (since If-None-Match is not honored on
+// versioned buckets).
+//
+// See design/orca/design.md s10.1.3.
+package s3
+
+import (
+	"bytes"
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/aws/aws-sdk-go-v2/aws"
+	awsconfig "github.com/aws/aws-sdk-go-v2/config"
+	"github.com/aws/aws-sdk-go-v2/credentials"
+	"github.com/aws/aws-sdk-go-v2/service/s3"
+	s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
+	"github.com/aws/smithy-go"
+
+	"github.com/Azure/unbounded/internal/orca/cachestore"
+	"github.com/Azure/unbounded/internal/orca/chunk"
+)
+
+// Driver implements cachestore.CacheStore against an S3-compatible
+// endpoint.
+type Driver struct {
+	client *s3.Client
+	bucket string
+
+	requireUnversionedBucket bool
+}
+
+// Config is the s3-driver configuration. Mirrors config.CachestoreS3
+// but kept package-local so the driver can be unit-tested without
+// importing the whole config package.
+type Config struct {
+	Endpoint                 string
+	Bucket                   string
+	Region                   string
+	AccessKey                string
+	SecretKey                string
+	UsePathStyle             bool
+	RequireUnversionedBucket bool
+}
+
+// New constructs a Driver. The boot versioning gate is run here.
+//
+// SelfTestAtomicCommit is a separate step (called by main after New)
+// to keep the constructor side-effect-light.
+func New(ctx context.Context, cfg Config) (*Driver, error) {
+	if cfg.Bucket == "" {
+		return nil, fmt.Errorf("cachestore/s3: bucket required")
+	}
+
+	if cfg.Endpoint == "" {
+		return nil, fmt.Errorf("cachestore/s3: endpoint required")
+	}
+
+	awsCfg, err := awsconfig.LoadDefaultConfig(ctx,
+		awsconfig.WithRegion(cfg.Region),
+		awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(
+			cfg.AccessKey, cfg.SecretKey, "",
+		)),
+		// Opt out of CRC64NVME default introduced in aws-sdk-go-v2
+		// 1.32. LocalStack 3.8 returns InvalidRequest for unknown
+		// algorithms; real AWS S3 still works either way.
+		awsconfig.WithRequestChecksumCalculation(aws.RequestChecksumCalculationWhenRequired),
+		awsconfig.WithResponseChecksumValidation(aws.ResponseChecksumValidationWhenRequired),
+	)
+	if err != nil {
+		return nil, fmt.Errorf("cachestore/s3: aws config: %w", err)
+	}
+
+	client := s3.NewFromConfig(awsCfg, func(o *s3.Options) {
+		o.BaseEndpoint = aws.String(cfg.Endpoint)
+		o.UsePathStyle = cfg.UsePathStyle
+	})
+
+	d := &Driver{
+		client:                   client,
+		bucket:                   cfg.Bucket,
+		requireUnversionedBucket: cfg.RequireUnversionedBucket,
+	}
+
+	if d.requireUnversionedBucket {
+		if err := d.versioningGate(ctx); err != nil {
+			return nil, err
+		}
+	}
+
+	return d, nil
+}
+
+// versioningGate refuses to start if the bucket has versioning enabled
+// or suspended. design.md s10.1.3.
+func (d *Driver) versioningGate(ctx context.Context) error {
+	out, err := d.client.GetBucketVersioning(ctx, &s3.GetBucketVersioningInput{
+		Bucket: aws.String(d.bucket),
+	})
+	if err != nil {
+		return fmt.Errorf("cachestore/s3: GetBucketVersioning failed: %w", err)
+	}
+
+	return validateBucketVersioning(d.bucket, out.Status)
+}
+
+// validateBucketVersioning returns an error if the bucket's versioning
+// status is incompatible with cachestore/s3's atomic-commit primitive.
+// Extracted as a pure function so unit tests can cover all branches
+// (empty / Enabled / Suspended) without round-tripping to a real or
+// emulated S3 backend.
+func validateBucketVersioning(bucket string, status s3types.BucketVersioningStatus) error {
+	switch status {
+	case s3types.BucketVersioningStatusEnabled, s3types.BucketVersioningStatusSuspended:
+		return fmt.Errorf(
+			"cachestore/s3: bucket %s has versioning %s; If-None-Match: * is not "+
+				"honored on versioned buckets and the atomic-commit primitive cannot "+
+				"guarantee no-clobber; disable bucket versioning to use cachestore/s3",
+			bucket, status)
+	}
+
+	return nil
+}
+
+// SelfTestAtomicCommit verifies the backend honors PutObject +
+// If-None-Match: *.
+func (d *Driver) SelfTestAtomicCommit(ctx context.Context) error {
+	probeKey := fmt.Sprintf("_orca-selftest/%s", randHex(16))
+	body := []byte("orca-selftest")
+
+	// First put: must succeed.
+	_, err := d.client.PutObject(ctx, &s3.PutObjectInput{
+		Bucket:      aws.String(d.bucket),
+		Key:         aws.String(probeKey),
+		Body:        bytes.NewReader(body),
+		IfNoneMatch: aws.String("*"),
+	})
+	if err != nil {
+		return fmt.Errorf("cachestore/s3 self-test: first put failed: %w", err)
+	}
+
+	// Second put: must fail with 412.
+	_, err = d.client.PutObject(ctx, &s3.PutObjectInput{
+		Bucket:      aws.String(d.bucket),
+		Key:         aws.String(probeKey),
+		Body:        bytes.NewReader(body),
+		IfNoneMatch: aws.String("*"),
+	})
+	if err == nil {
+		// Clean up before returning the failure.
+		_, _ = d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort selftest cleanup
+			Bucket: aws.String(d.bucket),
+			Key:    aws.String(probeKey),
+		})
+
+		return fmt.Errorf(
+			"cachestore/s3: backend does not honor If-None-Match: *; refusing to start " +
+				"(second concurrent put returned 200 instead of 412)")
+	}
+
+	if !isPreconditionFailed(err) {
+		_, _ = d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort selftest cleanup
+			Bucket: aws.String(d.bucket),
+			Key:    aws.String(probeKey),
+		})
+
+		return fmt.Errorf("cachestore/s3 self-test: second put returned unexpected error "+
+			"(want 412 PreconditionFailed): %w", err)
+	}
+
+	// Cleanup probe key.
+	_, _ = d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort selftest cleanup
+		Bucket: aws.String(d.bucket),
+		Key:    aws.String(probeKey),
+	})
+
+	return nil
+}
+
+// GetChunk fetches [off, off+n) of the chunk path from the bucket.
+func (d *Driver) GetChunk(ctx context.Context, k chunk.Key, off, n int64) (io.ReadCloser, error) {
+	rng := fmt.Sprintf("bytes=%d-%d", off, off+n-1)
+
+	out, err := d.client.GetObject(ctx, &s3.GetObjectInput{
+		Bucket: aws.String(d.bucket),
+		Key:    aws.String(k.Path()),
+		Range:  aws.String(rng),
+	})
+	if err != nil {
+		return nil, mapErr(err)
+	}
+
+	return out.Body, nil
+}
+
+// PutChunk uploads the chunk via PutObject + If-None-Match: *. On
+// 412 returns ErrCommitLost (loser of an atomic-commit race).
+func (d *Driver) PutChunk(ctx context.Context, k chunk.Key, size int64, r io.Reader) error {
+	// AWS SDK v2 needs an io.ReadSeeker for unsigned-payload uploads.
+	// For prototype simplicity we buffer the chunk in memory (chunks
+	// are 8 MiB by default).
+	buf, err := io.ReadAll(r)
+	if err != nil {
+		return fmt.Errorf("cachestore/s3 put: read body: %w", err)
+	}
+
+	if int64(len(buf)) != size && size > 0 {
+		return fmt.Errorf("cachestore/s3 put: short body (got %d want %d)", len(buf), size)
+	}
+
+	_, err = d.client.PutObject(ctx, &s3.PutObjectInput{
+		Bucket:        aws.String(d.bucket),
+		Key:           aws.String(k.Path()),
+		Body:          bytes.NewReader(buf),
+		ContentLength: aws.Int64(int64(len(buf))),
+		IfNoneMatch:   aws.String("*"),
+	})
+	if err != nil {
+		if isPreconditionFailed(err) {
+			return cachestore.ErrCommitLost
+		}
+
+		return mapErr(err)
+	}
+
+	return nil
+}
+
+// Stat checks for chunk presence.
+func (d *Driver) Stat(ctx context.Context, k chunk.Key) (cachestore.Info, error) {
+	out, err := d.client.HeadObject(ctx, &s3.HeadObjectInput{
+		Bucket: aws.String(d.bucket),
+		Key:    aws.String(k.Path()),
+	})
+	if err != nil {
+		return cachestore.Info{}, mapErr(err)
+	}
+
+	info := cachestore.Info{}
+	if out.ContentLength != nil {
+		info.Size = *out.ContentLength
+	}
+
+	if out.LastModified != nil {
+		info.Committed = *out.LastModified
+	}
+
+	return info, nil
+}
+
+// Delete removes the chunk; idempotent.
+func (d *Driver) Delete(ctx context.Context, k chunk.Key) error {
+	_, err := d.client.DeleteObject(ctx, &s3.DeleteObjectInput{
+		Bucket: aws.String(d.bucket),
+		Key:    aws.String(k.Path()),
+	})
+	if err != nil {
+		if isNotFound(err) {
+			return nil
+		}
+
+		return mapErr(err)
+	}
+
+	return nil
+}
+
+func randHex(n int) string {
+	b := make([]byte, n)
+	if _, err := rand.Read(b); err != nil {
+		// Fallback: time-based; only used for boot-test probe key.
+		return fmt.Sprintf("ts%d", time.Now().UnixNano())
+	}
+
+	return hex.EncodeToString(b)
+}
+
+func isPreconditionFailed(err error) bool {
+	var apiErr smithy.APIError
+	if errors.As(err, &apiErr) {
+		code := apiErr.ErrorCode()
+		if code == "PreconditionFailed" || code == "InvalidArgument" || code == "ConditionalRequestConflict" {
+			return true
+		}
+	}
+
+	return strings.Contains(err.Error(), "PreconditionFailed") ||
+		strings.Contains(err.Error(), "412")
+}
+
+func isNotFound(err error) bool {
+	var nsk *s3types.NoSuchKey
+	if errors.As(err, &nsk) {
+		return true
+	}
+
+	var nsb *s3types.NoSuchBucket
+	if errors.As(err, &nsb) {
+		return true
+	}
+
+	var notFound *s3types.NotFound
+	if errors.As(err, &notFound) {
+		return true
+	}
+
+	var apiErr smithy.APIError
+	if errors.As(err, &apiErr) {
+		switch apiErr.ErrorCode() {
+		case "NoSuchKey", "NotFound", "404":
+			return true
+		}
+	}
+
+	return false
+}
+
+func mapErr(err error) error {
+	if isNotFound(err) {
+		return cachestore.ErrNotFound
+	}
+
+	var apiErr smithy.APIError
+	if errors.As(err, &apiErr) {
+		switch apiErr.ErrorCode() {
+		case "AccessDenied", "Unauthorized", "Forbidden", "InvalidAccessKeyId", "SignatureDoesNotMatch":
+			return cachestore.ErrAuth
+		}
+	}
+	// Treat HTTP 5xx as transient.
+	if strings.Contains(err.Error(), "StatusCode: 5") {
+		return cachestore.ErrTransient
+	}
+
+	_ = http.StatusOK // keep net/http import if not needed otherwise
+
+	return err
+}
diff --git a/internal/orca/cachestore/s3/s3_test.go b/internal/orca/cachestore/s3/s3_test.go
new file mode 100644
index 00000000..b8d28735
--- /dev/null
+++ b/internal/orca/cachestore/s3/s3_test.go
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package s3
+
+import (
+	"strings"
+	"testing"
+
+	s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
+)
+
+// TestValidateBucketVersioning covers every BucketVersioningStatus
+// branch the gate cares about. The integration suite only exercises
+// the Enabled case end-to-end; this unit test fills in the empty
+// (never-enabled) and Suspended cases.
+func TestValidateBucketVersioning(t *testing.T) {
+	tests := []struct {
+		name    string
+		status  s3types.BucketVersioningStatus
+		wantErr bool
+	}{
+		{"empty (never enabled)", "", false},
+		{"enabled", s3types.BucketVersioningStatusEnabled, true},
+		{"suspended", s3types.BucketVersioningStatusSuspended, true},
+	}
+
+	const bucket = "test-bucket"
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := validateBucketVersioning(bucket, tt.status)
+
+			if (err != nil) != tt.wantErr {
+				t.Fatalf("err=%v, wantErr=%v", err, tt.wantErr)
+			}
+
+			if !tt.wantErr {
+				return
+			}
+
+			if !strings.Contains(err.Error(), bucket) {
+				t.Errorf("error %q does not include bucket name %q", err, bucket)
+			}
+
+			if !strings.Contains(err.Error(), string(tt.status)) {
+				t.Errorf("error %q does not include status %q", err, tt.status)
+			}
+		})
+	}
+}
diff --git a/internal/orca/chunk/chunk.go b/internal/orca/chunk/chunk.go
new file mode 100644
index 00000000..1a520c87
--- /dev/null
+++ b/internal/orca/chunk/chunk.go
@@ -0,0 +1,126 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package chunk implements the chunk model: ChunkKey, deterministic
+// path encoding, and the range -> chunk-index iterator.
+//
+// See design/orca/design.md s5 for the full chunk model spec. This
+// implementation is a faithful subset.
+package chunk
+
+import (
+	"crypto/sha256"
+	"encoding/binary"
+	"encoding/hex"
+	"fmt"
+	"hash"
+)
+
+// Key is the immutable identifier for a chunk.
+//
+// Path encoding (design.md s5):
+//
+//	LP(s)   = LE64(uint64(len(s))) || s
+//	hashKey = sha256(
+//	            LP(origin_id) ||
+//	            LP(bucket)    ||
+//	            LP(key)       ||
+//	            LP(etag)      ||
+//	            LE64(chunk_size)
+//	          )
+//	path    = "<origin_id>/<hex(hashKey)>/<chunk_index>"
+type Key struct {
+	OriginID  string
+	Bucket    string
+	ObjectKey string
+	ETag      string
+	ChunkSize int64
+	Index     int64
+}
+
+// Path returns the canonical on-store path for this ChunkKey.
+func (k Key) Path() string {
+	h := sha256.New()
+	writeLP(h, k.OriginID)
+	writeLP(h, k.Bucket)
+	writeLP(h, k.ObjectKey)
+	writeLP(h, k.ETag)
+
+	var sizeBuf [8]byte
+	binary.LittleEndian.PutUint64(sizeBuf[:], uint64(k.ChunkSize))
+	h.Write(sizeBuf[:])
+	sum := h.Sum(nil)
+
+	return fmt.Sprintf("%s/%s/%d", k.OriginID, hex.EncodeToString(sum), k.Index)
+}
+
+// Range returns the byte range [Off, Off+Len) within the origin
+// object that this chunk corresponds to.
+func (k Key) Range() (off, length int64) {
+	off = k.Index * k.ChunkSize
+	length = k.ChunkSize
+
+	return off, length
+}
+
+// String renders the key compactly for logging.
+func (k Key) String() string {
+	if len(k.ETag) > 8 {
+		return fmt.Sprintf("ChunkKey{%s/%s/%s..@%d#%d}",
+			k.OriginID, k.Bucket, k.ObjectKey, k.Index, len(k.ETag))
+	}
+
+	return fmt.Sprintf("ChunkKey{%s/%s/%s@%d}", k.OriginID, k.Bucket, k.ObjectKey, k.Index)
+}
+
+func writeLP(h hash.Hash, s string) {
+	var lenBuf [8]byte
+	binary.LittleEndian.PutUint64(lenBuf[:], uint64(len(s)))
+	h.Write(lenBuf[:])
+	h.Write([]byte(s))
+}
+
+// IndexRange returns the inclusive [first, last] chunk indices that
+// cover the byte range [start, end] of an object whose total size is
+// objectSize.
+//
+// Caller is responsible for clamping start / end against objectSize
+// before invoking; if end >= objectSize, end is clamped here.
+func IndexRange(start, end, chunkSize, objectSize int64) (first, last int64) {
+	if end >= objectSize {
+		end = objectSize - 1
+	}
+
+	first = start / chunkSize
+	last = end / chunkSize
+
+	return first, last
+}
+
+// ChunkSlice returns the [off, len) within a single chunk that
+// satisfies the original client byte range [start, end].
+//
+// chunkIdx is the chunk index. chunkSize is the configured chunk size.
+// objectSize is the total origin-object size (used to clamp the last
+// chunk if it is partial).
+func ChunkSlice(chunkIdx, chunkSize, start, end, objectSize int64) (off, length int64) {
+	chunkStart := chunkIdx * chunkSize
+
+	chunkEnd := chunkStart + chunkSize - 1
+	if chunkEnd >= objectSize {
+		chunkEnd = objectSize - 1
+	}
+
+	if start > chunkStart {
+		off = start - chunkStart
+	}
+
+	sliceEnd := chunkEnd
+	if end < chunkEnd {
+		sliceEnd = end
+	}
+
+	length = sliceEnd - chunkStart - off + 1
+
+	return off, length
+}
diff --git a/internal/orca/chunk/chunk_test.go b/internal/orca/chunk/chunk_test.go
new file mode 100644
index 00000000..bc53c795
--- /dev/null
+++ b/internal/orca/chunk/chunk_test.go
@@ -0,0 +1,231 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package chunk
+
+import (
+	"strings"
+	"testing"
+)
+
+// TestKey_Path_Deterministic verifies that the same inputs always
+// produce the same path and that meaningful input differences
+// (OriginID, Bucket, ObjectKey, ETag, ChunkSize, Index) produce
+// distinct paths. The path encoding is part of orca's design
+// contract (design.md s5).
+func TestKey_Path_Deterministic(t *testing.T) {
+	t.Parallel()
+
+	base := Key{
+		OriginID:  "origin-a",
+		Bucket:    "bucket",
+		ObjectKey: "key",
+		ETag:      "etag1",
+		ChunkSize: 1024,
+		Index:     0,
+	}
+	// Same inputs -> same path. Compare two equally-constructed Keys
+	// (calling Path() on the same receiver tautologically passes).
+	dup := base
+	if base.Path() != dup.Path() {
+		t.Fatalf("Path() not deterministic for identical key")
+	}
+
+	other := base
+	otherPath := other.Path()
+
+	mutations := []struct {
+		name string
+		mut  func(k *Key)
+	}{
+		{"different origin", func(k *Key) { k.OriginID = "origin-b" }},
+		{"different bucket", func(k *Key) { k.Bucket = "other-bucket" }},
+		{"different key", func(k *Key) { k.ObjectKey = "other-key" }},
+		{"different etag", func(k *Key) { k.ETag = "etag2" }},
+		{"different chunk size", func(k *Key) { k.ChunkSize = 2048 }},
+		{"different index", func(k *Key) { k.Index = 1 }},
+	}
+
+	for _, m := range mutations {
+		t.Run(m.name, func(t *testing.T) {
+			mutated := base
+			m.mut(&mutated)
+
+			got := mutated.Path()
+			if got == otherPath {
+				t.Errorf("path collision after %s mutation: %q", m.name, got)
+			}
+		})
+	}
+}
+
+// TestKey_Path_Format asserts the documented path shape:
+// "<origin_id>/<hex(sha256)>/<chunk_index>".
+func TestKey_Path_Format(t *testing.T) {
+	t.Parallel()
+
+	k := Key{
+		OriginID:  "origin-a",
+		Bucket:    "b",
+		ObjectKey: "k",
+		ETag:      "e",
+		ChunkSize: 1024,
+		Index:     7,
+	}
+
+	path := k.Path()
+
+	parts := strings.Split(path, "/")
+	if len(parts) != 3 {
+		t.Fatalf("path %q has %d segments, want 3", path, len(parts))
+	}
+
+	if parts[0] != "origin-a" {
+		t.Errorf("origin segment=%q want %q", parts[0], "origin-a")
+	}
+
+	if len(parts[1]) != 64 {
+		t.Errorf("hex segment len=%d want 64 (sha256)", len(parts[1]))
+	}
+
+	for _, c := range parts[1] {
+		isDigit := c >= '0' && c <= '9'
+		isLowerHex := c >= 'a' && c <= 'f'
+
+		if !isDigit && !isLowerHex {
+			t.Errorf("hex segment contains non-hex char %q", c)
+			break
+		}
+	}
+
+	if parts[2] != "7" {
+		t.Errorf("index segment=%q want %q", parts[2], "7")
+	}
+}
+
+// TestKey_Range verifies (off, length) = (Index*ChunkSize, ChunkSize).
+func TestKey_Range(t *testing.T) {
+	t.Parallel()
+
+	k := Key{ChunkSize: 1 << 20, Index: 3}
+
+	off, length := k.Range()
+	if off != 3<<20 {
+		t.Errorf("off=%d want %d", off, 3<<20)
+	}
+
+	if length != 1<<20 {
+		t.Errorf("length=%d want %d", length, 1<<20)
+	}
+}
+
+// TestIndexRange covers the chunk-index span computed from a byte
+// range plus the end clamping to objectSize.
+func TestIndexRange(t *testing.T) {
+	t.Parallel()
+
+	const chunkSize = int64(1024)
+
+	tests := []struct {
+		name       string
+		start, end int64
+		objectSize int64
+		wantFirst  int64
+		wantLast   int64
+	}{
+		{"aligned full chunk", 0, 1023, 1024, 0, 0},
+		{"aligned two chunks", 0, 2047, 4096, 0, 1},
+		{"start mid-chunk, end mid-chunk same", 100, 500, 1024, 0, 0},
+		{"start mid-chunk, end mid-next-chunk", 100, 1500, 4096, 0, 1},
+		{"end clamped to objectSize", 0, 9999, 2048, 0, 1},
+		{"single byte", 5, 5, 1024, 0, 0},
+		{"last partial chunk", 1024, 1500, 1500, 1, 1},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			first, last := IndexRange(tt.start, tt.end, chunkSize, tt.objectSize)
+			if first != tt.wantFirst {
+				t.Errorf("first=%d want %d", first, tt.wantFirst)
+			}
+
+			if last != tt.wantLast {
+				t.Errorf("last=%d want %d", last, tt.wantLast)
+			}
+		})
+	}
+}
+
+// TestChunkSlice covers the (off, length) within a single chunk that
+// satisfies the original byte range. Critical for cross-chunk
+// streamSlice copies.
+func TestChunkSlice(t *testing.T) {
+	t.Parallel()
+
+	const chunkSize = int64(1024)
+
+	tests := []struct {
+		name       string
+		chunkIdx   int64
+		start      int64
+		end        int64
+		objectSize int64
+		wantOff    int64
+		wantLen    int64
+	}{
+		{"entirely within chunk 0", 0, 100, 199, 4096, 100, 100},
+		{"start at chunk 0 boundary", 0, 0, 99, 4096, 0, 100},
+		{"end at chunk 0 boundary", 0, 0, 1023, 4096, 0, 1024},
+		{"chunk 1, range covers full chunk", 1, 1024, 2047, 4096, 0, 1024},
+		{"chunk spans range start", 1, 500, 1500, 4096, 0, 477}, // [1024..1500]
+		{"chunk spans range end", 1, 1500, 2500, 4096, 476, 548},
+		{"last partial chunk", 3, 3000, 3500, 3500, 0, 428},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			off, length := ChunkSlice(tt.chunkIdx, chunkSize, tt.start, tt.end, tt.objectSize)
+			if off != tt.wantOff {
+				t.Errorf("off=%d want %d", off, tt.wantOff)
+			}
+
+			if length != tt.wantLen {
+				t.Errorf("length=%d want %d", length, tt.wantLen)
+			}
+		})
+	}
+}
+
+// TestKey_String covers both formatting branches (short ETag + long
+// ETag).
+func TestKey_String(t *testing.T) {
+	t.Parallel()
+
+	short := Key{
+		OriginID:  "o",
+		Bucket:    "b",
+		ObjectKey: "k",
+		ETag:      "abc",
+		Index:     5,
+	}
+	if s := short.String(); !strings.Contains(s, "@5") {
+		t.Errorf("short ETag string=%q does not contain @5", s)
+	}
+
+	long := Key{
+		OriginID:  "o",
+		Bucket:    "b",
+		ObjectKey: "k",
+		ETag:      "abcdefghi", // 9 chars > 8
+		Index:     5,
+	}
+
+	s := long.String()
+	if !strings.Contains(s, "..@") {
+		t.Errorf("long ETag string=%q does not contain truncation marker '..@'", s)
+	}
+
+	if !strings.Contains(s, "#9") {
+		t.Errorf("long ETag string=%q does not contain length suffix '#9'", s)
+	}
+}
diff --git a/internal/orca/chunkcatalog/chunkcatalog.go b/internal/orca/chunkcatalog/chunkcatalog.go
new file mode 100644
index 00000000..453c8ed8
--- /dev/null
+++ b/internal/orca/chunkcatalog/chunkcatalog.go
@@ -0,0 +1,130 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package chunkcatalog implements a bounded LRU recording chunks known
+// to be present in the CacheStore. Pure hot-path optimization;
+// CacheStore is the source of truth.
+package chunkcatalog
+
+import (
+	"container/list"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/cachestore"
+	"github.com/Azure/unbounded/internal/orca/chunk"
+)
+
+// Catalog is a bounded LRU keyed on chunk.Key.Path().
+type Catalog struct {
+	mu         sync.Mutex
+	maxEntries int
+	ll         *list.List
+	idx        map[string]*list.Element
+}
+
+type entry struct {
+	path string
+	info cachestore.Info
+	at   time.Time
+}
+
+// New constructs a Catalog.
+func New(maxEntries int) *Catalog {
+	if maxEntries <= 0 {
+		maxEntries = 100_000
+	}
+
+	return &Catalog{
+		maxEntries: maxEntries,
+		ll:         list.New(),
+		idx:        make(map[string]*list.Element, maxEntries),
+	}
+}
+
+// Lookup returns the cached Info if present and bumps the LRU position.
+func (c *Catalog) Lookup(k chunk.Key) (cachestore.Info, bool, error) {
+	path := k.Path()
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	el, ok := c.idx[path]
+	if !ok {
+		return cachestore.Info{}, false, nil
+	}
+
+	c.ll.MoveToFront(el)
+
+	e, ok := el.Value.(*entry)
+	if !ok {
+		return cachestore.Info{}, false, fmt.Errorf("chunkcatalog: list element is not *entry")
+	}
+
+	return e.info, true, nil
+}
+
+// Record inserts or updates the entry.
+func (c *Catalog) Record(k chunk.Key, info cachestore.Info) error {
+	path := k.Path()
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if el, ok := c.idx[path]; ok {
+		c.ll.MoveToFront(el)
+
+		e, ok := el.Value.(*entry)
+		if !ok {
+			return fmt.Errorf("chunkcatalog: list element is not *entry")
+		}
+
+		e.info = info
+		e.at = time.Now()
+
+		return nil
+	}
+
+	el := c.ll.PushFront(&entry{path: path, info: info, at: time.Now()})
+
+	c.idx[path] = el
+	for c.ll.Len() > c.maxEntries {
+		oldest := c.ll.Back()
+		if oldest == nil {
+			break
+		}
+
+		c.ll.Remove(oldest)
+
+		oldEntry, ok := oldest.Value.(*entry)
+		if !ok {
+			return fmt.Errorf("chunkcatalog: list element is not *entry")
+		}
+
+		delete(c.idx, oldEntry.path)
+	}
+
+	return nil
+}
+
+// Forget removes the entry if present.
+func (c *Catalog) Forget(k chunk.Key) {
+	path := k.Path()
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if el, ok := c.idx[path]; ok {
+		c.ll.Remove(el)
+		delete(c.idx, path)
+	}
+}
+
+// Len returns the current entry count (test helper).
+func (c *Catalog) Len() int {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	return c.ll.Len()
+}
diff --git a/internal/orca/cluster/cluster.go b/internal/orca/cluster/cluster.go
new file mode 100644
index 00000000..d3c178c5
--- /dev/null
+++ b/internal/orca/cluster/cluster.go
@@ -0,0 +1,449 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package cluster handles peer discovery and rendezvous-hash
+// coordinator selection.
+//
+// Peer discovery: the headless Kubernetes Service backing the Orca
+// Deployment publishes Pod IPs in its A-record. We poll DNS at
+// cluster.membership_refresh interval (default 5s) and snapshot the
+// peer set.
+//
+// Coordinator selection: rendezvous hashing on (peer_ip, ChunkKey)
+// picks one coordinator per chunk across the cluster. See
+// design.md s8.3.
+//
+// Internal RPC: each replica runs an HTTP/2 client to dial peers'
+// internal listeners (mTLS in production, plain in dev). The
+// listener side is in the server/internal handler.
+//
+// # Test seams
+//
+// Production constructs a DNS-backed PeerSource implicitly from
+// cfg.Cluster.Service + net.DefaultResolver. Tests can substitute the
+// entire mechanism with WithPeerSource (typically a mutable
+// StaticPeerSource per replica) or just swap the underlying DNS
+// resolver with WithResolver.
+package cluster
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"net"
+	"net/http"
+	"net/url"
+	"strconv"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+	"github.com/Azure/unbounded/internal/orca/config"
+)
+
+// Peer represents one replica in the current peer-set snapshot.
+//
+// In production every Peer has Port == 0 because pod IPs are
+// addressed on the same internal-listener port across the
+// Deployment. Integration tests with multiple replicas sharing
+// 127.0.0.1 set Port to the per-replica OS-assigned port; in that
+// mode FillFromPeer dials peer.IP:peer.Port instead of falling back
+// to cfg.Cluster.InternalListen's port.
+type Peer struct {
+	IP   string
+	Port int  // 0 = use cfg.Cluster.InternalListen's port (production)
+	Self bool // true when this Peer entry represents the local replica
+}
+
+// Cluster manages peer discovery, rendezvous hashing, and the
+// internal-RPC client.
+type Cluster struct {
+	cfg config.Cluster
+
+	peers atomic.Pointer[[]Peer]
+
+	httpClient *http.Client
+	source     PeerSource
+
+	cancelFn context.CancelFunc
+	done     chan struct{}
+}
+
+// Resolver looks up the host names that back the headless Service.
+// Production uses net.DefaultResolver; tests can swap it with
+// WithResolver to substitute only the DNS layer while keeping the
+// rest of the DNS-based PeerSource behavior.
+type Resolver interface {
+	LookupHost(ctx context.Context, host string) ([]string, error)
+}
+
+// PeerSource produces the current peer-set snapshot. The DNS-backed
+// implementation queries the headless Service's A-record. Tests
+// substitute a StaticPeerSource that returns a mutable list of peers
+// with explicit Port values (so multiple replicas can share an IP).
+//
+// Each returned Peer.Self must be authoritatively set by the source
+// (the source knows the calling replica's identity at construction
+// time, so it is the only place that can stamp Self correctly when
+// peers share an IP).
+type PeerSource interface {
+	Peers(ctx context.Context) ([]Peer, error)
+}
+
+// Option configures a Cluster at construction time.
+type Option func(*Cluster)
+
+// WithPeerSource replaces the entire peer-discovery mechanism. This
+// is the primary test seam; production code constructs the default
+// DNS-backed source implicitly from cfg.Cluster.Service.
+func WithPeerSource(s PeerSource) Option {
+	return func(c *Cluster) { c.source = s }
+}
+
+// WithResolver replaces only the DNS resolver inside the default
+// DNS-backed PeerSource. Has no effect when WithPeerSource is also
+// provided. Useful if production wants a custom resolver (e.g. a
+// proxy resolver) without otherwise changing discovery semantics.
+func WithResolver(r Resolver) Option {
+	return func(c *Cluster) {
+		c.source = newDNSPeerSource(c.cfg.Service, c.cfg.SelfPodIP, r)
+	}
+}
+
+// NewDNSPeerSource is the production peer source: it polls the
+// headless Service via the given resolver. If resolver is nil, it
+// uses net.DefaultResolver. Returned peers have Port=0; FillFromPeer
+// falls back to cfg.Cluster.InternalListen's port when dialing.
+func NewDNSPeerSource(service, selfIP string, resolver Resolver) PeerSource {
+	return newDNSPeerSource(service, selfIP, resolver)
+}
+
+func newDNSPeerSource(service, selfIP string, resolver Resolver) PeerSource {
+	if resolver == nil {
+		resolver = net.DefaultResolver
+	}
+
+	return &dnsPeerSource{
+		service:  service,
+		selfIP:   selfIP,
+		resolver: resolver,
+	}
+}
+
+type dnsPeerSource struct {
+	service  string
+	selfIP   string
+	resolver Resolver
+}
+
+func (s *dnsPeerSource) Peers(ctx context.Context) ([]Peer, error) {
+	rctx, cancel := context.WithTimeout(ctx, 3*time.Second)
+	defer cancel()
+
+	ips, err := s.resolver.LookupHost(rctx, s.service)
+	if err != nil {
+		return nil, err
+	}
+
+	peers := make([]Peer, 0, len(ips))
+	for _, ip := range ips {
+		peers = append(peers, Peer{IP: ip, Self: ip == s.selfIP})
+	}
+
+	return peers, nil
+}
+
+// New returns a Cluster and starts the membership-refresh goroutine.
+func New(parent context.Context, cfg config.Cluster, opts ...Option) (*Cluster, error) {
+	if cfg.Service == "" {
+		return nil, fmt.Errorf("cluster: service required (headless Service FQDN)")
+	}
+
+	if cfg.SelfPodIP == "" {
+		return nil, fmt.Errorf("cluster: self_pod_ip required (set POD_IP env)")
+	}
+
+	ctx, cancel := context.WithCancel(parent)
+	c := &Cluster{
+		cfg:        cfg,
+		httpClient: newHTTPClient(cfg),
+		source:     newDNSPeerSource(cfg.Service, cfg.SelfPodIP, nil),
+		cancelFn:   cancel,
+		done:       make(chan struct{}),
+	}
+
+	for _, opt := range opts {
+		opt(c)
+	}
+	// Initial refresh; failure is non-fatal (empty peer-set fallback).
+	c.refresh(ctx)
+
+	go c.refreshLoop(ctx)
+
+	return c, nil
+}
+
+// Close stops the refresh goroutine and waits for it to exit.
+func (c *Cluster) Close() {
+	c.cancelFn()
+	<-c.done
+}
+
+// Peers returns the current peer-set snapshot.
+func (c *Cluster) Peers() []Peer {
+	p := c.peers.Load()
+	if p == nil {
+		return []Peer{{IP: c.cfg.SelfPodIP, Self: true}}
+	}
+
+	return *p
+}
+
+// Self returns the Peer for this replica.
+func (c *Cluster) Self() Peer {
+	return Peer{IP: c.cfg.SelfPodIP, Self: true}
+}
+
+// Coordinator selects the rendezvous-hashed coordinator for a chunk.
+//
+// Returns the Peer with the highest hash(peer || chunk_path) score.
+// On empty peer set returns Self (last-replica-standing fallback).
+func (c *Cluster) Coordinator(k chunk.Key) Peer {
+	peers := c.Peers()
+	if len(peers) == 0 {
+		return c.Self()
+	}
+
+	path := []byte(k.Path())
+
+	var (
+		best      Peer
+		bestScore uint64
+	)
+
+	for i, p := range peers {
+		score := rendezvousScore(p, path)
+		if i == 0 || score > bestScore {
+			bestScore = score
+			best = p
+		}
+	}
+
+	return best
+}
+
+// IsCoordinator reports whether this replica is the coordinator for k.
+func (c *Cluster) IsCoordinator(k chunk.Key) bool {
+	coord := c.Coordinator(k)
+	if coord.Self {
+		return true
+	}
+	// In production peers are addressed by IP only and Self is set
+	// from cfg.SelfPodIP, so the IP comparison below is the same as
+	// the Self check above. Tests with shared IPs rely on the Self
+	// flag being set authoritatively by the PeerSource.
+	return coord.IP == c.cfg.SelfPodIP && coord.Port == 0
+}
+
+// FillFromPeer issues GET /internal/fill against the named peer and
+// returns the streaming chunk body. Caller closes the returned reader.
+func (c *Cluster) FillFromPeer(ctx context.Context, p Peer, k chunk.Key) (io.ReadCloser, error) {
+	if p.Self {
+		return nil, fmt.Errorf("cluster: refusing to FillFromPeer for self")
+	}
+
+	scheme := "http"
+	if c.cfg.InternalTLS.Enabled {
+		scheme = "https"
+	}
+
+	port := strconv.Itoa(p.Port)
+	if p.Port == 0 {
+		_, defaultPort, err := net.SplitHostPort(c.cfg.InternalListen)
+		if err != nil {
+			defaultPort = "8444"
+		}
+
+		port = defaultPort
+	}
+
+	target := url.URL{
+		Scheme:   scheme,
+		Host:     net.JoinHostPort(p.IP, port),
+		Path:     "/internal/fill",
+		RawQuery: encodeChunkKey(k),
+	}
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, target.String(), nil)
+	if err != nil {
+		return nil, fmt.Errorf("cluster: build internal-fill request: %w", err)
+	}
+
+	req.Header.Set("X-Orca-Internal", "1")
+
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("cluster: internal-fill RPC: %w", err)
+	}
+
+	if resp.StatusCode == http.StatusConflict {
+		_ = resp.Body.Close() //nolint:errcheck // best-effort close on error path
+		return nil, ErrPeerNotCoordinator
+	}
+
+	if resp.StatusCode/100 != 2 {
+		body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) //nolint:errcheck // best-effort error body read
+		_ = resp.Body.Close()                                  //nolint:errcheck // best-effort close on error path
+
+		return nil, fmt.Errorf("cluster: internal-fill RPC returned %d: %s",
+			resp.StatusCode, string(body))
+	}
+
+	return resp.Body, nil
+}
+
+// ErrPeerNotCoordinator is returned by FillFromPeer when the peer
+// reports it is not the coordinator (membership disagreement).
+var ErrPeerNotCoordinator = fmt.Errorf("cluster: peer is not the coordinator (409 Conflict)")
+
+func (c *Cluster) refreshLoop(ctx context.Context) {
+	defer close(c.done)
+
+	t := time.NewTicker(c.cfg.MembershipRefresh)
+	defer t.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-t.C:
+			c.refresh(ctx)
+		}
+	}
+}
+
+func (c *Cluster) refresh(ctx context.Context) {
+	peers, err := c.source.Peers(ctx)
+	if err != nil || len(peers) == 0 {
+		// Empty-peer-set fallback: treat self as only peer.
+		self := []Peer{{IP: c.cfg.SelfPodIP, Self: true}}
+		c.peers.Store(&self)
+
+		return
+	}
+	// Ensure self is always in the set even if discovery hasn't
+	// caught up yet.
+	hasSelf := false
+
+	for _, p := range peers {
+		if p.Self {
+			hasSelf = true
+			break
+		}
+	}
+
+	if !hasSelf {
+		peers = append(peers, Peer{IP: c.cfg.SelfPodIP, Self: true})
+	}
+
+	c.peers.Store(&peers)
+}
+
+func newHTTPClient(cfg config.Cluster) *http.Client {
+	tr := &http.Transport{
+		MaxIdleConns:        16,
+		MaxIdleConnsPerHost: 4,
+		IdleConnTimeout:     30 * time.Second,
+		ForceAttemptHTTP2:   true,
+	}
+	// TLS configuration deliberately omitted for prototype dev mode
+	// (cluster.internal_tls.enabled=false). Production will populate
+	// tr.TLSClientConfig from cfg.InternalTLS.
+	_ = cfg
+
+	return &http.Client{
+		Transport: tr,
+		Timeout:   60 * time.Second,
+	}
+}
+
+// Score returns the rendezvous-hash score for (peer, key). Exposed so
+// integration tests can craft phantom peers that deterministically
+// win or lose against a real peer for a given key (used to induce
+// membership disagreement scenarios).
+func Score(p Peer, key []byte) uint64 {
+	return rendezvousScore(p, key)
+}
+
+func rendezvousScore(p Peer, key []byte) uint64 {
+	h := sha256.New()
+	h.Write([]byte(p.IP))
+	h.Write([]byte{0})
+
+	if p.Port != 0 {
+		// In production every peer has Port=0 so this branch never
+		// fires and the score is identical to historical behavior
+		// (sha256(ip || 0 || key)). Tests with multiple peers sharing
+		// 127.0.0.1 set distinct Ports so the score differentiates
+		// replicas.
+		var pb [4]byte
+		binary.BigEndian.PutUint32(pb[:], uint32(p.Port))
+		h.Write(pb[:])
+		h.Write([]byte{0})
+	}
+
+	h.Write(key)
+	sum := h.Sum(nil)
+
+	return binary.BigEndian.Uint64(sum[:8])
+}
+
+func encodeChunkKey(k chunk.Key) string {
+	v := url.Values{}
+	v.Set("origin_id", k.OriginID)
+	v.Set("bucket", k.Bucket)
+	v.Set("key", k.ObjectKey)
+	v.Set("etag", k.ETag)
+	v.Set("chunk_size", strconv.FormatInt(k.ChunkSize, 10))
+	v.Set("index", strconv.FormatInt(k.Index, 10))
+
+	return v.Encode()
+}
+
+// DecodeChunkKey parses query params into a Key. Used by the internal
+// listener (server/internal/fill).
+func DecodeChunkKey(values url.Values) (chunk.Key, error) {
+	chunkSize, err := strconv.ParseInt(values.Get("chunk_size"), 10, 64)
+	if err != nil {
+		return chunk.Key{}, fmt.Errorf("invalid chunk_size: %w", err)
+	}
+
+	idx, err := strconv.ParseInt(values.Get("index"), 10, 64)
+	if err != nil {
+		return chunk.Key{}, fmt.Errorf("invalid index: %w", err)
+	}
+
+	originID := values.Get("origin_id")
+	bucket := values.Get("bucket")
+	key := values.Get("key")
+	etag := values.Get("etag")
+
+	if originID == "" || key == "" {
+		return chunk.Key{}, fmt.Errorf("missing required key fields")
+	}
+
+	return chunk.Key{
+		OriginID:  originID,
+		Bucket:    bucket,
+		ObjectKey: key,
+		ETag:      etag,
+		ChunkSize: chunkSize,
+		Index:     idx,
+	}, nil
+}
+
+// Mu guards external mutation in tests.
+var Mu sync.Mutex
diff --git a/internal/orca/config/config.go b/internal/orca/config/config.go
new file mode 100644
index 00000000..e524611e
--- /dev/null
+++ b/internal/orca/config/config.go
@@ -0,0 +1,364 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package config defines Orca's YAML configuration shape and loading
+// helpers.
+//
+// Only the subset of design.md s5 needed for the prototype (Scope A+B)
+// is represented here. The schema is intentionally a subset: extending
+// it later is a matter of adding fields and keeping zero-values
+// backward-compatible.
+package config
+
+import (
+	"fmt"
+	"os"
+	"time"
+
+	"gopkg.in/yaml.v3"
+)
+
+// Config is the top-level Orca configuration.
+type Config struct {
+	Server       Server       `yaml:"server"`
+	Origin       Origin       `yaml:"origin"`
+	Cachestore   Cachestore   `yaml:"cachestore"`
+	Cluster      Cluster      `yaml:"cluster"`
+	ChunkCatalog ChunkCatalog `yaml:"chunk_catalog"`
+	Metadata     Metadata     `yaml:"metadata"`
+	Chunking     Chunking     `yaml:"chunking"`
+}
+
+// Server holds the client-edge listener configuration.
+type Server struct {
+	Listen string     `yaml:"listen"`
+	Auth   ServerAuth `yaml:"auth"`
+}
+
+// ServerAuth governs the client-edge authentication path.
+//
+// Production: enabled=true with mode=bearer or mode=mtls.
+// Dev: enabled=false disables authentication entirely (no token
+// or client cert required). This is a single security knob, not a
+// dev_mode flag.
+type ServerAuth struct {
+	Enabled          bool   `yaml:"enabled"`
+	Mode             string `yaml:"mode"`
+	BearerSecretFile string `yaml:"bearer_secret_file"`
+}
+
+// Origin describes the upstream origin (Azure Blob or AWS S3 in v1).
+type Origin struct {
+	ID           string        `yaml:"id"`
+	Driver       string        `yaml:"driver"` // "azureblob" or "awss3"
+	TargetGlobal int           `yaml:"target_global"`
+	QueueTimeout time.Duration `yaml:"queue_timeout"`
+	Retry        OriginRetry   `yaml:"retry"`
+	Azureblob    Azureblob     `yaml:"azureblob"`
+	AWSS3        AWSS3         `yaml:"awss3"`
+}
+
+// OriginRetry captures the leader-side pre-header retry budget.
+type OriginRetry struct {
+	Attempts         int           `yaml:"attempts"`
+	BackoffInitial   time.Duration `yaml:"backoff_initial"`
+	BackoffMax       time.Duration `yaml:"backoff_max"`
+	MaxTotalDuration time.Duration `yaml:"max_total_duration"`
+}
+
+// Azureblob is the azureblob origin adapter configuration.
+type Azureblob struct {
+	Account              string `yaml:"account"`
+	AccountKey           string `yaml:"account_key"`
+	Container            string `yaml:"container"`
+	EnforceBlockBlobOnly bool   `yaml:"enforce_block_blob_only"`
+
+	// Endpoint, when set, overrides the default Azure Blob service URL
+	// (https://<account>.blob.core.windows.net/). Used in dev to point
+	// at Azurite (http://azurite:10000/devstoreaccount1) so the
+	// azureblob driver path can be exercised without a real Azure
+	// account.
+	Endpoint string `yaml:"endpoint"`
+}
+
+// AWSS3 is the awss3 origin adapter configuration. In dev this points
+// at LocalStack alongside the cachestore (different bucket); in
+// production it points at real AWS S3 with no Endpoint override.
+type AWSS3 struct {
+	Endpoint     string `yaml:"endpoint"` // empty for real AWS S3
+	Region       string `yaml:"region"`
+	Bucket       string `yaml:"bucket"`
+	AccessKey    string `yaml:"access_key"`
+	SecretKey    string `yaml:"secret_key"`
+	UsePathStyle bool   `yaml:"use_path_style"` // true for LocalStack
+}
+
+// Cachestore is the in-DC chunk store configuration.
+type Cachestore struct {
+	Driver string       `yaml:"driver"` // "s3" in v1
+	S3     CachestoreS3 `yaml:"s3"`
+}
+
+// CachestoreS3 is the s3 driver configuration. In dev this points at
+// LocalStack; in production at VAST or another in-DC S3-compatible
+// store.
+type CachestoreS3 struct {
+	Endpoint                 string `yaml:"endpoint"`
+	Bucket                   string `yaml:"bucket"`
+	Region                   string `yaml:"region"`
+	AccessKey                string `yaml:"access_key"`
+	SecretKey                string `yaml:"secret_key"`
+	UsePathStyle             bool   `yaml:"use_path_style"` // true for LocalStack
+	RequireUnversionedBucket bool   `yaml:"require_unversioned_bucket"`
+}
+
+// Cluster captures peer discovery + internal-listener configuration.
+type Cluster struct {
+	Service           string        `yaml:"service"`            // headless Service FQDN
+	MembershipRefresh time.Duration `yaml:"membership_refresh"` // DNS poll interval
+	InternalListen    string        `yaml:"internal_listen"`
+	InternalTLS       InternalTLS   `yaml:"internal_tls"`
+	TargetReplicas    int           `yaml:"target_replicas"`
+	SelfPodIP         string        `yaml:"self_pod_ip"` // resolved from POD_IP env
+}
+
+// InternalTLS governs the internal-listener mTLS posture.
+//
+// Production: enabled=true (mTLS required).
+// Dev: enabled=false (plain HTTP/2). The binary logs WARN at startup.
+type InternalTLS struct {
+	Enabled    bool   `yaml:"enabled"`
+	CertFile   string `yaml:"cert_file"`
+	KeyFile    string `yaml:"key_file"`
+	CAFile     string `yaml:"ca_file"`
+	ServerName string `yaml:"server_name"`
+}
+
+// ChunkCatalog is the in-memory chunk-presence cache configuration.
+type ChunkCatalog struct {
+	MaxEntries int `yaml:"max_entries"`
+}
+
+// Metadata is the object-metadata cache configuration.
+type Metadata struct {
+	TTL         time.Duration `yaml:"ttl"`
+	NegativeTTL time.Duration `yaml:"negative_ttl"`
+	MaxEntries  int           `yaml:"max_entries"`
+}
+
+// Chunking governs chunk size and prefetch.
+type Chunking struct {
+	Size int64 `yaml:"size"` // bytes per chunk; default 8 MiB
+}
+
+// Load reads the YAML config file at path and returns a populated
+// Config. Defaults are applied for fields left at zero-value.
+func Load(path string) (*Config, error) {
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("read %s: %w", path, err)
+	}
+
+	cfg := &Config{}
+	if err := yaml.Unmarshal(raw, cfg); err != nil {
+		return nil, fmt.Errorf("yaml unmarshal: %w", err)
+	}
+
+	cfg.applyDefaults()
+
+	if err := cfg.validate(); err != nil {
+		return nil, fmt.Errorf("config invalid: %w", err)
+	}
+
+	return cfg, nil
+}
+
+func (c *Config) applyDefaults() {
+	// Server.
+	if c.Server.Listen == "" {
+		c.Server.Listen = "0.0.0.0:8443"
+	}
+	// Origin.
+	if c.Origin.Driver == "" {
+		c.Origin.Driver = "azureblob"
+	}
+
+	if c.Origin.TargetGlobal == 0 {
+		c.Origin.TargetGlobal = 192
+	}
+
+	if c.Origin.QueueTimeout == 0 {
+		c.Origin.QueueTimeout = 5 * time.Second
+	}
+
+	if c.Origin.Retry.Attempts == 0 {
+		c.Origin.Retry.Attempts = 3
+	}
+
+	if c.Origin.Retry.BackoffInitial == 0 {
+		c.Origin.Retry.BackoffInitial = 100 * time.Millisecond
+	}
+
+	if c.Origin.Retry.BackoffMax == 0 {
+		c.Origin.Retry.BackoffMax = 2 * time.Second
+	}
+
+	if c.Origin.Retry.MaxTotalDuration == 0 {
+		c.Origin.Retry.MaxTotalDuration = 5 * time.Second
+	}
+
+	if !c.Origin.Azureblob.EnforceBlockBlobOnly {
+		// design.md s9 states this is locked-true.
+		c.Origin.Azureblob.EnforceBlockBlobOnly = true
+	}
+	// Cachestore.
+	if c.Cachestore.Driver == "" {
+		c.Cachestore.Driver = "s3"
+	}
+
+	if c.Cachestore.S3.Region == "" {
+		c.Cachestore.S3.Region = "us-east-1"
+	}
+
+	if !c.Cachestore.S3.RequireUnversionedBucket {
+		c.Cachestore.S3.RequireUnversionedBucket = true
+	}
+	// Cluster.
+	if c.Cluster.MembershipRefresh == 0 {
+		c.Cluster.MembershipRefresh = 5 * time.Second
+	}
+
+	if c.Cluster.InternalListen == "" {
+		c.Cluster.InternalListen = "0.0.0.0:8444"
+	}
+
+	if c.Cluster.TargetReplicas == 0 {
+		c.Cluster.TargetReplicas = 3
+	}
+
+	if c.Cluster.InternalTLS.ServerName == "" {
+		c.Cluster.InternalTLS.ServerName = "orca.<ns>.svc"
+	}
+	// Resolve self pod IP from env if not set in YAML.
+	if c.Cluster.SelfPodIP == "" {
+		c.Cluster.SelfPodIP = os.Getenv("POD_IP")
+	}
+	// Resolve credentials from env if not set in YAML. This lets the
+	// non-secret config live in a ConfigMap while credentials come from
+	// a Kubernetes Secret mounted as env vars (envFrom: secretRef).
+	if c.Origin.Azureblob.AccountKey == "" {
+		c.Origin.Azureblob.AccountKey = os.Getenv("ORCA_AZUREBLOB_ACCOUNT_KEY")
+	}
+
+	if c.Origin.AWSS3.AccessKey == "" {
+		c.Origin.AWSS3.AccessKey = os.Getenv("ORCA_AWSS3_ACCESS_KEY")
+	}
+
+	if c.Origin.AWSS3.SecretKey == "" {
+		c.Origin.AWSS3.SecretKey = os.Getenv("ORCA_AWSS3_SECRET_KEY")
+	}
+
+	if c.Cachestore.S3.AccessKey == "" {
+		c.Cachestore.S3.AccessKey = os.Getenv("ORCA_CACHESTORE_S3_ACCESS_KEY")
+	}
+
+	if c.Cachestore.S3.SecretKey == "" {
+		c.Cachestore.S3.SecretKey = os.Getenv("ORCA_CACHESTORE_S3_SECRET_KEY")
+	}
+	// awss3 region default.
+	if c.Origin.AWSS3.Region == "" {
+		c.Origin.AWSS3.Region = "us-east-1"
+	}
+	// Chunk catalog.
+	if c.ChunkCatalog.MaxEntries == 0 {
+		c.ChunkCatalog.MaxEntries = 100_000
+	}
+	// Metadata.
+	if c.Metadata.TTL == 0 {
+		c.Metadata.TTL = 5 * time.Minute
+	}
+
+	if c.Metadata.NegativeTTL == 0 {
+		c.Metadata.NegativeTTL = 60 * time.Second
+	}
+
+	if c.Metadata.MaxEntries == 0 {
+		c.Metadata.MaxEntries = 10_000
+	}
+	// Chunking.
+	if c.Chunking.Size == 0 {
+		c.Chunking.Size = 8 * 1024 * 1024
+	}
+}
+
+func (c *Config) validate() error {
+	if c.Origin.ID == "" {
+		return fmt.Errorf("origin.id is required")
+	}
+
+	switch c.Origin.Driver {
+	case "azureblob":
+		if c.Origin.Azureblob.Account == "" {
+			return fmt.Errorf("origin.azureblob.account is required")
+		}
+
+		if c.Origin.Azureblob.Container == "" {
+			return fmt.Errorf("origin.azureblob.container is required")
+		}
+	case "awss3":
+		if c.Origin.AWSS3.Bucket == "" {
+			return fmt.Errorf("origin.awss3.bucket is required")
+		}
+	default:
+		return fmt.Errorf("origin.driver %q unsupported; supported: azureblob, awss3",
+			c.Origin.Driver)
+	}
+
+	if c.Cachestore.Driver != "s3" {
+		return fmt.Errorf("cachestore.driver %q unsupported; only s3 in v1", c.Cachestore.Driver)
+	}
+
+	if c.Cachestore.S3.Endpoint == "" {
+		return fmt.Errorf("cachestore.s3.endpoint is required")
+	}
+
+	if c.Cachestore.S3.Bucket == "" {
+		return fmt.Errorf("cachestore.s3.bucket is required")
+	}
+
+	if c.Cluster.Service == "" {
+		return fmt.Errorf("cluster.service is required (headless Service FQDN)")
+	}
+
+	if c.Cluster.SelfPodIP == "" {
+		return fmt.Errorf("cluster.self_pod_ip is required (typically resolved from POD_IP env)")
+	}
+
+	if c.Cluster.TargetReplicas < 1 {
+		return fmt.Errorf("cluster.target_replicas must be >= 1")
+	}
+
+	if c.Origin.TargetGlobal < c.Cluster.TargetReplicas {
+		return fmt.Errorf(
+			"origin.target_global=%d must be >= cluster.target_replicas=%d",
+			c.Origin.TargetGlobal, c.Cluster.TargetReplicas,
+		)
+	}
+
+	if c.Chunking.Size < 1024*1024 {
+		return fmt.Errorf("chunking.size %d too small; minimum 1 MiB", c.Chunking.Size)
+	}
+
+	return nil
+}
+
+// TargetPerReplica returns the per-replica origin concurrency cap derived
+// from origin.target_global and cluster.target_replicas
+// (design.md s8.4).
+func (c *Config) TargetPerReplica() int {
+	if c.Cluster.TargetReplicas <= 0 {
+		return c.Origin.TargetGlobal
+	}
+
+	return c.Origin.TargetGlobal / c.Cluster.TargetReplicas
+}
diff --git a/internal/orca/config/config_test.go b/internal/orca/config/config_test.go
new file mode 100644
index 00000000..28a734bf
--- /dev/null
+++ b/internal/orca/config/config_test.go
@@ -0,0 +1,339 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package config
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+)
+
+// TestApplyDefaults_EnvFallback verifies that applyDefaults populates
+// credential / pod-identity fields from environment variables when
+// the YAML omits them. This is the path used in production where the
+// Kubernetes Secret is mounted via envFrom and the ConfigMap holds
+// only the non-secret config.
+//
+// Each subtest sets one env var and checks that:
+//   - env-set, yaml-empty -> field populated from env.
+//   - env-unset, yaml-set -> field keeps yaml value.
+//   - env-set, yaml-set   -> field keeps yaml value (yaml wins).
+//   - env-unset, yaml-empty -> field stays empty.
+func TestApplyDefaults_EnvFallback(t *testing.T) {
+	tests := []struct {
+		envVar string
+		setVal func(c *Config, v string)
+		getVal func(c *Config) string
+	}{
+		{
+			envVar: "POD_IP",
+			setVal: func(c *Config, v string) { c.Cluster.SelfPodIP = v },
+			getVal: func(c *Config) string { return c.Cluster.SelfPodIP },
+		},
+		{
+			envVar: "ORCA_AZUREBLOB_ACCOUNT_KEY",
+			setVal: func(c *Config, v string) { c.Origin.Azureblob.AccountKey = v },
+			getVal: func(c *Config) string { return c.Origin.Azureblob.AccountKey },
+		},
+		{
+			envVar: "ORCA_AWSS3_ACCESS_KEY",
+			setVal: func(c *Config, v string) { c.Origin.AWSS3.AccessKey = v },
+			getVal: func(c *Config) string { return c.Origin.AWSS3.AccessKey },
+		},
+		{
+			envVar: "ORCA_AWSS3_SECRET_KEY",
+			setVal: func(c *Config, v string) { c.Origin.AWSS3.SecretKey = v },
+			getVal: func(c *Config) string { return c.Origin.AWSS3.SecretKey },
+		},
+		{
+			envVar: "ORCA_CACHESTORE_S3_ACCESS_KEY",
+			setVal: func(c *Config, v string) { c.Cachestore.S3.AccessKey = v },
+			getVal: func(c *Config) string { return c.Cachestore.S3.AccessKey },
+		},
+		{
+			envVar: "ORCA_CACHESTORE_S3_SECRET_KEY",
+			setVal: func(c *Config, v string) { c.Cachestore.S3.SecretKey = v },
+			getVal: func(c *Config) string { return c.Cachestore.S3.SecretKey },
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.envVar, func(t *testing.T) {
+			t.Run("env_set/yaml_empty", func(t *testing.T) {
+				t.Setenv(tt.envVar, "from-env")
+
+				c := &Config{}
+				c.applyDefaults()
+
+				if got := tt.getVal(c); got != "from-env" {
+					t.Errorf("got %q want %q", got, "from-env")
+				}
+			})
+
+			t.Run("env_unset/yaml_set", func(t *testing.T) {
+				_ = os.Unsetenv(tt.envVar) //nolint:errcheck // best-effort
+
+				c := &Config{}
+				tt.setVal(c, "from-yaml")
+				c.applyDefaults()
+
+				if got := tt.getVal(c); got != "from-yaml" {
+					t.Errorf("got %q want %q", got, "from-yaml")
+				}
+			})
+
+			t.Run("env_set/yaml_set_yaml_wins", func(t *testing.T) {
+				t.Setenv(tt.envVar, "from-env")
+
+				c := &Config{}
+				tt.setVal(c, "from-yaml")
+				c.applyDefaults()
+
+				if got := tt.getVal(c); got != "from-yaml" {
+					t.Errorf("got %q want %q (yaml should win)", got, "from-yaml")
+				}
+			})
+
+			t.Run("env_unset/yaml_empty", func(t *testing.T) {
+				_ = os.Unsetenv(tt.envVar) //nolint:errcheck // best-effort
+
+				c := &Config{}
+				c.applyDefaults()
+
+				if got := tt.getVal(c); got != "" {
+					t.Errorf("got %q want empty", got)
+				}
+			})
+		})
+	}
+}
+
+// TestApplyDefaults_FieldDefaults verifies that the hard-coded
+// fallback values fire for every field whose zero value is replaced.
+func TestApplyDefaults_FieldDefaults(t *testing.T) {
+	t.Parallel()
+
+	c := &Config{}
+	c.applyDefaults()
+
+	checks := []struct {
+		name string
+		got  any
+		want any
+	}{
+		{"server.listen", c.Server.Listen, "0.0.0.0:8443"},
+		{"origin.driver", c.Origin.Driver, "azureblob"},
+		{"origin.target_global", c.Origin.TargetGlobal, 192},
+		{"origin.queue_timeout", c.Origin.QueueTimeout, 5 * time.Second},
+		{"origin.retry.attempts", c.Origin.Retry.Attempts, 3},
+		{"origin.retry.backoff_initial", c.Origin.Retry.BackoffInitial, 100 * time.Millisecond},
+		{"origin.retry.backoff_max", c.Origin.Retry.BackoffMax, 2 * time.Second},
+		{"origin.retry.max_total_duration", c.Origin.Retry.MaxTotalDuration, 5 * time.Second},
+		{"origin.azureblob.enforce_block_blob_only", c.Origin.Azureblob.EnforceBlockBlobOnly, true},
+		{"cachestore.driver", c.Cachestore.Driver, "s3"},
+		{"cachestore.s3.region", c.Cachestore.S3.Region, "us-east-1"},
+		{"cachestore.s3.require_unversioned_bucket", c.Cachestore.S3.RequireUnversionedBucket, true},
+		{"cluster.membership_refresh", c.Cluster.MembershipRefresh, 5 * time.Second},
+		{"cluster.internal_listen", c.Cluster.InternalListen, "0.0.0.0:8444"},
+		{"cluster.target_replicas", c.Cluster.TargetReplicas, 3},
+		{"cluster.internal_tls.server_name", c.Cluster.InternalTLS.ServerName, "orca.<ns>.svc"},
+		{"chunk_catalog.max_entries", c.ChunkCatalog.MaxEntries, 100_000},
+		{"metadata.ttl", c.Metadata.TTL, 5 * time.Minute},
+		{"metadata.negative_ttl", c.Metadata.NegativeTTL, 60 * time.Second},
+		{"metadata.max_entries", c.Metadata.MaxEntries, 10_000},
+		{"chunking.size", c.Chunking.Size, int64(8 * 1024 * 1024)},
+		{"origin.awss3.region", c.Origin.AWSS3.Region, "us-east-1"},
+	}
+
+	for _, ch := range checks {
+		if ch.got != ch.want {
+			t.Errorf("%s: got %v want %v", ch.name, ch.got, ch.want)
+		}
+	}
+}
+
+// TestApplyDefaults_PreservesExplicitValues verifies that explicit
+// non-zero values are not overwritten by applyDefaults.
+func TestApplyDefaults_PreservesExplicitValues(t *testing.T) {
+	t.Parallel()
+
+	c := &Config{
+		Server: Server{Listen: "1.2.3.4:9000"},
+		Origin: Origin{
+			Driver:       "awss3",
+			TargetGlobal: 64,
+		},
+		Cachestore:   Cachestore{S3: CachestoreS3{Region: "eu-west-1"}},
+		Cluster:      Cluster{TargetReplicas: 7, MembershipRefresh: 10 * time.Second},
+		ChunkCatalog: ChunkCatalog{MaxEntries: 50},
+		Metadata:     Metadata{TTL: time.Hour, MaxEntries: 99},
+		Chunking:     Chunking{Size: 16 << 20},
+	}
+
+	c.applyDefaults()
+
+	if c.Server.Listen != "1.2.3.4:9000" {
+		t.Errorf("Server.Listen overwritten: %q", c.Server.Listen)
+	}
+
+	if c.Origin.Driver != "awss3" {
+		t.Errorf("Origin.Driver overwritten: %q", c.Origin.Driver)
+	}
+
+	if c.Origin.TargetGlobal != 64 {
+		t.Errorf("Origin.TargetGlobal overwritten: %d", c.Origin.TargetGlobal)
+	}
+
+	if c.Cachestore.S3.Region != "eu-west-1" {
+		t.Errorf("Cachestore.S3.Region overwritten: %q", c.Cachestore.S3.Region)
+	}
+
+	if c.Cluster.TargetReplicas != 7 {
+		t.Errorf("Cluster.TargetReplicas overwritten: %d", c.Cluster.TargetReplicas)
+	}
+
+	if c.Cluster.MembershipRefresh != 10*time.Second {
+		t.Errorf("Cluster.MembershipRefresh overwritten: %v", c.Cluster.MembershipRefresh)
+	}
+
+	if c.ChunkCatalog.MaxEntries != 50 {
+		t.Errorf("ChunkCatalog.MaxEntries overwritten: %d", c.ChunkCatalog.MaxEntries)
+	}
+
+	if c.Metadata.TTL != time.Hour {
+		t.Errorf("Metadata.TTL overwritten: %v", c.Metadata.TTL)
+	}
+
+	if c.Chunking.Size != 16<<20 {
+		t.Errorf("Chunking.Size overwritten: %d", c.Chunking.Size)
+	}
+}
+
+// TestLoad_Validate covers the validate() error paths.
+func TestLoad_Validate(t *testing.T) {
+	// No t.Parallel: subtests use t.Setenv to neutralize POD_IP.
+	tests := []struct {
+		name    string
+		yaml    string
+		wantErr string
+		wantOK  bool
+	}{
+		{
+			name:   "valid awss3 config",
+			yaml:   validAwss3YAML,
+			wantOK: true,
+		},
+		{
+			name:    "missing origin.id",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "id: test-origin", "id: \"\""),
+			wantErr: "origin.id is required",
+		},
+		{
+			name:    "unsupported driver",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "driver: awss3", "driver: ftp"),
+			wantErr: "origin.driver",
+		},
+		{
+			name:    "missing awss3 bucket",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "bucket: orca-origin", "bucket: \"\""),
+			wantErr: "origin.awss3.bucket is required",
+		},
+		{
+			name:    "missing cachestore endpoint",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "endpoint: http://localstack:4566", "endpoint: \"\""),
+			wantErr: "cachestore.s3.endpoint is required",
+		},
+		{
+			name:    "missing cluster service",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "service: orca-peers.svc", "service: \"\""),
+			wantErr: "cluster.service is required",
+		},
+		{
+			name:    "missing self_pod_ip when POD_IP unset",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "self_pod_ip: 10.0.0.1", "self_pod_ip: \"\""),
+			wantErr: "self_pod_ip is required",
+		},
+		{
+			name:    "target_replicas negative",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "target_replicas: 3", "target_replicas: -1"),
+			wantErr: "target_replicas",
+		},
+		{
+			name:    "chunking size below minimum",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "size: 8388608", "size: 4096"),
+			wantErr: "chunking.size",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Ensure no leakage of POD_IP from the test process env.
+			t.Setenv("POD_IP", "")
+
+			path := writeTempYAML(t, tt.yaml)
+
+			_, err := Load(path)
+			if tt.wantOK {
+				if err != nil {
+					t.Fatalf("expected nil error, got %v", err)
+				}
+
+				return
+			}
+
+			if err == nil {
+				t.Fatalf("expected error containing %q, got nil", tt.wantErr)
+			}
+
+			if !strings.Contains(err.Error(), tt.wantErr) {
+				t.Errorf("error %q does not contain %q", err.Error(), tt.wantErr)
+			}
+		})
+	}
+}
+
+func writeTempYAML(t *testing.T, content string) string {
+	t.Helper()
+
+	dir := t.TempDir()
+	path := filepath.Join(dir, "config.yaml")
+
+	if err := os.WriteFile(path, []byte(content), 0o600); err != nil {
+		t.Fatalf("write temp yaml: %v", err)
+	}
+
+	return path
+}
+
+const validAwss3YAML = `
+server:
+  listen: 0.0.0.0:8443
+origin:
+  id: test-origin
+  driver: awss3
+  awss3:
+    endpoint: http://localstack:4566
+    region: us-east-1
+    bucket: orca-origin
+    access_key: test
+    secret_key: test
+    use_path_style: true
+cachestore:
+  driver: s3
+  s3:
+    endpoint: http://localstack:4566
+    bucket: orca-cache
+    region: us-east-1
+    access_key: test
+    secret_key: test
+    use_path_style: true
+cluster:
+  service: orca-peers.svc
+  self_pod_ip: 10.0.0.1
+  target_replicas: 3
+chunking:
+  size: 8388608
+`
diff --git a/internal/orca/fetch/fetch.go b/internal/orca/fetch/fetch.go
new file mode 100644
index 00000000..8bc2cd51
--- /dev/null
+++ b/internal/orca/fetch/fetch.go
@@ -0,0 +1,333 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package fetch is the per-replica fill orchestrator: per-ChunkKey
+// singleflight, pre-header origin retry (Option D), per-replica origin
+// concurrency cap, and cross-replica fill via the cluster's internal
+// RPC (s8.3).
+//
+// Scope A+B per the design: per-replica singleflight + cluster-wide
+// dedup via rendezvous-hashed coordinator. No disk spool; joiner
+// streams from the leader's in-memory ring buffer.
+package fetch
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"sync"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/cachestore"
+	"github.com/Azure/unbounded/internal/orca/chunk"
+	"github.com/Azure/unbounded/internal/orca/chunkcatalog"
+	"github.com/Azure/unbounded/internal/orca/cluster"
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/metadata"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// Coordinator orchestrates per-replica chunk fills.
+type Coordinator struct {
+	or  origin.Origin
+	cs  cachestore.CacheStore
+	cl  *cluster.Cluster
+	cat *chunkcatalog.Catalog
+	mc  *metadata.Cache
+	cfg *config.Config
+
+	// Per-replica origin concurrency cap (s8.4 simplified).
+	originSem chan struct{}
+
+	// Per-ChunkKey singleflight (s8.1).
+	mu       sync.Mutex
+	inflight map[string]*fill
+}
+
+type fill struct {
+	done    chan struct{}
+	bodyBuf *bytes.Buffer // buffered chunk after fetch (in-memory, bounded by chunk size)
+	err     error
+}
+
+// NewCoordinator wires up the fetch coordinator.
+func NewCoordinator(
+	or origin.Origin,
+	cs cachestore.CacheStore,
+	cl *cluster.Cluster,
+	cat *chunkcatalog.Catalog,
+	mc *metadata.Cache,
+	cfg *config.Config,
+) *Coordinator {
+	tpr := cfg.TargetPerReplica()
+	if tpr < 1 {
+		tpr = 1
+	}
+
+	return &Coordinator{
+		or:        or,
+		cs:        cs,
+		cl:        cl,
+		cat:       cat,
+		mc:        mc,
+		cfg:       cfg,
+		originSem: make(chan struct{}, tpr),
+		inflight:  make(map[string]*fill),
+	}
+}
+
+// Origin returns the underlying origin (used by the LIST passthrough).
+func (c *Coordinator) Origin() origin.Origin { return c.or }
+
+// HeadObject returns object metadata, satisfying client HEAD requests.
+func (c *Coordinator) HeadObject(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	return c.mc.LookupOrFetch(ctx, c.cfg.Origin.ID, bucket, key,
+		func(ctx context.Context) (origin.ObjectInfo, error) {
+			return c.or.Head(ctx, bucket, key)
+		})
+}
+
+// GetChunk returns a reader over the chunk's bytes, fulfilling either
+// from CacheStore (hit) or by orchestrating a cluster-wide
+// dedup'd fill (miss).
+//
+// On miss:
+//   - If self is the coordinator: run local fill (origin GET via retry,
+//     atomic commit to CacheStore, populate buffer for joiners).
+//   - If a peer is the coordinator: send /internal/fill to that peer;
+//     stream from peer's response. On 409 Conflict, fall back to local
+//     fill.
+func (c *Coordinator) GetChunk(ctx context.Context, k chunk.Key) (io.ReadCloser, error) {
+	// Hot path: catalog hit -> direct CacheStore read.
+	_, ok, err := c.cat.Lookup(k)
+	if err != nil {
+		return nil, fmt.Errorf("chunkcatalog lookup: %w", err)
+	}
+
+	if ok {
+		rc, err := c.cs.GetChunk(ctx, k, 0, k.ChunkSize)
+		if err == nil {
+			return rc, nil
+		}
+
+		if errors.Is(err, cachestore.ErrNotFound) {
+			c.cat.Forget(k)
+			// fall through to miss path
+		} else {
+			return nil, err
+		}
+	}
+
+	// Stat to confirm presence.
+	if info, err := c.cs.Stat(ctx, k); err == nil {
+		if recErr := c.cat.Record(k, info); recErr != nil {
+			return nil, fmt.Errorf("chunkcatalog record: %w", recErr)
+		}
+
+		return c.cs.GetChunk(ctx, k, 0, info.Size)
+	} else if !errors.Is(err, cachestore.ErrNotFound) {
+		return nil, err
+	}
+
+	// Cluster-wide dedup: route to coordinator.
+	coord := c.cl.Coordinator(k)
+	if !coord.Self {
+		rc, err := c.cl.FillFromPeer(ctx, coord, k)
+		if err == nil {
+			return rc, nil
+		}
+
+		if errors.Is(err, cluster.ErrPeerNotCoordinator) {
+			slog.Default().Warn("peer reported not-coordinator; falling back to local fill",
+				"chunk", k.String(), "peer", coord.IP)
+			// fall through to local fill
+		} else {
+			slog.Default().Warn("internal-fill RPC failed; falling back to local fill",
+				"chunk", k.String(), "peer", coord.IP, "err", err)
+		}
+	}
+
+	return c.fillLocal(ctx, k)
+}
+
+// FillForPeer is the path taken by the /internal/fill handler.
+//
+// The receiver becomes the leader for this fill (or joins an in-flight
+// fill for the same key). Returns a streaming body of the entire chunk.
+func (c *Coordinator) FillForPeer(ctx context.Context, k chunk.Key) (io.ReadCloser, error) {
+	// Hot path: catalog hit -> direct read. The catalog can be stale
+	// (e.g. cachestore pruned out-of-band, or operator clear-cache);
+	// on ErrNotFound we forget and fall through to a fresh fill.
+	_, ok, err := c.cat.Lookup(k)
+	if err != nil {
+		return nil, fmt.Errorf("chunkcatalog lookup: %w", err)
+	}
+
+	if ok {
+		rc, err := c.cs.GetChunk(ctx, k, 0, k.ChunkSize)
+		if err == nil {
+			return rc, nil
+		}
+
+		if errors.Is(err, cachestore.ErrNotFound) {
+			c.cat.Forget(k)
+		} else {
+			return nil, err
+		}
+	}
+
+	if info, err := c.cs.Stat(ctx, k); err == nil {
+		if recErr := c.cat.Record(k, info); recErr != nil {
+			return nil, fmt.Errorf("chunkcatalog record: %w", recErr)
+		}
+
+		return c.cs.GetChunk(ctx, k, 0, info.Size)
+	} else if !errors.Is(err, cachestore.ErrNotFound) {
+		return nil, err
+	}
+
+	return c.fillLocal(ctx, k)
+}
+
+// fillLocal runs (or joins) the singleflight for k on this replica.
+func (c *Coordinator) fillLocal(ctx context.Context, k chunk.Key) (io.ReadCloser, error) {
+	path := k.Path()
+
+	c.mu.Lock()
+
+	f, ok := c.inflight[path]
+	if !ok {
+		f = &fill{done: make(chan struct{})}
+		c.inflight[path] = f
+		c.mu.Unlock()
+
+		go c.runFill(k, f)
+	} else {
+		c.mu.Unlock()
+	}
+
+	select {
+	case <-ctx.Done():
+		return nil, ctx.Err()
+	case <-f.done:
+	}
+
+	if f.err != nil {
+		return nil, f.err
+	}
+
+	return io.NopCloser(bytes.NewReader(f.bodyBuf.Bytes())), nil
+}
+
+func (c *Coordinator) runFill(k chunk.Key, f *fill) {
+	// Use a fill-scoped context to outlive any single requester.
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+
+	defer func() {
+		close(f.done)
+		c.mu.Lock()
+		delete(c.inflight, k.Path())
+		c.mu.Unlock()
+	}()
+
+	// Acquire per-replica origin slot.
+	queueCtx, queueCancel := context.WithTimeout(ctx, c.cfg.Origin.QueueTimeout)
+	defer queueCancel()
+
+	select {
+	case c.originSem <- struct{}{}:
+	case <-queueCtx.Done():
+		f.err = fmt.Errorf("origin: queue timeout (cap=%d)", cap(c.originSem))
+		return
+	}
+
+	defer func() { <-c.originSem }()
+
+	// Pre-header retry loop.
+	off, length := k.Range()
+
+	body, err := c.fetchWithRetry(ctx, k, off, length)
+	if err != nil {
+		f.err = err
+		return
+	}
+	defer body.Close() //nolint:errcheck // origin body close best-effort
+
+	buf := &bytes.Buffer{}
+	if _, err := io.Copy(buf, body); err != nil {
+		f.err = fmt.Errorf("fill copy: %w", err)
+		return
+	}
+
+	f.bodyBuf = buf
+
+	// Atomic commit to CacheStore.
+	commitErr := c.cs.PutChunk(ctx, k, int64(buf.Len()), bytes.NewReader(buf.Bytes()))
+	if commitErr == nil {
+		if recErr := c.cat.Record(k, cachestore.Info{Size: int64(buf.Len()), Committed: time.Now()}); recErr != nil {
+			slog.Default().Warn("chunkcatalog record failed",
+				"chunk", k.String(), "err", recErr)
+		}
+	} else if errors.Is(commitErr, cachestore.ErrCommitLost) {
+		// Another replica won; treat existing CacheStore entry as truth.
+		if info, err := c.cs.Stat(ctx, k); err == nil {
+			if recErr := c.cat.Record(k, info); recErr != nil {
+				slog.Default().Warn("chunkcatalog record failed",
+					"chunk", k.String(), "err", recErr)
+			}
+		}
+	} else {
+		slog.Default().Warn("commit-after-serve failed",
+			"chunk", k.String(), "err", commitErr)
+		// Don't record in catalog; next request refills.
+	}
+}
+
+func (c *Coordinator) fetchWithRetry(ctx context.Context, k chunk.Key, off, length int64) (io.ReadCloser, error) {
+	deadline := time.Now().Add(c.cfg.Origin.Retry.MaxTotalDuration)
+	backoff := c.cfg.Origin.Retry.BackoffInitial
+
+	var lastErr error
+
+	for attempt := 1; attempt <= c.cfg.Origin.Retry.Attempts; attempt++ {
+		if time.Now().After(deadline) {
+			return nil, fmt.Errorf("origin retry exhausted (duration); last err: %w", lastErr)
+		}
+
+		body, err := c.or.GetRange(ctx, k.Bucket, k.ObjectKey, k.ETag, off, length)
+		if err == nil {
+			return body, nil
+		}
+
+		lastErr = err
+		// Non-retryable: ETag changed.
+		var etagChanged *origin.OriginETagChangedError
+		if errors.As(err, &etagChanged) {
+			c.mc.Invalidate(c.cfg.Origin.ID, k.Bucket, k.ObjectKey)
+			return nil, err
+		}
+		// Non-retryable: not found.
+		if errors.Is(err, origin.ErrNotFound) {
+			return nil, err
+		}
+		// Backoff.
+		if attempt < c.cfg.Origin.Retry.Attempts {
+			select {
+			case <-ctx.Done():
+				return nil, ctx.Err()
+			case <-time.After(backoff):
+			}
+
+			backoff *= 2
+			if backoff > c.cfg.Origin.Retry.BackoffMax {
+				backoff = c.cfg.Origin.Retry.BackoffMax
+			}
+		}
+	}
+
+	return nil, fmt.Errorf("origin retry exhausted (attempts); last err: %w", lastErr)
+}
diff --git a/internal/orca/inttest/azure_test.go b/internal/orca/inttest/azure_test.go
new file mode 100644
index 00000000..5c9ab1dd
--- /dev/null
+++ b/internal/orca/inttest/azure_test.go
@@ -0,0 +1,45 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"bytes"
+	"context"
+	"net/http"
+	"testing"
+	"time"
+)
+
+// TestAzureBlobOrigin_ColdGet verifies the azureblob origin driver
+// works against Azurite end-to-end on a 3-replica cluster. The
+// MediumBlob spans 2 chunks so rendezvous-hashed routing typically
+// exercises both fillLocal and FillFromPeer in a single run.
+func TestAzureBlobOrigin_ColdGet(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 90*time.Second)
+	defer cancel()
+
+	ctr := pkgAzurite.NewContainer(ctx, t, "orca-origin")
+	blob := MediumBlob()
+	SeedAzure(ctx, t, pkgAzurite, ctr, []SeedBlob{blob})
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:     pkgLocalStack,
+		Azurite:        pkgAzurite,
+		OriginDriver:   "azureblob",
+		AzureContainer: ctr,
+	})
+
+	resp := cl.Get(1).HTTP.Get(ctx, t, ctr, blob.Key)
+	if resp.Status != http.StatusOK {
+		t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body))
+	}
+
+	if !bytes.Equal(resp.Body, blob.Data) {
+		t.Fatalf("body mismatch: got %d bytes want %d", len(resp.Body), len(blob.Data))
+	}
+}
diff --git a/internal/orca/inttest/azurite.go b/internal/orca/inttest/azurite.go
new file mode 100644
index 00000000..e80134ab
--- /dev/null
+++ b/internal/orca/inttest/azurite.go
@@ -0,0 +1,167 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"fmt"
+	"testing"
+
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/pageblob"
+	"github.com/testcontainers/testcontainers-go"
+	"github.com/testcontainers/testcontainers-go/wait"
+)
+
+// Azurite is a running Azurite container with helper accessors for
+// constructing azblob clients pointed at the well-known dev account.
+type Azurite struct {
+	container testcontainers.Container
+	endpoint  string // http://host:port/devstoreaccount1
+}
+
+// Endpoint returns the Azurite blob-service URL including the
+// devstoreaccount1 path segment.
+func (az *Azurite) Endpoint() string { return az.endpoint }
+
+// AccountName returns the well-known Azurite dev account name.
+func (az *Azurite) AccountName() string { return azuriteAccountName }
+
+// AccountKey returns the well-known Azurite dev account key.
+func (az *Azurite) AccountKey() string { return azuriteAccountKey }
+
+// StartAzurite launches an Azurite container and returns once the
+// blob-service port is reachable. Caller terminates via Terminate or
+// t.Cleanup.
+func StartAzurite(ctx context.Context) (*Azurite, error) {
+	req := testcontainers.ContainerRequest{
+		Image:        azuriteImage,
+		ExposedPorts: []string{azuritePort + "/tcp"},
+		// `azurite-blob` listens on 0.0.0.0 by default; --skipApiVersionCheck
+		// keeps the SDK happy for newer client versions.
+		Cmd:        []string{"azurite-blob", "--blobHost", "0.0.0.0", "--skipApiVersionCheck"},
+		WaitingFor: wait.ForListeningPort(azuritePort + "/tcp"),
+	}
+
+	c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
+		ContainerRequest: req,
+		Started:          true,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("start azurite: %w", err)
+	}
+
+	host, err := c.Host(ctx)
+	if err != nil {
+		_ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup
+		return nil, fmt.Errorf("azurite host: %w", err)
+	}
+
+	port, err := c.MappedPort(ctx, azuritePort+"/tcp")
+	if err != nil {
+		_ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup
+		return nil, fmt.Errorf("azurite port: %w", err)
+	}
+
+	endpoint := fmt.Sprintf("http://%s:%s/%s", host, port.Port(), azuriteAccountName)
+
+	return &Azurite{
+		container: c,
+		endpoint:  endpoint,
+	}, nil
+}
+
+// Terminate stops and removes the Azurite container.
+func (az *Azurite) Terminate(ctx context.Context) error {
+	return az.container.Terminate(ctx)
+}
+
+// NewServiceClient returns an azblob.Client authenticated with the
+// well-known Azurite dev creds.
+func (az *Azurite) NewServiceClient(t *testing.T) *azblob.Client {
+	t.Helper()
+
+	cred, err := azblob.NewSharedKeyCredential(az.AccountName(), az.AccountKey())
+	if err != nil {
+		t.Fatalf("azurite shared key cred: %v", err)
+	}
+
+	cli, err := azblob.NewClientWithSharedKeyCredential(az.endpoint, cred, nil)
+	if err != nil {
+		t.Fatalf("azurite client: %v", err)
+	}
+
+	return cli
+}
+
+// NewContainer creates a fresh container and registers a cleanup. The
+// container name is returned.
+func (az *Azurite) NewContainer(ctx context.Context, t *testing.T, prefix string) string {
+	t.Helper()
+
+	cli := az.NewServiceClient(t)
+	name := uniqueName(prefix)
+
+	if _, err := cli.CreateContainer(ctx, name, nil); err != nil {
+		t.Fatalf("create container %s: %v", name, err)
+	}
+
+	t.Cleanup(func() {
+		_, _ = cli.DeleteContainer(context.Background(), name, nil) //nolint:errcheck // best-effort cleanup
+	})
+
+	return name
+}
+
+// UploadBlockBlob uploads bytes as a block blob to (container, name).
+func (az *Azurite) UploadBlockBlob(ctx context.Context, t *testing.T, ctr, name string, data []byte) {
+	t.Helper()
+
+	cli := az.NewServiceClient(t)
+	if _, err := cli.UploadBuffer(ctx, ctr, name, data, nil); err != nil {
+		t.Fatalf("upload block blob %s/%s: %v", ctr, name, err)
+	}
+}
+
+// UploadPageBlob uploads bytes as a page blob (used to exercise the
+// EnforceBlockBlobOnly negative path). Size must be a multiple of 512.
+func (az *Azurite) UploadPageBlob(ctx context.Context, t *testing.T, ctr, name string, size int64) {
+	t.Helper()
+
+	cred, err := azblob.NewSharedKeyCredential(az.AccountName(), az.AccountKey())
+	if err != nil {
+		t.Fatalf("azurite shared key cred: %v", err)
+	}
+
+	containerCli, err := container.NewClientWithSharedKeyCredential(
+		fmt.Sprintf("%s/%s", az.endpoint, ctr), cred, nil)
+	if err != nil {
+		t.Fatalf("container client: %v", err)
+	}
+
+	pbCli := containerCli.NewPageBlobClient(name)
+	if _, err := pbCli.Create(ctx, size, &pageblob.CreateOptions{
+		HTTPHeaders: &blob.HTTPHeaders{},
+	}); err != nil {
+		t.Fatalf("create page blob: %v", err)
+	}
+	// Page blobs created here are zero-filled; tests don't read content
+	// because EnforceBlockBlobOnly should reject the GET first.
+}
+
+// uniqueName returns a short random-suffixed name suitable for
+// LocalStack buckets and Azurite containers.
+func uniqueName(prefix string) string {
+	var b [4]byte
+
+	_, _ = rand.Read(b[:]) //nolint:errcheck // crypto/rand never fails on linux
+
+	return fmt.Sprintf("%s-%s", prefix, hex.EncodeToString(b[:]))
+}
diff --git a/internal/orca/inttest/client.go b/internal/orca/inttest/client.go
new file mode 100644
index 00000000..78543451
--- /dev/null
+++ b/internal/orca/inttest/client.go
@@ -0,0 +1,127 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"encoding/xml"
+	"fmt"
+	"io"
+	"net/http"
+	"testing"
+)
+
+// Client is a thin HTTP wrapper that targets a single replica's edge
+// listener and provides typed helpers (GET, GET-Range, HEAD, LIST) for
+// test assertions.
+type Client struct {
+	BaseURL string
+	HTTP    *http.Client
+}
+
+// NewClient returns a Client targeting baseURL (e.g. http://127.0.0.1:34567).
+func NewClient(baseURL string) *Client {
+	return &Client{
+		BaseURL: baseURL,
+		HTTP:    &http.Client{},
+	}
+}
+
+// GetResponse is the result of a GET / HEAD request.
+type GetResponse struct {
+	Status int
+	Header http.Header
+	Body   []byte
+}
+
+// Get fetches the full body of /bucket/key.
+func (c *Client) Get(ctx context.Context, t *testing.T, bucket, key string) GetResponse {
+	t.Helper()
+
+	return c.do(ctx, t, http.MethodGet, fmt.Sprintf("/%s/%s", bucket, key), nil)
+}
+
+// GetRange fetches a byte range from /bucket/key.
+func (c *Client) GetRange(ctx context.Context, t *testing.T, bucket, key string, start, end int64) GetResponse {
+	t.Helper()
+
+	hdr := http.Header{}
+	hdr.Set("Range", fmt.Sprintf("bytes=%d-%d", start, end))
+
+	return c.do(ctx, t, http.MethodGet, fmt.Sprintf("/%s/%s", bucket, key), hdr)
+}
+
+// Head issues a HEAD against /bucket/key.
+func (c *Client) Head(ctx context.Context, t *testing.T, bucket, key string) GetResponse {
+	t.Helper()
+
+	return c.do(ctx, t, http.MethodHead, fmt.Sprintf("/%s/%s", bucket, key), nil)
+}
+
+// ListBucketResult mirrors the (subset) S3 ListObjectsV2 XML response
+// shape produced by the orca edge handler.
+type ListBucketResult struct {
+	XMLName  xml.Name `xml:"ListBucketResult"`
+	Name     string   `xml:"Name"`
+	Prefix   string   `xml:"Prefix"`
+	KeyCount int      `xml:"KeyCount"`
+	Contents []struct {
+		Key  string `xml:"Key"`
+		Size int64  `xml:"Size"`
+		ETag string `xml:"ETag"`
+	} `xml:"Contents"`
+}
+
+// List issues a LIST against /bucket/?list-type=2&prefix=<prefix>.
+func (c *Client) List(ctx context.Context, t *testing.T, bucket, prefix string) ListBucketResult {
+	t.Helper()
+
+	resp := c.do(ctx, t, http.MethodGet,
+		fmt.Sprintf("/%s/?list-type=2&prefix=%s", bucket, prefix), nil)
+	if resp.Status != http.StatusOK {
+		t.Fatalf("LIST status=%d body=%s", resp.Status, string(resp.Body))
+	}
+
+	var out ListBucketResult
+	if err := xml.Unmarshal(resp.Body, &out); err != nil {
+		t.Fatalf("LIST decode: %v body=%s", err, string(resp.Body))
+	}
+
+	return out
+}
+
+func (c *Client) do(ctx context.Context, t *testing.T, method, path string, hdr http.Header) GetResponse {
+	t.Helper()
+
+	req, err := http.NewRequestWithContext(ctx, method, c.BaseURL+path, nil)
+	if err != nil {
+		t.Fatalf("build request: %v", err)
+	}
+
+	for k, vs := range hdr {
+		for _, v := range vs {
+			req.Header.Add(k, v)
+		}
+	}
+
+	resp, err := c.HTTP.Do(req)
+	if err != nil {
+		t.Fatalf("%s %s: %v", method, path, err)
+	}
+
+	defer func() { _ = resp.Body.Close() }() //nolint:errcheck // body close best-effort in tests
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		t.Fatalf("read body: %v", err)
+	}
+
+	return GetResponse{
+		Status: resp.StatusCode,
+		Header: resp.Header,
+		Body:   body,
+	}
+}
diff --git a/internal/orca/inttest/doc.go b/internal/orca/inttest/doc.go
new file mode 100644
index 00000000..ac83f611
--- /dev/null
+++ b/internal/orca/inttest/doc.go
@@ -0,0 +1,75 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+// Package inttest contains integration tests for the Orca cache.
+//
+// Build tag `integrationtest` gates these tests; run via:
+//
+//	make orca-inttest
+//
+// Equivalent to:
+//
+//	go test -tags=integrationtest -race -timeout 15m \
+//	  ./internal/orca/inttest/...
+//
+// # Architecture
+//
+// The harness brings up real LocalStack and Azurite containers via
+// testcontainers-go and constructs N in-process *app.App instances
+// wired to those containers. By default StartCluster runs 3 replicas,
+// matching the production deploy/orca topology.
+//
+// Every replica binds to 127.0.0.1 with an OS-assigned distinct
+// internal port; the cluster.Peer struct now carries an explicit Port
+// (zero in production, set in tests) and FillFromPeer dials peer.IP +
+// peer.Port. This lets multi-replica tests run on every platform
+// (Linux, macOS, Windows / WSL) without loopback-alias setup.
+//
+// Each replica owns its own StaticPeerSource (cluster.PeerSource).
+// Tests that need to induce membership disagreement mutate one
+// replica's source; the cluster's refresh goroutine picks up the
+// change within MembershipRefresh (250 ms in tests).
+//
+// # Container lifecycle
+//
+// TestMain starts one LocalStack and one Azurite container per
+// `go test` invocation; per-test buckets/containers prevent
+// cross-test interference.
+//
+// # File layout
+//
+//   - e2e_test.go - the canonical end-to-end suite (3 replicas).
+//     Boot-self-test, cold/warm GET, ranged GET, multi-chunk GET,
+//     LIST, HEAD, NotFound, rendezvous coordinator routing,
+//     singleflight collapse, peer-not-coordinator fallback (real).
+//   - azure_test.go - azureblob origin driver smoke against Azurite
+//     (3 replicas).
+//
+// Driver-level branch coverage (versioning gate, blob-type
+// rejection) lives as fast unit tests in the respective driver
+// packages (cachestore/s3, origin/azureblob), not here.
+//
+// # Adding a scenario
+//
+//  1. Pick the right entry point: StartCluster (3-replica default).
+//     Tests that need to assert on a boot-time failure mode that
+//     surfaces before any chunk fetch (versioning gate, blob-type
+//     rejection, etc.) should live as unit tests in the respective
+//     driver package.
+//  2. Seed the origin: SeedS3 or SeedAzure.
+//  3. Issue requests via cl.Get(i).HTTP.Get / GetRange / Head / List.
+//  4. Assert byte-exact body, status code, and (where relevant) origin
+//     RPC counts via the optional CountingOrigin or peer 409 counts via
+//     CountingInternalHandlerWrap.
+//
+// # TODO (genuinely future work)
+//
+//   - TestEtagChange (mid-fill mutation): requires a deterministic
+//     test seam in fetch.Coordinator (e.g. a hook that pauses between
+//     chunk fetches) so the test can rewrite the origin object
+//     between chunk 0 and chunk 1 of the same fill.
+//   - Fault-injection origin / cachestore decorators: useful for
+//     timeout, throttle, and 5xx retry-budget assertions.
+package inttest
diff --git a/internal/orca/inttest/e2e_test.go b/internal/orca/inttest/e2e_test.go
new file mode 100644
index 00000000..c384fc61
--- /dev/null
+++ b/internal/orca/inttest/e2e_test.go
@@ -0,0 +1,496 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"bytes"
+	"context"
+	"net/http"
+	"strconv"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+	"github.com/Azure/unbounded/internal/orca/cluster"
+)
+
+// e2e_test.go is the canonical end-to-end suite for orca: every
+// scenario runs against a 3-replica in-process cluster pointed at
+// LocalStack. Tests that exercise chunk fetching naturally exercise
+// both the local-fill path (when self happens to win rendezvous for
+// a chunk) and the cross-replica /internal/fill path (when a peer
+// wins).
+//
+// Driver-level branch coverage (versioning gate, blob-type rejection,
+// HTTP error mapping, range parsing, chunk arithmetic, config env
+// fallback) lives as fast unit tests in the respective driver / server
+// / chunk / config packages. The scenarios here are reserved for
+// behavior that can only be verified end-to-end against real
+// LocalStack (or Azurite, in azure_test.go) plus a real cluster of
+// in-process orca instances.
+
+// TestColdAndWarmGet exercises GET twice for the same single-chunk
+// blob: cold (origin fetch + cache commit) and warm (cachestore hit).
+// The warm phase deletes the origin object first to prove the cache
+// hit really happened.
+func TestColdAndWarmGet(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 60*time.Second)
+	defer cancel()
+
+	bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin")
+	blob := SmallBlob()
+	SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob})
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:   pkgLocalStack,
+		OriginBucket: bucket,
+	})
+
+	cold := cl.Get(1).HTTP.Get(ctx, t, bucket, blob.Key)
+	if cold.Status != http.StatusOK {
+		t.Fatalf("cold status=%d body=%s", cold.Status, string(cold.Body))
+	}
+
+	if !bytes.Equal(cold.Body, blob.Data) {
+		t.Fatalf("cold body mismatch: got %d bytes, want %d", len(cold.Body), len(blob.Data))
+	}
+
+	if cold.Header.Get("ETag") == "" {
+		t.Errorf("expected ETag header on cold GET")
+	}
+
+	DeleteS3Object(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, blob.Key)
+
+	warm := cl.Get(1).HTTP.Get(ctx, t, bucket, blob.Key)
+	if warm.Status != http.StatusOK {
+		t.Fatalf("warm status=%d body=%s", warm.Status, string(warm.Body))
+	}
+
+	if !bytes.Equal(warm.Body, blob.Data) {
+		t.Fatalf("warm body mismatch: got %d bytes, want %d", len(warm.Body), len(blob.Data))
+	}
+}
+
+// TestRangedGet verifies byte-range requests return 206 +
+// Content-Range + the requested slice. Covers within-chunk,
+// cross-chunk, and (against a 64-chunk blob) various boundary edge
+// cases. The chunk-arithmetic branches are unit-tested separately in
+// internal/orca/chunk; this verifies the end-to-end HTTP Range
+// round-trip with real chunk bodies.
+func TestRangedGet(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 120*time.Second)
+	defer cancel()
+
+	bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin")
+	medium := MediumBlob() // 1.5 MiB == 2 chunks at 1 MiB
+	huge := HugeBlob()     // 64 MiB == 64 chunks at 1 MiB
+	SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{medium, huge})
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:   pkgLocalStack,
+		OriginBucket: bucket,
+	})
+
+	resp := cl.Get(1).HTTP.GetRange(ctx, t, bucket, medium.Key, 100, 199)
+	if resp.Status != http.StatusPartialContent {
+		t.Fatalf("status=%d (want 206)", resp.Status)
+	}
+
+	if cr := resp.Header.Get("Content-Range"); cr == "" {
+		t.Errorf("expected Content-Range header")
+	}
+
+	want := medium.Data[100:200]
+	if !bytes.Equal(resp.Body, want) {
+		t.Fatalf("range body mismatch: got %d bytes, want %d", len(resp.Body), len(want))
+	}
+
+	chunkSize := int64(1024 * 1024)
+	resp2 := cl.Get(1).HTTP.GetRange(ctx, t, bucket, medium.Key, chunkSize-50, chunkSize+49)
+
+	if resp2.Status != http.StatusPartialContent {
+		t.Fatalf("cross-chunk status=%d (want 206)", resp2.Status)
+	}
+
+	want2 := medium.Data[chunkSize-50 : chunkSize+50]
+	if !bytes.Equal(resp2.Body, want2) {
+		t.Fatalf("cross-chunk range mismatch: got %d bytes, want %d", len(resp2.Body), len(want2))
+	}
+
+	t.Run("huge blob boundary cases", func(t *testing.T) {
+		const chunk = int64(1024 * 1024)
+
+		cases := []struct {
+			name       string
+			start, end int64
+		}{
+			{"starts exactly at chunk boundary 32", 32 * chunk, 32*chunk + 100},
+			{"ends exactly at chunk boundary 47", 48*chunk - 100, 48*chunk - 1},
+			{"covers chunks 10-12 (3 contiguous full chunks)", 10 * chunk, 13*chunk - 1},
+			{"straddles 5 consecutive boundaries (chunks 20-25)", 20*chunk + 100, 25*chunk + 200},
+		}
+
+		for _, tc := range cases {
+			t.Run(tc.name, func(t *testing.T) {
+				rr := cl.Get(1).HTTP.GetRange(ctx, t, bucket, huge.Key, tc.start, tc.end)
+				if rr.Status != http.StatusPartialContent {
+					t.Fatalf("status=%d (want 206)", rr.Status)
+				}
+
+				expected := huge.Data[tc.start : tc.end+1]
+				if !bytes.Equal(rr.Body, expected) {
+					t.Fatalf("body mismatch: got %d bytes, want %d", len(rr.Body), len(expected))
+				}
+			})
+		}
+	})
+}
+
+// TestMultiChunkGet verifies a full GET of a 64-chunk blob assembles
+// correctly across chunk boundaries. With 3 replicas and 64 chunks,
+// rendezvous-hashed coordinator selection statistically guarantees
+// every replica is the coordinator for many chunks, so this test
+// exercises both fillLocal and FillFromPeer paths thoroughly in a
+// single run.
+func TestMultiChunkGet(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 120*time.Second)
+	defer cancel()
+
+	bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin")
+	blob := HugeBlob()
+	SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob})
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:   pkgLocalStack,
+		OriginBucket: bucket,
+	})
+
+	resp := cl.Get(1).HTTP.Get(ctx, t, bucket, blob.Key)
+	if resp.Status != http.StatusOK {
+		t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body))
+	}
+
+	if !bytes.Equal(resp.Body, blob.Data) {
+		t.Fatalf("body mismatch: got %d bytes, want %d", len(resp.Body), len(blob.Data))
+	}
+}
+
+// TestRendezvousCoordinatorRouting verifies that a GET against a
+// non-coordinator replica routes through /internal/fill to the
+// coordinator and still returns the body. The CountingOrigin
+// decorator confirms exactly one origin GetRange happened across the
+// cluster (the coordinator's).
+func TestRendezvousCoordinatorRouting(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 90*time.Second)
+	defer cancel()
+
+	bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin")
+	blob := SmallBlob()
+	SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob})
+
+	count := newCountingOriginForLocalStack(ctx, t, bucket)
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:     pkgLocalStack,
+		OriginBucket:   bucket,
+		OriginOverride: count,
+	})
+
+	headResp := cl.Get(1).HTTP.Head(ctx, t, bucket, blob.Key)
+
+	etag := stripQuotes(headResp.Header.Get("ETag"))
+	if etag == "" {
+		t.Fatalf("HEAD returned empty ETag: %+v", headResp.Header)
+	}
+
+	k := chunk.Key{
+		OriginID:  "inttest-origin",
+		Bucket:    bucket,
+		ObjectKey: blob.Key,
+		ETag:      etag,
+		ChunkSize: int64(1024 * 1024),
+		Index:     0,
+	}
+	coord := cl.Get(1).App.Cluster.Coordinator(k)
+
+	var nonCoord *Replica
+
+	for _, r := range cl.Replicas {
+		if r.SelfIP != coord.IP || r.InternalPort != coord.Port {
+			nonCoord = r
+			break
+		}
+	}
+
+	if nonCoord == nil {
+		t.Fatalf("could not find a non-coordinator replica; coord=%+v peers=%+v",
+			coord, cl.Get(1).App.Cluster.Peers())
+	}
+
+	count.Reset()
+
+	resp := nonCoord.HTTP.Get(ctx, t, bucket, blob.Key)
+	if resp.Status != http.StatusOK {
+		t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body))
+	}
+
+	if !bytes.Equal(resp.Body, blob.Data) {
+		t.Fatalf("body mismatch: got %d bytes, want %d", len(resp.Body), len(blob.Data))
+	}
+	// Exactly one HEAD (HeadObject metadata cache) plus one GetRange
+	// (single chunk fetch). Cluster-wide dedup must not produce more.
+	if got := count.GetRanges(); got != 1 {
+		t.Errorf("origin GetRange count=%d (want 1)", got)
+	}
+}
+
+// TestSingleflightCollapse fires N concurrent GETs (one per replica)
+// for the same key and asserts the origin saw exactly one GetRange
+// per chunk (cluster-wide singleflight collapse).
+func TestSingleflightCollapse(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 120*time.Second)
+	defer cancel()
+
+	bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin")
+	blob := HugeBlob() // 64 chunks
+	SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob})
+
+	count := newCountingOriginForLocalStack(ctx, t, bucket)
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:     pkgLocalStack,
+		OriginBucket:   bucket,
+		OriginOverride: count,
+	})
+
+	count.Reset()
+
+	var wg sync.WaitGroup
+
+	wg.Add(cl.Len())
+
+	results := make([][]byte, cl.Len())
+	statuses := make([]int, cl.Len())
+
+	for i := 1; i <= cl.Len(); i++ {
+		go func(i int) {
+			defer wg.Done()
+
+			r := cl.Get(i).HTTP.Get(ctx, t, bucket, blob.Key)
+			results[i-1] = r.Body
+			statuses[i-1] = r.Status
+		}(i)
+	}
+
+	wg.Wait()
+
+	for i, s := range statuses {
+		if s != http.StatusOK {
+			t.Fatalf("replica %d status=%d", i+1, s)
+		}
+
+		if !bytes.Equal(results[i], blob.Data) {
+			t.Fatalf("replica %d body mismatch: got %d bytes want %d", i+1, len(results[i]), len(blob.Data))
+		}
+	}
+	// HugeBlob spans 64 chunks; cluster-wide singleflight should
+	// dedupe each chunk to exactly one origin GetRange. Allow up to
+	// 76 (~20% slack) to absorb timing-dependent races where a
+	// joiner arrives during in-flight commit.
+	if got := count.GetRanges(); got > 76 {
+		t.Errorf("origin GetRange count=%d (want <= 76 for 64-chunk blob)", got)
+	}
+
+	if got := count.GetRanges(); got < 64 {
+		t.Errorf("origin GetRange count=%d (want >= 64 for 64-chunk cold fill)", got)
+	}
+}
+
+// TestPeerNotCoordinatorFallback induces real membership disagreement
+// and asserts the coordinator's /internal/fill returns 409 and the
+// requesting replica's local-fill fallback succeeds.
+//
+// Setup:
+//
+//   - 3-replica cluster with shared CountingInternalHandlerWrap so we
+//     can read 409 counts per receiving replica.
+//   - HEAD the seeded blob to learn ETag; compute Coordinator(k) for
+//     chunk 0 from replica 1's view (call it C).
+//   - Craft a phantom peer P (an unreachable IP/Port pair) whose
+//     rendezvous score for k is higher than C's. Mutate C's peer
+//     source to include P plus C itself; now C.IsCoordinator(k)
+//     returns false because P wins.
+//   - Find another replica R whose view still says C is the
+//     coordinator. GET via R.
+//
+// Expected:
+//
+//   - R issues /internal/fill to C.
+//   - C responds 409 (its IsCoordinator returns false because P wins).
+//   - R falls through to fillLocal, fetches the origin, serves the
+//     body.
+//   - counter.Count(C, 409) >= 1.
+func TestPeerNotCoordinatorFallback(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 90*time.Second)
+	defer cancel()
+
+	bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin")
+	blob := SmallBlob()
+	SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob})
+
+	wrap := NewCountingInternalHandlerWrap()
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:          pkgLocalStack,
+		OriginBucket:        bucket,
+		InternalHandlerWrap: wrap,
+	})
+
+	headResp := cl.Get(1).HTTP.Head(ctx, t, bucket, blob.Key)
+
+	etag := stripQuotes(headResp.Header.Get("ETag"))
+	if etag == "" {
+		t.Fatalf("HEAD returned empty ETag: %+v", headResp.Header)
+	}
+
+	k := chunk.Key{
+		OriginID:  "inttest-origin",
+		Bucket:    bucket,
+		ObjectKey: blob.Key,
+		ETag:      etag,
+		ChunkSize: int64(1024 * 1024),
+		Index:     0,
+	}
+	coord := cl.Get(1).App.Cluster.Coordinator(k)
+
+	coordReplica := cl.FindBySelfIPPort(coord.IP, coord.Port)
+	if coordReplica == nil {
+		t.Fatalf("coord %+v not found among replicas", coord)
+	}
+
+	// Craft a phantom peer whose rendezvous score beats coord's for k.
+	// The phantom's IP/Port don't need to be reachable; it's never
+	// dialed, only used to skew rendezvous on coord's view.
+	pathBytes := []byte(k.Path())
+	coordScore := cluster.Score(coord, pathBytes)
+	phantom := cluster.Peer{IP: "203.0.113.1"} // TEST-NET-3, unreachable
+
+	for port := 1; port < 65536; port++ {
+		phantom.Port = port
+		if cluster.Score(phantom, pathBytes) > coordScore {
+			break
+		}
+	}
+
+	if cluster.Score(phantom, pathBytes) <= coordScore {
+		t.Fatalf("could not find a phantom peer beating coord rendezvous score")
+	}
+
+	// Build coord's new peer-set: original real peers plus the
+	// phantom. The StaticPeerSource will stamp Self=true only on the
+	// peer matching coord's (selfIP, selfPort), so coord still
+	// recognizes itself; but the phantom wins rendezvous, so
+	// coord.IsCoordinator(k) flips to false.
+	newPeers := make([]cluster.Peer, 0, cl.Len()+1)
+	for _, r := range cl.Replicas {
+		newPeers = append(newPeers, cluster.Peer{IP: r.SelfIP, Port: r.InternalPort})
+	}
+
+	newPeers = append(newPeers, phantom)
+	coordReplica.PeerSource.SetPeers(newPeers)
+
+	if err := waitForCondition(ctx, 2*time.Second, func() bool {
+		return !coordReplica.App.Cluster.IsCoordinator(k)
+	}); err != nil {
+		t.Fatalf("coord did not relinquish coordinator status: %v", err)
+	}
+	// Find a replica R whose view still says coord is the coordinator.
+	var requester *Replica
+
+	for _, r := range cl.Replicas {
+		if r == coordReplica {
+			continue
+		}
+
+		rc := r.App.Cluster.Coordinator(k)
+		if rc.IP == coord.IP && rc.Port == coord.Port {
+			requester = r
+			break
+		}
+	}
+
+	if requester == nil {
+		t.Fatalf("no non-coord replica still views coord %+v as coordinator", coord)
+	}
+
+	resp := requester.HTTP.Get(ctx, t, bucket, blob.Key)
+	if resp.Status != http.StatusOK {
+		t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body))
+	}
+
+	if !bytes.Equal(resp.Body, blob.Data) {
+		t.Fatalf("body mismatch: got %d bytes, want %d", len(resp.Body), len(blob.Data))
+	}
+
+	coordKey := coord.IP + ":" + strconv.Itoa(coord.Port)
+	if got := wrap.Count(coordKey, http.StatusConflict); got < 1 {
+		t.Fatalf("expected at least one 409 from coord %s; got %d",
+			coordKey, got)
+	}
+}
+
+func newCountingOriginForLocalStack(ctx context.Context, t *testing.T, bucket string) *CountingOrigin {
+	t.Helper()
+
+	or, err := localStackOrigin(ctx, t, bucket)
+	if err != nil {
+		t.Fatalf("localStackOrigin: %v", err)
+	}
+
+	return NewCountingOrigin(or)
+}
+
+func stripQuotes(s string) string {
+	if len(s) >= 2 && s[0] == '"' && s[len(s)-1] == '"' {
+		return s[1 : len(s)-1]
+	}
+
+	return s
+}
+
+func waitForCondition(ctx context.Context, dl time.Duration, cond func() bool) error {
+	deadline := time.Now().Add(dl)
+	for time.Now().Before(deadline) {
+		if cond() {
+			return nil
+		}
+
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(25 * time.Millisecond):
+		}
+	}
+
+	if cond() {
+		return nil
+	}
+
+	return context.DeadlineExceeded
+}
diff --git a/internal/orca/inttest/harness.go b/internal/orca/inttest/harness.go
new file mode 100644
index 00000000..ee4fd291
--- /dev/null
+++ b/internal/orca/inttest/harness.go
@@ -0,0 +1,366 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"log/slog"
+	"net"
+	"strconv"
+	"testing"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/app"
+	"github.com/Azure/unbounded/internal/orca/cachestore"
+	"github.com/Azure/unbounded/internal/orca/cluster"
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// ClusterOptions controls Harness.StartCluster.
+type ClusterOptions struct {
+	// Replicas is the number of in-process orca instances. Defaults
+	// to 3 when zero, matching the production deploy/orca topology.
+	Replicas int
+
+	// ChunkSize is the per-chunk byte count. The orca config validator
+	// enforces a 1 MiB minimum; tests typically use 1 MiB to keep test
+	// blob sizes manageable while still spanning multiple chunks.
+	ChunkSize int64
+
+	// OriginID is the logical origin identifier (echoed in chunk paths).
+	OriginID string
+
+	// OriginBucket is the bucket on the origin LocalStack/Azurite.
+	OriginBucket string
+
+	// OriginDriver is "awss3" (default) or "azureblob".
+	OriginDriver string
+
+	// LocalStack is the LocalStack handle used for origin (when
+	// OriginDriver=="awss3") and always for cachestore.
+	LocalStack *LocalStack
+
+	// Azurite is required when OriginDriver=="azureblob".
+	Azurite *Azurite
+
+	// AzureContainer is the Azurite container name for the origin.
+	AzureContainer string
+
+	// CachestoreBucket is the bucket on LocalStack used as the orca
+	// cachestore. If empty, a fresh bucket is allocated.
+	CachestoreBucket string
+
+	// OriginOverride, when set, replaces the constructed origin driver.
+	// Used to wire CountingOrigin around the real client.
+	OriginOverride origin.Origin
+
+	// CacheStoreOverride, when set, replaces the constructed cachestore
+	// driver.
+	CacheStoreOverride cachestore.CacheStore
+
+	// InternalHandlerWrap, when set, is registered with each replica's
+	// app.WithInternalHandlerWrap. Tests use this to install a 409
+	// counter (CountingInternalHandlerWrap.WrapFor).
+	InternalHandlerWrap *CountingInternalHandlerWrap
+}
+
+// Replica represents one running *app.App in the harness.
+type Replica struct {
+	App          *app.App
+	SelfIP       string
+	InternalPort int
+	PeerSource   *StaticPeerSource
+	HTTP         *Client // pre-built client targeting this replica's edge
+}
+
+// Cluster is a collection of Replicas plus the harness-owned context.
+type Cluster struct {
+	Replicas []*Replica
+}
+
+// Get returns replica i (1-indexed).
+func (c *Cluster) Get(i int) *Replica { return c.Replicas[i-1] }
+
+// Len returns the replica count.
+func (c *Cluster) Len() int { return len(c.Replicas) }
+
+// FindBySelfIPPort returns the replica whose (SelfIP, InternalPort)
+// matches the given peer; nil if none.
+func (c *Cluster) FindBySelfIPPort(ip string, port int) *Replica {
+	for _, r := range c.Replicas {
+		if r.SelfIP == ip && r.InternalPort == port {
+			return r
+		}
+	}
+
+	return nil
+}
+
+// StartCluster brings up `opts.Replicas` orca instances (default 3)
+// pointed at the origin/cachestore described in opts. Every replica
+// binds to 127.0.0.1 with an OS-assigned distinct internal port; one
+// StaticPeerSource per replica is initialized with the full peer set
+// (with explicit ports). Tests can mutate any replica's PeerSource
+// independently.
+//
+// Cleanup (Shutdown of each app) is registered with t.Cleanup.
+func StartCluster(ctx context.Context, t *testing.T, opts ClusterOptions) *Cluster {
+	t.Helper()
+
+	if opts.Replicas == 0 {
+		opts.Replicas = 3
+	}
+
+	if opts.Replicas < 1 {
+		t.Fatalf("StartCluster: Replicas must be >= 1, got %d", opts.Replicas)
+	}
+
+	if opts.ChunkSize == 0 {
+		opts.ChunkSize = 1024 * 1024
+	}
+
+	if opts.OriginDriver == "" {
+		opts.OriginDriver = "awss3"
+	}
+
+	if opts.OriginID == "" {
+		opts.OriginID = "inttest-origin"
+	}
+
+	if opts.LocalStack == nil {
+		t.Fatal("StartCluster: LocalStack handle required")
+	}
+
+	if opts.OriginDriver == "azureblob" {
+		if opts.Azurite == nil {
+			t.Fatal("StartCluster: Azurite handle required for azureblob driver")
+		}
+
+		if opts.AzureContainer == "" {
+			t.Fatal("StartCluster: AzureContainer required for azureblob driver")
+		}
+	}
+
+	if opts.OriginBucket == "" && opts.OriginDriver == "awss3" {
+		t.Fatal("StartCluster: OriginBucket required for awss3 driver")
+	}
+
+	cacheBucket := opts.CachestoreBucket
+	if cacheBucket == "" {
+		cacheBucket = opts.LocalStack.NewBucket(ctx, t, "orca-cache")
+	}
+
+	// Allocate per-replica internal listeners up front (open) so each
+	// replica's peer source can advertise the full set with explicit
+	// ports from t=0. We hand the open listeners to app.Start via
+	// WithInternalListener/WithEdgeListener so there is no
+	// close-and-rebind window for races with concurrent tests.
+	internalListeners := make([]net.Listener, opts.Replicas)
+	internalPorts := make([]int, opts.Replicas)
+	edgeListeners := make([]net.Listener, opts.Replicas)
+
+	for i := range internalListeners {
+		ln, err := net.Listen("tcp", "127.0.0.1:0")
+		if err != nil {
+			closeListeners(internalListeners)
+			closeListeners(edgeListeners)
+			t.Fatalf("alloc internal port for replica %d: %v", i+1, err)
+		}
+
+		internalListeners[i] = ln
+		internalPorts[i] = ln.Addr().(*net.TCPAddr).Port //nolint:errcheck // *net.TCPAddr from net.Listen
+
+		eln, err := net.Listen("tcp", "127.0.0.1:0")
+		if err != nil {
+			closeListeners(internalListeners)
+			closeListeners(edgeListeners)
+			t.Fatalf("alloc edge port for replica %d: %v", i+1, err)
+		}
+
+		edgeListeners[i] = eln
+	}
+
+	allPeers := make([]cluster.Peer, opts.Replicas)
+	for i := range allPeers {
+		allPeers[i] = cluster.Peer{
+			IP:   "127.0.0.1",
+			Port: internalPorts[i],
+		}
+	}
+
+	cl := &Cluster{}
+
+	logger := slog.New(slog.NewTextHandler(io.Discard, nil))
+
+	for i := 0; i < opts.Replicas; i++ {
+		selfIP := "127.0.0.1"
+		selfPort := internalPorts[i]
+		ps := NewStaticPeerSource(selfIP, selfPort, allPeers)
+
+		cfg := buildConfig(opts, cacheBucket)
+		cfg.Cluster.SelfPodIP = selfIP
+		cfg.Cluster.InternalListen = net.JoinHostPort(selfIP, strconv.Itoa(selfPort))
+		cfg.Server.Listen = edgeListeners[i].Addr().String()
+
+		appOpts := []app.Option{
+			app.WithLogger(logger),
+			app.WithPeerSource(ps),
+			app.WithEdgeListener(edgeListeners[i]),
+			app.WithInternalListener(internalListeners[i]),
+		}
+
+		if opts.OriginOverride != nil {
+			appOpts = append(appOpts, app.WithOrigin(opts.OriginOverride))
+		}
+
+		if opts.CacheStoreOverride != nil {
+			appOpts = append(appOpts, app.WithCacheStore(opts.CacheStoreOverride))
+		}
+
+		if opts.InternalHandlerWrap != nil {
+			appOpts = append(appOpts, app.WithInternalHandlerWrap(opts.InternalHandlerWrap.WrapFor(selfIP+":"+strconv.Itoa(selfPort))))
+		}
+
+		a, err := app.Start(ctx, cfg, appOpts...)
+		if err != nil {
+			t.Fatalf("app.Start replica %d: %v", i+1, err)
+		}
+
+		r := &Replica{
+			App:          a,
+			SelfIP:       selfIP,
+			InternalPort: selfPort,
+			PeerSource:   ps,
+			HTTP:         NewClient("http://" + a.EdgeAddr),
+		}
+		cl.Replicas = append(cl.Replicas, r)
+
+		t.Cleanup(func() {
+			ctxShut, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+			defer cancel()
+
+			_ = a.Shutdown(ctxShut) //nolint:errcheck // shutdown logs already emitted
+		})
+	}
+	// Wait for every replica's Cluster.Peers() to converge to the
+	// full set.
+	if err := waitForPeers(ctx, cl, opts.Replicas, 2*time.Second); err != nil {
+		t.Fatalf("waitForPeers: %v", err)
+	}
+
+	return cl
+}
+
+func buildConfig(opts ClusterOptions, cacheBucket string) *config.Config {
+	cfg := &config.Config{
+		Server: config.Server{
+			Listen: "127.0.0.1:0",
+			Auth:   config.ServerAuth{Enabled: false},
+		},
+		Origin: config.Origin{
+			ID:           opts.OriginID,
+			Driver:       opts.OriginDriver,
+			TargetGlobal: 32,
+			QueueTimeout: 5 * time.Second,
+			Retry: config.OriginRetry{
+				Attempts:         2,
+				BackoffInitial:   10 * time.Millisecond,
+				BackoffMax:       50 * time.Millisecond,
+				MaxTotalDuration: 2 * time.Second,
+			},
+		},
+		Cachestore: config.Cachestore{
+			Driver: "s3",
+			S3: config.CachestoreS3{
+				Endpoint:                 opts.LocalStack.Endpoint(),
+				Bucket:                   cacheBucket,
+				Region:                   opts.LocalStack.Region(),
+				AccessKey:                opts.LocalStack.AccessKey(),
+				SecretKey:                opts.LocalStack.SecretKey(),
+				UsePathStyle:             true,
+				RequireUnversionedBucket: true,
+			},
+		},
+		Cluster: config.Cluster{
+			Service:           "orca-peers.test.svc.cluster.local",
+			MembershipRefresh: 250 * time.Millisecond,
+			InternalListen:    "127.0.0.1:0", // overridden per replica
+			InternalTLS:       config.InternalTLS{Enabled: false},
+			TargetReplicas:    opts.Replicas,
+			SelfPodIP:         "127.0.0.1", // overridden per replica
+		},
+		ChunkCatalog: config.ChunkCatalog{MaxEntries: 1024},
+		Metadata: config.Metadata{
+			TTL:         5 * time.Minute,
+			NegativeTTL: 5 * time.Second,
+			MaxEntries:  1024,
+		},
+		Chunking: config.Chunking{Size: opts.ChunkSize},
+	}
+
+	switch opts.OriginDriver {
+	case "awss3":
+		cfg.Origin.AWSS3 = config.AWSS3{
+			Endpoint:     opts.LocalStack.Endpoint(),
+			Region:       opts.LocalStack.Region(),
+			Bucket:       opts.OriginBucket,
+			AccessKey:    opts.LocalStack.AccessKey(),
+			SecretKey:    opts.LocalStack.SecretKey(),
+			UsePathStyle: true,
+		}
+	case "azureblob":
+		cfg.Origin.Azureblob = config.Azureblob{
+			Account:              opts.Azurite.AccountName(),
+			AccountKey:           opts.Azurite.AccountKey(),
+			Container:            opts.AzureContainer,
+			EnforceBlockBlobOnly: true,
+			Endpoint:             opts.Azurite.Endpoint(),
+		}
+	}
+
+	return cfg
+}
+
+// waitForPeers polls each replica's cluster.Peers() until every
+// replica has at least the expected count or the deadline elapses.
+func waitForPeers(ctx context.Context, cl *Cluster, want int, dl time.Duration) error {
+	deadline := time.Now().Add(dl)
+
+	for time.Now().Before(deadline) {
+		ok := true
+
+		for _, r := range cl.Replicas {
+			if len(r.App.Cluster.Peers()) < want {
+				ok = false
+				break
+			}
+		}
+
+		if ok {
+			return nil
+		}
+
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(50 * time.Millisecond):
+		}
+	}
+
+	return fmt.Errorf("peer-set did not converge to %d on all %d replicas within %s",
+		want, len(cl.Replicas), dl)
+}
+
+func closeListeners(lns []net.Listener) {
+	for _, ln := range lns {
+		if ln != nil {
+			_ = ln.Close() //nolint:errcheck // best-effort cleanup
+		}
+	}
+}
diff --git a/internal/orca/inttest/images.go b/internal/orca/inttest/images.go
new file mode 100644
index 00000000..9eb3c729
--- /dev/null
+++ b/internal/orca/inttest/images.go
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+// Pinned container image tags. Bump centrally when upgrading.
+const (
+	// localstackImage is the LocalStack image used for both the origin
+	// (awss3) and cachestore (s3) backends. 3.8 matches the version
+	// referenced in design.md and the dev harness's awareness of the
+	// CRC64NVME checksum quirk.
+	localstackImage = "localstack/localstack:3.8"
+
+	// azuriteImage is the Azurite (Azure Blob emulator) image. We pin
+	// to a specific minor for reproducibility.
+	azuriteImage = "mcr.microsoft.com/azure-storage/azurite:3.34.0"
+
+	// azuritePort is the blob-service port published by Azurite.
+	azuritePort = "10000"
+
+	// azuriteAccountName is the well-known Azurite dev account.
+	azuriteAccountName = "devstoreaccount1"
+
+	// azuriteAccountKey is the well-known Azurite dev account key. It
+	// is hard-coded by the emulator; not a secret.
+	azuriteAccountKey = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="
+)
diff --git a/internal/orca/inttest/internalwrap.go b/internal/orca/inttest/internalwrap.go
new file mode 100644
index 00000000..67197393
--- /dev/null
+++ b/internal/orca/inttest/internalwrap.go
@@ -0,0 +1,135 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"net/http"
+	"sync"
+	"sync/atomic"
+)
+
+// CountingInternalHandlerWrap is an http.Handler decorator factory
+// that counts response status codes per receiving replica IP. Used
+// by TestPeerNotCoordinatorFallback to assert a peer's
+// /internal/fill handler returned 409 (proving the cluster.go 409
+// fallback path actually fired on the requesting replica).
+//
+// One CountingInternalHandlerWrap is shared across all replicas in
+// the harness; each replica's wrapped handler stamps its self IP
+// onto the response writer so counts can be attributed back.
+type CountingInternalHandlerWrap struct {
+	mu      sync.Mutex
+	counts  map[string]map[int]*atomic.Int64 // selfIP -> status -> count
+	defined map[string]struct{}
+}
+
+// NewCountingInternalHandlerWrap returns an empty wrapper.
+func NewCountingInternalHandlerWrap() *CountingInternalHandlerWrap {
+	return &CountingInternalHandlerWrap{
+		counts:  make(map[string]map[int]*atomic.Int64),
+		defined: make(map[string]struct{}),
+	}
+}
+
+// WrapFor returns a wrap function suitable for app.WithInternalHandlerWrap
+// that attributes status-code counts back to the named selfIP.
+func (w *CountingInternalHandlerWrap) WrapFor(selfIP string) func(http.Handler) http.Handler {
+	w.mu.Lock()
+	if _, ok := w.counts[selfIP]; !ok {
+		w.counts[selfIP] = make(map[int]*atomic.Int64)
+	}
+
+	w.defined[selfIP] = struct{}{}
+	w.mu.Unlock()
+
+	return func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
+			cw := &countingResponseWriter{ResponseWriter: rw, status: http.StatusOK}
+			next.ServeHTTP(cw, req)
+			w.record(selfIP, cw.status)
+		})
+	}
+}
+
+// Count returns the number of responses with the given status code
+// observed at the named selfIP.
+func (w *CountingInternalHandlerWrap) Count(selfIP string, status int) int64 {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	byStatus, ok := w.counts[selfIP]
+	if !ok {
+		return 0
+	}
+
+	c, ok := byStatus[status]
+	if !ok {
+		return 0
+	}
+
+	return c.Load()
+}
+
+// CountAcross returns the count summed across all known selfIPs.
+func (w *CountingInternalHandlerWrap) CountAcross(status int) int64 {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	var total int64
+
+	for _, byStatus := range w.counts {
+		if c, ok := byStatus[status]; ok {
+			total += c.Load()
+		}
+	}
+
+	return total
+}
+
+func (w *CountingInternalHandlerWrap) record(selfIP string, status int) {
+	w.mu.Lock()
+
+	byStatus, ok := w.counts[selfIP]
+	if !ok {
+		byStatus = make(map[int]*atomic.Int64)
+		w.counts[selfIP] = byStatus
+	}
+
+	c, ok := byStatus[status]
+	if !ok {
+		c = &atomic.Int64{}
+		byStatus[status] = c
+	}
+
+	w.mu.Unlock()
+	c.Add(1)
+}
+
+// countingResponseWriter records the first WriteHeader status; if no
+// WriteHeader is ever called, http.StatusOK is recorded (matching the
+// net/http default).
+type countingResponseWriter struct {
+	http.ResponseWriter
+	status      int
+	wroteHeader bool
+}
+
+func (c *countingResponseWriter) WriteHeader(status int) {
+	if !c.wroteHeader {
+		c.status = status
+		c.wroteHeader = true
+	}
+
+	c.ResponseWriter.WriteHeader(status)
+}
+
+func (c *countingResponseWriter) Write(p []byte) (int, error) {
+	if !c.wroteHeader {
+		c.wroteHeader = true
+	}
+
+	return c.ResponseWriter.Write(p)
+}
diff --git a/internal/orca/inttest/localstack.go b/internal/orca/inttest/localstack.go
new file mode 100644
index 00000000..5abb404d
--- /dev/null
+++ b/internal/orca/inttest/localstack.go
@@ -0,0 +1,180 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"fmt"
+	"testing"
+
+	"github.com/aws/aws-sdk-go-v2/aws"
+	awsconfig "github.com/aws/aws-sdk-go-v2/config"
+	"github.com/aws/aws-sdk-go-v2/credentials"
+	"github.com/aws/aws-sdk-go-v2/service/s3"
+	s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
+	"github.com/testcontainers/testcontainers-go"
+	"github.com/testcontainers/testcontainers-go/wait"
+)
+
+// LocalStack is a running LocalStack container with helper accessors
+// for constructing AWS S3 clients pointed at it. Use NewS3Client to
+// get a configured client; use NewBucket to allocate a fresh bucket
+// for a single test.
+type LocalStack struct {
+	container testcontainers.Container
+	endpoint  string
+	region    string
+}
+
+// AccessKey returns the LocalStack-default access key. LocalStack does
+// not validate credentials but the AWS SDK requires non-empty values.
+func (ls *LocalStack) AccessKey() string { return "test" }
+
+// SecretKey returns the LocalStack-default secret key.
+func (ls *LocalStack) SecretKey() string { return "test" }
+
+// Endpoint returns the http:// URL of the LocalStack edge port.
+func (ls *LocalStack) Endpoint() string { return ls.endpoint }
+
+// Region returns the static region the harness uses with LocalStack.
+func (ls *LocalStack) Region() string { return ls.region }
+
+// StartLocalStack launches a LocalStack container and returns a handle
+// once the edge port is healthy. Caller is responsible for terminating
+// the container (via container.Terminate or t.Cleanup).
+func StartLocalStack(ctx context.Context) (*LocalStack, error) {
+	req := testcontainers.ContainerRequest{
+		Image:        localstackImage,
+		ExposedPorts: []string{"4566/tcp"},
+		Env: map[string]string{
+			"SERVICES": "s3",
+			// LocalStack 3.8 returns InvalidRequest on the SDK's
+			// CRC64NVME default checksum. The orca s3 driver opts out
+			// at the SDK config level, but seeding clients in tests
+			// must do the same. We set the variables both in the
+			// container env (for any in-container tooling) and on the
+			// SDK config in NewS3Client.
+			"S3_SKIP_SIGNATURE_VALIDATION": "1",
+		},
+		WaitingFor: wait.ForHTTP("/_localstack/health").
+			WithPort("4566/tcp").
+			WithStatusCodeMatcher(func(status int) bool { return status == 200 }),
+	}
+
+	c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
+		ContainerRequest: req,
+		Started:          true,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("start localstack: %w", err)
+	}
+
+	host, err := c.Host(ctx)
+	if err != nil {
+		_ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup
+		return nil, fmt.Errorf("localstack host: %w", err)
+	}
+
+	port, err := c.MappedPort(ctx, "4566/tcp")
+	if err != nil {
+		_ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup
+		return nil, fmt.Errorf("localstack port: %w", err)
+	}
+
+	return &LocalStack{
+		container: c,
+		endpoint:  fmt.Sprintf("http://%s:%s", host, port.Port()),
+		region:    "us-east-1",
+	}, nil
+}
+
+// Terminate stops and removes the LocalStack container.
+func (ls *LocalStack) Terminate(ctx context.Context) error {
+	return ls.container.Terminate(ctx)
+}
+
+// NewS3Client returns an AWS S3 client with LocalStack-friendly
+// settings (path-style addressing, dummy credentials, checksum quirks
+// disabled).
+func (ls *LocalStack) NewS3Client(ctx context.Context, t *testing.T) *s3.Client {
+	t.Helper()
+
+	cfg, err := awsconfig.LoadDefaultConfig(ctx,
+		awsconfig.WithRegion(ls.region),
+		awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(
+			ls.AccessKey(), ls.SecretKey(), "",
+		)),
+		awsconfig.WithRequestChecksumCalculation(aws.RequestChecksumCalculationWhenRequired),
+		awsconfig.WithResponseChecksumValidation(aws.ResponseChecksumValidationWhenRequired),
+	)
+	if err != nil {
+		t.Fatalf("aws config: %v", err)
+	}
+
+	return s3.NewFromConfig(cfg, func(o *s3.Options) {
+		o.BaseEndpoint = aws.String(ls.endpoint)
+		o.UsePathStyle = true
+	})
+}
+
+// NewBucket creates a fresh bucket and registers a t.Cleanup hook to
+// best-effort delete it. Returns the bucket name.
+func (ls *LocalStack) NewBucket(ctx context.Context, t *testing.T, prefix string) string {
+	t.Helper()
+
+	cli := ls.NewS3Client(ctx, t)
+	name := uniqueName(prefix)
+
+	if _, err := cli.CreateBucket(ctx, &s3.CreateBucketInput{
+		Bucket: aws.String(name),
+	}); err != nil {
+		t.Fatalf("create bucket %s: %v", name, err)
+	}
+
+	t.Cleanup(func() {
+		emptyBucket(context.Background(), cli, name)
+
+		_, _ = cli.DeleteBucket(context.Background(), &s3.DeleteBucketInput{ //nolint:errcheck // best-effort cleanup
+			Bucket: aws.String(name),
+		})
+	})
+
+	return name
+}
+
+// EnableVersioning toggles versioning on a bucket. Used by the
+// versioning-gate negative test.
+func (ls *LocalStack) EnableVersioning(ctx context.Context, t *testing.T, bucket string) {
+	t.Helper()
+
+	cli := ls.NewS3Client(ctx, t)
+	if _, err := cli.PutBucketVersioning(ctx, &s3.PutBucketVersioningInput{
+		Bucket: aws.String(bucket),
+		VersioningConfiguration: &s3types.VersioningConfiguration{
+			Status: s3types.BucketVersioningStatusEnabled,
+		},
+	}); err != nil {
+		t.Fatalf("enable versioning on %s: %v", bucket, err)
+	}
+}
+
+// emptyBucket deletes every object in the bucket. Best-effort; errors
+// are ignored.
+func emptyBucket(ctx context.Context, cli *s3.Client, bucket string) {
+	out, err := cli.ListObjectsV2(ctx, &s3.ListObjectsV2Input{
+		Bucket: aws.String(bucket),
+	})
+	if err != nil {
+		return
+	}
+
+	for _, obj := range out.Contents {
+		_, _ = cli.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort cleanup
+			Bucket: aws.String(bucket),
+			Key:    obj.Key,
+		})
+	}
+}
diff --git a/internal/orca/inttest/main_test.go b/internal/orca/inttest/main_test.go
new file mode 100644
index 00000000..f793abd6
--- /dev/null
+++ b/internal/orca/inttest/main_test.go
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"testing"
+	"time"
+)
+
+// Package-level container handles shared across tests in this package.
+// TestMain brings them up once and tears them down at the end.
+var (
+	pkgLocalStack *LocalStack
+	pkgAzurite    *Azurite
+)
+
+// TestMain provisions LocalStack + Azurite once per `go test` run.
+// Per-test buckets / containers are allocated inside individual tests
+// to avoid cross-test interference.
+func TestMain(m *testing.M) {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+
+	ls, err := StartLocalStack(ctx)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "TestMain: start localstack: %v\n", err)
+		os.Exit(1)
+	}
+
+	pkgLocalStack = ls
+
+	az, err := StartAzurite(ctx)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "TestMain: start azurite: %v\n", err)
+
+		_ = ls.Terminate(ctx) //nolint:errcheck // best-effort cleanup
+
+		os.Exit(1)
+	}
+
+	pkgAzurite = az
+
+	code := m.Run()
+
+	termCtx, termCancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer termCancel()
+
+	_ = pkgAzurite.Terminate(termCtx)    //nolint:errcheck // best-effort
+	_ = pkgLocalStack.Terminate(termCtx) //nolint:errcheck // best-effort
+
+	os.Exit(code)
+}
diff --git a/internal/orca/inttest/origins_test.go b/internal/orca/inttest/origins_test.go
new file mode 100644
index 00000000..594b7596
--- /dev/null
+++ b/internal/orca/inttest/origins_test.go
@@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"testing"
+
+	"github.com/Azure/unbounded/internal/orca/origin"
+	"github.com/Azure/unbounded/internal/orca/origin/awss3"
+)
+
+// localStackOrigin builds an awss3.Origin pointed at the package-level
+// LocalStack with the given bucket. Used by tests that need to wrap
+// the origin in a CountingOrigin decorator.
+func localStackOrigin(ctx context.Context, t *testing.T, bucket string) (origin.Origin, error) {
+	t.Helper()
+
+	return awss3.New(ctx, awss3.Config{
+		Endpoint:     pkgLocalStack.Endpoint(),
+		Region:       pkgLocalStack.Region(),
+		Bucket:       bucket,
+		AccessKey:    pkgLocalStack.AccessKey(),
+		SecretKey:    pkgLocalStack.SecretKey(),
+		UsePathStyle: true,
+	})
+}
diff --git a/internal/orca/inttest/originwrap.go b/internal/orca/inttest/originwrap.go
new file mode 100644
index 00000000..c215d9e8
--- /dev/null
+++ b/internal/orca/inttest/originwrap.go
@@ -0,0 +1,67 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"io"
+	"sync/atomic"
+
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// CountingOrigin is an origin.Origin decorator that counts Head and
+// GetRange calls. It is used by tests that need to assert
+// singleflight collapse and coordinator routing.
+type CountingOrigin struct {
+	inner origin.Origin
+
+	heads     atomic.Int64
+	getRanges atomic.Int64
+	lists     atomic.Int64
+}
+
+// NewCountingOrigin wraps inner with call counters.
+func NewCountingOrigin(inner origin.Origin) *CountingOrigin {
+	return &CountingOrigin{inner: inner}
+}
+
+// Heads returns the number of Head() calls observed.
+func (c *CountingOrigin) Heads() int64 { return c.heads.Load() }
+
+// GetRanges returns the number of GetRange() calls observed.
+func (c *CountingOrigin) GetRanges() int64 { return c.getRanges.Load() }
+
+// Lists returns the number of List() calls observed.
+func (c *CountingOrigin) Lists() int64 { return c.lists.Load() }
+
+// Reset zeroes all counters.
+func (c *CountingOrigin) Reset() {
+	c.heads.Store(0)
+	c.getRanges.Store(0)
+	c.lists.Store(0)
+}
+
+// Head implements origin.Origin.
+func (c *CountingOrigin) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	c.heads.Add(1)
+
+	return c.inner.Head(ctx, bucket, key)
+}
+
+// GetRange implements origin.Origin.
+func (c *CountingOrigin) GetRange(ctx context.Context, bucket, key, etag string, off, length int64) (io.ReadCloser, error) {
+	c.getRanges.Add(1)
+
+	return c.inner.GetRange(ctx, bucket, key, etag, off, length)
+}
+
+// List implements origin.Origin.
+func (c *CountingOrigin) List(ctx context.Context, bucket, prefix, marker string, maxKeys int) (origin.ListResult, error) {
+	c.lists.Add(1)
+
+	return c.inner.List(ctx, bucket, prefix, marker, maxKeys)
+}
diff --git a/internal/orca/inttest/peersource.go b/internal/orca/inttest/peersource.go
new file mode 100644
index 00000000..c349f601
--- /dev/null
+++ b/internal/orca/inttest/peersource.go
@@ -0,0 +1,67 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"sync"
+
+	"github.com/Azure/unbounded/internal/orca/cluster"
+)
+
+// StaticPeerSource implements cluster.PeerSource with a mutable peer
+// list. Each replica in the harness owns its own StaticPeerSource so
+// tests can mutate one replica's view of the cluster independently
+// (used by TestPeerNotCoordinatorFallback to induce membership
+// disagreement).
+//
+// The source knows its calling replica's identity (selfIP, selfPort)
+// so it can stamp Peer.Self correctly even when multiple peers share
+// an IP (the case in tests where every replica is on 127.0.0.1).
+type StaticPeerSource struct {
+	mu       sync.Mutex
+	selfIP   string
+	selfPort int
+	peers    []cluster.Peer
+}
+
+// NewStaticPeerSource returns a peer source that stamps Self=true on
+// any peer whose (IP, Port) matches the constructor arguments.
+func NewStaticPeerSource(selfIP string, selfPort int, peers []cluster.Peer) *StaticPeerSource {
+	s := &StaticPeerSource{
+		selfIP:   selfIP,
+		selfPort: selfPort,
+	}
+	s.SetPeers(peers)
+
+	return s
+}
+
+// SetPeers replaces the current peer list. Each peer's Self bit is
+// recomputed against the source's stored (selfIP, selfPort).
+func (s *StaticPeerSource) SetPeers(peers []cluster.Peer) {
+	out := make([]cluster.Peer, len(peers))
+	for i, p := range peers {
+		p.Self = p.IP == s.selfIP && p.Port == s.selfPort
+		out[i] = p
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.peers = out
+}
+
+// Peers satisfies cluster.PeerSource.
+func (s *StaticPeerSource) Peers(_ context.Context) ([]cluster.Peer, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	out := make([]cluster.Peer, len(s.peers))
+	copy(out, s.peers)
+
+	return out, nil
+}
diff --git a/internal/orca/inttest/seed.go b/internal/orca/inttest/seed.go
new file mode 100644
index 00000000..c286bcdc
--- /dev/null
+++ b/internal/orca/inttest/seed.go
@@ -0,0 +1,96 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"bytes"
+	"context"
+	"testing"
+
+	"github.com/aws/aws-sdk-go-v2/aws"
+	"github.com/aws/aws-sdk-go-v2/service/s3"
+)
+
+// SeedBlob describes a single blob seeded into the origin.
+type SeedBlob struct {
+	Key  string
+	Data []byte
+}
+
+// SmallBlob is one chunk's-worth (1 KiB).
+func SmallBlob() SeedBlob {
+	return SeedBlob{Key: "sample-1k", Data: deterministicBytes(1024, 0xa1)}
+}
+
+// MediumBlob spans two 1 MiB chunks.
+func MediumBlob() SeedBlob {
+	return SeedBlob{Key: "sample-2chunk", Data: deterministicBytes(1024*1024+512*1024, 0xb2)}
+}
+
+// HugeBlob spans 64 chunks at the harness's 1 MiB chunk size. With 3
+// replicas, rendezvous-hashed coordinator selection statistically
+// covers every replica many times over (~21 chunks per replica),
+// so any test using HugeBlob exercises the full local-fill +
+// cross-replica /internal/fill matrix in a single run.
+func HugeBlob() SeedBlob {
+	return SeedBlob{Key: "sample-64chunk", Data: deterministicBytes(64*1024*1024, 0xd4)}
+}
+
+// AllBlobs returns the canonical seed set used across most tests.
+func AllBlobs() []SeedBlob {
+	return []SeedBlob{SmallBlob(), MediumBlob(), HugeBlob()}
+}
+
+// SeedS3 uploads each blob to the named bucket via the provided
+// LocalStack-friendly S3 client.
+func SeedS3(ctx context.Context, t *testing.T, cli *s3.Client, bucket string, blobs []SeedBlob) {
+	t.Helper()
+
+	for _, b := range blobs {
+		if _, err := cli.PutObject(ctx, &s3.PutObjectInput{
+			Bucket: aws.String(bucket),
+			Key:    aws.String(b.Key),
+			Body:   bytes.NewReader(b.Data),
+		}); err != nil {
+			t.Fatalf("seed %s/%s: %v", bucket, b.Key, err)
+		}
+	}
+}
+
+// DeleteS3Object removes a blob from a LocalStack bucket. Used by
+// warm-cache tests to prove that subsequent GETs are served from the
+// cachestore and not refetched from the origin.
+func DeleteS3Object(ctx context.Context, t *testing.T, cli *s3.Client, bucket, key string) {
+	t.Helper()
+
+	if _, err := cli.DeleteObject(ctx, &s3.DeleteObjectInput{
+		Bucket: aws.String(bucket),
+		Key:    aws.String(key),
+	}); err != nil {
+		t.Fatalf("delete origin %s/%s: %v", bucket, key, err)
+	}
+}
+
+// SeedAzure uploads each blob to the named container as block blobs.
+func SeedAzure(ctx context.Context, t *testing.T, az *Azurite, ctr string, blobs []SeedBlob) {
+	t.Helper()
+
+	for _, b := range blobs {
+		az.UploadBlockBlob(ctx, t, ctr, b.Key, b.Data)
+	}
+}
+
+// deterministicBytes returns n bytes filled with a repeating pattern
+// derived from seed. Useful for byte-exact assertions without random
+// flakiness.
+func deterministicBytes(n int, seed byte) []byte {
+	out := make([]byte, n)
+	for i := range out {
+		out[i] = seed ^ byte(i*31+17)
+	}
+
+	return out
+}
diff --git a/internal/orca/manifests/doc.go b/internal/orca/manifests/doc.go
new file mode 100644
index 00000000..a629d147
--- /dev/null
+++ b/internal/orca/manifests/doc.go
@@ -0,0 +1,12 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package manifests holds tests that validate the orca deployment
+// manifest templates render to syntactically correct, structurally
+// reasonable Kubernetes YAML.
+//
+// These tests catch typos, missing required fields, and template
+// regressions at compile time without needing a Kind cluster. They
+// complement (but do not replace) hack/orca's actual `kubectl apply`
+// validation.
+package manifests
diff --git a/internal/orca/manifests/manifests_test.go b/internal/orca/manifests/manifests_test.go
new file mode 100644
index 00000000..bbab6cab
--- /dev/null
+++ b/internal/orca/manifests/manifests_test.go
@@ -0,0 +1,307 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package manifests
+
+import (
+	"bytes"
+	"errors"
+	"io"
+	"os"
+	"path/filepath"
+	"runtime"
+	"sort"
+	"strings"
+	"testing"
+
+	"gopkg.in/yaml.v3"
+
+	"github.com/Azure/unbounded/hack/cmd/render-manifests/render"
+)
+
+// TestProductionManifestsRender renders every *.yaml.tmpl under
+// deploy/orca/ (excluding the dev/ subdirectory which contains the
+// in-Kind LocalStack/Azurite manifests) with realistic inputs and
+// asserts the output is structurally valid Kubernetes YAML.
+func TestProductionManifestsRender(t *testing.T) {
+	t.Parallel()
+
+	root := repoRoot(t)
+	templatesDir := filepath.Join(root, "deploy", "orca")
+
+	renderAndValidate(t, templatesDir, productionData(),
+		// One file at a time: walking the dev/ subdirectory is the dev
+		// suite's job, so we render-then-skip it here.
+		skipDir("dev"),
+		// Required kinds that MUST appear at least once across the
+		// rendered manifests.
+		expectKindsAtLeastOnce("Namespace", "Deployment", "Service", "ConfigMap"),
+	)
+}
+
+// TestDevManifestsRender renders the LocalStack + Azurite + init-Job
+// manifests used by the Kind dev harness.
+func TestDevManifestsRender(t *testing.T) {
+	t.Parallel()
+
+	root := repoRoot(t)
+	templatesDir := filepath.Join(root, "deploy", "orca", "dev")
+
+	renderAndValidate(t, templatesDir, devData(),
+		expectKindsAtLeastOnce("Deployment", "Service", "Job"),
+	)
+}
+
+// productionData supplies realistic template variables for the
+// production-shape templates. Templates use sprig's `default` for
+// missing keys; we set values that exercise the non-default paths
+// where it matters.
+func productionData() map[string]string {
+	return map[string]string{
+		"Namespace":               "orca-test",
+		"Image":                   "ghcr.io/example/orca:test",
+		"ImagePullPolicy":         "IfNotPresent",
+		"TargetReplicas":          "3",
+		"OriginID":                "test-origin",
+		"OriginDriver":            "awss3",
+		"OriginAWSS3Endpoint":     "http://localstack:4566",
+		"OriginAWSS3Region":       "us-east-1",
+		"OriginAWSS3Bucket":       "orca-origin",
+		"OriginAWSS3UsePathStyle": "true",
+		"CachestoreEndpoint":      "http://localstack:4566",
+		"CachestoreBucket":        "orca-cache",
+		"CachestoreRegion":        "us-east-1",
+		"ClusterService":          "orca-peers.orca-test.svc.cluster.local",
+		"ServerAuthEnabled":       "false",
+		"InternalTLSEnabled":      "false",
+		"AzureAccount":            "",
+		"AzureContainer":          "",
+		"AzureEndpoint":           "",
+	}
+}
+
+func devData() map[string]string {
+	return map[string]string{
+		"Namespace":        "orca-test",
+		"CachestoreBucket": "orca-cache",
+		"OriginBucket":     "orca-origin",
+		"AzuriteContainer": "orca-test",
+	}
+}
+
+// renderAndValidate renders every template under templatesDir into a
+// t.TempDir, then walks the output and applies each Validator.
+func renderAndValidate(t *testing.T, templatesDir string, data map[string]string, validators ...Validator) {
+	t.Helper()
+
+	outputDir := t.TempDir()
+
+	if err := render.Render(templatesDir, outputDir, data); err != nil {
+		t.Fatalf("render.Render: %v", err)
+	}
+	// Collect every rendered .yaml file. Skip directories filtered
+	// by the validators.
+	skipDirs := skipDirsOf(validators)
+
+	var renderedFiles []string
+
+	walkErr := filepath.WalkDir(outputDir, func(path string, d os.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+
+		if d.IsDir() {
+			rel, _ := filepath.Rel(outputDir, path)
+			if _, skip := skipDirs[rel]; skip {
+				return filepath.SkipDir
+			}
+
+			return nil
+		}
+
+		if strings.HasSuffix(path, ".yaml") {
+			renderedFiles = append(renderedFiles, path)
+		}
+
+		return nil
+	})
+	if walkErr != nil {
+		t.Fatalf("walk rendered output: %v", walkErr)
+	}
+
+	if len(renderedFiles) == 0 {
+		t.Fatalf("no rendered manifests found under %s", outputDir)
+	}
+
+	sort.Strings(renderedFiles)
+
+	docs := parseRenderedDocs(t, renderedFiles)
+
+	// Always-on basic structural validation.
+	for _, d := range docs {
+		validateBasicStructure(t, d)
+	}
+
+	for _, v := range validators {
+		v.Validate(t, docs)
+	}
+}
+
+// renderedDoc is one logical YAML document plus the source file it
+// came from (multi-doc files split into multiple renderedDocs).
+type renderedDoc struct {
+	SourcePath string
+	Index      int
+	Doc        map[string]any
+}
+
+func parseRenderedDocs(t *testing.T, files []string) []renderedDoc {
+	t.Helper()
+
+	var docs []renderedDoc
+
+	for _, f := range files {
+		raw, err := os.ReadFile(f)
+		if err != nil {
+			t.Fatalf("read %s: %v", f, err)
+		}
+
+		dec := yaml.NewDecoder(bytes.NewReader(raw))
+
+		for i := 0; ; i++ {
+			var doc map[string]any
+			if derr := dec.Decode(&doc); derr != nil {
+				if errors.Is(derr, io.EOF) {
+					break
+				}
+
+				t.Fatalf("yaml decode %s doc %d: %v", f, i, derr)
+			}
+
+			if doc == nil {
+				continue
+			}
+
+			docs = append(docs, renderedDoc{SourcePath: f, Index: i, Doc: doc})
+		}
+	}
+
+	return docs
+}
+
+func validateBasicStructure(t *testing.T, d renderedDoc) {
+	t.Helper()
+
+	apiVersion, _ := d.Doc["apiVersion"].(string)
+	kind, _ := d.Doc["kind"].(string)
+
+	if apiVersion == "" {
+		t.Errorf("%s doc %d: missing apiVersion", d.SourcePath, d.Index)
+	}
+
+	if kind == "" {
+		t.Errorf("%s doc %d: missing kind", d.SourcePath, d.Index)
+	}
+
+	meta, _ := d.Doc["metadata"].(map[string]any)
+	if meta == nil {
+		t.Errorf("%s doc %d (kind=%s): missing metadata", d.SourcePath, d.Index, kind)
+		return
+	}
+
+	name, _ := meta["name"].(string)
+	if name == "" {
+		t.Errorf("%s doc %d (kind=%s): missing metadata.name", d.SourcePath, d.Index, kind)
+	}
+}
+
+// Validator is a test-time check applied to the full set of
+// rendered docs.
+type Validator interface {
+	Validate(t *testing.T, docs []renderedDoc)
+	skipDir() string // empty when not a dir filter
+}
+
+type kindsAtLeastOnce struct{ kinds []string }
+
+func (v kindsAtLeastOnce) Validate(t *testing.T, docs []renderedDoc) {
+	t.Helper()
+
+	seen := map[string]bool{}
+
+	for _, d := range docs {
+		if k, _ := d.Doc["kind"].(string); k != "" {
+			seen[k] = true
+		}
+	}
+
+	for _, want := range v.kinds {
+		if !seen[want] {
+			t.Errorf("expected at least one document of kind %q, got kinds %v", want, sortedKeys(seen))
+		}
+	}
+}
+
+func (v kindsAtLeastOnce) skipDir() string { return "" }
+
+func expectKindsAtLeastOnce(kinds ...string) Validator {
+	return kindsAtLeastOnce{kinds: kinds}
+}
+
+type dirSkipper struct{ name string }
+
+func (d dirSkipper) Validate(*testing.T, []renderedDoc) {}
+
+func (d dirSkipper) skipDir() string { return d.name }
+
+func skipDir(name string) Validator {
+	return dirSkipper{name: name}
+}
+
+func skipDirsOf(vs []Validator) map[string]struct{} {
+	out := map[string]struct{}{}
+
+	for _, v := range vs {
+		if d := v.skipDir(); d != "" {
+			out[d] = struct{}{}
+		}
+	}
+
+	return out
+}
+
+func sortedKeys(m map[string]bool) []string {
+	out := make([]string, 0, len(m))
+	for k := range m {
+		out = append(out, k)
+	}
+
+	sort.Strings(out)
+
+	return out
+}
+
+// repoRoot returns the absolute path to the repo root by walking up
+// from this test file's directory until it finds a go.mod.
+func repoRoot(t *testing.T) string {
+	t.Helper()
+
+	_, file, _, ok := runtime.Caller(0)
+	if !ok {
+		t.Fatal("runtime.Caller(0) failed")
+	}
+
+	dir := filepath.Dir(file)
+	for {
+		if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil {
+			return dir
+		}
+
+		parent := filepath.Dir(dir)
+		if parent == dir {
+			t.Fatalf("reached filesystem root without finding go.mod (started at %s)", filepath.Dir(file))
+		}
+
+		dir = parent
+	}
+}
diff --git a/internal/orca/metadata/metadata.go b/internal/orca/metadata/metadata.go
new file mode 100644
index 00000000..be7e3dd5
--- /dev/null
+++ b/internal/orca/metadata/metadata.go
@@ -0,0 +1,231 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package metadata is the per-replica object-metadata cache.
+//
+// Responsibilities:
+//   - bounded TTL'd cache of ObjectInfo keyed on (origin_id, bucket,
+//     key)
+//   - separate negative-TTL handling for 404 / unsupported-blob-type
+//     entries (design.md s12)
+//   - per-replica HEAD singleflight (s8.7) so concurrent misses
+//     collapse to one Origin.Head
+package metadata
+
+import (
+	"container/list"
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// Cache is the per-replica metadata cache.
+type Cache struct {
+	cfg config.Metadata
+
+	mu  sync.Mutex
+	ll  *list.List
+	idx map[string]*list.Element
+
+	sf sync.Map // map[string]*sfEntry
+}
+
+type cacheEntry struct {
+	key       string
+	info      origin.ObjectInfo
+	negative  bool
+	negErr    error
+	expiresAt time.Time
+}
+
+type sfEntry struct {
+	once sync.Once
+	done chan struct{}
+	info origin.ObjectInfo
+	err  error
+}
+
+// NewCache builds a Cache from config.
+func NewCache(cfg config.Metadata) *Cache {
+	if cfg.MaxEntries <= 0 {
+		cfg.MaxEntries = 10_000
+	}
+
+	if cfg.TTL <= 0 {
+		cfg.TTL = 5 * time.Minute
+	}
+
+	if cfg.NegativeTTL <= 0 {
+		cfg.NegativeTTL = 60 * time.Second
+	}
+
+	return &Cache{
+		cfg: cfg,
+		ll:  list.New(),
+		idx: make(map[string]*list.Element, cfg.MaxEntries),
+	}
+}
+
+// Lookup returns the cached ObjectInfo if present and unexpired.
+//
+// Returns:
+//   - info, true,  nil  -> positive cache hit
+//   - {}, true,    err  -> negative cache hit (err is the cached error)
+//   - {}, false,   nil  -> miss; caller should LookupOrFetch
+func (c *Cache) Lookup(originID, bucket, key string) (origin.ObjectInfo, bool, error) {
+	k := mkKey(originID, bucket, key)
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	el, ok := c.idx[k]
+	if !ok {
+		return origin.ObjectInfo{}, false, nil
+	}
+
+	e, ok := el.Value.(*cacheEntry)
+	if !ok {
+		return origin.ObjectInfo{}, false, fmt.Errorf("metadata: list element is not *cacheEntry")
+	}
+
+	if time.Now().After(e.expiresAt) {
+		c.ll.Remove(el)
+		delete(c.idx, k)
+
+		return origin.ObjectInfo{}, false, nil
+	}
+
+	c.ll.MoveToFront(el)
+
+	if e.negative {
+		return origin.ObjectInfo{}, true, e.negErr
+	}
+
+	return e.info, true, nil
+}
+
+// LookupOrFetch returns the cached ObjectInfo on hit (positive or
+// negative); on miss, runs the per-replica HEAD singleflight against
+// fetch and caches the result with the appropriate TTL.
+func (c *Cache) LookupOrFetch(
+	ctx context.Context,
+	originID, bucket, key string,
+	fetch func(ctx context.Context) (origin.ObjectInfo, error),
+) (origin.ObjectInfo, error) {
+	if info, ok, err := c.Lookup(originID, bucket, key); ok {
+		return info, err
+	}
+
+	k := mkKey(originID, bucket, key)
+	v, _ := c.sf.LoadOrStore(k, &sfEntry{done: make(chan struct{})})
+
+	sfe, ok := v.(*sfEntry)
+	if !ok {
+		return origin.ObjectInfo{}, fmt.Errorf("metadata: singleflight value is not *sfEntry")
+	}
+
+	first := false
+
+	sfe.once.Do(func() {
+		first = true
+	})
+
+	if first {
+		defer func() {
+			close(sfe.done)
+			c.sf.Delete(k)
+		}()
+
+		info, err := fetch(ctx)
+		sfe.info = info
+		sfe.err = err
+
+		if recErr := c.recordResult(originID, bucket, key, info, err); recErr != nil {
+			err = errors.Join(err, recErr)
+		}
+
+		return info, err
+	}
+	// Joiner: wait for the leader.
+	select {
+	case <-ctx.Done():
+		return origin.ObjectInfo{}, ctx.Err()
+	case <-sfe.done:
+	}
+
+	return sfe.info, sfe.err
+}
+
+// Invalidate drops the entry.
+func (c *Cache) Invalidate(originID, bucket, key string) {
+	k := mkKey(originID, bucket, key)
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if el, ok := c.idx[k]; ok {
+		c.ll.Remove(el)
+		delete(c.idx, k)
+	}
+}
+
+func (c *Cache) recordResult(originID, bucket, key string, info origin.ObjectInfo, err error) error {
+	k := mkKey(originID, bucket, key)
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	now := time.Now()
+
+	var e *cacheEntry
+
+	switch {
+	case err == nil:
+		e = &cacheEntry{key: k, info: info, expiresAt: now.Add(c.cfg.TTL)}
+	case errors.Is(err, origin.ErrNotFound):
+		e = &cacheEntry{key: k, negative: true, negErr: err, expiresAt: now.Add(c.cfg.NegativeTTL)}
+	default:
+		var ube *origin.UnsupportedBlobTypeError
+		if errors.As(err, &ube) {
+			e = &cacheEntry{key: k, negative: true, negErr: err, expiresAt: now.Add(c.cfg.NegativeTTL)}
+		} else {
+			// Other transient errors not cached.
+			return nil
+		}
+	}
+
+	if existing, ok := c.idx[k]; ok {
+		c.ll.Remove(existing)
+		delete(c.idx, k)
+	}
+
+	el := c.ll.PushFront(e)
+
+	c.idx[k] = el
+	for c.ll.Len() > c.cfg.MaxEntries {
+		oldest := c.ll.Back()
+		if oldest == nil {
+			break
+		}
+
+		c.ll.Remove(oldest)
+
+		oldEntry, ok := oldest.Value.(*cacheEntry)
+		if !ok {
+			return fmt.Errorf("metadata: list element is not *cacheEntry")
+		}
+
+		delete(c.idx, oldEntry.key)
+	}
+
+	return nil
+}
+
+func mkKey(originID, bucket, key string) string {
+	return originID + "|" + bucket + "|" + key
+}
diff --git a/internal/orca/origin/awss3/awss3.go b/internal/orca/origin/awss3/awss3.go
new file mode 100644
index 00000000..6d7e842c
--- /dev/null
+++ b/internal/orca/origin/awss3/awss3.go
@@ -0,0 +1,291 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package awss3 is the AWS S3 (and S3-compatible) origin driver. It
+// targets either real AWS S3 or a local S3-compatible endpoint such as
+// LocalStack. Useful as a credential-free origin for the dev harness:
+// LocalStack acts as both origin and cachestore (different buckets).
+//
+// This driver is read-only from Orca's perspective (Head, GetRange,
+// List). The seed step that uploads test objects to the origin bucket
+// happens out-of-band via aws-cli or similar.
+package awss3
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+
+	"github.com/aws/aws-sdk-go-v2/aws"
+	awsconfig "github.com/aws/aws-sdk-go-v2/config"
+	"github.com/aws/aws-sdk-go-v2/credentials"
+	"github.com/aws/aws-sdk-go-v2/service/s3"
+	s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
+	"github.com/aws/smithy-go"
+
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// Adapter implements origin.Origin against an S3-compatible endpoint.
+type Adapter struct {
+	cfg    Config
+	client *s3.Client
+}
+
+// Config is the awss3-driver configuration. Mirrors config.AWSS3 but
+// kept package-local so the driver can be unit-tested without
+// importing the whole config package.
+type Config struct {
+	// Endpoint, when set, overrides the regional default and routes
+	// requests at a custom URL (LocalStack uses
+	// http://localstack:4566). Leave empty for real AWS S3.
+	Endpoint string
+
+	// Region is the AWS region. LocalStack ignores this; the SDK
+	// requires a value.
+	Region string
+
+	// Bucket is the source bucket holding origin objects.
+	Bucket string
+
+	// AccessKey / SecretKey are static credentials. For LocalStack
+	// these are "test"/"test"; for real AWS, supply real creds.
+	AccessKey string
+	SecretKey string
+
+	// UsePathStyle: true for LocalStack (host-based addressing
+	// requires DNS wildcards LocalStack does not provide).
+	UsePathStyle bool
+}
+
+// New constructs an Adapter.
+func New(ctx context.Context, cfg Config) (*Adapter, error) {
+	if cfg.Bucket == "" {
+		return nil, fmt.Errorf("origin/awss3: bucket required")
+	}
+
+	if cfg.Region == "" {
+		cfg.Region = "us-east-1"
+	}
+
+	awsCfg, err := awsconfig.LoadDefaultConfig(ctx,
+		awsconfig.WithRegion(cfg.Region),
+		awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(
+			cfg.AccessKey, cfg.SecretKey, "",
+		)),
+		// Opt out of CRC64NVME default introduced in aws-sdk-go-v2
+		// 1.32. LocalStack 3.8 returns InvalidRequest for unknown
+		// algorithms; real AWS S3 still works either way.
+		awsconfig.WithRequestChecksumCalculation(aws.RequestChecksumCalculationWhenRequired),
+		awsconfig.WithResponseChecksumValidation(aws.ResponseChecksumValidationWhenRequired),
+	)
+	if err != nil {
+		return nil, fmt.Errorf("origin/awss3: aws config: %w", err)
+	}
+
+	client := s3.NewFromConfig(awsCfg, func(o *s3.Options) {
+		if cfg.Endpoint != "" {
+			o.BaseEndpoint = aws.String(cfg.Endpoint)
+		}
+
+		o.UsePathStyle = cfg.UsePathStyle
+	})
+
+	return &Adapter{cfg: cfg, client: client}, nil
+}
+
+// Head returns ObjectInfo for the named object. The bucket arg lets
+// callers override the configured bucket; if empty, the configured
+// bucket is used.
+func (a *Adapter) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	b := bucket
+	if b == "" {
+		b = a.cfg.Bucket
+	}
+
+	out, err := a.client.HeadObject(ctx, &s3.HeadObjectInput{
+		Bucket: aws.String(b),
+		Key:    aws.String(key),
+	})
+	if err != nil {
+		if isNotFound(err) {
+			return origin.ObjectInfo{LastStatus: http.StatusNotFound}, origin.ErrNotFound
+		}
+
+		if isAuth(err) {
+			return origin.ObjectInfo{}, origin.ErrAuth
+		}
+
+		return origin.ObjectInfo{}, fmt.Errorf("awss3 head: %w", err)
+	}
+
+	info := origin.ObjectInfo{LastStatus: http.StatusOK}
+	if out.ContentLength != nil {
+		info.Size = *out.ContentLength
+	}
+
+	if out.ETag != nil {
+		info.ETag = strings.Trim(*out.ETag, "\"")
+	}
+
+	if out.ContentType != nil {
+		info.ContentType = *out.ContentType
+	}
+
+	if out.LastModified != nil {
+		info.LastValidated = *out.LastModified
+	}
+
+	return info, nil
+}
+
+// GetRange fetches [off, off+n) of the object, sending If-Match: <etag>.
+func (a *Adapter) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) {
+	b := bucket
+	if b == "" {
+		b = a.cfg.Bucket
+	}
+
+	rng := fmt.Sprintf("bytes=%d-%d", off, off+n-1)
+
+	in := &s3.GetObjectInput{
+		Bucket: aws.String(b),
+		Key:    aws.String(key),
+		Range:  aws.String(rng),
+	}
+	if etag != "" {
+		// S3 expects the etag wrapped in double quotes.
+		in.IfMatch = aws.String("\"" + etag + "\"")
+	}
+
+	out, err := a.client.GetObject(ctx, in)
+	if err != nil {
+		if isPreconditionFailed(err) {
+			return nil, &origin.OriginETagChangedError{
+				Bucket: b, Key: key, Want: etag,
+			}
+		}
+
+		if isNotFound(err) {
+			return nil, origin.ErrNotFound
+		}
+
+		if isAuth(err) {
+			return nil, origin.ErrAuth
+		}
+
+		return nil, fmt.Errorf("awss3 get-range: %w", err)
+	}
+
+	return out.Body, nil
+}
+
+// List enumerates objects under prefix.
+func (a *Adapter) List(ctx context.Context, bucket, prefix, marker string, maxResults int) (origin.ListResult, error) {
+	b := bucket
+	if b == "" {
+		b = a.cfg.Bucket
+	}
+
+	in := &s3.ListObjectsV2Input{
+		Bucket:  aws.String(b),
+		Prefix:  aws.String(prefix),
+		MaxKeys: aws.Int32(int32(maxResults)),
+	}
+	if marker != "" {
+		in.ContinuationToken = aws.String(marker)
+	}
+
+	out, err := a.client.ListObjectsV2(ctx, in)
+	if err != nil {
+		if isAuth(err) {
+			return origin.ListResult{}, origin.ErrAuth
+		}
+
+		return origin.ListResult{}, fmt.Errorf("awss3 list: %w", err)
+	}
+
+	res := origin.ListResult{}
+
+	for _, item := range out.Contents {
+		entry := origin.ObjectEntry{}
+		if item.Key != nil {
+			entry.Key = *item.Key
+		}
+
+		if item.Size != nil {
+			entry.Size = *item.Size
+		}
+
+		if item.ETag != nil {
+			entry.ETag = strings.Trim(*item.ETag, "\"")
+		}
+
+		res.Entries = append(res.Entries, entry)
+	}
+
+	if out.IsTruncated != nil {
+		res.IsTruncated = *out.IsTruncated
+	}
+
+	if out.NextContinuationToken != nil {
+		res.NextMarker = *out.NextContinuationToken
+	}
+
+	return res, nil
+}
+
+func isNotFound(err error) bool {
+	var nsk *s3types.NoSuchKey
+	if errors.As(err, &nsk) {
+		return true
+	}
+
+	var nsb *s3types.NoSuchBucket
+	if errors.As(err, &nsb) {
+		return true
+	}
+
+	var notFound *s3types.NotFound
+	if errors.As(err, &notFound) {
+		return true
+	}
+
+	var apiErr smithy.APIError
+	if errors.As(err, &apiErr) {
+		switch apiErr.ErrorCode() {
+		case "NoSuchKey", "NotFound", "404":
+			return true
+		}
+	}
+
+	return false
+}
+
+func isAuth(err error) bool {
+	var apiErr smithy.APIError
+	if errors.As(err, &apiErr) {
+		switch apiErr.ErrorCode() {
+		case "AccessDenied", "Unauthorized", "Forbidden", "InvalidAccessKeyId", "SignatureDoesNotMatch":
+			return true
+		}
+	}
+
+	return false
+}
+
+func isPreconditionFailed(err error) bool {
+	var apiErr smithy.APIError
+	if errors.As(err, &apiErr) {
+		switch apiErr.ErrorCode() {
+		case "PreconditionFailed", "ConditionalRequestConflict":
+			return true
+		}
+	}
+
+	return strings.Contains(err.Error(), "PreconditionFailed") ||
+		strings.Contains(err.Error(), "412")
+}
diff --git a/internal/orca/origin/azureblob/azureblob.go b/internal/orca/origin/azureblob/azureblob.go
new file mode 100644
index 00000000..ab17d422
--- /dev/null
+++ b/internal/orca/origin/azureblob/azureblob.go
@@ -0,0 +1,265 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package azureblob is the Azure Blob Storage adapter for the Origin
+// interface. Block Blobs only (design.md s9).
+package azureblob
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+
+	"github.com/Azure/azure-sdk-for-go/sdk/azcore"
+	"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/bloberror"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container"
+
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// Adapter implements origin.Origin against Azure Blob Storage.
+type Adapter struct {
+	cfg    config.Azureblob
+	client *azblob.Client
+}
+
+// New builds an Adapter from config.
+func New(cfg config.Azureblob) (*Adapter, error) {
+	if cfg.Account == "" {
+		return nil, fmt.Errorf("azureblob: account required")
+	}
+
+	if cfg.AccountKey == "" {
+		return nil, fmt.Errorf("azureblob: account_key required")
+	}
+
+	cred, err := azblob.NewSharedKeyCredential(cfg.Account, cfg.AccountKey)
+	if err != nil {
+		return nil, fmt.Errorf("azureblob: shared-key credential: %w", err)
+	}
+
+	endpoint := cfg.Endpoint
+	if endpoint == "" {
+		endpoint = fmt.Sprintf("https://%s.blob.core.windows.net/", cfg.Account)
+	}
+
+	client, err := azblob.NewClientWithSharedKeyCredential(endpoint, cred, nil)
+	if err != nil {
+		return nil, fmt.Errorf("azureblob: client: %w", err)
+	}
+
+	return &Adapter{cfg: cfg, client: client}, nil
+}
+
+// Head returns ObjectInfo for the named blob.
+//
+// "bucket" maps to the configured container; the bucket arg is honored
+// only if non-empty (allowing single-container deployments to use the
+// configured container as the default).
+func (a *Adapter) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	cName := bucket
+	if cName == "" {
+		cName = a.cfg.Container
+	}
+
+	props, err := a.client.ServiceClient().NewContainerClient(cName).
+		NewBlobClient(key).GetProperties(ctx, nil)
+	if err != nil {
+		if isNotFound(err) {
+			return origin.ObjectInfo{LastStatus: http.StatusNotFound}, origin.ErrNotFound
+		}
+
+		if isAuth(err) {
+			return origin.ObjectInfo{}, origin.ErrAuth
+		}
+
+		return origin.ObjectInfo{}, fmt.Errorf("azureblob head: %w", err)
+	}
+
+	if err := validateBlobType(a.cfg.EnforceBlockBlobOnly, cName, key, props.BlobType); err != nil {
+		return origin.ObjectInfo{}, err
+	}
+
+	info := origin.ObjectInfo{LastStatus: http.StatusOK}
+	if props.ContentLength != nil {
+		info.Size = *props.ContentLength
+	}
+
+	if props.ETag != nil {
+		info.ETag = strings.Trim(string(*props.ETag), "\"")
+	}
+
+	if props.ContentType != nil {
+		info.ContentType = *props.ContentType
+	}
+
+	if props.LastModified != nil {
+		info.LastValidated = *props.LastModified
+	}
+
+	return info, nil
+}
+
+// GetRange fetches [off, off+n) of the blob, sending If-Match: <etag>.
+func (a *Adapter) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) {
+	cName := bucket
+	if cName == "" {
+		cName = a.cfg.Container
+	}
+
+	bc := a.client.ServiceClient().NewContainerClient(cName).NewBlobClient(key)
+	opts := &azblob.DownloadStreamOptions{
+		Range: blob.HTTPRange{Offset: off, Count: n},
+	}
+
+	if etag != "" {
+		etagVal := azcore.ETag(etag)
+		opts.AccessConditions = &blob.AccessConditions{
+			ModifiedAccessConditions: &blob.ModifiedAccessConditions{
+				IfMatch: to.Ptr(etagVal),
+			},
+		}
+	}
+
+	resp, err := bc.DownloadStream(ctx, opts)
+	if err != nil {
+		if isPreconditionFailed(err) {
+			return nil, &origin.OriginETagChangedError{
+				Bucket: cName, Key: key, Want: etag,
+			}
+		}
+
+		if isNotFound(err) {
+			return nil, origin.ErrNotFound
+		}
+
+		if isAuth(err) {
+			return nil, origin.ErrAuth
+		}
+
+		return nil, fmt.Errorf("azureblob get-range: %w", err)
+	}
+
+	return resp.Body, nil
+}
+
+// List enumerates blobs in the container matching prefix.
+func (a *Adapter) List(ctx context.Context, bucket, prefix, marker string, maxResults int) (origin.ListResult, error) {
+	cName := bucket
+	if cName == "" {
+		cName = a.cfg.Container
+	}
+
+	cc := a.client.ServiceClient().NewContainerClient(cName)
+	max := int32(maxResults)
+	pager := cc.NewListBlobsFlatPager(&container.ListBlobsFlatOptions{
+		Prefix:     &prefix,
+		MaxResults: &max,
+		Marker:     stringOrNil(marker),
+	})
+	out := origin.ListResult{}
+
+	if pager.More() {
+		page, err := pager.NextPage(ctx)
+		if err != nil {
+			if isAuth(err) {
+				return origin.ListResult{}, origin.ErrAuth
+			}
+
+			return origin.ListResult{}, fmt.Errorf("azureblob list: %w", err)
+		}
+
+		for _, item := range page.Segment.BlobItems {
+			entry := origin.ObjectEntry{}
+			if item.Name != nil {
+				entry.Key = *item.Name
+			}
+
+			if item.Properties != nil {
+				if item.Properties.ContentLength != nil {
+					entry.Size = *item.Properties.ContentLength
+				}
+
+				if item.Properties.ETag != nil {
+					entry.ETag = strings.Trim(string(*item.Properties.ETag), "\"")
+				}
+
+				if item.Properties.BlobType != nil {
+					entry.BlobType = string(*item.Properties.BlobType)
+				}
+			}
+
+			out.Entries = append(out.Entries, entry)
+		}
+
+		if page.NextMarker != nil {
+			out.NextMarker = *page.NextMarker
+			out.IsTruncated = *page.NextMarker != ""
+		}
+	}
+
+	return out, nil
+}
+
+func stringOrNil(s string) *string {
+	if s == "" {
+		return nil
+	}
+
+	return &s
+}
+
+func isNotFound(err error) bool {
+	return bloberror.HasCode(err, bloberror.BlobNotFound) ||
+		bloberror.HasCode(err, bloberror.ContainerNotFound) ||
+		errors.Is(err, origin.ErrNotFound)
+}
+
+func isAuth(err error) bool {
+	var rerr *azcore.ResponseError
+	if errors.As(err, &rerr) {
+		if rerr.StatusCode == http.StatusUnauthorized || rerr.StatusCode == http.StatusForbidden {
+			return true
+		}
+	}
+
+	return bloberror.HasCode(err, bloberror.AuthenticationFailed) ||
+		bloberror.HasCode(err, bloberror.AuthorizationFailure)
+}
+
+func isPreconditionFailed(err error) bool {
+	var rerr *azcore.ResponseError
+	if errors.As(err, &rerr) && rerr.StatusCode == http.StatusPreconditionFailed {
+		return true
+	}
+
+	return bloberror.HasCode(err, bloberror.ConditionNotMet)
+}
+
+// validateBlobType returns an UnsupportedBlobTypeError when
+// enforceBlockBlobOnly is set and the blob is a non-Block-Blob type
+// (Page or Append). Returns nil for Block Blobs and when the gate is
+// disabled. Extracted as a pure function so unit tests can cover all
+// branches without an Azurite round-trip.
+func validateBlobType(enforceBlockBlobOnly bool, container, key string, blobType *blob.BlobType) error {
+	if !enforceBlockBlobOnly || blobType == nil {
+		return nil
+	}
+
+	if *blobType == blob.BlobTypeBlockBlob {
+		return nil
+	}
+
+	return &origin.UnsupportedBlobTypeError{
+		Bucket:   container,
+		Key:      key,
+		BlobType: string(*blobType),
+	}
+}
diff --git a/internal/orca/origin/azureblob/azureblob_test.go b/internal/orca/origin/azureblob/azureblob_test.go
new file mode 100644
index 00000000..debfef96
--- /dev/null
+++ b/internal/orca/origin/azureblob/azureblob_test.go
@@ -0,0 +1,72 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package azureblob
+
+import (
+	"errors"
+	"testing"
+
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
+
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// TestValidateBlobType covers every branch of the EnforceBlockBlobOnly
+// gate. The integration suite previously only exercised the
+// PageBlob-refused case; this unit test fills in disabled, nil,
+// BlockBlob, and AppendBlob.
+func TestValidateBlobType(t *testing.T) {
+	pageBlob := blob.BlobTypePageBlob
+	appendBlob := blob.BlobTypeAppendBlob
+	blockBlob := blob.BlobTypeBlockBlob
+
+	tests := []struct {
+		name            string
+		enforce         bool
+		blobType        *blob.BlobType
+		wantUnsupported bool
+	}{
+		{"enforce off accepts any type", false, &pageBlob, false},
+		{"nil blob type passes when enforced (no info)", true, nil, false},
+		{"block blob accepted", true, &blockBlob, false},
+		{"page blob refused", true, &pageBlob, true},
+		{"append blob refused", true, &appendBlob, true},
+	}
+
+	const (
+		container = "ctr"
+		key       = "key"
+	)
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := validateBlobType(tt.enforce, container, key, tt.blobType)
+
+			if (err != nil) != tt.wantUnsupported {
+				t.Fatalf("err=%v, wantUnsupported=%v", err, tt.wantUnsupported)
+			}
+
+			if !tt.wantUnsupported {
+				return
+			}
+
+			var ube *origin.UnsupportedBlobTypeError
+			if !errors.As(err, &ube) {
+				t.Fatalf("err type=%T (want *origin.UnsupportedBlobTypeError): %v", err, err)
+			}
+
+			if ube.Bucket != container {
+				t.Errorf("Bucket=%q want %q", ube.Bucket, container)
+			}
+
+			if ube.Key != key {
+				t.Errorf("Key=%q want %q", ube.Key, key)
+			}
+
+			if tt.blobType != nil && ube.BlobType != string(*tt.blobType) {
+				t.Errorf("BlobType=%q want %q", ube.BlobType, string(*tt.blobType))
+			}
+		})
+	}
+}
diff --git a/internal/orca/origin/origin.go b/internal/orca/origin/origin.go
new file mode 100644
index 00000000..06e53b32
--- /dev/null
+++ b/internal/orca/origin/origin.go
@@ -0,0 +1,90 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package origin defines the upstream-blob-store interface and shared
+// types. Concrete adapters live under origin/<driver>/.
+//
+// See design/orca/design.md s7 for the full interface.
+package origin
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"time"
+)
+
+// Origin is a read-only view of an upstream blob store.
+type Origin interface {
+	// Head returns object metadata. If the blob does not exist, returns
+	// ErrNotFound. If the blob is an unsupported type (e.g., azureblob
+	// non-BlockBlob), returns UnsupportedBlobTypeError.
+	Head(ctx context.Context, bucket, key string) (ObjectInfo, error)
+
+	// GetRange fetches [off, off+n) bytes of the object. The etag is
+	// passed as `If-Match: <etag>` so a mid-flight overwrite is detected
+	// at the wire (returns OriginETagChangedError).
+	GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error)
+
+	// List enumerates objects under prefix. Pagination via marker.
+	List(ctx context.Context, bucket, prefix, marker string, max int) (ListResult, error)
+}
+
+// ObjectInfo is the result of a successful Head.
+type ObjectInfo struct {
+	Size          int64
+	ETag          string
+	ContentType   string
+	LastValidated time.Time
+	LastStatus    int
+}
+
+// ListResult is the paginated result of List.
+type ListResult struct {
+	Entries     []ObjectEntry
+	NextMarker  string
+	IsTruncated bool
+}
+
+// ObjectEntry is one item in a ListResult.
+type ObjectEntry struct {
+	Key      string
+	Size     int64
+	ETag     string
+	BlobType string // "" for s3; "BlockBlob" / "PageBlob" / "AppendBlob" for azureblob
+}
+
+// Sentinel errors. Wrap with %w so callers use errors.Is.
+var (
+	ErrNotFound = errors.New("origin: not found")
+	ErrAuth     = errors.New("origin: auth")
+	ErrThrottle = errors.New("origin: throttle")
+)
+
+// OriginETagChangedError is returned by GetRange when the origin
+// rejects the If-Match precondition.
+type OriginETagChangedError struct {
+	Bucket string
+	Key    string
+	Want   string
+	Got    string
+}
+
+func (e *OriginETagChangedError) Error() string {
+	return fmt.Sprintf("origin etag changed for %s/%s: want=%q got=%q",
+		e.Bucket, e.Key, e.Want, e.Got)
+}
+
+// UnsupportedBlobTypeError is returned by azureblob.Head when the
+// target is a Page or Append blob (design.md s9).
+type UnsupportedBlobTypeError struct {
+	Bucket   string
+	Key      string
+	BlobType string
+}
+
+func (e *UnsupportedBlobTypeError) Error() string {
+	return fmt.Sprintf("origin unsupported blob type %s for %s/%s",
+		e.BlobType, e.Bucket, e.Key)
+}
diff --git a/internal/orca/server/server.go b/internal/orca/server/server.go
new file mode 100644
index 00000000..2a1f5546
--- /dev/null
+++ b/internal/orca/server/server.go
@@ -0,0 +1,434 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package server holds the HTTP handlers for the client edge and the
+// internal-listener.
+//
+// Client edge (8443): GET /{bucket}/{key} (with optional Range), HEAD,
+// LIST. No auth in dev (server.auth.enabled=false).
+//
+// Internal listener (8444): GET /internal/fill?<chunk-key>. No mTLS in
+// dev (cluster.internal_tls.enabled=false).
+package server
+
+import (
+	"context"
+	"encoding/xml"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"strconv"
+	"strings"
+
+	"github.com/Azure/unbounded/internal/orca/cachestore"
+	"github.com/Azure/unbounded/internal/orca/chunk"
+	"github.com/Azure/unbounded/internal/orca/cluster"
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// EdgeHandler implements the client-edge S3 surface.
+type EdgeHandler struct {
+	fc  edgeFetchAPI
+	cfg *config.Config
+	log *slog.Logger
+}
+
+// edgeFetchAPI is the surface area EdgeHandler depends on. The real
+// *fetch.Coordinator satisfies it; tests substitute small fakes for
+// deterministic unit-level coverage.
+type edgeFetchAPI interface {
+	HeadObject(ctx context.Context, bucket, key string) (origin.ObjectInfo, error)
+	GetChunk(ctx context.Context, k chunk.Key) (io.ReadCloser, error)
+	Origin() origin.Origin
+}
+
+// NewEdgeHandler wires the edge handler.
+func NewEdgeHandler(fc edgeFetchAPI, cfg *config.Config, log *slog.Logger) *EdgeHandler {
+	return &EdgeHandler{fc: fc, cfg: cfg, log: log}
+}
+
+// ServeHTTP routes incoming client requests.
+//
+// Routing (path-style only, since LocalStack and most dev clients
+// use path-style):
+//
+//	GET  /                                  -> ListBuckets (not supported; 405)
+//	GET  /{bucket}/?list-type=2&prefix=...  -> ListObjectsV2
+//	GET  /{bucket}/                         -> ListObjectsV2 (default)
+//	GET  /{bucket}/{key}                    -> GetObject (with optional Range)
+//	HEAD /{bucket}/{key}                    -> HeadObject
+//	HEAD /{bucket}/                         -> HeadBucket (not supported; 405)
+func (h *EdgeHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	if h.cfg.Server.Auth.Enabled {
+		// Stub: production would dispatch to bearer/mTLS validation.
+		// In dev (auth.enabled=false) we skip entirely.
+		http.Error(w, "auth required (server.auth.enabled=true) but not implemented in MVP",
+			http.StatusUnauthorized)
+
+		return
+	}
+
+	bucket, key := splitPath(r.URL.Path)
+
+	switch r.Method {
+	case http.MethodHead:
+		if key == "" {
+			h.notImplemented(w, "HeadBucket")
+			return
+		}
+
+		h.handleHead(w, r, bucket, key)
+	case http.MethodGet:
+		if key == "" {
+			h.handleList(w, r, bucket)
+			return
+		}
+
+		h.handleGet(w, r, bucket, key)
+	default:
+		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+	}
+}
+
+func (h *EdgeHandler) handleHead(w http.ResponseWriter, r *http.Request, bucket, key string) {
+	info, err := h.fc.HeadObject(r.Context(), bucket, key)
+	if err != nil {
+		h.writeOriginError(w, err)
+		return
+	}
+
+	setObjectHeaders(w, info)
+	// HEAD must report the Content-Length the GET response would carry.
+	w.Header().Set("Content-Length", strconv.FormatInt(info.Size, 10))
+	w.WriteHeader(http.StatusOK)
+}
+
+func (h *EdgeHandler) handleGet(w http.ResponseWriter, r *http.Request, bucket, key string) {
+	info, err := h.fc.HeadObject(r.Context(), bucket, key)
+	if err != nil {
+		h.writeOriginError(w, err)
+		return
+	}
+
+	// Determine byte range.
+	var (
+		rangeStart int64
+		rangeEnd   = info.Size - 1
+		hasRange   bool
+		statusCode = http.StatusOK
+	)
+	if rh := r.Header.Get("Range"); rh != "" {
+		s, e, ok := parseSimpleByteRange(rh, info.Size)
+		if !ok {
+			http.Error(w, "invalid Range", http.StatusRequestedRangeNotSatisfiable)
+			return
+		}
+
+		rangeStart, rangeEnd = s, e
+		hasRange = true
+		statusCode = http.StatusPartialContent
+	}
+
+	if rangeStart > rangeEnd {
+		http.Error(w, "range not satisfiable", http.StatusRequestedRangeNotSatisfiable)
+		return
+	}
+
+	chunkSize := h.cfg.Chunking.Size
+	firstChunk, lastChunk := chunk.IndexRange(rangeStart, rangeEnd, chunkSize, info.Size)
+
+	// Set headers eagerly (Option D commit boundary == first byte from
+	// origin; for cache hit, immediate).
+	setObjectHeaders(w, info)
+	w.Header().Set("Content-Length", strconv.FormatInt(rangeEnd-rangeStart+1, 10))
+
+	if hasRange {
+		w.Header().Set("Content-Range",
+			fmt.Sprintf("bytes %d-%d/%d", rangeStart, rangeEnd, info.Size))
+	}
+
+	// Write status now; subsequent failures become mid-stream aborts.
+	w.WriteHeader(statusCode)
+
+	for ci := firstChunk; ci <= lastChunk; ci++ {
+		ckey := chunk.Key{
+			OriginID:  h.cfg.Origin.ID,
+			Bucket:    bucket,
+			ObjectKey: key,
+			ETag:      info.ETag,
+			ChunkSize: chunkSize,
+			Index:     ci,
+		}
+
+		body, err := h.fc.GetChunk(r.Context(), ckey)
+		if err != nil {
+			// We've already sent headers; abort the response.
+			h.log.Warn("mid-stream chunk fetch failed",
+				"bucket", bucket, "key", key, "chunk", ci, "err", err)
+
+			return
+		}
+
+		off, length := chunk.ChunkSlice(ci, chunkSize, rangeStart, rangeEnd, info.Size)
+		if err := streamSlice(w, body, off, length); err != nil {
+			body.Close() //nolint:errcheck // chunk body close best-effort, response already streaming
+			h.log.Warn("mid-stream copy failed",
+				"bucket", bucket, "key", key, "chunk", ci, "err", err)
+
+			return
+		}
+
+		body.Close() //nolint:errcheck // chunk body close best-effort, response already streaming
+
+		if f, ok := w.(http.Flusher); ok {
+			f.Flush()
+		}
+	}
+}
+
+// streamSlice copies length bytes starting at off from src to dst.
+func streamSlice(dst io.Writer, src io.Reader, off, length int64) error {
+	if off > 0 {
+		if _, err := io.CopyN(io.Discard, src, off); err != nil {
+			return err
+		}
+	}
+
+	if length > 0 {
+		if _, err := io.CopyN(dst, src, length); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// handleList is a thin pass-through to Origin.List for v1 prototype.
+func (h *EdgeHandler) handleList(w http.ResponseWriter, r *http.Request, bucket string) {
+	// Pass-through; very minimal S3 ListObjectsV2 shape. Reviewers can
+	// curl this for sanity but full S3 list semantics are not in MVP.
+	prefix := r.URL.Query().Get("prefix")
+	marker := r.URL.Query().Get("continuation-token")
+	maxStr := r.URL.Query().Get("max-keys")
+	maxKeys := 1000
+
+	if maxStr != "" {
+		if v, err := strconv.Atoi(maxStr); err == nil && v > 0 {
+			maxKeys = v
+		}
+	}
+
+	type listEntry struct {
+		Key  string `xml:"Key"`
+		Size int64  `xml:"Size"`
+		ETag string `xml:"ETag"`
+	}
+
+	type listResult struct {
+		XMLName     xml.Name    `xml:"ListBucketResult"`
+		Name        string      `xml:"Name"`
+		Prefix      string      `xml:"Prefix"`
+		KeyCount    int         `xml:"KeyCount"`
+		MaxKeys     int         `xml:"MaxKeys"`
+		IsTruncated bool        `xml:"IsTruncated"`
+		NextMarker  string      `xml:"NextContinuationToken,omitempty"`
+		Contents    []listEntry `xml:"Contents"`
+	}
+
+	or := h.fc.Origin()
+
+	res, err := or.List(r.Context(), bucket, prefix, marker, maxKeys)
+	if err != nil {
+		h.writeOriginError(w, err)
+		return
+	}
+
+	body := listResult{
+		Name:        bucket,
+		Prefix:      prefix,
+		KeyCount:    len(res.Entries),
+		MaxKeys:     maxKeys,
+		IsTruncated: res.IsTruncated,
+		NextMarker:  res.NextMarker,
+	}
+	for _, e := range res.Entries {
+		body.Contents = append(body.Contents, listEntry{Key: e.Key, Size: e.Size, ETag: e.ETag})
+	}
+
+	w.Header().Set("Content-Type", "application/xml")
+	w.WriteHeader(http.StatusOK)
+	enc := xml.NewEncoder(w)
+	_ = enc.Encode(body) //nolint:errcheck // headers already sent; mid-stream encode error not actionable
+}
+
+func (h *EdgeHandler) notImplemented(w http.ResponseWriter, op string) {
+	http.Error(w, op+" not implemented in MVP", http.StatusNotImplemented)
+}
+
+func (h *EdgeHandler) writeOriginError(w http.ResponseWriter, err error) {
+	switch {
+	case errors.Is(err, origin.ErrNotFound):
+		http.Error(w, "NoSuchKey", http.StatusNotFound)
+	case errors.Is(err, origin.ErrAuth):
+		http.Error(w, "Unauthorized origin", http.StatusBadGateway)
+	default:
+		var (
+			ube *origin.UnsupportedBlobTypeError
+			ec  *origin.OriginETagChangedError
+		)
+
+		switch {
+		case errors.As(err, &ube):
+			http.Error(w, "OriginUnsupported: "+ube.Error(), http.StatusBadGateway)
+		case errors.As(err, &ec):
+			http.Error(w, "OriginETagChanged", http.StatusBadGateway)
+		default:
+			h.log.Warn("origin error", "err", err)
+			http.Error(w, "OriginUnreachable", http.StatusBadGateway)
+		}
+	}
+}
+
+func setObjectHeaders(w http.ResponseWriter, info origin.ObjectInfo) {
+	if info.ContentType != "" {
+		w.Header().Set("Content-Type", info.ContentType)
+	}
+
+	if info.ETag != "" {
+		w.Header().Set("ETag", "\""+info.ETag+"\"")
+	}
+
+	w.Header().Set("Accept-Ranges", "bytes")
+}
+
+func splitPath(p string) (bucket, key string) {
+	p = strings.TrimPrefix(p, "/")
+	if p == "" {
+		return "", ""
+	}
+
+	idx := strings.IndexByte(p, '/')
+	if idx < 0 {
+		return p, ""
+	}
+
+	return p[:idx], p[idx+1:]
+}
+
+func parseSimpleByteRange(h string, size int64) (start, end int64, ok bool) {
+	if !strings.HasPrefix(h, "bytes=") {
+		return 0, 0, false
+	}
+
+	spec := strings.TrimPrefix(h, "bytes=")
+
+	parts := strings.Split(spec, "-")
+	if len(parts) != 2 {
+		return 0, 0, false
+	}
+
+	if parts[0] == "" {
+		// Suffix: -N (last N bytes)
+		n, err := strconv.ParseInt(parts[1], 10, 64)
+		if err != nil || n <= 0 || n > size {
+			return 0, 0, false
+		}
+
+		return size - n, size - 1, true
+	}
+
+	s, err := strconv.ParseInt(parts[0], 10, 64)
+	if err != nil || s < 0 {
+		return 0, 0, false
+	}
+
+	if parts[1] == "" {
+		return s, size - 1, true
+	}
+
+	e, err := strconv.ParseInt(parts[1], 10, 64)
+	if err != nil || e < s {
+		return 0, 0, false
+	}
+
+	if e >= size {
+		e = size - 1
+	}
+
+	return s, e, true
+}
+
+// InternalHandler implements GET /internal/fill on the internal
+// listener. Plain HTTP/2 (no mTLS) in dev.
+type InternalHandler struct {
+	fc  internalFetchAPI
+	cl  *cluster.Cluster
+	log *slog.Logger
+}
+
+// internalFetchAPI is the surface area InternalHandler depends on. The
+// real *fetch.Coordinator satisfies it; tests substitute small fakes.
+type internalFetchAPI interface {
+	FillForPeer(ctx context.Context, k chunk.Key) (io.ReadCloser, error)
+}
+
+// NewInternalHandler wires the internal handler.
+func NewInternalHandler(fc internalFetchAPI, cl *cluster.Cluster, log *slog.Logger) *InternalHandler {
+	return &InternalHandler{fc: fc, cl: cl, log: log}
+}
+
+// ServeHTTP handles GET /internal/fill?<chunk-key-params>.
+func (h *InternalHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	if r.URL.Path != "/internal/fill" {
+		http.NotFound(w, r)
+		return
+	}
+
+	if r.Method != http.MethodGet {
+		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	if r.Header.Get("X-Orca-Internal") != "1" {
+		http.Error(w, "missing X-Orca-Internal header", http.StatusBadRequest)
+		return
+	}
+
+	k, err := cluster.DecodeChunkKey(r.URL.Query())
+	if err != nil {
+		http.Error(w, "invalid chunk key: "+err.Error(), http.StatusBadRequest)
+		return
+	}
+
+	if !h.cl.IsCoordinator(k) {
+		http.Error(w, `{"reason":"not_coordinator"}`, http.StatusConflict)
+		return
+	}
+
+	body, err := h.fc.FillForPeer(r.Context(), k)
+	if err != nil {
+		h.log.Warn("internal fill failed", "chunk", k.String(), "err", err)
+		http.Error(w, "fill failed", http.StatusBadGateway)
+
+		return
+	}
+	defer body.Close() //nolint:errcheck // internal-fill body close best-effort
+
+	w.Header().Set("Content-Type", "application/octet-stream")
+	w.WriteHeader(http.StatusOK)
+
+	if _, copyErr := io.Copy(w, body); copyErr != nil {
+		h.log.Warn("internal fill copy failed", "chunk", k.String(), "err", copyErr)
+	}
+}
+
+// Compile-time check that the cachestore.ErrNotFound mapping survives
+// dead-code elimination across handlers (used only via errors.Is in
+// production code paths).
+var (
+	_ = cachestore.ErrNotFound
+	_ = context.Canceled
+)
diff --git a/internal/orca/server/server_test.go b/internal/orca/server/server_test.go
new file mode 100644
index 00000000..64999464
--- /dev/null
+++ b/internal/orca/server/server_test.go
@@ -0,0 +1,482 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package server
+
+import (
+	"context"
+	"encoding/xml"
+	"errors"
+	"io"
+	"log/slog"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// fakeEdgeAPI satisfies edgeFetchAPI with canned responses for unit
+// tests. Only the field for the call you want to mock needs to be
+// set; an unset *Func panics if the test invokes the corresponding
+// method.
+type fakeEdgeAPI struct {
+	HeadObjectFunc func(ctx context.Context, bucket, key string) (origin.ObjectInfo, error)
+	GetChunkFunc   func(ctx context.Context, k chunk.Key) (io.ReadCloser, error)
+	OriginVal      origin.Origin
+}
+
+func (f *fakeEdgeAPI) HeadObject(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	return f.HeadObjectFunc(ctx, bucket, key)
+}
+
+func (f *fakeEdgeAPI) GetChunk(ctx context.Context, k chunk.Key) (io.ReadCloser, error) {
+	return f.GetChunkFunc(ctx, k)
+}
+
+func (f *fakeEdgeAPI) Origin() origin.Origin { return f.OriginVal }
+
+// fakeOrigin satisfies origin.Origin for handler tests. Only the
+// fields used in the test need to be populated.
+type fakeOrigin struct {
+	HeadFunc     func(ctx context.Context, bucket, key string) (origin.ObjectInfo, error)
+	GetRangeFunc func(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error)
+	ListFunc     func(ctx context.Context, bucket, prefix, marker string, max int) (origin.ListResult, error)
+}
+
+func (f *fakeOrigin) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	return f.HeadFunc(ctx, bucket, key)
+}
+
+func (f *fakeOrigin) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) {
+	return f.GetRangeFunc(ctx, bucket, key, etag, off, n)
+}
+
+func (f *fakeOrigin) List(ctx context.Context, bucket, prefix, marker string, max int) (origin.ListResult, error) {
+	return f.ListFunc(ctx, bucket, prefix, marker, max)
+}
+
+// TestWriteOriginError covers all five branches of the error mapping.
+// Previously only ErrNotFound was exercised (via integration test).
+func TestWriteOriginError(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name       string
+		err        error
+		wantStatus int
+		wantBody   string
+	}{
+		{
+			name:       "not found",
+			err:        origin.ErrNotFound,
+			wantStatus: http.StatusNotFound,
+			wantBody:   "NoSuchKey",
+		},
+		{
+			name:       "auth",
+			err:        origin.ErrAuth,
+			wantStatus: http.StatusBadGateway,
+			wantBody:   "Unauthorized origin",
+		},
+		{
+			name: "unsupported blob type",
+			err: &origin.UnsupportedBlobTypeError{
+				Bucket:   "ctr",
+				Key:      "page-blob",
+				BlobType: "PageBlob",
+			},
+			wantStatus: http.StatusBadGateway,
+			wantBody:   "OriginUnsupported",
+		},
+		{
+			name: "etag changed",
+			err: &origin.OriginETagChangedError{
+				Bucket: "b", Key: "k", Want: "old", Got: "new",
+			},
+			wantStatus: http.StatusBadGateway,
+			wantBody:   "OriginETagChanged",
+		},
+		{
+			name:       "generic error",
+			err:        errors.New("unexpected"),
+			wantStatus: http.StatusBadGateway,
+			wantBody:   "OriginUnreachable",
+		},
+	}
+
+	h := &EdgeHandler{log: discardLogger()}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			rr := httptest.NewRecorder()
+			h.writeOriginError(rr, tt.err)
+
+			if rr.Code != tt.wantStatus {
+				t.Errorf("status=%d want %d", rr.Code, tt.wantStatus)
+			}
+
+			if !strings.Contains(rr.Body.String(), tt.wantBody) {
+				t.Errorf("body %q does not contain %q", rr.Body.String(), tt.wantBody)
+			}
+		})
+	}
+}
+
+// TestHandleHead covers metadata propagation and the not-found error
+// path on HEAD requests.
+func TestHandleHead(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name       string
+		info       origin.ObjectInfo
+		err        error
+		wantStatus int
+		wantHdrs   map[string]string
+	}{
+		{
+			name: "normal blob",
+			info: origin.ObjectInfo{
+				Size:        1024,
+				ETag:        "abc123",
+				ContentType: "application/octet-stream",
+			},
+			wantStatus: http.StatusOK,
+			wantHdrs: map[string]string{
+				"Content-Length": "1024",
+				"ETag":           `"abc123"`,
+				"Content-Type":   "application/octet-stream",
+			},
+		},
+		{
+			name:       "missing content type omits header",
+			info:       origin.ObjectInfo{Size: 99, ETag: "x"},
+			wantStatus: http.StatusOK,
+			wantHdrs: map[string]string{
+				"Content-Length": "99",
+				"ETag":           `"x"`,
+			},
+		},
+		{
+			name:       "missing etag omits header",
+			info:       origin.ObjectInfo{Size: 7},
+			wantStatus: http.StatusOK,
+			wantHdrs: map[string]string{
+				"Content-Length": "7",
+			},
+		},
+		{
+			name:       "origin not found yields 404",
+			err:        origin.ErrNotFound,
+			wantStatus: http.StatusNotFound,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			fc := &fakeEdgeAPI{
+				HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+					return tt.info, tt.err
+				},
+			}
+			h := NewEdgeHandler(fc, &config.Config{}, discardLogger())
+
+			req := httptest.NewRequest(http.MethodHead, "/bucket/key", nil)
+			rr := httptest.NewRecorder()
+			h.handleHead(rr, req, "bucket", "key")
+
+			if rr.Code != tt.wantStatus {
+				t.Errorf("status=%d want %d", rr.Code, tt.wantStatus)
+			}
+
+			for k, want := range tt.wantHdrs {
+				got := rr.Header().Get(k)
+				if got != want {
+					t.Errorf("header %s=%q want %q", k, got, want)
+				}
+			}
+
+			if rr.Body.Len() != 0 && tt.wantStatus == http.StatusOK {
+				t.Errorf("HEAD body should be empty; got %d bytes", rr.Body.Len())
+			}
+		})
+	}
+}
+
+// TestHandleList covers the XML pass-through, prefix propagation,
+// truncation, and empty-list handling.
+func TestHandleList(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name        string
+		prefix      string
+		listResult  origin.ListResult
+		listErr     error
+		wantStatus  int
+		wantKeys    []string
+		wantTrunc   bool
+		wantNextTok string
+	}{
+		{
+			name:   "normal list",
+			prefix: "alpha/",
+			listResult: origin.ListResult{
+				Entries: []origin.ObjectEntry{
+					{Key: "alpha/one", Size: 3, ETag: "e1"},
+					{Key: "alpha/two", Size: 5, ETag: "e2"},
+				},
+			},
+			wantStatus: http.StatusOK,
+			wantKeys:   []string{"alpha/one", "alpha/two"},
+		},
+		{
+			name:       "empty list",
+			prefix:     "missing/",
+			listResult: origin.ListResult{},
+			wantStatus: http.StatusOK,
+			wantKeys:   nil,
+		},
+		{
+			name: "truncated list",
+			listResult: origin.ListResult{
+				Entries:     []origin.ObjectEntry{{Key: "k1"}},
+				IsTruncated: true,
+				NextMarker:  "next-page",
+			},
+			wantStatus:  http.StatusOK,
+			wantKeys:    []string{"k1"},
+			wantTrunc:   true,
+			wantNextTok: "next-page",
+		},
+		{
+			name:       "origin error yields 502",
+			listErr:    errors.New("upstream broken"),
+			wantStatus: http.StatusBadGateway,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			or := &fakeOrigin{
+				ListFunc: func(_ context.Context, bucket, prefix, _ string, _ int) (origin.ListResult, error) {
+					if bucket != "b" {
+						t.Errorf("bucket=%q want %q", bucket, "b")
+					}
+
+					if prefix != tt.prefix {
+						t.Errorf("prefix=%q want %q", prefix, tt.prefix)
+					}
+
+					return tt.listResult, tt.listErr
+				},
+			}
+			fc := &fakeEdgeAPI{OriginVal: or}
+			h := NewEdgeHandler(fc, &config.Config{}, discardLogger())
+
+			req := httptest.NewRequest(http.MethodGet,
+				"/b/?list-type=2&prefix="+tt.prefix, nil)
+			rr := httptest.NewRecorder()
+			h.handleList(rr, req, "b")
+
+			if rr.Code != tt.wantStatus {
+				t.Errorf("status=%d want %d body=%s", rr.Code, tt.wantStatus, rr.Body.String())
+			}
+
+			if tt.wantStatus != http.StatusOK {
+				return
+			}
+
+			var got struct {
+				XMLName     xml.Name `xml:"ListBucketResult"`
+				Name        string   `xml:"Name"`
+				Prefix      string   `xml:"Prefix"`
+				KeyCount    int      `xml:"KeyCount"`
+				IsTruncated bool     `xml:"IsTruncated"`
+				NextMarker  string   `xml:"NextContinuationToken"`
+				Contents    []struct {
+					Key string `xml:"Key"`
+				} `xml:"Contents"`
+			}
+			if err := xml.Unmarshal(rr.Body.Bytes(), &got); err != nil {
+				t.Fatalf("xml decode: %v body=%s", err, rr.Body.String())
+			}
+
+			if got.Name != "b" {
+				t.Errorf("Name=%q want %q", got.Name, "b")
+			}
+
+			if got.Prefix != tt.prefix {
+				t.Errorf("Prefix=%q want %q", got.Prefix, tt.prefix)
+			}
+
+			if got.KeyCount != len(tt.wantKeys) {
+				t.Errorf("KeyCount=%d want %d", got.KeyCount, len(tt.wantKeys))
+			}
+
+			if got.IsTruncated != tt.wantTrunc {
+				t.Errorf("IsTruncated=%v want %v", got.IsTruncated, tt.wantTrunc)
+			}
+
+			if got.NextMarker != tt.wantNextTok {
+				t.Errorf("NextMarker=%q want %q", got.NextMarker, tt.wantNextTok)
+			}
+
+			gotKeys := make([]string, 0, len(got.Contents))
+			for _, c := range got.Contents {
+				gotKeys = append(gotKeys, c.Key)
+			}
+
+			if !equalStrings(gotKeys, tt.wantKeys) {
+				t.Errorf("keys=%v want %v", gotKeys, tt.wantKeys)
+			}
+		})
+	}
+}
+
+// TestParseSimpleByteRange covers all parser branches.
+func TestParseSimpleByteRange(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name      string
+		header    string
+		size      int64
+		wantStart int64
+		wantEnd   int64
+		wantOK    bool
+	}{
+		{"normal range", "bytes=0-99", 1024, 0, 99, true},
+		{"suffix range", "bytes=-100", 1024, 924, 1023, true},
+		{"open-ended", "bytes=100-", 1024, 100, 1023, true},
+		{"end clamped to size", "bytes=0-9999", 1024, 0, 1023, true},
+		{"start > end rejected", "bytes=100-50", 1024, 0, 0, false},
+		{"missing prefix rejected", "0-99", 1024, 0, 0, false},
+		{"multi-range rejected", "bytes=0-99,200-299", 1024, 0, 0, false},
+		{"empty rejected", "", 1024, 0, 0, false},
+		{"bytes= alone rejected", "bytes=", 1024, 0, 0, false},
+		{"suffix larger than size rejected", "bytes=-9999", 1024, 0, 0, false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			s, e, ok := parseSimpleByteRange(tt.header, tt.size)
+			if ok != tt.wantOK {
+				t.Fatalf("ok=%v want %v (s=%d e=%d)", ok, tt.wantOK, s, e)
+			}
+
+			if !ok {
+				return
+			}
+
+			if s != tt.wantStart || e != tt.wantEnd {
+				t.Errorf("(s,e)=(%d,%d) want (%d,%d)", s, e, tt.wantStart, tt.wantEnd)
+			}
+		})
+	}
+}
+
+// TestSplitPath covers path splitting edge cases.
+func TestSplitPath(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		in         string
+		wantBucket string
+		wantKey    string
+	}{
+		{"", "", ""},
+		{"/", "", ""},
+		{"/bucket", "bucket", ""},
+		{"/bucket/", "bucket", ""},
+		{"/bucket/key", "bucket", "key"},
+		{"/bucket/path/to/key", "bucket", "path/to/key"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.in, func(t *testing.T) {
+			b, k := splitPath(tt.in)
+			if b != tt.wantBucket || k != tt.wantKey {
+				t.Errorf("splitPath(%q)=(%q,%q) want (%q,%q)",
+					tt.in, b, k, tt.wantBucket, tt.wantKey)
+			}
+		})
+	}
+}
+
+// TestSetObjectHeaders covers header propagation including the
+// always-set Accept-Ranges and the conditionally-set fields.
+func TestSetObjectHeaders(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		info origin.ObjectInfo
+		want map[string]string
+	}{
+		{
+			name: "all fields set",
+			info: origin.ObjectInfo{ETag: "abc", ContentType: "text/plain"},
+			want: map[string]string{
+				"ETag":          `"abc"`,
+				"Content-Type":  "text/plain",
+				"Accept-Ranges": "bytes",
+			},
+		},
+		{
+			name: "missing content type",
+			info: origin.ObjectInfo{ETag: "abc"},
+			want: map[string]string{
+				"ETag":          `"abc"`,
+				"Content-Type":  "",
+				"Accept-Ranges": "bytes",
+			},
+		},
+		{
+			name: "missing etag",
+			info: origin.ObjectInfo{ContentType: "text/plain"},
+			want: map[string]string{
+				"ETag":          "",
+				"Content-Type":  "text/plain",
+				"Accept-Ranges": "bytes",
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			rr := httptest.NewRecorder()
+			setObjectHeaders(rr, tt.info)
+
+			for k, want := range tt.want {
+				if got := rr.Header().Get(k); got != want {
+					t.Errorf("header %s=%q want %q", k, got, want)
+				}
+			}
+		})
+	}
+}
+
+// helpers
+
+func discardLogger() *slog.Logger {
+	return slog.New(slog.NewTextHandler(io.Discard, nil))
+}
+
+func equalStrings(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+
+	return true
+}