Allow the sidecar to sample from a list of prefill host ports

smarterclayton · smarterclayton · commit 1143dac58aa5 · 2025-10-04T16:22:45.000-04:00
In some benchmarking and test environments dynamic prefill selection
may be difficult and random selection among a set of hosts is
sufficient.

Add a new `--enable-prefiller-sampling` flag that instructs the
sidecar to select a random prefill host from the provided list
instead of the first one. Make the behavior opt-in to prevent
users from accidentally depending on the new behavior, and
keep the existing default behavior (first header value) consistent.

E.g.:

    curl -H 'x-prefiller-host-port: server1:8000` -H 'x-prefiller-host-port: server2:8000'

will randomly choose one of the two values.

Signed-off-by: Clayton Coleman &lt;smarterclayton@gmail.com&gt;
diff --git a/cmd/llm-d-routing-sidecar/main.go b/cmd/llm-d-routing-sidecar/main.go
@@ -20,6 +20,7 @@ import (
 	"flag"
 	"net/url"
 	"os"
+	"strconv"
 
 	"k8s.io/klog/v2"
 
@@ -43,6 +44,7 @@ func main() {
 	enableSSRFProtection := flag.Bool("enable-ssrf-protection", false, "enable SSRF protection using InferencePool allowlisting")
 	inferencePoolNamespace := flag.String("inference-pool-namespace", os.Getenv("INFERENCE_POOL_NAMESPACE"), "the Kubernetes namespace to watch for InferencePool resources (defaults to INFERENCE_POOL_NAMESPACE env var)")
 	inferencePoolName := flag.String("inference-pool-name", os.Getenv("INFERENCE_POOL_NAME"), "the specific InferencePool name to watch (defaults to INFERENCE_POOL_NAME env var)")
+	enablePrefillerSampling := flag.Bool("enable-prefiller-sampling", func() bool { b, _ := strconv.ParseBool(os.Getenv("ENABLE_PREFILLER_SAMPLING")); return b }(), "if true, the target prefill instance will be selected randomly from among the provided prefill host values")
 
 	klog.InitFlags(nil)
 	flag.Parse()
@@ -97,6 +99,7 @@ func main() {
 		EnableSSRFProtection:        *enableSSRFProtection,
 		InferencePoolNamespace:      *inferencePoolNamespace,
 		InferencePoolName:           *inferencePoolName,
+		EnablePrefillerSampling:     *enablePrefillerSampling,
 	}
 
 	proxy, err := proxy.NewProxy(*port, targetURL, config)
diff --git a/internal/proxy/chat_completions.go b/internal/proxy/chat_completions.go
@@ -17,7 +17,9 @@ limitations under the License.
 package proxy
 
 import (
+	"math/rand"
 	"net/http"
+	"strings"
 )
 
 var (
@@ -29,30 +31,50 @@ var (
 )
 
 func (s *Server) chatCompletionsHandler(w http.ResponseWriter, r *http.Request) {
-	prefillPodHostPort := r.Header.Get(requestHeaderPrefillHostPort)
+	var prefillHostPorts []string
+	prefillHostPorts = r.Header.Values(requestHeaderPrefillHostPort)
 
-	if prefillPodHostPort == "" {
+	if len(prefillHostPorts) == 0 {
 		// backward compatible behavior: to remove in next release
-		prefillPodHostPort = r.Header.Get(requestHeaderPrefillURL)
+		prefillHostPorts = r.Header.Values(requestHeaderPrefillURL)
 	}
 
-	if prefillPodHostPort == "" {
+	// https://datatracker.ietf.org/doc/html/rfc7230#section-3.2.2 specifies proxies
+	// may combine multiple header values with a comma. Accept either one host per
+	// header line OR one line with multiple header values.
+	if len(prefillHostPorts) == 1 {
+		prefillHostPorts = strings.Split(prefillHostPorts[0], ",")
+	}
+
+	numHosts := len(prefillHostPorts)
+	var prefillHostPort string
+	if numHosts > 0 {
+		if s.config.EnablePrefillerSampling {
+			// Sample a host value from the list
+			prefillHostPort = strings.TrimSpace(prefillHostPorts[rand.Intn(numHosts)])
+		} else if numHosts > 0 {
+			// Select only the first header value, consistent with previous behavior
+			prefillHostPort = strings.TrimSpace(prefillHostPorts[0])
+		}
+	}
+
+	if len(prefillHostPort) == 0 {
 		s.logger.V(4).Info("skip disaggregated prefill")
 		s.decoderProxy.ServeHTTP(w, r)
 		return
 	}
 
 	// SSRF Protection: Check if the prefill target is allowed
-	if !s.allowlistValidator.IsAllowed(prefillPodHostPort) {
+	if !s.allowlistValidator.IsAllowed(prefillHostPort) {
 		s.logger.Error(nil, "SSRF protection: prefill target not in allowlist",
-			"target", prefillPodHostPort,
+			"target", prefillHostPort,
 			"clientIP", r.RemoteAddr,
 			"userAgent", r.Header.Get("User-Agent"),
 			"requestPath", r.URL.Path)
 		http.Error(w, "Forbidden: prefill target not allowed by SSRF protection", http.StatusForbidden)
 		return
 	}
 
-	s.logger.V(4).Info("SSRF protection: prefill target allowed", "target", prefillPodHostPort)
-	s.runConnectorProtocol(w, r, prefillPodHostPort)
+	s.logger.V(4).Info("SSRF protection: prefill target allowed", "target", prefillHostPort)
+	s.runConnectorProtocol(w, r, prefillHostPort)
 }
diff --git a/internal/proxy/proxy.go b/internal/proxy/proxy.go
@@ -88,6 +88,10 @@ type Config struct {
 
 	// InferencePoolName InferencePool object name.
 	InferencePoolName string
+
+	// EnablePrefillerSampling configures the proxy to randomly choose from the set
+	// of provided prefill hosts instead of always using the first one.
+	EnablePrefillerSampling bool
 }
 
 type protocolRunner func(http.ResponseWriter, *http.Request, string)
@@ -265,7 +269,7 @@ func (s *Server) createRoutes() *http.ServeMux {
 		// Log errors from the decoder proxy
 		switch {
 		case errors.Is(err, syscall.ECONNREFUSED):
-			s.logger.Error(err, "waiting for vLLM to be ready")
+			s.logger.Error(err, "waiting for model server to be ready")
 		default:
 			s.logger.Error(err, "http: proxy error")
 		}