Update yamls for jetstream-pytorch-v0.2.4 (#1544)

Co-authored-by: Nim Jayawardena <[email protected]>
GoogleCloudPlatform · Dec 4, 2024 · 32872d6 · 32872d6
1 parent 0ce99b8
commit 32872d6
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 28 deletions.
diff --git a/ai-ml/llm-serving-tpus-jetstream/pytorch/jetstream-pytorch-gemma-7b-it-2x4.yaml b/ai-ml/llm-serving-tpus-jetstream/pytorch/jetstream-pytorch-gemma-7b-it-2x4.yaml
@@ -34,39 +34,65 @@ spec:
         cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
       containers:
       - name: jetstream-pytorch-server
-        image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.3
+        image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.4
         args:
-        - --size=7b
-        - --model_name=gemma
-        - --batch_size=32
-        - --max_cache_length=2048
-        - --quantize_weights=False
-        - --quantize_kv_cache=False
-        - --tokenizer_path=/models/pytorch/gemma-7b-it/final/bf16/tokenizer.model
-        - --checkpoint_path=/models/pytorch/gemma-7b-it/final/bf16/model.safetensors
-        ports:
-        - containerPort: 9000
+        - --model_id=google/gemma-7b-it
+        - --override_batch_size=32
+        - --working_dir=/models/pytorch/
+        - --enable_model_warmup=True
         volumeMounts:
         - name: gcs-fuse-checkpoint
           mountPath: /models
+        - name: huggingface-credentials
+          mountPath: /huggingface
           readOnly: true
+        ports:
+        - containerPort: 9000
         resources:
           requests:
             google.com/tpu: 8
           limits:
             google.com/tpu: 8
+        startupProbe:
+          httpGet:
+            path: /healthcheck
+            port: 8000
+            scheme: HTTP
+          periodSeconds: 60
+          initialDelaySeconds: 90
+          failureThreshold: 50
+        livenessProbe:
+          httpGet:
+            path: /healthcheck
+            port: 8000
+            scheme: HTTP
+          periodSeconds: 60
+          failureThreshold: 30
+        readinessProbe:
+          httpGet:
+            path: /healthcheck
+            port: 8000
+            scheme: HTTP
+          periodSeconds: 60
+          failureThreshold: 30
       - name: jetstream-http
-        image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.2
+        image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.3
         ports:
         - containerPort: 8000
       volumes:
+      - name: huggingface-credentials
+        secret:
+          defaultMode: 0400
+          secretName: huggingface-secret
+      - name: gke-gcsfuse-cache
+        emptyDir:
+          medium: Memory
       - name: gcs-fuse-checkpoint
         csi:
           driver: gcsfuse.csi.storage.gke.io
-          readOnly: true
           volumeAttributes:
             bucketName: BUCKET_NAME
-            mountOptions: "implicit-dirs"
+            mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
 ---
 apiVersion: v1
 kind: Service

diff --git a/ai-ml/llm-serving-tpus-jetstream/pytorch/jetstream-pytorch-llama-3-8b-2x4.yaml b/ai-ml/llm-serving-tpus-jetstream/pytorch/jetstream-pytorch-llama-3-8b-2x4.yaml
@@ -34,39 +34,65 @@ spec:
         cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
       containers:
       - name: jetstream-pytorch-server
-        image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.3
+        image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.4
         args:
-        - --size=8b
-        - --model_name=llama-3
-        - --batch_size=32
-        - --max_cache_length=2048
-        - --quantize_weights=False
-        - --quantize_kv_cache=False
-        - --tokenizer_path=/models/pytorch/llama-3-8b/final/bf16/tokenizer.model
-        - --checkpoint_path=/models/pytorch/llama-3-8b/final/bf16/model.safetensors
-        ports:
-        - containerPort: 9000
+        - --model_id=meta-llama/Meta-Llama-3-8B
+        - --override_batch_size=32
+        - --working_dir=/models/pytorch/
+        - --enable_model_warmup=True
         volumeMounts:
         - name: gcs-fuse-checkpoint
           mountPath: /models
+        - name: huggingface-credentials
+          mountPath: /huggingface
           readOnly: true
+        ports:
+        - containerPort: 9000
         resources:
           requests:
             google.com/tpu: 8
           limits:
             google.com/tpu: 8
+        startupProbe:
+          httpGet:
+            path: /healthcheck
+            port: 8000
+            scheme: HTTP
+          periodSeconds: 60
+          initialDelaySeconds: 90
+          failureThreshold: 50
+        livenessProbe:
+          httpGet:
+            path: /healthcheck
+            port: 8000
+            scheme: HTTP
+          periodSeconds: 60
+          failureThreshold: 30
+        readinessProbe:
+          httpGet:
+            path: /healthcheck
+            port: 8000
+            scheme: HTTP
+          periodSeconds: 60
+          failureThreshold: 30
       - name: jetstream-http
-        image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.2
+        image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.3
         ports:
         - containerPort: 8000
       volumes:
+      - name: huggingface-credentials
+        secret:
+          defaultMode: 0400
+          secretName: huggingface-secret
+      - name: gke-gcsfuse-cache
+        emptyDir:
+          medium: Memory
       - name: gcs-fuse-checkpoint
         csi:
           driver: gcsfuse.csi.storage.gke.io
-          readOnly: true
           volumeAttributes:
             bucketName: BUCKET_NAME
-            mountOptions: "implicit-dirs"
+            mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
 ---
 apiVersion: v1
 kind: Service