Skip to content

Commit

Permalink
Update yamls for jetstream-pytorch-v0.2.4 (#1544)
Browse files Browse the repository at this point in the history
Co-authored-by: Nim Jayawardena <[email protected]>
  • Loading branch information
vivianrwu and NimJay authored Dec 4, 2024
1 parent 0ce99b8 commit 32872d6
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,39 +34,65 @@ spec:
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
containers:
- name: jetstream-pytorch-server
image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.3
image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.4
args:
- --size=7b
- --model_name=gemma
- --batch_size=32
- --max_cache_length=2048
- --quantize_weights=False
- --quantize_kv_cache=False
- --tokenizer_path=/models/pytorch/gemma-7b-it/final/bf16/tokenizer.model
- --checkpoint_path=/models/pytorch/gemma-7b-it/final/bf16/model.safetensors
ports:
- containerPort: 9000
- --model_id=google/gemma-7b-it
- --override_batch_size=32
- --working_dir=/models/pytorch/
- --enable_model_warmup=True
volumeMounts:
- name: gcs-fuse-checkpoint
mountPath: /models
- name: huggingface-credentials
mountPath: /huggingface
readOnly: true
ports:
- containerPort: 9000
resources:
requests:
google.com/tpu: 8
limits:
google.com/tpu: 8
startupProbe:
httpGet:
path: /healthcheck
port: 8000
scheme: HTTP
periodSeconds: 60
initialDelaySeconds: 90
failureThreshold: 50
livenessProbe:
httpGet:
path: /healthcheck
port: 8000
scheme: HTTP
periodSeconds: 60
failureThreshold: 30
readinessProbe:
httpGet:
path: /healthcheck
port: 8000
scheme: HTTP
periodSeconds: 60
failureThreshold: 30
- name: jetstream-http
image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.2
image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.3
ports:
- containerPort: 8000
volumes:
- name: huggingface-credentials
secret:
defaultMode: 0400
secretName: huggingface-secret
- name: gke-gcsfuse-cache
emptyDir:
medium: Memory
- name: gcs-fuse-checkpoint
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: true
volumeAttributes:
bucketName: BUCKET_NAME
mountOptions: "implicit-dirs"
mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
---
apiVersion: v1
kind: Service
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,39 +34,65 @@ spec:
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
containers:
- name: jetstream-pytorch-server
image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.3
image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.4
args:
- --size=8b
- --model_name=llama-3
- --batch_size=32
- --max_cache_length=2048
- --quantize_weights=False
- --quantize_kv_cache=False
- --tokenizer_path=/models/pytorch/llama-3-8b/final/bf16/tokenizer.model
- --checkpoint_path=/models/pytorch/llama-3-8b/final/bf16/model.safetensors
ports:
- containerPort: 9000
- --model_id=meta-llama/Meta-Llama-3-8B
- --override_batch_size=32
- --working_dir=/models/pytorch/
- --enable_model_warmup=True
volumeMounts:
- name: gcs-fuse-checkpoint
mountPath: /models
- name: huggingface-credentials
mountPath: /huggingface
readOnly: true
ports:
- containerPort: 9000
resources:
requests:
google.com/tpu: 8
limits:
google.com/tpu: 8
startupProbe:
httpGet:
path: /healthcheck
port: 8000
scheme: HTTP
periodSeconds: 60
initialDelaySeconds: 90
failureThreshold: 50
livenessProbe:
httpGet:
path: /healthcheck
port: 8000
scheme: HTTP
periodSeconds: 60
failureThreshold: 30
readinessProbe:
httpGet:
path: /healthcheck
port: 8000
scheme: HTTP
periodSeconds: 60
failureThreshold: 30
- name: jetstream-http
image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.2
image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.3
ports:
- containerPort: 8000
volumes:
- name: huggingface-credentials
secret:
defaultMode: 0400
secretName: huggingface-secret
- name: gke-gcsfuse-cache
emptyDir:
medium: Memory
- name: gcs-fuse-checkpoint
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: true
volumeAttributes:
bucketName: BUCKET_NAME
mountOptions: "implicit-dirs"
mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
---
apiVersion: v1
kind: Service
Expand Down

0 comments on commit 32872d6

Please sign in to comment.