-
Notifications
You must be signed in to change notification settings - Fork 2
/
Whisper-RayService.yaml
147 lines (146 loc) · 4.2 KB
/
Whisper-RayService.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
apiVersion: ray.io/v1
kind: RayService
metadata:
name: whisper-streaming
namespace: default
spec:
deploymentUnhealthySecondThreshold: 300
rayClusterConfig:
enableInTreeAutoscaling: true
headGroupSpec:
rayStartParams:
dashboard-host: 0.0.0.0
template:
spec:
containers:
- env:
- name: RAY_GRAFANA_HOST
value: http://kube-prometheus-stack-grafana.kube-prometheus-stack.svc:80
- name: RAY_PROMETHEUS_HOST
value: http://kube-prometheus-stack-prometheus.kube-prometheus-stack.svc:9090
- name: PYANNOTE_AUTH_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: token
image: public.ecr.aws/darrenlin/ray-whisper-streaming:latest
name: ray-head
ports:
- containerPort: 6379
name: gcs
protocol: TCP
- containerPort: 8265
name: dashboard
protocol: TCP
- containerPort: 10001
name: client
protocol: TCP
- containerPort: 8000
name: serve
protocol: TCP
resources:
limits:
cpu: "2"
memory: 8G
requests:
cpu: "2"
memory: 8G
securityContext:
capabilities:
add:
- SYS_PTRACE
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
volumes:
- emptyDir: {}
name: ray-logs
rayVersion: 2.9.2
workerGroupSpecs:
- groupName: gpu-group
maxReplicas: 20
minReplicas: 1
rayStartParams: {}
replicas: 2
template:
spec:
containers:
- env:
- name: PYANNOTE_AUTH_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: token
image: public.ecr.aws/darrenlin/ray-whisper-streaming:latest
name: ray-worker
resources:
limits:
cpu: 4
memory: 16G
nvidia.com/gpu: 1
requests:
cpu: 3
memory: 12G
nvidia.com/gpu: 1
tolerations:
- effect: NoSchedule
key: ray.io/node-type
operator: Equal
value: worker
serveConfigV2: |
applications:
- name: whisper_app
import_path: src.voice_stream_ai_server:entrypoint
runtime_env:
working_dir: "https://github.com/lindarr915/VoiceStreamAI/archive/49732a6034c2fe4f6f2dfb0f81ac2c0e1da6e3fe.zip"
deployments:
- name: TranscriptionServer
max_concurrent_queries: 100
autoscaling_config:
target_num_ongoing_requests_per_replica: 5
min_replicas: 1
max_replicas: 5
initial_replicas: 3
- name: FasterWhisperASR
max_concurrent_queries: 10
autoscaling_config:
target_num_ongoing_requests_per_replica: 2
min_replicas: 1
max_replicas: 20
initial_replicas: 3
- name: PyannoteVAD
max_concurrent_queries: 10
autoscaling_config:
target_num_ongoing_requests_per_replica: 3
min_replicas: 1
max_replicas: 20
initial_replicas: 3
serviceUnhealthySecondThreshold: 900
# ---
# apiVersion: networking.k8s.io/v1
# kind: Ingress
# metadata:
# name: whisper-streaming
# annotations:
# nginx.ingress.kubernetes.io/rewrite-target: "/$1"
# spec:
# ingressClassName: nginx
# rules:
# - http:
# paths:
# # Ray Dashboard
# - path: /dashboard/(.*)
# pathType: ImplementationSpecific
# backend:
# service:
# name: whisper-streaming-head-svc
# port:
# number: 8265
# # Ray Serve
# - path: /serve/(.*)
# pathType: ImplementationSpecific
# backend:
# service:
# name: whisper-streaming-serve-svc
# port:
# number: 8000