-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdeterministic_vllm_inference.py
More file actions
46 lines (38 loc) · 1.32 KB
/
deterministic_vllm_inference.py
File metadata and controls
46 lines (38 loc) · 1.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# Requires PyTorch 2.9.0 or higher as well as https://github.com/vllm-project/vllm/pull/24583
#
# vllm serve Qwen/Qwen3-8B --enforce-eager
import asyncio
import httpx
async def main():
url = "http://localhost:8000/v1/chat/completions"
headers = {
"Content-Type": "application/json",
}
data = {
# "model": "Qwen/Qwen3-30B-A3B",
"model": "Qwen/Qwen3-8B",
"messages": [
{
"role": "user",
"content": "Generate 1000 random numbers. Go directly into it, don't say Sure and don't say here are numbers. Just start with a number. /no_think",
}
],
"chat_template_kwargs": {
"thinking": False
},
"temperature": 0.0,
"max_tokens": 100,
}
outs = []
responses = []
async with httpx.AsyncClient() as client:
for i in range(1000):
response = client.post(url, headers=headers, json=data, timeout=120)
responses.append(response)
responses = await asyncio.gather(*responses)
for response in responses:
outs.append(response.json()['choices'][0]['message']['content'])
for i in outs:
print(i.replace("\n", " "))
print(f"Total samples: {len(outs)}, Unique samples: {len(set(outs))}")
asyncio.run(main())