-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgeneration_utils.py
76 lines (67 loc) · 2.58 KB
/
generation_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import openai
def get_openai_response(prompts, max_len, model):
"""Get a single output from an OpenAI model."""
response = openai.Completion.create(
model=model,
prompt=prompts,
temperature=0,
max_tokens=max_len,
frequency_penalty=0,
presence_penalty=0,
best_of=1
)
return response["choices"]
def get_openai_response_chatmodels(prompt, max_len, model):
"""
Get a single response from an OpenAI chat model
(e.g., GPT-3.5 Turbo or GPT-4).
"""
response = openai.ChatCompletion.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
temperature=0,
max_tokens=max_len,
frequency_penalty=0,
presence_penalty=0
)
return response["choices"]
def load_llama(model_name):
"""Set up and load (Code)Llama tokenizer and model."""
import torch
if model_name.startswith("meta-llama/Llama-2"):
from transformers import LlamaTokenizer, LlamaForCausalLM, BitsAndBytesConfig
llama_tokenizer = LlamaTokenizer.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig( # use 4-bit quantization to make it fit on a single GPU
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
llama_model = LlamaForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
quantization_config=bnb_config
)
elif model_name.startswith("codellama/"):
from transformers import CodeLlamaTokenizer, AutoModelForCausalLM
llama_tokenizer = CodeLlamaTokenizer.from_pretrained(model_name)
llama_model = AutoModelForCausalLM.from_pretrained( # load in half-precision
model_name,
torch_dtype=torch.bfloat16
).to("cuda")
else:
raise ValueError("Unsupported model: ")
return llama_tokenizer, llama_model
def get_llama_response(tokenizer, model, prompts, max_len):
"""Get a single response for a (Code)Llama model."""
input_ids = tokenizer(prompts, return_tensors="pt").input_ids.to("cuda")
outputs = model.generate(input_ids,
top_p=0.9,
temperature=0.1,
max_new_tokens=max_len,
pad_token_id=tokenizer.eos_token_id)
strings = tokenizer.batch_decode(outputs, skip_special_tokens=True)
return strings