Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fms_mo/dq.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
eval_llm_1GPU(qcfg, model, test_dataset)
else:
model.to(torch.device("cuda:0"))
n_samples = int(test_dataset.input_ids.shape[1] / block_size)
n_samples = int(test_dataset["input_ids"].shape[1] / block_size)
evaluator = Evaluator(test_dataset, "cuda", n_samples=n_samples)
with patch_torch_bmm(qcfg):
ppl = evaluator.evaluate(model, block_size=block_size)
Expand Down
172 changes: 116 additions & 56 deletions fms_mo/utils/calib_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,36 +26,46 @@

# Third Party
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, BatchEncoding
from transformers import BatchEncoding
import datasets
import torch


def return_tokenized_samples(nsamples, trainenc, seqlen, sequential=False):
def return_tokenized_samples(
nsamples: int, trainenc: list, seqlen: int, sequential: bool = False
) -> dict:
"""Randomly crop nsamples sequence from trainenc, each with the length of seqlen.
see below functions, e.g. get_wikitext2() for more details.
"""
traindataset = []
traindataset = {
"input_ids": torch.zeros(size=(nsamples, seqlen), dtype=torch.int),
"attention_mask": torch.zeros(size=(nsamples, seqlen), dtype=torch.int),
}
i = 0

for _ in range(nsamples):
for k in range(nsamples):
if not sequential:
i = random.randint(0, len(trainenc.input_ids) - seqlen - 1)

j = i + seqlen
inp = trainenc.input_ids[i:j]
mask = trainenc.attention_mask[i:j]
traindataset.append(
{"input_ids": torch.tensor(inp), "attention_mask": torch.tensor(mask)}
)
traindataset["input_ids"][k] = torch.tensor(inp)
traindataset["attention_mask"][k] = torch.tensor(mask)

i = j

return traindataset


def get_wikitext2(
nsamples, seed, seqlen, tokenizer, sequential=False, gptq_style=False
):
nsamples: int,
seed: int,
seqlen: int,
tokenizer: str,
sequential: bool = False,
gptq_style: bool = False,
) -> tuple[dict, dict]:
"""Prepare data for GPTQ using wikitext2 dataset.

Args:
Expand Down Expand Up @@ -83,11 +93,22 @@ def get_wikitext2(
traindataset = return_tokenized_samples(
nsamples, trainenc, seqlen, sequential=sequential
)
testenc = {
"input_ids": testenc["input_ids"],
"attention_mask": testenc["attention_mask"],
}

return traindataset, testenc


def get_ptb(nsamples, seed, seqlen, model, sequential=False, gptq_style=False):
def get_ptb(
nsamples: int,
seed: int,
seqlen: int,
tokenizer: str,
sequential: bool = False,
gptq_style: bool = False,
) -> tuple[dict, dict]:
"""Prepare data for GPTQ using PTB dataset.

Args:
Expand All @@ -102,8 +123,6 @@ def get_ptb(nsamples, seed, seqlen, model, sequential=False, gptq_style=False):
"""
random.seed(seed)

tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)

traindata = load_dataset("ptb_text_only", "penn_treebank", split="train")
valdata = load_dataset("ptb_text_only", "penn_treebank", split="validation")
if gptq_style:
Expand All @@ -112,14 +131,20 @@ def get_ptb(nsamples, seed, seqlen, model, sequential=False, gptq_style=False):
traindata = "\n\n".join(traindata["sentence"])

trainenc = tokenizer(traindata)
testenc = tokenizer("\n\n".join(valdata["sentence"]))
testenc = tokenizer("\n\n".join(valdata["sentence"]), return_tensors="pt")

traindataset = return_tokenized_samples(nsamples, trainenc, seqlen, sequential)
testenc = {
"input_ids": testenc["input_ids"],
"attention_mask": testenc["attention_mask"],
}

return traindataset, testenc


def get_c4_train(nsamples, seed, seqlen, tokenizer, sequential=False):
def get_c4_train(
nsamples: int, seed: int, seqlen: int, tokenizer: str, sequential: bool = False
) -> tuple[dict, dict]:
"""Prepare data for GPTQ using C4 dataset.

Args:
Expand All @@ -144,8 +169,13 @@ def get_c4_train(nsamples, seed, seqlen, tokenizer, sequential=False):
split="validation",
)

trainloader = []
for _ in range(nsamples):
testenc = tokenizer("\n\n".join(valdata["text"]), return_tensors="pt")

trainloader = {
"input_ids": torch.zeros(size=(nsamples, seqlen), dtype=torch.int),
"attention_mask": torch.zeros(size=(nsamples, seqlen), dtype=torch.int),
}
for k in range(nsamples):
while True:
i = random.randint(0, len(traindata) - 1)
trainenc = tokenizer(traindata[i]["text"])
Expand All @@ -156,19 +186,19 @@ def get_c4_train(nsamples, seed, seqlen, tokenizer, sequential=False):
j = i + seqlen
inp = trainenc.input_ids[i:j]
mask = trainenc.attention_mask[i:j]
trainloader.append({"input_ids": inp, "attention_mask": mask})
trainloader["input_ids"][k] = torch.tensor(inp)
trainloader["attention_mask"][k] = torch.tensor(mask)
j = i
testdataset = [
{
"input_ids": torch.tensor(valdata.input_ids),
"attention_mask": torch.tensor(valdata.attention_mask),
}
]

testdataset = {
"input_ids": testenc["input_ids"],
"attention_mask": testenc["attention_mask"],
}

return trainloader, testdataset


def get_c4_new(nsamples, seed, seqlen, tokenizer):
def get_c4_new(nsamples: int, seed: int, seqlen: int, tokenizer: str):
"""Prepare data for GPTQ using C4 dataset.

Args:
Expand Down Expand Up @@ -213,8 +243,8 @@ def get_c4_new(nsamples, seed, seqlen, tokenizer):


def get_self_instruct_starcoder(
nsamples, seed, seqlen, tokenizer, split_name="curated"
): # pylint: disable=unused-argument
nsamples: int, seed: int, seqlen: int, tokenizer: str, split_name: str = "curated"
) -> tuple[dict, dict]: # pylint: disable=unused-argument
"""Prepare data for GPTQ using starcoder dataset.

Args:
Expand All @@ -229,23 +259,42 @@ def get_self_instruct_starcoder(
cr_dataset = load_dataset("codeparrot/self-instruct-starcoder", split=split_name)

eval_dataset = tokenizer(" ".join(cr_dataset[:]["output"]), return_tensors="pt")
eval_dataset = {
"input_ids": eval_dataset["input_ids"],
"attention_mask": eval_dataset["attention_mask"],
}

cr_dataset.shuffle(seed)
nsamples = min(nsamples, len(cr_dataset))
trainloader = []
for i in range(nsamples):
tokenized = tokenizer(cr_dataset[i]["output"], return_tensors="pt")
trainloader.append(
{
"input_ids": tokenized.input_ids.squeeze(0),
"attention_mask": tokenized.attention_mask.squeeze(0),
}

if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token

trainloader = {
"input_ids": torch.zeros(size=(nsamples, seqlen), dtype=torch.int),
"attention_mask": torch.zeros(size=(nsamples, seqlen), dtype=torch.int),
}
for k in range(nsamples):
tokenized = tokenizer(
cr_dataset[k]["output"],
return_tensors="pt",
padding="max_length",
max_length=seqlen,
)
trainloader["input_ids"][k] = tokenized.input_ids.squeeze(0)
trainloader["attention_mask"][k] = tokenized.attention_mask.squeeze(0)

return trainloader, eval_dataset


def get_cobol_java_supervised(
nsamples, seed, model, seqlen=8192, split_name="both", file_path=None
):
nsamples: int,
seed: int,
seqlen: int = 8192,
tokenizer: str = "",
split_name: str = "both",
file_path: str = None,
) -> tuple[dict, dict]:
"""Prepare data for GPTQ using cobol/java dataset.

Args:
Expand All @@ -265,13 +314,21 @@ def get_cobol_java_supervised(
raw_data = f.readlines()

data_dict_array = [json.loads(line) for line in raw_data]
random.shuffle(data_dict_array)

tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
eval_dataset = tokenizer(data_dict_array["content"], return_tensors="pt")
eval_dataset = {
"input_ids": eval_dataset["input_ids"],
"attention_mask": eval_dataset["attention_mask"],
}

random.shuffle(data_dict_array)

nsamples = min(nsamples, len(data_dict_array))

trainloader = []
trainloader = {
"input_ids": torch.zeros(size=(nsamples, seqlen), dtype=torch.int),
"attention_mask": torch.zeros(size=(nsamples, seqlen), dtype=torch.int),
}
added_ex = 0

while added_ex < nsamples:
Expand Down Expand Up @@ -300,28 +357,24 @@ def get_cobol_java_supervised(
inputs = inputs[i:j]

tokenized = tokenizer(inputs, return_tensors="pt")
trainloader.append(
{
"input_ids": tokenized.input_ids,
"attention_mask": tokenized.attention_mask,
}
)
trainloader["input_ids"][added_ex] = tokenized.input_ids.squeeze(0)
trainloader["attention_mask"][added_ex] = tokenized.attention_mask.squeeze(0)

added_ex += 1

return trainloader, None
return trainloader, eval_dataset


def get_tokenized_data(
name,
nsamples=128,
seqlen=2048,
tokenizer="",
seed=0,
gptq_style=False,
path_to_save=None,
field_name=None,
):
name: str,
nsamples: int = 128,
seqlen: int = 2048,
tokenizer: str = "",
seed: int = 0,
gptq_style: bool = False,
path_to_save: str = None,
field_name: str = None,
) -> tuple[dict, dict]:
"""Convenient function to get data. Default to get_wikitext2."""

# Option 1: User provide a dataset from disk, only need to tokenize and format it.
Expand Down Expand Up @@ -390,14 +443,21 @@ def get_tokenized_data(
traindataset, testdataset = get_self_instruct_starcoder(
nsamples, seed, seqlen, tokenizer, split_name="curated"
)
elif "java" in name:
traindataset, testdataset = get_cobol_java_supervised(
nsamples,
seed,
seqlen,
tokenizer,
)
else:
raise NotImplementedError(
f"Dataset {name} is not implemented yet. Please refer to get_wikitext2() and implement"
"for your own dataset if needed."
)

if path_to_save:
datasets.Dataset.from_list(traindataset).save_to_disk(path_to_save + "_train")
datasets.Dataset.from_dict(traindataset).save_to_disk(path_to_save + "_train")
if isinstance(testdataset, BatchEncoding):
if not os.path.exists(path_to_save + "_test"):
os.mkdir(path_to_save + "_test")
Expand Down
10 changes: 5 additions & 5 deletions fms_mo/utils/eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs): #
qcfg["batch_size"] = 1 # for dataloading, always use batch_size of 1
qcfg["dtype"] = next(iter(model.parameters())).dtype
seq_len = qcfg["seq_len"]
qcfg["n_samples"] = int(test_dataset.input_ids.shape[1] / seq_len)
qcfg["n_samples"] = int(test_dataset["input_ids"].shape[1] / seq_len)
# --- Phase 0 cache the inputs of the block0---
use_cache = model.config.use_cache
model.config.use_cache = False
Expand Down Expand Up @@ -116,9 +116,9 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs): #

# Shift so that tokens < n predict n
shift_logits = lm_logits[:, :-1, :].contiguous().float()
shift_labels = test_dataset.input_ids[:, (i * seq_len) : ((i + 1) * seq_len)][
:, 1:
].to(dev)
shift_labels = test_dataset["input_ids"][
:, (i * seq_len) : ((i + 1) * seq_len)
][:, 1:].to(dev)
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(
shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
Expand All @@ -144,7 +144,7 @@ def __init__(self, dataset, device, n_samples=160):
self.dataset = dataset
self.device = device
# loading tokenized dataset.
self.dataset = dataset.input_ids.to(device)
self.dataset = dataset["input_ids"].to(device)
self.n_samples = n_samples

@torch.no_grad()
Expand Down
Loading