You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am using the following example to run NVIDIA's domain classifier with CrossFit:
from dataclasses import dataclass
import cudf
import dask_cudf
import torch
import torch.nn as nn
from huggingface_hub import PyTorchModelHubMixin
from transformers import AutoConfig, AutoModel
import crossfit as cf
from crossfit import op
from crossfit.backend.torch.hf.model import HFModel
@dataclass
class DomainConfig:
identifier = "nvidia/domain-classifier"
model = "microsoft/deberta-v3-base"
fc_dropout = 0.2
max_len = 512
class CustomModel(nn.Module, PyTorchModelHubMixin):
def __init__(self, config: dataclass):
super().__init__()
self.model = AutoModel.from_pretrained(config["base_model"])
self.dropout = nn.Dropout(config["fc_dropout"])
self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"]))
def _forward(self, batch):
features = self.model(
batch["input_ids"], batch["attention_mask"]
).last_hidden_state
dropped = self.dropout(features)
outputs = self.fc(dropped)
return torch.softmax(outputs[:, 0, :], dim=1)
def forward(self, batch):
with torch.autocast(device_type="cuda"):
return self._forward(batch)
class DomainModel(HFModel):
def __init__(self, config):
self.config = config
super().__init__(self.config.model)
def load_model(self, device="cuda"):
model = CustomModel.from_pretrained(self.config.identifier)
model = model.to(device)
return model.eval()
def load_config(self):
return AutoConfig.from_pretrained(self.config.identifier)
def main():
text = [
"Quantum computing is set to revolutionize the field of cryptography.",
"Investing in index funds is a popular strategy for long-term financial growth.",
"Recent advancements in gene therapy offer new hope for treating genetic disorders.",
"Online learning platforms have transformed the way students access educational resources.",
"Traveling to Europe during the off-season can be a more budget-friendly option.",
]
df = cudf.DataFrame({"text": text})
df = dask_cudf.from_cudf(df, npartitions=1)
labels = [
"Adult",
"Arts_and_Entertainment",
"Autos_and_Vehicles",
"Beauty_and_Fitness",
"Books_and_Literature",
"Business_and_Industrial",
"Computers_and_Electronics",
"Finance",
"Food_and_Drink",
"Games",
"Health",
"Hobbies_and_Leisure",
"Home_and_Garden",
"Internet_and_Telecom",
"Jobs_and_Education",
"Law_and_Government",
"News",
"Online_Communities",
"People_and_Society",
"Pets_and_Animals",
"Real_Estate",
"Science",
"Sensitive_Subjects",
"Shopping",
"Sports",
"Travel_and_Transportation",
]
label_col = "domain_pred"
prob_col = "domain_prob"
with cf.Distributed(rmm_pool_size="12GB", n_workers=1):
model = DomainModel(DomainConfig)
# TODO: The user should not have to do this
df[prob_col] = 0
columns_to_keep_list = df.columns.to_list()
classifier_pipe = op.Sequential(
op.Tokenizer(model, cols=["text"], tokenizer_type="default"),
op.Predictor(
model,
sorted_data_loader=True,
batch_size=256,
pred_output_col=prob_col,
),
op.Labeler(labels, cols=[prob_col], suffix=label_col),
repartition=df.npartitions,
keep_cols=columns_to_keep_list,
)
df = classifier_pipe(df)
df = df.compute()
print(df)
print(df.columns)
if __name__ == "__main__":
main()
The code works as expected, however I have to initialize the probability column with df[prob_col] = 0 in order to keep it as part of the resulting DataFrame. (If df[prob_col] = 0 is removed then only the "text" and "domain_pred" columns are returned in the result). I would prefer if that is handled by CrossFit instead of the user having to create the new column themselves.
This should also depend on whether or not the user wants to keep the prob_col in their final result or not.
The text was updated successfully, but these errors were encountered:
I am using the following example to run NVIDIA's domain classifier with CrossFit:
The code works as expected, however I have to initialize the probability column with
df[prob_col] = 0
in order to keep it as part of the resulting DataFrame. (Ifdf[prob_col] = 0
is removed then only the "text" and "domain_pred" columns are returned in the result). I would prefer if that is handled by CrossFit instead of the user having to create the new column themselves.This should also depend on whether or not the user wants to keep the
prob_col
in their final result or not.The text was updated successfully, but these errors were encountered: