Create `prob_col` on the CrossFit side #108

sarahyurick · 2025-01-03T22:56:43Z

I am using the following example to run NVIDIA's domain classifier with CrossFit:

from dataclasses import dataclass

import cudf
import dask_cudf
import torch
import torch.nn as nn
from huggingface_hub import PyTorchModelHubMixin
from transformers import AutoConfig, AutoModel

import crossfit as cf
from crossfit import op
from crossfit.backend.torch.hf.model import HFModel


@dataclass
class DomainConfig:
    identifier = "nvidia/domain-classifier"
    model = "microsoft/deberta-v3-base"
    fc_dropout = 0.2
    max_len = 512


class CustomModel(nn.Module, PyTorchModelHubMixin):
    def __init__(self, config: dataclass):
        super().__init__()
        self.model = AutoModel.from_pretrained(config["base_model"])
        self.dropout = nn.Dropout(config["fc_dropout"])
        self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"]))

    def _forward(self, batch):
        features = self.model(
            batch["input_ids"], batch["attention_mask"]
        ).last_hidden_state
        dropped = self.dropout(features)
        outputs = self.fc(dropped)
        return torch.softmax(outputs[:, 0, :], dim=1)

    def forward(self, batch):
        with torch.autocast(device_type="cuda"):
            return self._forward(batch)


class DomainModel(HFModel):
    def __init__(self, config):
        self.config = config
        super().__init__(self.config.model)

    def load_model(self, device="cuda"):
        model = CustomModel.from_pretrained(self.config.identifier)
        model = model.to(device)
        return model.eval()

    def load_config(self):
        return AutoConfig.from_pretrained(self.config.identifier)


def main():
    text = [
        "Quantum computing is set to revolutionize the field of cryptography.",
        "Investing in index funds is a popular strategy for long-term financial growth.",
        "Recent advancements in gene therapy offer new hope for treating genetic disorders.",
        "Online learning platforms have transformed the way students access educational resources.",
        "Traveling to Europe during the off-season can be a more budget-friendly option.",
    ]
    df = cudf.DataFrame({"text": text})
    df = dask_cudf.from_cudf(df, npartitions=1)

    labels = [
        "Adult",
        "Arts_and_Entertainment",
        "Autos_and_Vehicles",
        "Beauty_and_Fitness",
        "Books_and_Literature",
        "Business_and_Industrial",
        "Computers_and_Electronics",
        "Finance",
        "Food_and_Drink",
        "Games",
        "Health",
        "Hobbies_and_Leisure",
        "Home_and_Garden",
        "Internet_and_Telecom",
        "Jobs_and_Education",
        "Law_and_Government",
        "News",
        "Online_Communities",
        "People_and_Society",
        "Pets_and_Animals",
        "Real_Estate",
        "Science",
        "Sensitive_Subjects",
        "Shopping",
        "Sports",
        "Travel_and_Transportation",
    ]
    label_col = "domain_pred"
    prob_col = "domain_prob"

    with cf.Distributed(rmm_pool_size="12GB", n_workers=1):
        model = DomainModel(DomainConfig)

        # TODO: The user should not have to do this
        df[prob_col] = 0

        columns_to_keep_list = df.columns.to_list()

        classifier_pipe = op.Sequential(
            op.Tokenizer(model, cols=["text"], tokenizer_type="default"),
            op.Predictor(
                model,
                sorted_data_loader=True,
                batch_size=256,
                pred_output_col=prob_col,
            ),
            op.Labeler(labels, cols=[prob_col], suffix=label_col),
            repartition=df.npartitions,
            keep_cols=columns_to_keep_list,
        )
        df = classifier_pipe(df)

        df = df.compute()
        print(df)
        print(df.columns)


if __name__ == "__main__":
    main()

The code works as expected, however I have to initialize the probability column with df[prob_col] = 0 in order to keep it as part of the resulting DataFrame. (If df[prob_col] = 0 is removed then only the "text" and "domain_pred" columns are returned in the result). I would prefer if that is handled by CrossFit instead of the user having to create the new column themselves.

This should also depend on whether or not the user wants to keep the prob_col in their final result or not.

The text was updated successfully, but these errors were encountered:

sarahyurick added the enhancement New feature or request label Jan 3, 2025

sarahyurick mentioned this issue Jan 3, 2025

Clean up internal column logic in _run_classifier_helper function NVIDIA/NeMo-Curator#457

Merged

This was referenced Jan 16, 2025

Adjust keep_cols logic #109

Open

Minor CrossFit improvements NVIDIA/NeMo-Curator#483

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Create `prob_col` on the CrossFit side #108

Create `prob_col` on the CrossFit side #108

sarahyurick commented Jan 3, 2025

Create prob_col on the CrossFit side #108

Create prob_col on the CrossFit side #108

Comments

sarahyurick commented Jan 3, 2025

Create `prob_col` on the CrossFit side #108

Create `prob_col` on the CrossFit side #108