Skip to content

TokensLoader segmentation fault #756

@Aceticia

Description

@Aceticia

🐛 Bug

Segmentation fault happens when streaming TokensLoaders with zstd compression. I'd like to know what I should check in order to find a fix for this.

I suspect this might be related to past issues like #459 ?

To Reproduce

I build the DCLM 1.0 dataset with this:

import json
import time
from functools import partial
from pathlib import Path

import zstandard as zstd
from litdata import TokensLoader, optimize
from litgpt.tokenizer import Tokenizer
from litgpt.utils import CLI, extend_checkpoint_dir


def get_files(input_dir, val_set):
    candidate_dirs = []
    # Note that global shard uses 01-10, local shard uses 0-9.
    # Val set use the global-shard_10_of_10/local-shard_9_of_10/**

    if val_set:
        candidate_dirs.append(
            Path(input_dir) / "global-shard_10_of_10" / "local-shard_9_of_10"
        )

    else:
        for global_shard in range(1, 11):
            for local_shard in range(10):
                if global_shard == 10 and local_shard == 9:
                    continue

                candidate_dirs.append(
                    Path(input_dir)
                    / f"global-shard_{global_shard:02}_of_10"
                    / f"local-shard_{local_shard}_of_10"
                )

    files = []
    for d in candidate_dirs:
        files.extend(list(Path(d).rglob("*.zst")))

    return [str(file) for file in files]


def process_items(filepath, tokenizer):
    with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
        for row in f:
            text = json.loads(row)["text"]
            text_ids = tokenizer.encode(string=text, bos=False, eos=True)
            yield text_ids


def prepare(
    input_dir: Path = None,
    output_dir: Path = None,
    tokenizer_path: Path = None,
    chunk_size: int = (2049 * 16384),
    fast_dev_run: bool = False,
    split: str = "train",
    workers: int = 1,
) -> None:
    tokenizer_path = extend_checkpoint_dir(tokenizer_path)
    tokenizer = Tokenizer(tokenizer_path)
    filelist = get_files(input_dir, val_set=split == "val")

    start_time = time.time()
    optimize(
        partial(process_items, tokenizer=tokenizer),
        inputs=filelist,
        output_dir=output_dir,
        chunk_size=chunk_size,
        num_workers=workers,
        fast_dev_run=fast_dev_run,
        item_loader=TokensLoader(),
        compression="zstd",
        keep_data_ordered=False,
        mode="overwrite",
    )
    elapsed_time = time.time() - start_time

    print(f"Time taken: {elapsed_time:.2f} seconds")


if __name__ == "__main__":
    CLI(prepare)

Then stream with this:

from tqdm import tqdm
from litdata import StreamingDataset, StreamingDataLoader, TokensLoader


ds = StreamingDataset(
    "/gpfs/data/oermannlab/project_data/weaver/pretrain_data/dclm/train",
    item_loader=TokensLoader(1025),
)
dl = StreamingDataLoader(ds, batch_size=4, num_workers=0)
for _ in tqdm(dl):
    pass

Same error happens for different combinations of block size, batch size and worker numbers. Directly iterating over the dataset seems to have no problem.

Expected behavior

Normal streaming should work.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workinghelp wantedExtra attention is neededwaiting on authorWaiting for user input or feedback.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions