Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ s3 = ["s3fs>=2024.0.0"]
gcs = ["gcsfs>=2024.0.0"]
sentencepiece = ["sentencepiece>=0.2.0"]
xenna = ["cosmos-xenna"]
curator = ["nemo-curator[text-cpu]"]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nemo-curator[text-cuda12] should cover all the steps merged in so far

dev = [
"pytest>=7.0.0",
"pytest-cov>=4.0.0",
Expand All @@ -62,6 +63,7 @@ all = [
"gcsfs>=2024.0.0",
"sentencepiece>=0.2.0",
"cosmos-xenna",
"nemo-curator[text-cpu]",
]

# Note: megatron-bridge is required for training but not listed as a dependency
Expand Down
2 changes: 2 additions & 0 deletions src/nemotron/cli/bin/nemotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,11 @@ def main_callback(
# Import and register recipe groups
def _register_groups() -> None:
"""Register all recipe groups with the main app."""
from nemotron.cli.commands.data import data_app
from nemotron.cli.commands.nano3 import nano3_app
from nemotron.cli.kit import kit_app

app.add_typer(data_app, name="data")
app.add_typer(nano3_app, name="nano3")
app.add_typer(kit_app, name="kit")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
run:
env:
container: anyscale/ray:2.49.2-py312

# Snapshot range
start_snapshot: "2024-46"
end_snapshot: "2024-51"

# Paths
output_dir: ${oc.env:PWD}/../output/nemotron-cc/cleaned_extracted
cache_dir: ${oc.env:PWD}/../output/nemotron-cc/cache

# Common Crawl options
url_limit: null
record_limit: null

# Language filtering (null = all languages)
languages: null

# Cloud storage (JSON string of fsspec options, or null)
storage_options: null

# Ray cluster
num_cpus: null
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
run:
env:
container: anyscale/ray:2.49.2-py312

# Tiny config for download-extract
#
# Usage:
# nemotron data curate nemotron-cc download-extract -c tiny

start_snapshot: "2024-51"
end_snapshot: "2024-51"
output_dir: ${oc.env:PWD}/../output/nemotron-cc-tiny/cleaned_extracted
cache_dir: ${oc.env:PWD}/../output/nemotron-cc-tiny/cache
url_limit: 2
record_limit: 100
languages: null
storage_options: null
num_cpus: null
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
run:
env:
container: nvcr.io/nvidia/nemo:25.02

# Operation flags
identify: true
remove: true
Comment on lines +6 to +7
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small note:
Identify is a GPU job.
remove is CPU only. We can do both on a GPU node.


# Paths
input_dir: ${oc.env:PWD}/../output/nemotron-cc/cleaned_extracted
cache_dir: ${oc.env:PWD}/../output/nemotron-cc/exact_dedup_cache
output_dir: ${oc.env:PWD}/../output/nemotron-cc/exact_deduplicated

# Input/output format
input_filetype: jsonl
text_field: text
output_filetype: jsonl

# Identification settings
input_blocksize: "256MiB"
identification_batchsize: 12
total_nparts: null
rmm_pool_size: "auto"
spill_memory_limit: "auto"

# Cloud storage
storage_options: null

# Ray cluster
num_gpus: null
num_cpus: null
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
run:
env:
container: nvcr.io/nvidia/nemo:25.02

# Tiny config for exact dedup
#
# Usage:
# nemotron data curate nemotron-cc exact-dedup -c tiny

identify: true
remove: true
input_dir: ${oc.env:PWD}/../output/nemotron-cc-tiny/cleaned_extracted
cache_dir: ${oc.env:PWD}/../output/nemotron-cc-tiny/exact_dedup_cache
output_dir: ${oc.env:PWD}/../output/nemotron-cc-tiny/exact_deduplicated
input_filetype: jsonl
text_field: text
output_filetype: jsonl
input_blocksize: "64MiB"
identification_batchsize: 4
total_nparts: null
rmm_pool_size: "auto"
spill_memory_limit: "auto"
storage_options: null
num_gpus: null
num_cpus: null
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
run:
env:
container: nvcr.io/nvidia/nemo:25.02

# Operation flags
identify: true
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above

remove: true

# Paths
input_dir: ${oc.env:PWD}/../output/nemotron-cc/exact_deduplicated/exact_deduplicated
cache_dir: ${oc.env:PWD}/../output/nemotron-cc/fuzzy_dedup_cache
output_dir: ${oc.env:PWD}/../output/nemotron-cc/fuzzy_deduplicated

# Input/output format
input_filetype: jsonl
text_field: text
output_filetype: jsonl

# Fuzzy dedup settings
input_blocksize: "256MiB"
bands_per_iteration: 5
total_nparts: null

# Cloud storage
storage_options: null

# Ray cluster
num_gpus: null
num_cpus: null
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
run:
env:
container: nvcr.io/nvidia/nemo:25.02

# Tiny config for fuzzy dedup
#
# Usage:
# nemotron data curate nemotron-cc fuzzy-dedup -c tiny

identify: true
remove: true
input_dir: ${oc.env:PWD}/../output/nemotron-cc-tiny/exact_deduplicated/exact_deduplicated
cache_dir: ${oc.env:PWD}/../output/nemotron-cc-tiny/fuzzy_dedup_cache
output_dir: ${oc.env:PWD}/../output/nemotron-cc-tiny/fuzzy_deduplicated
input_filetype: jsonl
text_field: text
output_filetype: jsonl
input_blocksize: "64MiB"
bands_per_iteration: 5
total_nparts: null
storage_options: null
num_gpus: null
num_cpus: null
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
run:
env:
container: nvcr.io/nvidia/nemo:25.02

# Operation flags
classify: true
ensemble: true
Comment on lines +6 to +7
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Classify is a GPU job, Ensemble is Cpu only.


# Paths
input_dir: ${oc.env:PWD}/../output/nemotron-cc/fuzzy_deduplicated/fuzzy_deduplicated
output_dir: ${oc.env:PWD}/../output/nemotron-cc/quality_labeling

# Threshold sampling
threshold_sample_frac: 0.01

# Ray cluster
num_gpus: null
num_cpus: null
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
run:
env:
container: nvcr.io/nvidia/nemo:25.02

# Tiny config for quality classification
#
# Usage:
# nemotron data curate nemotron-cc quality-classify -c tiny

classify: true
ensemble: true
input_dir: ${oc.env:PWD}/../output/nemotron-cc-tiny/fuzzy_deduplicated/fuzzy_deduplicated
output_dir: ${oc.env:PWD}/../output/nemotron-cc-tiny/quality_labeling
threshold_sample_frac: 1.0
num_gpus: null
num_cpus: null
Loading