NVIDIA-NeMo · Copilot · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025
diff --git a/docs/curate-video/process-data/dedup.md b/docs/curate-video/process-data/dedup.md
@@ -30,7 +30,7 @@ Duplicate removal operates on clip-level embeddings produced during processing:
 ## Before You Start
 
 - Verify local paths or configure S3-compatible credentials. Provide `storage_options` in read/write keyword arguments when reading or writing cloud paths.
-- Create output directories for `KMeansStage`, `PairwiseStage`, and `IdentifyDuplicatesStage`.
+- Create output directories for the stages you'll use (`KMeansStage`, `PairwiseStage`, and `IdentifyDuplicatesStage`).
 
 ---
 

diff --git a/nemo_curator/stages/base.py b/nemo_curator/stages/base.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+import re
 from abc import ABC, ABCMeta, abstractmethod
 from typing import TYPE_CHECKING, Any, Generic, TypeVar
 
@@ -31,6 +32,20 @@
 _STAGE_REGISTRY: dict[str, type[ProcessingStage]] = {}
 
 
+def _validate_stage_name(name: str) -> None:
+    """Validate that stage name follows snake_case convention.
+
+    Args:
+        name: The stage name to validate
+
+    Raises:
+        ValueError: If stage name is not in snake_case format
+    """
+    if not re.fullmatch(r"[a-z][a-z0-9_]*", name):
+        msg = f"Stage name must be snake_case, got '{name}'"
+        raise ValueError(msg)
+
+
 class StageMeta(ABCMeta):
     """Metaclass that automatically registers concrete Stage subclasses.
     A class is considered *concrete* if it directly inherits from
@@ -80,7 +95,7 @@ class ProcessingStage(ABC, Generic[X, Y], metaclass=StageMeta):
     """
 
     _is_abstract_root = True  # prevent base from registering itself
-    _name = "ProcessingStage"
+    _name = "processing_stage"  # Changed to snake_case
     _resources = Resources(cpus=1.0)
     _batch_size = 1
 
@@ -251,6 +266,7 @@ def with_(
 
         # Override the instance attributes directly
         if name is not None:
+            _validate_stage_name(name)  # Validate the new name
             new_instance._name = name
         if resources is not None:
             new_instance._resources = resources

diff --git a/nemo_curator/stages/deduplication/exact/identification.py b/nemo_curator/stages/deduplication/exact/identification.py
@@ -65,7 +65,7 @@ class ExactDuplicateIdentification(DeduplicationIO, ShuffleStage):
         Whether the underlying rapidsmpf shuffler should collect shuffle statistics.
     """
 
-    _name = "ExactDuplicateIds"
+    _name = "exact_duplicate_ids"
 
     def __init__(  # noqa: PLR0913
         self,

diff --git a/nemo_curator/stages/deduplication/fuzzy/buckets_to_edges.py b/nemo_curator/stages/deduplication/fuzzy/buckets_to_edges.py
@@ -41,7 +41,7 @@ class BucketsToEdgesStage(ProcessingStage[FileGroupTask, FileGroupTask]):
             Only the storage_options key is supported for now.
     """
 
-    _name = "BucketsToEdgesStage"
+    _name = "buckets_to_edges_stage"
     _resources = Resources(cpus=1.0)
 
     def __init__(

diff --git a/nemo_curator/stages/deduplication/fuzzy/connected_components.py b/nemo_curator/stages/deduplication/fuzzy/connected_components.py
@@ -59,7 +59,7 @@ def __init__(
         self.read_kwargs = read_kwargs if read_kwargs is not None else {}
         self.write_kwargs = write_kwargs if write_kwargs is not None else {}
 
-        self._name = self.__class__.__name__
+        self._name = "connected_components_stage"
         self._resources = Resources(cpus=1.0, gpus=1.0)
         self._batch_size = None
 

diff --git a/nemo_curator/stages/deduplication/fuzzy/identify_duplicates.py b/nemo_curator/stages/deduplication/fuzzy/identify_duplicates.py
@@ -58,7 +58,7 @@ class IdentifyDuplicatesStage(ShuffleStage):
         Whether the underlying rapidsmpf shuffler should collect shuffle statistics.
     """
 
-    _name = "IdentifyDuplicates"
+    _name = "identify_duplicates"
 
     def __init__(  # noqa: PLR0913
         self,

diff --git a/nemo_curator/stages/deduplication/fuzzy/lsh/stage.py b/nemo_curator/stages/deduplication/fuzzy/lsh/stage.py
@@ -67,7 +67,7 @@ class LSHStage(ProcessingStage[FileGroupTask, FileGroupTask]):
         If None, the number of partitions will be decided automatically by the executor as the closest power of 2 <= number of input tasks.
     """
 
-    _name = "LSHStage"
+    _name = "lsh_stage"
     _resources = Resources(gpus=1.0)
 
     # Core Algo objects

diff --git a/nemo_curator/stages/deduplication/fuzzy/minhash.py b/nemo_curator/stages/deduplication/fuzzy/minhash.py
@@ -240,7 +240,7 @@ def __init__(  # noqa: PLR0913
         pool: bool = True,
     ):
         # Set ProcessingStage attributes
-        self._name = self.__class__.__name__
+        self._name = "min_hash_stage"
-        self._name = "min_hash_stage"
+        self._name = "minhash_stage"
-        self._name = "min_hash_stage"
+        self._name = "minhash_stage"
         self._resources = Resources(gpus=1.0)  # Requires 1 GPU
 
         self.text_field = text_field

diff --git a/nemo_curator/stages/deduplication/semantic/identify_duplicates.py b/nemo_curator/stages/deduplication/semantic/identify_duplicates.py
@@ -45,7 +45,7 @@ class IdentifyDuplicatesStage(ProcessingStage[FileGroupTask, FileGroupTask]):
     def __post_init__(self):
         """Initialize parent class after dataclass initialization."""
         super().__init__()
-        self._name = "RemovalStage"
+        self._name = "removal_stage"
 
         self._batch_size = 10  # We want to load multiple clusters at once
 

diff --git a/nemo_curator/stages/deduplication/semantic/kmeans.py b/nemo_curator/stages/deduplication/semantic/kmeans.py
@@ -112,7 +112,7 @@ def __init__(  # noqa: PLR0913
         self.input_storage_options = self.read_kwargs.pop("storage_options", None)
         self.output_storage_options = self.write_kwargs.pop("storage_options", None)
 
-        self._name = "KMeansStage"
+        self._name = "kmeans_stage"
         self._resources = Resources(cpus=1.0, gpus=1.0)
 
     def process(self, task: FileGroupTask) -> _EmptyTask:

diff --git a/nemo_curator/stages/deduplication/semantic/pairwise.py b/nemo_curator/stages/deduplication/semantic/pairwise.py
@@ -112,7 +112,7 @@ def __init__(  # noqa: PLR0913
         check_disallowed_kwargs(self.write_kwargs, ["index"])
         self.input_storage_options = self.read_kwargs.pop("storage_options", None) if self.read_kwargs else None
         self.output_storage_options = self.write_kwargs.pop("storage_options", None) if self.write_kwargs else None
-        self._name = "PairwiseCosineSimilarityStage"
+        self._name = "pairwise_cosine_similarity_stage"
         self._resources = Resources(cpus=1.0, gpus=1.0)
 
     def process(self, task: FileGroupTask) -> FileGroupTask:

diff --git a/nemo_curator/stages/deduplication/shuffle_utils/stage.py b/nemo_curator/stages/deduplication/shuffle_utils/stage.py
@@ -51,7 +51,7 @@ class ShuffleStage(ProcessingStage[FileGroupTask, FileGroupTask]):
         Whether the underlying rapidsmpf shuffler should collect shuffle statistics.
     """
 
-    _name = "ShuffleStage"
+    _name = "shuffle_stage"
     _resources = Resources(gpus=1.0)
 
     # Use BulkRapidsMPFShuffler directly

diff --git a/nemo_curator/stages/text/deduplication/removal.py b/nemo_curator/stages/text/deduplication/removal.py
@@ -55,7 +55,7 @@ class TextDuplicatesRemovalStage(ProcessingStage[DocumentBatch, DocumentBatch]):
     def __post_init__(self):
         """Initialize parent class after dataclass initialization."""
         super().__init__()
-        self._name = "DuplicatesRemovalStage"
+        self._name = "text_duplicates_removal_stage"
         self.read_kwargs = self.read_kwargs.copy() if self.read_kwargs else {}
 
     def process(self, task: DocumentBatch) -> DocumentBatch:

diff --git a/nemo_curator/stages/text/filters/doc_filter.py b/nemo_curator/stages/text/filters/doc_filter.py
@@ -12,9 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
 from abc import ABC, abstractmethod
 
 
+def _camel_to_snake(name: str) -> str:
+    """Convert CamelCase to snake_case."""
+    # Insert an underscore before any uppercase letter that follows a lowercase letter or digit
+    s1 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)
+    return s1.lower()
+
+
 class DocumentFilter(ABC):
     """
     An abstract base class for text-based document filters.
@@ -26,7 +34,7 @@ class DocumentFilter(ABC):
 
     def __init__(self):
         super().__init__()
-        self._name = self.__class__.__name__
+        self._name = _camel_to_snake(self.__class__.__name__)
         self._sentences = None
         self._paragraphs = None
         self._ngrams = None

diff --git a/nemo_curator/stages/text/modifiers/doc_modifier.py b/nemo_curator/stages/text/modifiers/doc_modifier.py
@@ -12,9 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
 from abc import ABC, abstractmethod
 
 
+def _camel_to_snake(name: str) -> str:
+    """Convert CamelCase to snake_case."""
+    # Insert an underscore before any uppercase letter that follows a lowercase letter or digit
+    s1 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)
+    return s1.lower()
+
+
 class DocumentModifier(ABC):
     """
     Abstract base class for text-based document modifiers.
@@ -29,7 +37,7 @@ class DocumentModifier(ABC):
 
     def __init__(self) -> None:
         super().__init__()
-        self._name = self.__class__.__name__
+        self._name = _camel_to_snake(self.__class__.__name__)
         self._sentences = None
         self._paragraphs = None
         self._ngrams = None

diff --git a/tests/stages/deduplication/fuzzy/test_buckets_to_edges_stage.py b/tests/stages/deduplication/fuzzy/test_buckets_to_edges_stage.py
@@ -300,7 +300,7 @@ def test_output_directory_cleanup(self, input_task: FileGroupTask, tmp_path: Pat
         """Test that existing output directory is cleaned up."""
         output_dir = tmp_path / "output"
 
-        existing_dir = output_dir / "BucketsToEdgesStage"
+        existing_dir = output_dir / "buckets_to_edges_stage"
         existing_dir.mkdir(parents=True)
         existing_file = existing_dir / "existing.txt"
         existing_file.write_text("This should be deleted")

diff --git a/tests/stages/deduplication/fuzzy/test_fuzzy_workflow.py b/tests/stages/deduplication/fuzzy/test_fuzzy_workflow.py
@@ -245,11 +245,11 @@ def test_fuzzy_dedup_no_duplicates(
 
         workflow.run(initial_tasks=tasks)
 
-        assert not (cache_path / "ConnectedComponentsStage").exists()
-        assert not (cache_path / "BucketsToEdgesStage").exists()
+        assert not (cache_path / "connected_components_stage").exists()
+        assert not (cache_path / "buckets_to_edges_stage").exists()
         assert not (output_path / DUPLICATE_IDS_SUBDIR).exists()
 
-        lsh_df = cudf.read_parquet(cache_path / "LSHStage")
+        lsh_df = cudf.read_parquet(cache_path / "lsh_stage")
         assert len(lsh_df) == 0
 
     def test_bad_inputs(self, tmp_path: Path) -> None: