NVIDIA · polinabinder1 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
@@ -2,7 +2,6 @@
 docs/site/
 *.nemo
 protein/
-singlecell/
 results/
 
 # Local configs

@@ -279,10 +279,10 @@ type, and then pass in the config type to the training recipe.
 Similar to ESM-2, you can download the dataset and checkpoint through our utility function.
 
 ```bash
-TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20240506 --source $MY_DATA_SOURCE); \
+TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20241203 --source $MY_DATA_SOURCE); \
 GENEFORMER_10M_CKPT=$(download_bionemo_data geneformer/10M_240530:2.0 --source $MY_DATA_SOURCE); \
 train_geneformer     \
-    --data-dir ${TEST_DATA_DIR}/cellxgene_2023-12-15_small/processed_data    \
+    --data-dir ${TEST_DATA_DIR}/cellxgene_2023-12-15_small_processed_scdl    \
     --result-dir ./results     \
     --restore-from-checkpoint-path ${GENEFORMER_10M_CKPT} \
     --experiment-name test_experiment     \
@@ -305,9 +305,9 @@ copy the `sub-projects/bionemo-geneformer/geneformer/scripts/train_geneformer.py
 Simple fine-tuning example (**NOTE**: please change `--restore-from-checkpoint-path` to be the checkpoint directory path that was output last
 by the previous train run)
 ```bash
-TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20240506 --source $MY_DATA_SOURCE); \
+TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20241203 --source $MY_DATA_SOURCE); \
 train_geneformer     \
-    --data-dir ${TEST_DATA_DIR}/cellxgene_2023-12-15_small/processed_data    \
+    --data-dir ${TEST_DATA_DIR}/cellxgene_2023-12-15_small_processed_scdl    \
     --result-dir ./results     \
     --experiment-name test_finettune_experiment     \
     --num-gpus 1  \
@@ -331,11 +331,11 @@ customizations for your task.
 
 
 ```bash
-TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20240506 --source $MY_DATA_SOURCE); \
+TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20241203 --source $MY_DATA_SOURCE); \
 bionemo-geneformer-recipe \
-    --recipe geneformer_10m_pretrain_recipe \
-    --dest my_config.yaml \
-    --data-path ${TEST_DATA_DIR}/cellxgene_2023-12-15_small/processed_data \
+    --recipe 10m-pretrain \
+    --dest my_config.json \
+    --data-path ${TEST_DATA_DIR}/cellxgene_2023-12-15_small_processed_scdl \
     --result-dir ./results
 ```
 > ⚠️ **IMPORTANT:** Inspect and edit the contents of the outputted my_config.yaml as you see fit

@@ -8,3 +8,4 @@ awscli==1.33.33
 nbval==0.11.0
 # For NvFaidx equivalence tests
 pyfaidx==0.8.1.3
+wandb<0.19.0 # temporary pin: https://nvidia.slack.com/archives/C074Z808N05/p1733418209959769
@@ -105,16 +105,18 @@ class MultiEpochDatasetResampler(Dataset[T_co]):
 
     def __post_init__(self):
         """Pre-shuffle each epoch's samples."""
+        self._num_samples_in_dataset = len(self.dataset)
+
         if self.num_epochs is None and self.num_samples is None:
             self.num_epochs = 1
         elif self.num_epochs is not None and self.num_samples is not None:
             raise ValueError("Only one of num_epochs and num_samples should be provided.")
 
         if self.num_epochs is None and self.num_samples is not None:
-            self.num_epochs = math.ceil(self.num_samples / len(self.dataset))
+            self.num_epochs = math.ceil(self.num_samples / self._num_samples_in_dataset)
 
         elif self.num_samples is None and self.num_epochs is not None:
-            self.num_samples = len(self.dataset) * self.num_epochs
+            self.num_samples = self._num_samples_in_dataset * self.num_epochs
 
         # Type guard statements, the above if/elif block should ensure these are not None.
         assert self.num_epochs is not None
@@ -140,10 +142,10 @@ def __len__(self) -> int:
 
     def _global_index_to_permuted_local_index(self, index: int) -> EpochIndex:
         """Convert a global index to an epoch index."""
-        epoch = index // len(self.dataset)
-        idx = index % len(self.dataset)
+        epoch = index // self._num_samples_in_dataset
+        idx = index % self._num_samples_in_dataset
         if self.shuffle:
-            idx = permute(idx, len(self.dataset), self.epoch_seeds[epoch])
+            idx = permute(idx, self._num_samples_in_dataset, self.epoch_seeds[epoch])
         return EpochIndex(epoch, idx)
 
 

@@ -5,3 +5,11 @@
   sha256: 7a4237537bf535dfa00301ce8cc7073e0a23d5bc8aa902ad65db9f51b57a6df9 # pragma: allowlist secret
   owner: Polina Binder <[email protected]>
   description: Sample test data for SCDL.
+
+- tag: sample_scdl_feature_ids
+  ngc: nvidia/clara/scdl_sample_test_feature_ids:1.0
+  ngc_registry: resource
+  pbss: s3://bionemo-ci/test-data/scdl_sample_test_feat_ids.tar.gz
+  sha256: 9020ba336dbfe33bddadba26ca0cde49958cbd73c5ad44f0960a5a4837c9db26 # pragma: allowlist secret
+  owner: Savitha Srinivasan <[email protected]>
+  description: Sample test data for SCDL with feature IDs appended.
@@ -21,3 +21,11 @@
   sha256: ab038b184de52e53ff7bcea5e01d97d55944c507db88c0495bdf9e5e9e0303a4 # pragma: allowlist secret
   owner: John St John <[email protected]>
   description: Golden values for geneformer QA model.
+
+- tag: testdata-20241203
+  ngc: nvidia/clara/singlecell-testdata:2.0
+  ngc_registry: resource
+  pbss: "s3://bionemo-ci/test-data/singlecell/singlecell-scdltestdata-20241203.tar.gz"
+  sha256: d8e3ea569bc43768c24aa651aff77722df202078415528497c22394046b08cc3 # pragma: allowlist secret
+  owner:  Savitha Srinivasan <[email protected]>
+  description: Test data for single cell models in SCDL Memmap format.
@@ -16,7 +16,7 @@ pytest -v .
 
 
 ## Acquiring Data
-Datasets are expected to be in the form of AnnData (.h5ad) objects such as those downloaded from [Cell x Gene | CZI](https://chanzuckerberg.github.io/cellxgene-census/). They are then pre-processed with either `bionemo-geneformer/src/bionemo/geneformer/data/singlecell/sc_memmap.py` or with sc-DL.
+Datasets are expected to be in the form of AnnData (.h5ad) objects such as those downloaded from [Cell x Gene | CZI](https://chanzuckerberg.github.io/cellxgene-census/). They are then pre-processed with `sub-packages/bionemo-scdl/src/bionemo/scdl/scripts/convert_h5ad_to_scdl.py`.
 
 ## Geneformer-nv 10M and 106M
 Refer to the Dataset cards and Model cards to learn more about the pre-trained checkpoints provided for both 10M and 106M of Geneformer-nv.

@@ -21,7 +21,6 @@ dependencies = [
 [project.scripts]
 bionemo-geneformer-train= "bionemo.geneformer.run.main:main"
 bionemo-geneformer-recipe= "bionemo.geneformer.run.recipes:main"
-sc_memmap = "bionemo.geneformer.scripts.sc_memmap:main_cli"
 infer_geneformer = "bionemo.geneformer.scripts.infer_geneformer:geneformer_infer_entrypoint"
 train_geneformer = "bionemo.geneformer.scripts.train_geneformer:entrypoint"
 geneformer_mlm_loss_eval = "bionemo.geneformer.scripts.geneformer_mlm_loss_eval:entrypoint"

@@ -128,6 +128,7 @@ def main(
     seq_len_nv: int = 2048,
     seq_len_hf: int = 2048,
     seed: int = 513,
+    skip_unrecognized_vocab_in_dataset: bool = True,
 ):
     """Inference function (requires DDP and only training data that fits in memory)."""
     # This is just used to get the tokenizer :(
@@ -185,6 +186,7 @@ def main(
             max_len=seq_len_nv,
             mask_prob=mask_prob,
             seed=seed,
+            skip_unrecognized_vocab_in_dataset=skip_unrecognized_vocab_in_dataset,
         )
         ds_hf_nvfilt = SingleCellDataset(
             dataset_path,
@@ -194,6 +196,7 @@ def main(
             mask_prob=mask_prob,
             eos_token=hf_tokenizer.token_to_id(hf_tokenizer.sep_token),  # Stored in the special token
             seed=seed,
+            skip_unrecognized_vocab_in_dataset=skip_unrecognized_vocab_in_dataset,
         )
         print(f"Loaded dataset of length (NV): {len(ds_nv)}, (HF): {len(ds_hf_nvfilt)}")
 
@@ -299,6 +302,11 @@ def entrypoint():
     )
     parser.add_argument("--hf-model-path", type=str, default="ctheodoris/Geneformer", help="HF model path")
     parser.add_argument("--dataset-path", type=Path, help="Path to dataset directory", required=True)
+    parser.add_argument(
+        "--skip-unrecognized-vocab-in-dataset",
+        action="store_false",
+        help="Set to False to verify whether all gene identifers are in the user supplied tokenizer vocab. Defaults to True which means that any gene identifier not in the user supplied tokenizer vocab will be excluded.",
+    )
 
     args = parser.parse_args()
     main(
@@ -307,6 +315,7 @@ def entrypoint():
         args.dataset_path,
         args.hf_token_dictionary_path,
         args.hf_medians_dictionary_path,
+        args.skip_unrecognized_vocab_in_dataset,
     )
 
 

@@ -51,6 +51,7 @@ class SingleCellDataModule(MegatronDataModule):
         num_mask_per_sample (int): Number of masked versions of a single sample to be returned by each worker
         train_batch_size (int): Batch size for training
         val_batch_size (int): Batch size for validation
+        skip_unrecognized_vocab_in_dataset (bool, optional): Set to False to verify whether all gene identifers are in the user supplied tokenizer vocab. Defaults to True which means that any gene identifier not in the user supplied tokenizer vocab will be excluded.
 
     Attributes:
         cfg (Config): Configuration object
@@ -82,6 +83,7 @@ def __init__(  # noqa: D107
         num_workers: int = 10,  # TODO can this be automatically set?
         persistent_workers: bool = True,
         pin_memory: bool = True,
+        skip_unrecognized_vocab_in_dataset: bool = True,
     ) -> None:
         super().__init__()
         if predict_dataset_path is None:
@@ -122,6 +124,7 @@ def __init__(  # noqa: D107
                 mask_token_prob=self.mask_token_prob,
                 random_token_prob=self.random_token_prob,
                 seed=random_utils.get_seed_from_rng(rng),
+                skip_unrecognized_vocab_in_dataset=skip_unrecognized_vocab_in_dataset,
             )
             self._val_dataset_ori = SingleCellDataset(
                 self.data_path_val,
@@ -132,6 +135,7 @@ def __init__(  # noqa: D107
                 mask_token_prob=self.mask_token_prob,
                 random_token_prob=self.random_token_prob,
                 seed=random_utils.get_seed_from_rng(rng),
+                skip_unrecognized_vocab_in_dataset=skip_unrecognized_vocab_in_dataset,
             )
             self._test_dataset_ori = SingleCellDataset(
                 self.data_path_test,
@@ -142,6 +146,7 @@ def __init__(  # noqa: D107
                 mask_token_prob=self.mask_token_prob,
                 random_token_prob=self.random_token_prob,
                 seed=random_utils.get_seed_from_rng(rng),
+                skip_unrecognized_vocab_in_dataset=skip_unrecognized_vocab_in_dataset,
             )
             self._predict_dataset_ori = None
         else:
@@ -155,6 +160,7 @@ def __init__(  # noqa: D107
                 mask_token_prob=self.mask_token_prob,
                 random_token_prob=self.random_token_prob,
                 seed=random_utils.get_seed_from_rng(rng),
+                skip_unrecognized_vocab_in_dataset=skip_unrecognized_vocab_in_dataset,
             )
             self._train_dataset_ori = None
             self._val_dataset_ori = None