diff --git a/training/configs/curriculum/graphs_curriculum.yaml b/training/configs/curriculum/graphs_curriculum.yaml
new file mode 100644
index 00000000..4efc526d
--- /dev/null
+++ b/training/configs/curriculum/graphs_curriculum.yaml
@@ -0,0 +1,225 @@
+reasoning_gym:
+  dataset_size: 20000
+  developer_prompt: DeepSeekZero
+  datasets:
+curriculum:
+    enabled: True
+    schedule:
+      automatic: False
+      update_steps: 30
+    last_k: 2560
+    success_threshold: 0.70
+    failure_threshold: 0.20
+    curricula:
+      course_schedule:
+        attribute_levels:
+          num_courses: 0
+          num_prerequisites: 0
+          cycle_length: 0
+      family_relationships:
+        attribute_levels:
+          family_size: 0
+      largest_island:
+        attribute_levels:
+          rows: 0
+          cols: 0
+          num_islands: 0
+          island_size: 0
+      quantum_lock:
+        attribute_levels:
+          difficulty: 0
+      shortest_path:
+        attribute_levels:
+          rows: 0
+          cols: 0
+
+reward:
+  use_accuracy: True
+  secondary_rewards:
+   - name: cosine
+     scaling_factor: 0.3
+   - name: format
+     scaling_factor: 0.2
+     kwargs:
+        preappend_thinking_token: False
+
+data:
+  tokenizer: null
+  train_files: train.parquet
+  val_files: test.parquet
+  prompt_key: prompt
+  max_prompt_length: 4096
+  max_response_length: 2048
+  train_batch_size: 32
+  val_batch_size: 64
+  return_raw_chat: True
+  return_raw_input_ids: True
+
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    path: Qwen/Qwen2.5-3B-Instruct
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: True
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_mini_batch_size: 8
+    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: 2
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 34816 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    optim:
+      lr: 1e-6
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: 400  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+      model_dtype: bfloat16
+  ref:
+    fsdp_config:
+      param_offload: True
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      model_dtype: bfloat16
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+  rollout:
+    name: vllm
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    prompt_length: ${data.max_prompt_length}  # not use for opensource
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.7
+    ignore_eos: False
+    enforce_eager: True
+    free_cache_engine: True
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 4
+    max_num_batched_tokens: 12288
+    max_num_seqs: 1024
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    use_fire_sampling: False
+    max_model_len: 12288
+    # number of responses (i.e. num sample times)
+    n: 8 # > 1 for grpo
+    val_kwargs:
+      do_sample: True
+
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+verbose: True
+trainer:
+  balance_batch: True
+  total_epochs: 1
+  total_training_steps: 400
+  project_name: curriculum
+  experiment_name: graphs_curriculum
+  logger: [ 'console', 'wandb' ]
+  val_generations_to_log_to_wandb: 0
+  nnodes: 1
+  n_gpus_per_node: 4
+  save_freq: 100
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path if
+  resume_from_path: False
+  test_freq: 100
+  critic_warmup: 0
+  default_hdfs_dir: null
+  remove_previous_ckpt_in_save: False
+  del_local_ckpt_after_load: False
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+
+
+critic:
+  strategy: fsdp
+  optim:
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    min_lr_ratio: null   # only useful for warmup with cosine
+    warmup_style: constant  # select from constant/cosine
+    total_training_steps: -1  # must be override by program
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config: { }
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+    fsdp_config:
+      param_offload: False
+      optimizer_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      fsdp_size: -1
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: null
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ulysses_sequence_parallel_size: 1 # sp size
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  grad_clip: 1.0
+  cliprange_value: 0.5
+
+# Reward model not used for GRPO
+reward_model:
+  enable: False
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    use_remove_padding: False
+    fsdp_config:
+      min_num_params: 0
+      param_offload: False
+      fsdp_size: -1
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  ulysses_sequence_parallel_size: 1
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
diff --git a/training/configs/curriculum/graphs_noncurriculum.yaml b/training/configs/curriculum/graphs_noncurriculum.yaml
new file mode 100644
index 00000000..0a8dc8bd
--- /dev/null
+++ b/training/configs/curriculum/graphs_noncurriculum.yaml
@@ -0,0 +1,233 @@
+reasoning_gym:
+  dataset_size: 20000
+  developer_prompt: DeepSeekZero
+  datasets:
+    course_schedule:
+      weight: 1
+    family_relationships:
+      weight: 1
+    largest_island:
+      weight: 1
+    quantum_lock:
+      weight: 1
+    shortest_path:
+      weight: 1
+curriculum:
+    enabled: False
+    schedule:
+      automatic: False
+      update_steps: 30
+    last_k: 2560
+    success_threshold: 0.70
+    failure_threshold: 0.20
+    curricula:
+      course_schedule:
+        attribute_levels:
+          num_courses: 0
+          num_prerequisites: 0
+          cycle_length: 0
+      family_relationships:
+        attribute_levels:
+          family_size: 0
+      largest_island:
+        attribute_levels:
+          rows: 0
+          cols: 0
+          num_islands: 0
+          island_size: 0
+      quantum_lock:
+        attribute_levels:
+          difficulty: 0
+      shortest_path:
+        attribute_levels:
+          rows: 0
+          cols: 0
+
+reward:
+  use_accuracy: True
+  secondary_rewards:
+   - name: cosine
+     scaling_factor: 0.3
+   - name: format
+     scaling_factor: 0.2
+     kwargs:
+        preappend_thinking_token: False
+
+data:
+  tokenizer: null
+  train_files: train.parquet
+  val_files: test.parquet
+  prompt_key: prompt
+  max_prompt_length: 4096
+  max_response_length: 2048
+  train_batch_size: 32
+  val_batch_size: 64
+  return_raw_chat: True
+  return_raw_input_ids: True
+
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    path: Qwen/Qwen2.5-3B-Instruct
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: True
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_mini_batch_size: 8
+    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: 2
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 34816 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    optim:
+      lr: 1e-6
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: 400  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+  ref:
+    fsdp_config:
+      param_offload: True
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+  rollout:
+    name: vllm
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    prompt_length: ${data.max_prompt_length}  # not use for opensource
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.7
+    ignore_eos: False
+    enforce_eager: True
+    free_cache_engine: True
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 4
+    max_num_batched_tokens: 12288
+    max_num_seqs: 1024
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    use_fire_sampling: False
+    max_model_len: 12288
+    # number of responses (i.e. num sample times)
+    n: 8 # > 1 for grpo
+    val_kwargs:
+      do_sample: True
+
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+verbose: True
+trainer:
+  balance_batch: True
+  total_epochs: 1
+  total_training_steps: 400
+  project_name: curriculum
+  experiment_name: graphs_noncurriculum
+  logger: [ 'console', 'wandb' ]
+  val_generations_to_log_to_wandb: 0
+  nnodes: 1
+  n_gpus_per_node: 4
+  save_freq: 100
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path if
+  resume_from_path: False
+  test_freq: 100
+  critic_warmup: 0
+  default_hdfs_dir: null
+  remove_previous_ckpt_in_save: False
+  del_local_ckpt_after_load: False
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+
+
+critic:
+  strategy: fsdp
+  optim:
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    min_lr_ratio: null   # only useful for warmup with cosine
+    warmup_style: constant  # select from constant/cosine
+    total_training_steps: -1  # must be override by program
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config: { }
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+    fsdp_config:
+      param_offload: False
+      optimizer_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      fsdp_size: -1
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: null
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ulysses_sequence_parallel_size: 1 # sp size
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  grad_clip: 1.0
+  cliprange_value: 0.5
+
+# Reward model not used for GRPO
+reward_model:
+  enable: False
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    use_remove_padding: False
+    fsdp_config:
+      min_num_params: 0
+      param_offload: False
+      fsdp_size: -1
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  ulysses_sequence_parallel_size: 1
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
diff --git a/training/evaluations/curriculum/graphs.yml b/training/evaluations/curriculum/graphs.yml
new file mode 100644
index 00000000..c9df6310
--- /dev/null
+++ b/training/evaluations/curriculum/graphs.yml
@@ -0,0 +1,40 @@
+# Config used for evaluating curriculum experiment models on graphs composite data
+
+# Models evaluated on this config:
+# Qwen/Qwen2.5-3B-Instruct (original model)
+# qwen3b_graphs_noncurriculum_300 (original + 300 GRPO steps on non-curriculum graphs data)
+# qwen3b_graphs_curriculum_300 (original + 300 GRPO steps on curriculum graphs data)
+
+model_path: Qwen/Qwen2.5-3B-Instruct  # Default model path
+
+max_tokens: 2048  # From max_response_length in training config
+top_p: 1.0
+temperature: 1.0  # Lower temperature for more focused responses
+dtype: bfloat16
+
+developer_prompt: DeepSeekZero
+developer_role: system
+
+output_dir: results
+save_metadata: true
+save_full_results: true
+eval_repeats: 3
+
+categories:
+  - category: graphs
+    datasets:
+      - dataset: course_schedule
+        size: 50
+        seed: 42
+      - dataset: family_relationships
+        size: 50
+        seed: 42
+      - dataset: largest_island
+        size: 50
+        seed: 42
+      - dataset: quantum_lock
+        size: 50
+        seed: 42
+      - dataset: shortest_path
+        size: 50
+        seed: 42
diff --git a/training/evaluations/evaluate_model.py b/training/evaluations/evaluate_model.py
index 5b51ee48..2714eeaa 100644
--- a/training/evaluations/evaluate_model.py
+++ b/training/evaluations/evaluate_model.py
@@ -45,6 +45,7 @@ class EvalConfig:
     model_path: str
     max_tokens: int
     temperature: float
+    dtype: str
     top_p: float
     output_dir: str
     save_metadata: bool
@@ -82,7 +83,7 @@ def __init__(
         self.verbose = verbose
 
         # Load model and tokenizer
-        self.llm = LLM(model=model_path)
+        self.llm = LLM(model=model_path, dtype=config.dtype)
         self.tokenizer = self.llm.get_tokenizer()
         self.sampling_params = SamplingParams(
             temperature=config.temperature,
@@ -214,6 +215,7 @@ def evaluate_all(self) -> Dict[str, Any]:
                 "duration_seconds": (datetime.now() - self.start_time).total_seconds(),
                 "max_tokens": self.config.max_tokens,
                 "temperature": self.config.temperature,
+                "dtype": self.config.dtype,
                 "top_p": self.config.top_p,
                 "eval_repeats": self.config.eval_repeats,
             },
diff --git a/training/trainers/ray_grpo_trainer.py b/training/trainers/ray_grpo_trainer.py
index acbaa740..b8c6bb15 100644
--- a/training/trainers/ray_grpo_trainer.py
+++ b/training/trainers/ray_grpo_trainer.py
@@ -365,13 +365,23 @@ def fit(self):
                     if self.config.curriculum.schedule.automatic:
                         for dataset_name in grouped_scores.keys():
                             if self.global_steps % self.config.curriculum.schedule.update_steps == 0:
-                                self.train_dataset.experiment.update_difficulty(dataset_name, method="increment")
+                                self.train_dataset.update_experiment_difficulty(dataset_name, method="increment")
                     else:
                         for dataset_name in grouped_scores.keys():
                             if (
                                 grouped_scores[dataset_name]["results"] > self.config.curriculum.success_threshold
                             ) and (grouped_scores[dataset_name]["total_samples"] >= self.config.curriculum.last_k):
-                                self.train_dataset.experiment.update_difficulty(dataset_name, method="increment")
+                                print(
+                                    f"Increasing difficulty for dataset: {dataset_name} (success rate: {grouped_scores[dataset_name]['results']:.2f}, samples: {grouped_scores[dataset_name]['total_samples']})"
+                                )
+                                self.train_dataset.update_experiment_difficulty(dataset_name, method="increment")
+                            elif (
+                                grouped_scores[dataset_name]["results"] < self.config.curriculum.failure_threshold
+                            ) and (grouped_scores[dataset_name]["total_samples"] >= self.config.curriculum.last_k):
+                                print(
+                                    f"Decreasing difficulty for dataset: {dataset_name} (success rate: {grouped_scores[dataset_name]['results']:.2f}, samples: {grouped_scores[dataset_name]['total_samples']})"
+                                )
+                                self.train_dataset.update_experiment_difficulty(dataset_name, method="decrement")
 
                 metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
                 metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))