diff --git a/training/configs/curriculum/graphs_curriculum.yaml b/training/configs/curriculum/graphs_curriculum.yaml new file mode 100644 index 00000000..4efc526d --- /dev/null +++ b/training/configs/curriculum/graphs_curriculum.yaml @@ -0,0 +1,225 @@ +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: +curriculum: + enabled: True + schedule: + automatic: False + update_steps: 30 + last_k: 2560 + success_threshold: 0.70 + failure_threshold: 0.20 + curricula: + course_schedule: + attribute_levels: + num_courses: 0 + num_prerequisites: 0 + cycle_length: 0 + family_relationships: + attribute_levels: + family_size: 0 + largest_island: + attribute_levels: + rows: 0 + cols: 0 + num_islands: 0 + island_size: 0 + quantum_lock: + attribute_levels: + difficulty: 0 + shortest_path: + attribute_levels: + rows: 0 + cols: 0 + +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 2048 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True + +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 8 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 2 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 34816 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 400 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + model_dtype: bfloat16 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + model_dtype: bfloat16 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 400 + project_name: curriculum + experiment_name: graphs_curriculum + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/curriculum/graphs_noncurriculum.yaml b/training/configs/curriculum/graphs_noncurriculum.yaml new file mode 100644 index 00000000..0a8dc8bd --- /dev/null +++ b/training/configs/curriculum/graphs_noncurriculum.yaml @@ -0,0 +1,233 @@ +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + course_schedule: + weight: 1 + family_relationships: + weight: 1 + largest_island: + weight: 1 + quantum_lock: + weight: 1 + shortest_path: + weight: 1 +curriculum: + enabled: False + schedule: + automatic: False + update_steps: 30 + last_k: 2560 + success_threshold: 0.70 + failure_threshold: 0.20 + curricula: + course_schedule: + attribute_levels: + num_courses: 0 + num_prerequisites: 0 + cycle_length: 0 + family_relationships: + attribute_levels: + family_size: 0 + largest_island: + attribute_levels: + rows: 0 + cols: 0 + num_islands: 0 + island_size: 0 + quantum_lock: + attribute_levels: + difficulty: 0 + shortest_path: + attribute_levels: + rows: 0 + cols: 0 + +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 2048 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True + +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 8 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 2 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 34816 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 400 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 400 + project_name: curriculum + experiment_name: graphs_noncurriculum + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/evaluations/curriculum/graphs.yml b/training/evaluations/curriculum/graphs.yml new file mode 100644 index 00000000..c9df6310 --- /dev/null +++ b/training/evaluations/curriculum/graphs.yml @@ -0,0 +1,40 @@ +# Config used for evaluating curriculum experiment models on graphs composite data + +# Models evaluated on this config: +# Qwen/Qwen2.5-3B-Instruct (original model) +# qwen3b_graphs_noncurriculum_300 (original + 300 GRPO steps on non-curriculum graphs data) +# qwen3b_graphs_curriculum_300 (original + 300 GRPO steps on curriculum graphs data) + +model_path: Qwen/Qwen2.5-3B-Instruct # Default model path + +max_tokens: 2048 # From max_response_length in training config +top_p: 1.0 +temperature: 1.0 # Lower temperature for more focused responses +dtype: bfloat16 + +developer_prompt: DeepSeekZero +developer_role: system + +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +categories: + - category: graphs + datasets: + - dataset: course_schedule + size: 50 + seed: 42 + - dataset: family_relationships + size: 50 + seed: 42 + - dataset: largest_island + size: 50 + seed: 42 + - dataset: quantum_lock + size: 50 + seed: 42 + - dataset: shortest_path + size: 50 + seed: 42 diff --git a/training/evaluations/evaluate_model.py b/training/evaluations/evaluate_model.py index 5b51ee48..2714eeaa 100644 --- a/training/evaluations/evaluate_model.py +++ b/training/evaluations/evaluate_model.py @@ -45,6 +45,7 @@ class EvalConfig: model_path: str max_tokens: int temperature: float + dtype: str top_p: float output_dir: str save_metadata: bool @@ -82,7 +83,7 @@ def __init__( self.verbose = verbose # Load model and tokenizer - self.llm = LLM(model=model_path) + self.llm = LLM(model=model_path, dtype=config.dtype) self.tokenizer = self.llm.get_tokenizer() self.sampling_params = SamplingParams( temperature=config.temperature, @@ -214,6 +215,7 @@ def evaluate_all(self) -> Dict[str, Any]: "duration_seconds": (datetime.now() - self.start_time).total_seconds(), "max_tokens": self.config.max_tokens, "temperature": self.config.temperature, + "dtype": self.config.dtype, "top_p": self.config.top_p, "eval_repeats": self.config.eval_repeats, }, diff --git a/training/trainers/ray_grpo_trainer.py b/training/trainers/ray_grpo_trainer.py index acbaa740..b8c6bb15 100644 --- a/training/trainers/ray_grpo_trainer.py +++ b/training/trainers/ray_grpo_trainer.py @@ -365,13 +365,23 @@ def fit(self): if self.config.curriculum.schedule.automatic: for dataset_name in grouped_scores.keys(): if self.global_steps % self.config.curriculum.schedule.update_steps == 0: - self.train_dataset.experiment.update_difficulty(dataset_name, method="increment") + self.train_dataset.update_experiment_difficulty(dataset_name, method="increment") else: for dataset_name in grouped_scores.keys(): if ( grouped_scores[dataset_name]["results"] > self.config.curriculum.success_threshold ) and (grouped_scores[dataset_name]["total_samples"] >= self.config.curriculum.last_k): - self.train_dataset.experiment.update_difficulty(dataset_name, method="increment") + print( + f"Increasing difficulty for dataset: {dataset_name} (success rate: {grouped_scores[dataset_name]['results']:.2f}, samples: {grouped_scores[dataset_name]['total_samples']})" + ) + self.train_dataset.update_experiment_difficulty(dataset_name, method="increment") + elif ( + grouped_scores[dataset_name]["results"] < self.config.curriculum.failure_threshold + ) and (grouped_scores[dataset_name]["total_samples"] >= self.config.curriculum.last_k): + print( + f"Decreasing difficulty for dataset: {dataset_name} (success rate: {grouped_scores[dataset_name]['results']:.2f}, samples: {grouped_scores[dataset_name]['total_samples']})" + ) + self.train_dataset.update_experiment_difficulty(dataset_name, method="decrement") metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))