diff --git a/reasoning_gym/algorithmic/count_primes.py b/reasoning_gym/algorithmic/count_primes.py index 1fcacd05..df7c173d 100644 --- a/reasoning_gym/algorithmic/count_primes.py +++ b/reasoning_gym/algorithmic/count_primes.py @@ -84,7 +84,7 @@ def __init__(self): self._define_attributes( RangeAttributeDefinition( name="n", - levels=[10, 1000, 10_000, 50_000, 100_000], + levels=[100, 500, 1000, 5000], description="Up to which number to consider the primes", lower_field_name="min_n", upper_field_name="max_n", diff --git a/training/configs/curriculum-experiments/count_primes/curriculum.yaml b/training/configs/curriculum-experiments/count_primes/curriculum.yaml new file mode 100644 index 00000000..2689daf8 --- /dev/null +++ b/training/configs/curriculum-experiments/count_primes/curriculum.yaml @@ -0,0 +1,212 @@ +hydra: + searchpath: + - file:///home/ubuntu/verl/verl/trainer/config + +defaults: + - ppo_trainer + - _self_ + +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: +curriculum: + enabled: True + schedule: + automatic: False + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 5120 # num_generations * batch_size * num_training_steps + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + count_primes: + attribute_levels: + n: 0 + +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 4 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 300 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 300 + project_name: reasoning_gym + experiment_name: count_primes + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/curriculum-experiments/count_primes/non_curriculum.yaml b/training/configs/curriculum-experiments/count_primes/non_curriculum.yaml new file mode 100644 index 00000000..e75274d1 --- /dev/null +++ b/training/configs/curriculum-experiments/count_primes/non_curriculum.yaml @@ -0,0 +1,217 @@ +hydra: + searchpath: + - file:///home/ubuntu/verl/verl/trainer/config + +defaults: + - ppo_trainer + - _self_ + +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + count_primes: + weight: 1 + config: + min_n: 100 + max_n: 5000 +curriculum: + enabled: False + schedule: + automatic: False + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 5120 # num_generations * batch_size * num_training_steps + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + count_primes: + attribute_levels: + n: 0 + +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 4 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 300 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 300 + project_name: reasoning_gym + experiment_name: count_primes_non_curriculum + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/curriculum-experiments/mini_sukoku/curriculum.yaml b/training/configs/curriculum-experiments/mini_sukoku/curriculum.yaml new file mode 100644 index 00000000..4e768dfe --- /dev/null +++ b/training/configs/curriculum-experiments/mini_sukoku/curriculum.yaml @@ -0,0 +1,211 @@ +hydra: + searchpath: + - file:///home/ubuntu/verl/verl/trainer/config + +defaults: + - ppo_trainer + - _self_ + +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: +curriculum: + enabled: True + schedule: + automatic: False + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 5120 # num_generations * batch_size * num_training_steps + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + mini_sudoku: + attribute_levels: + empty: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 4 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 300 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 300 + project_name: reasoning_gym + experiment_name: mini_sudoku_curriculum + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/curriculum-experiments/mini_sukoku/non_curriculum.yaml b/training/configs/curriculum-experiments/mini_sukoku/non_curriculum.yaml new file mode 100644 index 00000000..5f7dfb6c --- /dev/null +++ b/training/configs/curriculum-experiments/mini_sukoku/non_curriculum.yaml @@ -0,0 +1,216 @@ +hydra: + searchpath: + - file:///home/ubuntu/verl/verl/trainer/config + +defaults: + - ppo_trainer + - _self_ + +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + mini_sudoku: + weight: 1 + config: + min_empty: 4 + max_n: 12 +curriculum: + enabled: False + schedule: + automatic: False + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 5120 # num_generations * batch_size * num_training_steps + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + mini_sudoku: + attribute_levels: + empty: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 4 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 300 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 300 + project_name: reasoning_gym + experiment_name: mini_sudoku_non_curriculum + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/curriculum-experiments/spell_backward/curriculum.yaml b/training/configs/curriculum-experiments/spell_backward/curriculum.yaml new file mode 100644 index 00000000..5b262c99 --- /dev/null +++ b/training/configs/curriculum-experiments/spell_backward/curriculum.yaml @@ -0,0 +1,211 @@ +hydra: + searchpath: + - file:///home/ubuntu/verl/verl/trainer/config + +defaults: + - ppo_trainer + - _self_ + +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: +curriculum: + enabled: True + schedule: + automatic: False + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 5120 # num_generations * batch_size * num_training_steps + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + spell_backward: + attribute_levels: + word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 4 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 300 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 300 + project_name: reasoning_gym + experiment_name: spell_backward_curriculum + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/curriculum-experiments/spell_backward/non_curriculum.yaml b/training/configs/curriculum-experiments/spell_backward/non_curriculum.yaml new file mode 100644 index 00000000..d8c525eb --- /dev/null +++ b/training/configs/curriculum-experiments/spell_backward/non_curriculum.yaml @@ -0,0 +1,216 @@ +hydra: + searchpath: + - file:///home/ubuntu/verl/verl/trainer/config + +defaults: + - ppo_trainer + - _self_ + +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + spell_backward: + weight: 0.33 + config: + min_word_len: 3 + max_word_len: 10 +curriculum: + enabled: False + schedule: + automatic: False + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 20 + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + spell_backward: + attribute_levels: + word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 4 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 400 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 400 + project_name: reasoning_gym + experiment_name: spell_backward_non_curriculum + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/evaluations/curriculum/count_primes.yaml b/training/evaluations/curriculum/count_primes.yaml new file mode 100644 index 00000000..de4a14c4 --- /dev/null +++ b/training/evaluations/curriculum/count_primes.yaml @@ -0,0 +1,25 @@ +model_path: Qwen/Qwen2.5-3B-Instruct # Default model path +# model_path: joesharratt29/count_prime_curriculum +# model_path: joesharratt29/count_primes_non_curriculum + +max_tokens: 2048 # From max_response_length in training config +top_p: 1.0 +temperature: 1.0 # + +developer_prompt: DeepSeekZero +developer_role: system + +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +categories: + - category: algorithmic + datasets: + - dataset: count_primes + size: 100 + seed: 42 + params: + min_n: 100 + max_n: 5000 diff --git a/training/evaluations/curriculum/mini_sudoku.yaml b/training/evaluations/curriculum/mini_sudoku.yaml new file mode 100644 index 00000000..4f7437e0 --- /dev/null +++ b/training/evaluations/curriculum/mini_sudoku.yaml @@ -0,0 +1,25 @@ +model_path: joesharratt29/mini_sudoku_non_curriculum # Default model path +# model_path: joesharratt29/mini_sudoku_non_curriculum +# model_path: joesharratt29/mini_sudoku_curriculum + +max_tokens: 2048 # From max_response_length in training config +top_p: 1.0 +temperature: 1.0 # + +developer_prompt: DeepSeekZero +developer_role: system + +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +categories: + - category: algorithmic + datasets: + - dataset: mini_sudoku + size: 100 + seed: 42 + params: + min_empty: 4 + max_empty: 12 diff --git a/training/evaluations/curriculum/spell_backward.yaml b/training/evaluations/curriculum/spell_backward.yaml new file mode 100644 index 00000000..2b0e7f1b --- /dev/null +++ b/training/evaluations/curriculum/spell_backward.yaml @@ -0,0 +1,26 @@ +model_path: Qwen/Qwen2.5-3B-Instruct # Default model path +# model_path: joesharratt29/spell_backward_non_curriculum +# model_path: joesharratt29/spell_backward_curriculum + +max_tokens: 2048 # From max_response_length in training config +top_p: 1.0 +temperature: 1.0 # + +developer_prompt: DeepSeekZero +developer_role: system + +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +categories: + - category: algorithmic + datasets: + - dataset: spell_backward + size: 100 + seed: 42 + params: + min_word_len: 3 # Minimum word length + max_word_len: 10 + data_file: holdout_words.txt diff --git a/training/trainers/ray_grpo_trainer.py b/training/trainers/ray_grpo_trainer.py index 7c414ccb..533c4ecd 100644 --- a/training/trainers/ray_grpo_trainer.py +++ b/training/trainers/ray_grpo_trainer.py @@ -365,9 +365,8 @@ def fit(self): if self.config.curriculum.schedule.automatic: for dataset_name in grouped_scores.keys(): if self.global_steps % self.config.curriculum.schedule.update_steps == 0: - self.train_dataset.experiment.update_difficulty(dataset_name, method="increment") + self.train_dataset.update_experiment_difficulty(dataset_name, method="increment") else: - print(grouped_scores) for dataset_name in grouped_scores.keys(): if ( grouped_scores[dataset_name]["results"] > self.config.curriculum.success_threshold diff --git a/training/utils/load_fsdp_to_hf.py b/training/utils/load_fsdp_to_hf.py index 495266c3..809c30ad 100644 --- a/training/utils/load_fsdp_to_hf.py +++ b/training/utils/load_fsdp_to_hf.py @@ -5,10 +5,11 @@ import fire import torch +from huggingface_hub import HfApi, create_repo from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer -def main(fsdp_checkpoint_path, huggingface_model_path, output_path): +def main(fsdp_checkpoint_path, huggingface_model_path, output_path, push_to_hub=True, hub_token="", private=True): state_dict = defaultdict(list) world_size = 4 @@ -31,6 +32,27 @@ def main(fsdp_checkpoint_path, huggingface_model_path, output_path): tokenizer = AutoTokenizer.from_pretrained(huggingface_model_path) tokenizer.save_pretrained(output_path) + # Push to hub if requested + if push_to_hub: + if not output_path: + raise ValueError("output path must be provided when push_to_hub=True") + + print(f"Pushing model to Hugging Face Hub: {output_path}") + + # Create repository if it doesn't exist + api = HfApi(token=hub_token) + try: + create_repo(repo_id=output_path, private=private, exist_ok=True, token=hub_token) + print(f"Repository {output_path} created or already exists") + except Exception as e: + print(f"Repository creation info: {e}") + + # Push model and tokenizer to hub + model.push_to_hub(output_path, token=hub_token, private=private) + tokenizer.push_to_hub(output_path, token=hub_token, private=private) + + print(f"✅ Model successfully pushed to https://huggingface.co/{output_path}") + if __name__ == "__main__": fire.Fire(main)