huggingface · 3outeille · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
@@ -43,6 +43,7 @@ install_pip_dependencies() {
   pip_install -r /opt/conda/requirements.txt
   pip_install -r /opt/conda/requirements-flux.txt
   pip_install -r /opt/conda/requirements-vlm.txt
+  pip_install -r /opt/conda/requirements-transformers-backend.txt
   popd
 }
 

diff --git a/.ci/docker/requirements-transformers-backend.txt b/.ci/docker/requirements-transformers-backend.txt
@@ -0,0 +1 @@
+transformers==4.57.1
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
@@ -33,6 +33,7 @@ COPY requirements-dev.txt /opt/conda/
 COPY requirements.txt /opt/conda/
 COPY requirements-flux.txt /opt/conda/
 COPY requirements-vlm.txt /opt/conda/
+COPY requirements-transformers-backend.txt /opt/conda/
 COPY conda-env-ci.txt /opt/conda/
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/utils.sh utils.sh

diff --git a/.github/workflows/integration_test_8gpu_huggingface.yaml b/.github/workflows/integration_test_8gpu_huggingface.yaml
@@ -0,0 +1,53 @@
+name: Transformers Backend 8 GPU Integration Tests
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'torchtitan/experiments/transformers_backend/**'
+  pull_request:
+    paths:
+      - 'torchtitan/experiments/transformers_backend/**'
+  schedule:
+    # Runs every 12 hours
+    - cron: '0 */12 * * *'
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.48xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Log CUDA driver version for debugging.
+        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
+        echo "CUDA driver version: ${DRIVER_VERSION}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        mkdir artifacts-to-be-uploaded
+        python -m torchtitan.experiments.transformers_backend.tests.integration_tests artifacts-to-be-uploaded --ngpu 8
diff --git a/torchtitan/distributed/pipeline_parallel.py b/torchtitan/distributed/pipeline_parallel.py
@@ -228,6 +228,7 @@ def generate_llm_fqn_per_model_part(
     num_layers: int,
     input_weight: int = 1,
     output_weight: int = 1,
+    include_rotary_emb: bool = False,
 ) -> list[list[str]]:
     """
     Programmatically generates module names model part, focused on LLMs models.
@@ -237,6 +238,7 @@ def generate_llm_fqn_per_model_part(
         num_layers: Total number of transformer layers in the model
         input_weight: Weight for input modules (tok_embeddings) in layer calculation
         output_weight: Weight for output modules (norm + output) in layer calculation
+        include_rotary_emb: Whether to include rotary_emb in each model part
 
     Returns:
         List of lists containing module names for each model part
@@ -251,7 +253,10 @@ def generate_llm_fqn_per_model_part(
     if num_stages == 1:
         # Single stage gets everything
         layer_names = [f"layers.{i}" for i in range(num_layers)]
-        return [["tok_embeddings"] + layer_names + ["norm", "output"]]
+        result = [["tok_embeddings"] + layer_names + ["norm", "output"]]
+        if include_rotary_emb:
+            result[0].append("rotary_emb")
+        return result
 
     # Calculate effective layers including weights
     num_effective_layers = num_layers + input_weight + output_weight
@@ -329,6 +334,8 @@ def generate_llm_fqn_per_model_part(
                     stage_modules.append(f"layers.{current_layer}")
                     current_layer += 1
 
+        if include_rotary_emb:
+            stage_modules.append("rotary_emb")
         module_names_per_stage.append(stage_modules)
 
     return module_names_per_stage
@@ -340,6 +347,7 @@ def pipeline_module_split(
     pp_schedule: str,
     device: torch.device,
     module_names_per_stage: list[list[str]],
+    use_identity_for_missing_modules: bool = False,
 ) -> tuple[list[PipelineStage], list[nn.Module]]:
     """
     This API creates pipeline stages based on specified module names for each stage.
@@ -361,6 +369,8 @@ def pipeline_module_split(
                                - "layers.0", "layers.1" for specific transformer layers
                                - "norm" for the final normalization layer
                                - "output" for the output projection layer
+        use_identity_for_missing_modules: If True, replace missing modules with nn.Identity(),
+                                         otherwise replace with None
 
     Returns:
         Tuple of (stages, models) where stages are PipelineStage objects and models are the
@@ -417,8 +427,11 @@ def _build_stage_from_modules(
                         setattr(model, module_name, nn.ModuleList())
             # Handle simple module attributes (e.g., "linear", "norm")
             elif module_name not in modules_to_keep:
-                # Replace with None
-                setattr(model, module_name, None)
+                # Replace with Identity or None based on configuration
+                replacement = (
+                    nn.Identity() if use_identity_for_missing_modules else None
+                )
+                setattr(model, module_name, replacement)
 
         stage = PipelineStage(
             model,

diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
@@ -106,6 +106,8 @@ def set_determinism(
     if debug_config.deterministic:
         logger.info("Deterministic algorithm enabled (expect perf degradation).")
         torch.use_deterministic_algorithms(True)
+        # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan
+        torch.utils.deterministic.fill_uninitialized_memory = False
         torch.use_deterministic_algorithms(
             True, warn_only=debug_config.deterministic_warn_only
         )

diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md
@@ -30,4 +30,6 @@ We provide this `experiments/` folder to host experiments that add significant v
 | [torchcomms](./torchcomms/) | [![TorchComms 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_torchcomms.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_torchcomms.yaml?query=branch%3Amain) | [@d4l3k](https://https://github.com/d4l3k) [@fduwjj](https://github.com/fduwjj) [@mori360 ](https://github.com/mori360) |
 | [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) |
 | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) |
+| [compiler_toolkit](./compiler_tookit/) | TBA | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
+| [transformers_backend](./transformers_backend/) | TBA | [@3outeille](https://github.com/3outeille) |
 | [compiler_toolkit](./compiler_toolkit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
diff --git a/torchtitan/experiments/__init__.py b/torchtitan/experiments/__init__.py
@@ -12,5 +12,6 @@
         "vlm",
         "compiler_toolkit.deepseek_v3",
         "compiler_toolkit.llama3",
+        "transformers_backend",
     ]
 )
diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md
@@ -0,0 +1,51 @@
+# Huggingface Transformers backend
+
+## Quick start
+
+- Requirements `transformers==4.57.1`
+
+- Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml`
+```diff
+...
+[model]
+- name = "llama3"
++ name = "Qwen/Qwen3-4B-Instruct-2507"
+flavor = "debugmodel"
+hf_assets_path = "./tests/assets/tokenizer"
+...
+```
+**Note:** Any model name containing "/" is automatically recognized as a HuggingFace model ID and will use the `transformers_backend`.
+
+- Train: `LOG_RANK=7 CONFIG_FILE=<YOUR_PATH>/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml ./run_train.sh --compile.enable`
+    - Make sure you have created the tokenizers beforehand
+<img width="1334" height="453" alt="image" src="https://github.com/user-attachments/assets/da459448-027b-4af9-8176-6a3e433a272c" />
+
+## Supported Features
+
+- The following models were tested:
+    - Dense (FSDP/CP/TP/PP/`torch.compile`)
+        - `meta-llama/Llama-3.2-1B`
+        - `microsoft/phi-2`
+        - `Qwen/Qwen2.5-7B`
+        - `mistralai/Mistral-7B-v0.1`
+        - `ByteDance-Seed/Seed-Coder-8B-Instruct`
+        - `Qwen/Qwen3-4B-Instruct-2507`
+        - `arcee-ai/AFM-4.5B`
+        - `ibm-granite/granite-3b-code-base-2k`
+        - `baidu/ERNIE-4.5-0.3B-Base-PT`
+        - `kyutai/helium-1-preview-2b`
+        - `allenai/OLMo-7B-hf`
+        - `mistralai/Ministral-8B-Instruct-2410`
+    - MoE (upcoming)
+
+## Known issues to address later
+
+- When using HF modeling, the test `FSDP=2 vs FSDP=2 + PP=2`, the `loss` and `grad_norm` not bitwise matching (but converging) while it is the case with Torchtitan modeling. This will be addressed in another PR but the culprit is probably `register_buffer` when loading `seed_checkpoint`
+- the HF modeling has lower MFU than Torchtitan MFU
+
+## Further work
+
+- Missing `build_optimizers_with_moe_load_balancing` support for MoE
+- Missing TP/PP/EP supports for MoE
+- Load HF weights
+- Add LORA support
diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+
+from torchtitan.components.loss import build_cross_entropy_loss
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers
+from torchtitan.components.tokenizer import build_hf_tokenizer
+from torchtitan.hf_datasets.text_datasets import build_text_dataloader
+from torchtitan.protocols.train_spec import TrainSpec
+
+from .infra.parallelize import parallelize_hf_transformers
+
+from .infra.pipeline import pipeline_hf_transformers
+from .model.args import HFTransformerModelArgs
+from .model.model import HFTransformerModel
+
+
+__all__ = [
+    "HFTransformerModelArgs",
+    "HFTransformerModel",
+]
+
+
+@dataclass
+class TitanDenseModelArgs:
+    """Arguments for the base TorchTitan model."""
+
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: int | None = None
+    vocab_size: int | None = None
+    multiple_of: int = 256
+    ffn_dim_multiplier: float | None = None
+    norm_eps: float = 1e-5
+    rope_theta: float = 10000
+    max_seq_len: int = 2048
+    depth_init: bool = True
+    use_flex_attn: bool = False
+    attn_mask_type: str = "causal"
+
+
+flavors = {
+    "debugmodel": HFTransformerModelArgs(
+        titan_dense_args=TitanDenseModelArgs(
+            dim=256,
+            n_layers=2,
+            n_heads=16,
+            n_kv_heads=16,
+        ),
+    ),
+    "full": HFTransformerModelArgs(
+        titan_dense_args=TitanDenseModelArgs(),
+    ),
+}
+
+
+def get_train_spec() -> TrainSpec:
+    return TrainSpec(
+        model_cls=HFTransformerModel,
+        model_args=flavors,
+        parallelize_fn=parallelize_hf_transformers,
+        pipelining_fn=pipeline_hf_transformers,
+        build_optimizers_fn=build_optimizers,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_text_dataloader,
+        build_tokenizer_fn=build_hf_tokenizer,
+        build_loss_fn=build_cross_entropy_loss,
+    )
diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
@@ -0,0 +1,87 @@
+# torchtitan Config.toml
+
+[job]
+dump_folder = "./outputs"
+description = "Qwen 3 debug training"
+print_config = true
+
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 5
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+
+[model]
+name = "Qwen/Qwen3-4B-Instruct-2507"
+flavor = "debugmodel"
+# test folder with tokenizer.json, for debug purpose only
+hf_assets_path = "./tests/assets/tokenizer"
+# converters = ["float8"]
+
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+
+[lr_scheduler]
+warmup_steps = 2  # lr scheduler warm up, normally 20% of the train steps
+decay_ratio = 0.8  # lr scheduler decay ratio, 80% of the train steps
+decay_type = "linear"
+min_lr_factor = 0.0
+
+[training]
+local_batch_size = 2
+seq_len = 2048
+max_norm = 1.0  # grad norm clipping
+steps = 10
+dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
+dataset_path = "./tests/assets/c4_test"
+mixed_precision_param = "float32" # force float32 for comparison
+mixed_precision_reduce = "float32"
+
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = 2
+fsdp_reshard_after_forward = "default" # default / never / always
+tensor_parallel_degree = 2
+enable_async_tensor_parallel = false
+pipeline_parallel_degree = 2
+pipeline_parallel_schedule = "1F1B"
+context_parallel_degree = 1
+expert_parallel_degree = 1
+expert_tensor_parallel_degree = 1
+
+[checkpoint]
+enable = false
+folder = "checkpoint"
+interval = 10
+last_save_model_only = false
+export_dtype = "float32"
+async_mode = "disabled"  # ["disabled", "async", "async_with_pinned_mem"]
+
+[activation_checkpoint]
+mode = "selective"  # ["none", "selective", "full"]
+selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
+
+[compile]
+enable=false
+components = ["model", "loss"]
+
+[quantize.linear.float8]
+enable_fsdp_float8_all_gather = false
+precompute_float8_dynamic_scale_for_fsdp = false
+filter_fqns = ["output"]
+
+[validation]
+enable = false
+dataset = "c4_validation"
+freq = 5
+steps = 10