From 2b8ecd008fc963c2a9f9ab10029a0bb75f66f816 Mon Sep 17 00:00:00 2001
From: Liang Hao <hliangac@connect.ust.hk>
Date: Fri, 26 Jun 2026 20:59:32 +0800
Subject: [PATCH 1/9] Add Cosmos3-Nano LIBERO-10 action-policy SFT recipe,
 config, eval harness, and doc

Mirrors the DROID action-policy counterpart (action_policy_droid_nano + repro
toml + launch + doc). Net-new LIBERO feature:
- experiment config: action_policy_libero_nano
- dataset: LIBEROLeRobotDataset + get_action_libero_sft_dataset (frame_wise_relative
  rot6d, quantile_rot, concat_view, 20fps); base_dataset tasks.parquet fallback for
  community LIBERO layouts; resample-on-decode-failure guard (matches i4 behavior)
- closed-loop eval harness (vectorized sim) + batched /predict_batch inference path
  + single-rank no_dist checkpoint load for the policy server
- canonical recipe action_policy_libero_repro.toml + launch_sft_action_policy_libero.sh
  (lr 5e-5, warmup 500, cycle 16000, global batch 2048; ~95% libero_10 500-ep eval)
- docs/action_policy_libero_sft.md

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cosmos_framework/configs/base/config.py       |    2 +
 .../action_policy_libero_nano.py              |  258 ++++
 .../data/vfm/action/datasets/__init__.py      |    2 +
 .../vfm/action/datasets/action_sft_dataset.py |  107 +-
 .../data/vfm/action/datasets/base_dataset.py  |   51 +-
 .../action/datasets/libero_lerobot_dataset.py |  335 ++++
 ...bero_native_frame_wise_relative_rot6d.json |   37 +
 .../data/vfm/action/libero_pose_utils.py      |   69 +
 .../scripts/action_policy_server_libero.py    |  211 ++-
 cosmos_framework/simulation/__init__.py       |    2 +
 .../simulation/libero/__init__.py             |    3 +
 .../simulation/libero/closed_loop_eval.py     | 1343 +++++++++++++++++
 .../libero/dataset_reply_action_server.py     |  653 ++++++++
 cosmos_framework/utils/vfm/model_loader.py    |   67 +-
 docs/action_policy_libero_sft.md              |  206 +++
 examples/launch_sft_action_policy_libero.sh   |   46 +
 .../action_policy_libero_repro.toml           |   79 +
 17 files changed, 3333 insertions(+), 138 deletions(-)
 create mode 100644 cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py
 create mode 100644 cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py
 create mode 100644 cosmos_framework/data/vfm/action/datasets/stats/libero_native_frame_wise_relative_rot6d.json
 create mode 100644 cosmos_framework/data/vfm/action/libero_pose_utils.py
 create mode 100644 cosmos_framework/simulation/__init__.py
 create mode 100644 cosmos_framework/simulation/libero/__init__.py
 create mode 100644 cosmos_framework/simulation/libero/closed_loop_eval.py
 create mode 100644 cosmos_framework/simulation/libero/dataset_reply_action_server.py
 create mode 100644 docs/action_policy_libero_sft.md
 create mode 100755 examples/launch_sft_action_policy_libero.sh
 create mode 100644 examples/toml/sft_config/action_policy_libero_repro.toml

diff --git a/cosmos_framework/configs/base/config.py b/cosmos_framework/configs/base/config.py
index e766c5c..1fb0514 100644
--- a/cosmos_framework/configs/base/config.py
+++ b/cosmos_framework/configs/base/config.py
@@ -97,4 +97,6 @@ def make_config() -> Config:
     import cosmos_framework.configs.base.experiment.sft.vision_sft_nano  # noqa: F401
     import cosmos_framework.configs.base.experiment.sft.vision_sft_super  # noqa: F401
     import cosmos_framework.configs.base.experiment.action.posttrain_config.action_policy_droid_nano  # noqa: F401
+    import cosmos_framework.configs.base.experiment.action.posttrain_config.action_policy_libero_nano  # noqa: F401
+    import cosmos_framework.configs.base.experiment.action.posttrain_config.action_policy_libero_nano_4suite  # noqa: F401
     return c
diff --git a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py
new file mode 100644
index 0000000..98b5b12
--- /dev/null
+++ b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py
@@ -0,0 +1,258 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""``action_policy_libero_nano`` — Cosmos3-Nano LIBERO action-policy SFT recipe.
+
+Reproduces the Cosmos3-Nano LIBERO-10 result (Table 20, 97.4% @ ckpt 2000).
+Mirrors ``action_policy_droid_nano`` (PackingDataLoader + RankPartitionedDataLoader
++ ActionIterableShuffleDataset), but feeds ``LIBEROLeRobotDataset`` (frame-wise-relative
+rot6d actions, ``quantile_rot``-normalized, concat_view third-person + wrist at
+256x256 each -> 256x512) through ``ActionTransformPipeline``, and trains the
+generation + action heads from the public ``nvidia/Cosmos3-Nano`` base. Full SFT
+(no LoRA) — the LoRA variant is the 32B "super" tier only.
+
+LIBERO-10 reproduction note: the public Table-20 number is reached training on
+``libero_10`` ALONE. Training on the full 4-suite mix dilutes libero_10 to ~1 pass
+in 2000 steps (~82%); libero_10 alone is ~2.7 passes (~97%). Point ``LIBERO_ROOT``
+(and ``LIBERO_REPO_ID``) at the libero_10 LeRobot conversion only.
+
+Usage (1 node, 8 GPU)::
+
+    LIBERO_ROOT=/path/to/libero_10_lerobot \\
+    LIBERO_REPO_ID=lerobot/libero_10 \\
+    BASE_CHECKPOINT_PATH=<Cosmos3-Nano DCP dir> \\
+    WAN_VAE_PATH=<Wan2.2_VAE.pth> \\
+    torchrun --nproc_per_node=8 -m cosmos_framework.scripts.train \\
+        --sft-toml examples/toml/sft_config/action_policy_libero_repro.toml
+"""
+
+import copy
+
+from hydra.core.config_store import ConfigStore
+
+from cosmos_framework.utils.lazy_config import LazyCall as L
+from cosmos_framework.utils.lazy_config import LazyDict
+
+from cosmos_framework.configs.base.experiment.sft.models.nano_model_config import NANO_MODEL_CONFIG
+from cosmos_framework.data.vfm.joint_dataloader import (
+    PackingDataLoader,
+    RankPartitionedDataLoader,
+)
+from cosmos_framework.data.vfm.action.datasets.action_sft_dataset import get_action_libero_sft_dataset
+
+cs = ConfigStore.instance()
+
+
+def _action_policy_libero_nano_model_config() -> dict:
+    """GA LIBERO model config: capped packed tokens, selective activation
+    checkpointing, fresh diffusion-expert init, 10x vision flow-matching loss, and
+    the VAE encode durations [17, 61, 73] carried by the Cosmos3 base.
+
+    NOTE: keep ``encode_exact_durations=[17, 61, 73]`` — do NOT reduce it to [17]
+    even though ``mode="policy"`` only produces 17-frame windows at the data level.
+    The public Cosmos3-Nano base was pretrained with [17, 61, 73]; the reference
+    GA LIBERO SFT (``action_policy_sft_nano`` on ``mharrim-nv-patch-1``) retains it,
+    and empirically reducing it to [17] regresses the policy badly
+    (60.8% vs 94.6% at iter 2000)."""
+    cfg = copy.deepcopy(NANO_MODEL_CONFIG)  # action_gen=True, max_action_dim=64
+    # Cap the packed sequence. Uncapped (-1) + a large max_samples_per_batch packs
+    # one very long sequence and OOMs even on H200; 74000 keeps the GA-validated bound.
+    cfg["max_num_tokens_after_packing"] = 74000
+    cfg["activation_checkpointing"]["mode"] = "selective"
+    cfg["diffusion_expert_config"]["load_weights_from_pretrained"] = False
+    cfg["rectified_flow_training_config"]["loss_scale"] = 10.0
+    cfg["rectified_flow_training_config"]["image_loss_scale"] = None
+    cfg["tokenizer"]["encode_exact_durations"] = [17, 61, 73]  # match Cosmos3 base + reference SFT (do NOT reduce)
+    return cfg
+
+
+action_policy_libero_nano = LazyDict(
+    dict(
+        defaults=[
+            {"override /model": "mot_fsdp"},
+            {"override /data_train": None},
+            {"override /data_val": None},
+            # FusedAdam with fp32 master_weights + eps 1e-8 (bf16 params + eps 1e-6
+            # diverged on the action loss).
+            {"override /optimizer": "fusedadamw"},
+            {"override /scheduler": "lambdalinear"},  # linear LR decay
+            {"override /checkpoint": "s3"},
+            {
+                "override /callbacks": [
+                    "basic",
+                    "optimization",
+                    "job_monitor",
+                ]
+            },
+            {"override /ema": "power"},
+            {"override /tokenizer": "wan2pt2_tokenizer"},
+            {"override /sound_tokenizer": None},
+            {"override /vlm_config": None},
+            {"override /ckpt_type": "dcp"},
+            "_self_",
+        ],
+        job=dict(
+            project="cosmos3",
+            group="action_sft",
+            name="action_policy_libero_nano",
+            wandb_mode="disabled",
+        ),
+        model=dict(
+            config=_action_policy_libero_nano_model_config(),
+        ),
+        optimizer=dict(
+            betas=[0.9, 0.99],
+            eps=1.0e-08,
+            fused=True,  # popped by build_optimizer for FusedAdam (fused by construction)
+            # Train the generation + action heads.
+            keys_to_select=[
+                "moe_gen",
+                "time_embedder",
+                "vae2llm",
+                "llm2vae",
+                "action2llm",
+                "llm2action",
+                "action_modality_embed",
+            ],
+            lr=5.0e-05,
+            lr_multipliers={
+                "action2llm": 5.0,
+                "llm2action": 5.0,
+                "action_modality_embed": 5.0,
+            },
+            optimizer_type="FusedAdam",
+            weight_decay=0.05,
+        ),
+        scheduler=dict(
+            lr_scheduler_type="LambdaLinear",
+            cycle_lengths=[100],  # smoke: 100 iters (real run sets via TOML, GA=10000)
+            f_max=[1.0],
+            f_min=[0.0],
+            f_start=[1.0e-06],
+            verbosity_interval=0,
+            warm_up_steps=[0],  # smoke (real run sets via TOML, GA=2000)
+        ),
+        trainer=dict(
+            distributed_parallelism="fsdp",
+            grad_accum_iter=1,  # real run sets via TOML (GA=2)
+            logging_iter=1,
+            max_iter=100,  # smoke
+            max_val_iter=None,
+            run_validation=False,
+            run_validation_on_start=False,
+            save_zero_checkpoint=False,
+            seed=42,
+            timeout_period=999999999,
+            validation_iter=100,
+            compile_config=dict(recompile_limit=8, use_duck_shape=False),
+            cudnn=dict(benchmark=True, deterministic=False),
+            ddp=dict(broadcast_buffers=True, find_unused_parameters=False, static_graph=True),
+            grad_scaler_args=dict(enabled=False),
+            callbacks=dict(
+                dataloader_speed=dict(every_n=100, save_s3=False, step_size=1),
+                device_monitor=dict(
+                    every_n=200, log_memory_detail=True, save_s3=False, step_size=1, upload_every_n_mul=5
+                ),
+                grad_clip=dict(clip_norm=1.0, force_finite=True),
+                heart_beat=dict(every_n=200, save_s3=False, step_size=1, update_interval_in_minute=20),
+                iter_speed=dict(every_n=1, hit_thres=50, save_s3=False, save_s3_every_log_n=500),
+                low_precision=dict(update_iter=1),
+                manual_gc=dict(every_n=5, gc_level=1, warm_up=1),
+                param_count=dict(save_s3=False),
+                skip_nan_step=dict(max_consecutive_nan=100),
+                training_stats=dict(log_freq=100),
+            ),
+        ),
+        checkpoint=dict(
+            broadcast_via_filesystem=False,
+            dcp_async_mode_enabled=False,
+            enable_gcs_patch_in_boto3=True,
+            keys_not_to_resume=[],
+            # Skip net_ema (EMA warm-starts from net, see dcp.py) and the action
+            # heads, so they init fresh from the base (the public Cosmos3-Nano base
+            # has no LIBERO-trained action heads).
+            keys_to_skip_loading=[
+                "net_ema.",
+                "action2llm",
+                "llm2action",
+                "action_modality_embed",
+                "action_pos_embed",
+            ],
+            load_ema_to_reg=False,
+            load_path="???",  # Cosmos3-Nano DCP dir; supply via TOML/env
+            load_training_state=False,
+            only_load_scheduler_state=False,
+            save_iter=100,
+            strict_resume=False,  # base init: tolerate key set differences
+            verbose=True,
+            hf_export=dict(
+                enabled=False,
+                export_every_n=1,
+                hf_repo_id=None,
+                upload_to_object_store=dict(bucket="", credentials="", enabled=False),
+            ),
+            jit=dict(device="cuda", dtype="bfloat16", enabled=False, input_shape=None, strict=True),
+            load_from_object_store=dict(bucket="", credentials="", enabled=False),
+            save_to_object_store=dict(bucket="", credentials="", enabled=False),
+        ),
+        dataloader_train=L(PackingDataLoader)(
+            audio_sample_rate=48000,
+            dataset_name="action_libero",
+            max_samples_per_batch=128,  # peak-mem bound (256 OOMs on H200); global = 128 x DP8 x grad_accum2 = 2048
+            max_sequence_length=None,  # None disables token packing (TOML can't express null)
+            patch_spatial=2,
+            sound_latent_fps=0,
+            tokenizer_spatial_compression_factor=16,
+            tokenizer_temporal_compression_factor=4,
+            dataloader=L(RankPartitionedDataLoader)(
+                batch_size=1,
+                in_order=False,
+                num_workers=4,
+                persistent_workers=True,
+                pin_memory=True,
+                prefetch_factor=4,
+                sampler=None,
+                # Shuffling is handled by the dataset (iterable_shuffle=True below):
+                # ActionIterableShuffleDataset streams rank x worker-sharded, episode-order-
+                # shuffled, sequential-within-episode.
+                datasets=dict(
+                    libero=dict(
+                        ratio=1,
+                        dataset=L(get_action_libero_sft_dataset)(
+                            # Local LeRobot dir for the libero_10 suite ONLY (Table-20
+                            # reproduction; full suite mix -> ~82%, see module docstring). Use the
+                            # 20 FPS nvidia/LIBERO_LeRobot_v3 (matches the bundled stats + 20 Hz eval):
+                            #   hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset \
+                            #     --include 'libero_10/**' --local-dir <dir>   # LIBERO_ROOT=<dir>/libero_10
+                            root="${oc.env:LIBERO_ROOT}",
+                            fps=20,  # metadata only (FPS-agnostic loader reads native fps from info.json)
+                            chunk_length=16,
+                            image_size=256,  # concat_view -> 256x512
+                            mode="policy",
+                            camera_mode="concat_view",
+                            action_space="frame_wise_relative",
+                            rotation_space="6d",
+                            pose_coordinate_frame="native",
+                            action_normalization="quantile_rot",
+                            val_ratio=0.01,
+                            iterable_shuffle=True,
+                            episode_shuffle_seed=42,
+                            resolution=None,
+                            max_action_dim="${model.config.max_action_dim}",
+                            cfg_dropout_rate=0.1,
+                            tokenizer_config="${model.config.vlm_config.tokenizer}",
+                        ),
+                    ),
+                ),
+            ),
+        ),
+        dataloader_val=None,
+        upload_reproducible_setup=False,
+    ),
+    flags={"allow_objects": True},
+)
+
+
+for _item in [action_policy_libero_nano]:
+    _name = [k for k, v in globals().items() if v is _item][0]
+    cs.store(group="experiment", package="_global_", name=_name, node=_item)
diff --git a/cosmos_framework/data/vfm/action/datasets/__init__.py b/cosmos_framework/data/vfm/action/datasets/__init__.py
index 0b01e6b..6365693 100644
--- a/cosmos_framework/data/vfm/action/datasets/__init__.py
+++ b/cosmos_framework/data/vfm/action/datasets/__init__.py
@@ -12,6 +12,7 @@
 from cosmos_framework.data.vfm.action.datasets.base_dataset import ActionBaseDataset
 from cosmos_framework.data.vfm.action.datasets.bridge_orig_lerobot_dataset import BridgeOrigLeRobotDataset
 from cosmos_framework.data.vfm.action.datasets.droid_lerobot_dataset import DROIDLeRobotDataset
+from cosmos_framework.data.vfm.action.datasets.libero_lerobot_dataset import LIBEROLeRobotDataset
 from cosmos_framework.data.vfm.action.datasets.robomind_franka_dataset import RoboMINDFrankaDataset
 
 __all__ = [
@@ -19,5 +20,6 @@
     "AgiBotWorldBetaLeRobotDataset",
     "BridgeOrigLeRobotDataset",
     "DROIDLeRobotDataset",
+    "LIBEROLeRobotDataset",
     "RoboMINDFrankaDataset",
 ]
diff --git a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py
index 1790de5..afe76da 100644
--- a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py
+++ b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py
@@ -18,7 +18,11 @@
 
 from torch.utils.data import Dataset, IterableDataset, get_worker_info
 
-from cosmos_framework.data.vfm.action.datasets.droid_lerobot_dataset import DROIDLeRobotDataset
+from cosmos_framework.data.vfm.action.datasets.droid_lerobot_dataset import (
+    DROIDLeRobotDataset,
+    ShardedDROIDLeRobotDataset,
+)
+from cosmos_framework.data.vfm.action.datasets.libero_lerobot_dataset import LIBEROLeRobotDataset
 from cosmos_framework.data.vfm.action.transforms import ActionTransformPipeline
 
 
@@ -98,6 +102,7 @@ def get_action_droid_sft_dataset(
     action_normalization: str | None = None,
     viewpoint: str = "concat_view",
     use_image_augmentation: bool = False,
+    apply_color_jitter: bool = True,
     use_filter_dict: bool = False,
     filter_dict_path: str | None = None,
     resolution: str | int = "256",
@@ -110,11 +115,24 @@ def get_action_droid_sft_dataset(
     append_idle_frames: bool = False,
     iterable_shuffle: bool = False,
     episode_shuffle_seed: int = 42,
+    sharded: bool = False,
+    lerobot_roots: list[str] | None = None,
+    use_success_only: bool = True,
 ) -> Dataset:
     """Build the DROID action SFT dataset: ``action_space='joint_pos'`` (8D) +
-    ``use_state`` (raw/un-normalized), concat_view, chunk_length 32."""
-    dataset = DROIDLeRobotDataset(
-        root=root,
+    ``use_state`` (raw/un-normalized), concat_view, chunk_length 32.
+
+    ``sharded=True`` consumes the per-lab sharded layout (``<root>/success/<lab>``)
+    via :class:`ShardedDROIDLeRobotDataset` — one ``DROIDLeRobotDataset`` per lab
+    concatenated into one flat index — reproducing the internal sharded run's
+    per-shard index construction. ``sharded=False`` (default) reads ``root`` as a
+    single flat LeRobot dataset (the prior behavior). ``lerobot_roots`` optionally
+    pins the shard sub-paths (relative to ``root``); otherwise they are
+    auto-discovered."""
+    # ``sharded`` may arrive as a string from env-var config resolution.
+    if isinstance(sharded, str):
+        sharded = sharded.strip().lower() in ("1", "true", "yes", "on")
+    shard_kwargs = dict(
         fps=fps,
         chunk_length=chunk_length,
         viewpoint=viewpoint,
@@ -123,9 +141,90 @@ def get_action_droid_sft_dataset(
         use_state=use_state,
         action_normalization=action_normalization,
         use_image_augmentation=use_image_augmentation,
+        apply_color_jitter=apply_color_jitter,
         use_filter_dict=use_filter_dict,
         filter_dict_path=filter_dict_path,
     )
+    if sharded:
+        dataset: Dataset = ShardedDROIDLeRobotDataset(
+            root=root,
+            lerobot_roots=lerobot_roots,
+            use_success_only=use_success_only,
+            **shard_kwargs,
+        )
+    else:
+        dataset = DROIDLeRobotDataset(root=root, **shard_kwargs)
+    transform = ActionTransformPipeline(
+        tokenizer_config=tokenizer_config,
+        cfg_dropout_rate=cfg_dropout_rate,
+        max_action_dim=max_action_dim,
+        append_viewpoint_info=append_viewpoint_info,
+        append_duration_fps_timestamps=append_duration_fps_timestamps,
+        append_resolution_info=append_resolution_info,
+        append_idle_frames=append_idle_frames,
+    )
+    sft = ActionSFTDataset(dataset, transform, resolution)
+    if iterable_shuffle:
+        return ActionIterableShuffleDataset(sft, seed=episode_shuffle_seed)
+    return sft
+
+
+def get_action_libero_sft_dataset(
+    *,
+    root: str,
+    fps: float = 20.0,
+    chunk_length: int = 16,
+    image_size: int = 256,
+    mode: str = "policy",
+    camera_mode: str = "concat_view",
+    action_space: str = "frame_wise_relative",
+    rotation_space: str = "6d",
+    pose_coordinate_frame: str = "native",
+    action_normalization: str | None = "quantile_rot",
+    action_stats_path: str | None = None,
+    split: str = "train",
+    val_ratio: float = 0.01,
+    seed: int = 0,
+    resolution: str | int | None = None,
+    max_action_dim: int = 64,
+    tokenizer_config: dict | None = None,
+    cfg_dropout_rate: float = 0.1,
+    append_viewpoint_info: bool = True,
+    append_duration_fps_timestamps: bool = True,
+    append_resolution_info: bool = True,
+    append_idle_frames: bool = True,
+    iterable_shuffle: bool = False,
+    episode_shuffle_seed: int = 42,
+) -> Dataset:
+    """Build the LIBERO action-policy SFT dataset (GA reproduction defaults).
+
+    Mirrors :func:`get_action_droid_sft_dataset` but feeds ``LIBEROLeRobotDataset``
+    (frame-wise-relative rot6d actions, ``quantile_rot``-normalized, concat_view
+    third-person + wrist at 256x256 each → 256x512) through
+    ``ActionTransformPipeline``. ``root`` is a LOCAL LeRobot dir (read parquet +
+    video directly, like DROID); pre-sync the HF dataset once, e.g.
+    ``hf download lerobot/libero_10 --repo-type dataset --local-dir <root>``. For
+    the Table-20 LIBERO-10 reproduction point ``root`` at libero_10 alone (the
+    4-suite mix dilutes libero_10 to ~1 pass in 2000 steps → ~82% vs ~97%). The
+    dataset is FPS-agnostic (decodes at real frame timestamps); ``fps`` is metadata
+    for ``conditioning_fps`` / prompt duration.
+    """
+    dataset = LIBEROLeRobotDataset(
+        root=root,
+        image_size=image_size,
+        chunk_length=chunk_length,
+        fps=fps,
+        mode=mode,
+        split=split,
+        val_ratio=val_ratio,
+        seed=seed,
+        camera_mode=camera_mode,
+        action_space=action_space,
+        rotation_space=rotation_space,
+        pose_coordinate_frame=pose_coordinate_frame,
+        action_normalization=action_normalization,
+        action_stats_path=action_stats_path,
+    )
     transform = ActionTransformPipeline(
         tokenizer_config=tokenizer_config,
         cfg_dropout_rate=cfg_dropout_rate,
diff --git a/cosmos_framework/data/vfm/action/datasets/base_dataset.py b/cosmos_framework/data/vfm/action/datasets/base_dataset.py
index 2c9c4cb..f9b0f61 100644
--- a/cosmos_framework/data/vfm/action/datasets/base_dataset.py
+++ b/cosmos_framework/data/vfm/action/datasets/base_dataset.py
@@ -69,18 +69,25 @@ def __init__(
             for path in sorted((self._root / "meta" / "episodes").glob("chunk-*/file-*.parquet"))
             for row in pq.read_table(path).to_pylist()
         }
-        self._tasks = {
-            int(row["task_index"]): str(row["task"])
-            for row in pq.read_table(self._root / "meta" / "tasks.parquet").to_pylist()
-        }
-        self._rows = sorted(
-            (
-                row
-                for path in sorted((self._root / "data").glob("chunk-*/file-*.parquet"))
-                for row in pq.read_table(path).to_pylist()
-            ),
-            key=lambda row: int(row["index"]),
-        )
+        # ``meta/tasks.parquet`` normally has a ``task`` column. Some LeRobot
+        # conversions (e.g. the community LIBERO datasets) instead store the task
+        # string as the (unnamed) pandas index, which pyarrow surfaces as
+        # ``__index_level_0__``. Fall back to the lone non-``task_index`` field so
+        # both layouts work (datasets that have ``task`` are unaffected).
+        self._tasks = {}
+        for row in pq.read_table(self._root / "meta" / "tasks.parquet").to_pylist():
+            if "task" in row:
+                task = row["task"]
+            else:
+                extras = [v for k, v in row.items() if k != "task_index"]
+                task = extras[0] if extras else ""
+            self._tasks[int(row["task_index"])] = str(task)
+        # ``self._rows`` (the flat, index-sorted list of every frame dict) is built
+        # lazily on first access — see the ``_rows`` property. Materializing all
+        # ~18M frames as Python dicts plus a full sort costs ~13 min and tens of GB;
+        # subclasses that build their own compact index (e.g. DROIDLeRobotDataset)
+        # never touch it, so they must not pay for it at construction.
+        self._rows_cache: list[dict[str, Any]] | None = None
 
     @property
     def fps(self) -> float:
@@ -213,5 +220,25 @@ def _build_result(
             **extras,
         }
 
+    @property
+    def _rows(self) -> list[dict[str, Any]]:
+        """Flat, index-sorted list of every frame dict, built lazily on first access.
+
+        Only datasets that don't build their own compact index (bridge / agibot /
+        robomind) touch this; for them it materializes once and caches. Datasets with
+        a bespoke index (e.g. DROIDLeRobotDataset) never read it, so they skip the
+        ~13 min / tens-of-GB construction entirely.
+        """
+        if self._rows_cache is None:
+            self._rows_cache = sorted(
+                (
+                    row
+                    for path in sorted((self._root / "data").glob("chunk-*/file-*.parquet"))
+                    for row in pq.read_table(path).to_pylist()
+                ),
+                key=lambda row: int(row["index"]),
+            )
+        return self._rows_cache
+
     def __len__(self) -> int:
         return max(0, (len(self._rows) - self._chunk_length + self._sample_stride - 1) // self._sample_stride)
diff --git a/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py b/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py
new file mode 100644
index 0000000..146fcc1
--- /dev/null
+++ b/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py
@@ -0,0 +1,335 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""LIBERO LeRobot dataset (frame-wise-relative action policy).
+
+Mirrors ``DROIDLeRobotDataset``: reads the LeRobot parquet directly, windows by
+frame index, and decodes video at each frame's REAL timestamp. That makes it
+FPS-agnostic — it works with the 10 FPS community ``lerobot/libero_*`` datasets
+and a 20 FPS conversion alike, without LeRobot's ``delta_timestamps`` grid (which
+rejects any window whose synthetic timestamps don't land on real frames).
+
+Action layout (``frame_wise_relative``): the stored 7D ``action`` is already a
+per-frame delta ``[dpos(3), drot_axisangle(3), gripper(1)]``; only the rotation is
+re-encoded to the requested ``rotation_space`` -> ``[dpos(3), rot6d(6), gripper(1)]``
+(10D for ``6d``).
+
+NOTE on FPS / stats fidelity: the bundled ``quantile_rot`` stats were computed on
+a 20 FPS conversion. Per-frame deltas at 10 FPS span 2x the wall-clock motion, so
+for a faithful Table-20 reproduction use a 20 FPS LIBERO dataset (or recompute
+stats for the dataset's FPS). Loading/training is correct at any FPS regardless.
+"""
+
+from __future__ import annotations
+
+import json
+import random
+from pathlib import Path
+from typing import Any, Literal
+
+import numpy as np
+import pyarrow.parquet as pq
+import torch
+import torch.nn.functional as F
+from lerobot.datasets.video_utils import decode_video_frames
+
+from cosmos_framework.utils import log
+from cosmos_framework.data.vfm.action.action_normalization import normalize_action
+from cosmos_framework.data.vfm.action.action_spec import ActionSpec, Gripper, Pos, Rot, build_action_spec
+from cosmos_framework.data.vfm.action.datasets.base_dataset import ActionBaseDataset
+from cosmos_framework.data.vfm.action.libero_pose_utils import libero_action_dim, libero_rotation_format
+from cosmos_framework.data.vfm.action.pose_utils import convert_rotation
+
+CameraMode = Literal["image", "wrist_image", "concat_view"]
+RotationSpace = Literal["3d", "6d", "9d"]
+
+_ACTION_FEATURE = "action"
+_IMAGE_FEATURE = "observation.images.image"
+_WRIST_FEATURE = "observation.images.wrist_image"
+_STAT_KEYS = ("mean", "std", "min", "max", "q01", "q99")
+_NORMALIZERS_DIR = Path(__file__).parent / "stats"
+
+_VIEWPOINT_BY_CAMERA = {
+    "image": "third_person_view",
+    "wrist_image": "wrist_view",
+    "concat_view": "concat_view",
+}
+
+
+class LIBEROLeRobotDataset(ActionBaseDataset):
+    """LIBERO action-policy dataset with frame-wise-relative rot6d actions.
+
+    10D ``[pos_delta(3), rot6d_delta(6), gripper(1)]`` (for ``rotation_space='6d'``),
+    ``concat_view`` third-person + wrist video, and ``quantile_rot`` normalization
+    against the bundled stats. Reads parquet + decodes video at real timestamps,
+    so the requested ``fps`` is metadata only (it sets ``conditioning_fps`` and the
+    prompt duration); frame windows always use the data's actual frames.
+    """
+
+    def __init__(
+        self,
+        root: str,
+        fps: float = 20.0,
+        chunk_length: int = 16,
+        mode: str = "policy",
+        tolerance_s: float = 1e-4,
+        camera_mode: CameraMode = "concat_view",
+        image_size: int = 256,
+        action_space: str = "frame_wise_relative",
+        rotation_space: RotationSpace = "6d",
+        pose_coordinate_frame: str = "native",
+        embodiment_type: str = "libero",
+        action_normalization: str | None = "quantile_rot",
+        action_stats_path: str | None = None,
+        split: str = "train",
+        val_ratio: float = 0.01,
+        seed: int = 0,
+        sample_stride: int = 1,
+    ) -> None:
+        if action_space != "frame_wise_relative":
+            raise NotImplementedError(
+                f"This LIBERO dataset only supports action_space='frame_wise_relative', got {action_space!r}."
+            )
+        if camera_mode not in _VIEWPOINT_BY_CAMERA:
+            raise ValueError(f"Unsupported camera_mode={camera_mode!r}. Use image/wrist_image/concat_view.")
+        split = split.lower().strip()
+        if split not in {"train", "val", "valid", "validation", "eval", "test", "full"}:
+            raise ValueError(f"Unsupported split={split!r}. Use train/val/full.")
+        if chunk_length % 4 != 0:
+            raise ValueError(f"chunk_length must be divisible by 4, got {chunk_length}.")
+
+        super().__init__(
+            root=root,
+            domain_name=embodiment_type,
+            fps=fps,
+            chunk_length=chunk_length,
+            mode=mode,
+            pose_convention="backward_framewise",  # unused for frame_wise deltas; satisfies the base assert
+            tolerance_s=tolerance_s,
+            viewpoint=_VIEWPOINT_BY_CAMERA[camera_mode],
+            # frame_wise_relative ⇔ backward_framewise idle semantics. quantile_rot is a
+            # LIBERO convention -> normalize with the "quantile" formula on raw-rotation
+            # stats (see _load_norm_stats); pass the method the base will call.
+            action_normalization=None if action_normalization is None else "quantile",
+            sample_stride=sample_stride,
+        )
+        # FPS-agnostic loader: trust the dataset's NATIVE fps for conditioning_fps /
+        # prompt duration so the metadata is truthful (10 for the public
+        # lerobot/libero_*, 20 for a 20 FPS conversion). Frame sampling uses each
+        # frame's real timestamp regardless, so the requested ``fps`` is ignored here.
+        info_fps = self._info.get("fps")
+        if info_fps:
+            if int(info_fps) != int(fps):
+                log.info(f"Using dataset native fps={info_fps} for conditioning (requested {fps}).")
+            self._fps = float(info_fps)
+            self._dt = 1.0 / self._fps
+        self._camera_mode = camera_mode
+        self._image_size = int(image_size)
+        self._rotation_space = rotation_space.lower().strip()
+        self._pose_coordinate_frame = pose_coordinate_frame
+        self._embodiment_type = embodiment_type
+        self._requested_normalization = action_normalization
+        # quantile_rot normalizes against the raw (un-orthonormalized) rotation stats
+        # under "global_raw"; everything else uses "global".
+        self._stats_key = "global_raw" if action_normalization == "quantile_rot" else "global"
+        self._stats_file = self._resolve_stats_file(action_stats_path)
+
+        if self._camera_mode == "image":
+            self._video_keys = [_IMAGE_FEATURE]
+        elif self._camera_mode == "wrist_image":
+            self._video_keys = [_WRIST_FEATURE]
+        else:
+            self._video_keys = [_IMAGE_FEATURE, _WRIST_FEATURE]
+
+        # Compact, lazy frame index (mirrors DROIDLeRobotDataset): read only the
+        # columns the sample builder needs into contiguous arrays, ordered by global
+        # frame index, so DataLoader worker forks share them copy-on-write.
+        index_parts, episode_parts, task_parts, ts_parts, action_parts = [], [], [], [], []
+        for path in sorted((self._root / "data").glob("chunk-*/file-*.parquet")):
+            table = pq.read_table(path, columns=["index", "episode_index", "task_index", "timestamp", _ACTION_FEATURE])
+            index_parts.append(table["index"].to_numpy())
+            episode_parts.append(table["episode_index"].to_numpy())
+            task_parts.append(table["task_index"].to_numpy())
+            ts_parts.append(table["timestamp"].to_numpy())
+            action_parts.append(np.asarray(table[_ACTION_FEATURE].to_pylist(), dtype=np.float32))
+        if not index_parts:
+            raise FileNotFoundError(f"No data parquet found under {self._root / 'data'}.")
+        order = np.argsort(np.concatenate(index_parts).astype(np.int64), kind="stable")
+        self._row_episode = np.concatenate(episode_parts).astype(np.int64)[order]
+        self._row_task = np.concatenate(task_parts).astype(np.int64)[order]
+        self._row_timestamp = np.concatenate(ts_parts).astype(np.float64)[order]
+        self._row_action = np.concatenate(action_parts, axis=0).astype(np.float32)[order]
+
+        assert np.all(np.diff(self._row_episode) >= 0), "episode_index not contiguous after sorting by frame index"
+        ep_vals, ep_starts, ep_counts = np.unique(self._row_episode, return_index=True, return_counts=True)
+
+        # Deterministic per-episode train/val split (seeded; same on every rank).
+        keep = self._split_episode_ids(ep_vals.tolist(), split, val_ratio, seed)
+        kept = np.array([int(v) in keep for v in ep_vals], dtype=bool)
+        self._ep_vals = ep_vals.astype(np.int64)[kept]
+        self._ep_starts = ep_starts.astype(np.int64)[kept]
+        kept_counts = ep_counts.astype(np.int64)[kept]
+        # Within-episode windows only: total - n_kept_episodes * chunk_length valid samples.
+        self._valid_cum = np.cumsum(np.maximum(0, kept_counts - self._chunk_length)).astype(np.int64)
+
+        log.info(
+            f"Loaded LIBERO dataset root={self._root} split={split!r} camera_mode={camera_mode!r} "
+            f"fps={self._fps} kept_episodes={len(self._ep_vals)}/{len(ep_vals)} "
+            f"valid_indices={int(self._valid_cum[-1]) if self._valid_cum.size else 0}"
+        )
+
+    # ---- spec / dims -------------------------------------------------------
+
+    @property
+    def action_dim(self) -> int:
+        return libero_action_dim(self._rotation_space)
+
+    def _action_spec(self) -> ActionSpec:
+        return build_action_spec(Pos(), Rot(libero_rotation_format(self._rotation_space)), Gripper())
+
+    @classmethod
+    def _stats_path(cls) -> Path:
+        # Base classmethod fallback; the instance uses self._stats_file (which also
+        # honors action_stats_path + the rotation/coordinate-frame-specific filename).
+        return _NORMALIZERS_DIR / "libero_native_frame_wise_relative_rot6d.json"
+
+    # ---- normalization (nested global/global_raw + quantile_rot) ------------
+
+    def _bundled_stats_filename(self) -> str:
+        rotation_suffix = {"3d": "3d", "6d": "rot6d", "9d": "rot9d"}.get(self._rotation_space)
+        if rotation_suffix is None:
+            raise ValueError(f"Unsupported rotation_space={self._rotation_space!r}.")
+        action_space = "frame_wise_relative"
+        return f"{self._embodiment_type}_{self._pose_coordinate_frame}_{action_space}_{rotation_suffix}.json"
+
+    def _resolve_stats_file(self, action_stats_path: str | None) -> Path:
+        if action_stats_path:
+            p = Path(action_stats_path)
+            if not p.is_absolute():
+                p = _NORMALIZERS_DIR / p.name
+            if not p.exists():
+                raise FileNotFoundError(f"action_stats_path not found: {action_stats_path!r}")
+            return p
+        p = _NORMALIZERS_DIR / self._bundled_stats_filename()
+        if not p.exists():
+            raise FileNotFoundError(
+                f"Bundled LIBERO stats not found at {p}. Pass action_stats_path or recompute stats."
+            )
+        return p
+
+    def _load_norm_stats(self) -> dict[str, torch.Tensor]:
+        if self._norm_stats is None:
+            raw = json.loads(self._stats_file.read_text())[self._stats_key]
+            self._norm_stats = {
+                k: torch.tensor(v, dtype=torch.float32) for k, v in raw.items() if k in _STAT_KEYS
+            }
+        return self._norm_stats
+
+    # ---- index helpers -----------------------------------------------------
+
+    @staticmethod
+    def _split_episode_ids(ep_ids: list[int], split: str, val_ratio: float, seed: int) -> set[int]:
+        if split == "full":
+            return set(int(v) for v in ep_ids)
+        if not (0.0 < val_ratio < 1.0):
+            raise ValueError(f"val_ratio must be in (0, 1), got {val_ratio}.")
+        n_val = max(1, int(round(len(ep_ids) * val_ratio)))
+        rng = random.Random(seed)  # identical selection on every rank
+        val = set(int(v) for v in rng.sample(list(ep_ids), n_val))
+        if split == "train":
+            return set(int(v) for v in ep_ids) - val
+        return val  # val/valid/validation/eval/test
+
+    def __len__(self) -> int:
+        return int(self._valid_cum[-1]) if self._valid_cum.size else 0
+
+    def get_shuffle_blocks(self) -> list[tuple[int, int]]:
+        """Per-episode ``(start, length)`` flat-index blocks for
+        ``ActionIterableShuffleDataset`` (shuffle block ORDER + shard across
+        ranks, sequential within a block)."""
+        blocks: list[tuple[int, int]] = []
+        prev = 0
+        for c in np.asarray(self._valid_cum).tolist():
+            c = int(c)
+            if c > prev:
+                blocks.append((prev, c - prev))
+            prev = c
+        return blocks
+
+    # ---- sample build ------------------------------------------------------
+
+    def __getitem__(self, idx: int) -> dict[str, Any]:
+        # Resilience: a single unreadable/corrupt video frame (e.g. a torchcodec
+        # decode error on the packed LeRobot-v3 mp4s) must not crash a multi-node
+        # run. Resample a different valid window on failure (bounded retries).
+        n = len(self)
+        last_err: Exception | None = None
+        for _attempt in range(8):
+            try:
+                return self._build_item(idx)
+            except Exception as e:  # noqa: BLE001 — skip past undecodable frames
+                last_err = e
+                log.warning(f"LIBERO: sample idx={idx} failed to load ({type(e).__name__}: {e}); resampling")
+                if n > 0:
+                    idx = random.randint(0, n - 1)
+        raise RuntimeError(f"LIBERO: failed to load a sample after 8 resamples; last error: {last_err}")
+
+    def _build_item(self, idx: int) -> dict[str, Any]:
+        mode = self._choose_mode()
+        idx = int(idx)
+        ep = int(np.searchsorted(self._valid_cum, idx, side="right"))
+        prev = int(self._valid_cum[ep - 1]) if ep > 0 else 0
+        start = int(self._ep_starts[ep]) + (idx - prev)
+        episode_index = int(self._ep_vals[ep])
+        episode = self._episodes[episode_index]
+
+        stop = start + self._chunk_length + 1
+        timestamps = [float(self._row_timestamp[j]) for j in range(start, stop)]
+        video = self._load_video(episode, timestamps)
+
+        # frame_wise_relative: chunk per-frame deltas are the stored actions directly.
+        raw = self._row_action[start : start + self._chunk_length]  # [chunk, 7]
+        action = self._build_frame_wise_action(raw)
+
+        task = self._tasks[int(self._row_task[start])]
+        ai_caption = random.choice([p.strip() for p in task.split(" | ") if p.strip()] or [task])
+
+        extras: dict[str, Any] = {}
+        if self._camera_mode == "concat_view":
+            extras["additional_view_description"] = (
+                "The left half shows the third-person view; the right half shows the wrist-mounted camera."
+            )
+        return self._build_result(mode=mode, video=video, action=action, ai_caption=ai_caption, **extras)
+
+    def _build_frame_wise_action(self, raw: np.ndarray) -> torch.Tensor:
+        raw_t = torch.from_numpy(np.ascontiguousarray(raw)).float()  # [chunk, 7]
+        translation = raw_t[:, 0:3]
+        rotation_matrix = convert_rotation(raw_t[:, 3:6], input_format="axisangle", output_format="matrix")
+        rotation = convert_rotation(
+            rotation_matrix, input_format="matrix", output_format=libero_rotation_format(self._rotation_space)
+        )
+        gripper = raw_t[:, 6:7]
+        return torch.cat([translation, rotation, gripper], dim=-1)  # [chunk, action_dim]
+
+    def _load_video(self, episode: dict[str, Any], timestamps: list[float]) -> torch.Tensor:
+        frames_by_view = {}
+        for key in self._video_keys:
+            from_ts = float(episode.get(f"videos/{key}/from_timestamp", 0.0))
+            frames = decode_video_frames(
+                self._video_path(episode, key),
+                [from_ts + ts for ts in timestamps],
+                self._tolerance_s,
+            )  # [T, C, H, W] in [0, 1]
+            frames = self._resize(frames)
+            frames_by_view[key] = frames
+        if self._camera_mode == "concat_view":
+            # third-person (left) + wrist (right), horizontally concatenated -> [T, C, H, 2W]
+            return torch.cat([frames_by_view[_IMAGE_FEATURE], frames_by_view[_WRIST_FEATURE]], dim=-1)
+        return frames_by_view[self._video_keys[0]]
+
+    def _resize(self, frames: torch.Tensor) -> torch.Tensor:
+        if frames.shape[-1] == self._image_size and frames.shape[-2] == self._image_size:
+            return frames
+        return F.interpolate(
+            frames, size=(self._image_size, self._image_size), mode="bilinear", align_corners=False
+        )
diff --git a/cosmos_framework/data/vfm/action/datasets/stats/libero_native_frame_wise_relative_rot6d.json b/cosmos_framework/data/vfm/action/datasets/stats/libero_native_frame_wise_relative_rot6d.json
new file mode 100644
index 0000000..a705e7c
--- /dev/null
+++ b/cosmos_framework/data/vfm/action/datasets/stats/libero_native_frame_wise_relative_rot6d.json
@@ -0,0 +1,37 @@
+{
+  "metadata": {
+    "embodiment_type":    "libero",
+    "pose_convention":    "frame_wise_relative",
+    "pose_coordinate_frame": "native",
+    "rotation_format":    "6d",
+    "action_dim":         10,
+    "skip_rotation_dims": [3, 4, 5, 6, 7, 8],
+    "chunk_length":       16,
+    "sample_stride":      null,
+    "dataset_name":       "libero",
+    "dataset_class":      "LIBEROLeRobotDataset",
+    "dataset_root":       ["outputs/libero_datasets/libero_10", "outputs/libero_datasets/libero_object", "outputs/libero_datasets/libero_spatial", "outputs/libero_datasets/libero_goal"],
+    "_comment": "Dataset paths are placeholders; the statistics values are independent of local dataset location.",
+    "split":              "train",
+    "num_samples_stats":  10000,
+    "reservoir_size":     50000,
+    "max_samples":        10000,
+    "sampling_seed":      42
+  },
+  "global": {
+    "mean": [ 0.050704,  0.097407, -0.094833,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.476725],
+    "std":  [ 0.333621,  0.387175,  0.457140,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  0.499460],
+    "min":  [-0.937500, -0.937500, -0.937500, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
+    "max":  [ 0.937500,  0.937500,  0.937500,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000],
+    "q01":  [-0.723214, -0.808929, -0.937500, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
+    "q99":  [ 0.937500,  0.870536,  0.937500,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000]
+  },
+  "global_raw": {
+    "mean": [ 0.050704,  0.097407, -0.094833,  0.994873, -0.004579, -0.004288,  0.004389,  0.996104,  0.001109,  0.476725],
+    "std":  [ 0.333621,  0.387175,  0.457140,  0.010807,  0.077802,  0.063386,  0.078571,  0.009994,  0.038504,  0.499460],
+    "min":  [-0.937500, -0.937500, -0.937500,  0.902028, -0.356085, -0.367416, -0.370434,  0.921907, -0.255000,  0.000000],
+    "max":  [ 0.937500,  0.937500,  0.937500,  1.000000,  0.368853,  0.341214,  0.356395,  1.000000,  0.348251,  1.000000],
+    "q01":  [-0.723214, -0.808929, -0.937500,  0.934955, -0.223431, -0.189878, -0.334735,  0.938516, -0.107736,  0.000000],
+    "q99":  [ 0.937500,  0.870536,  0.937500,  1.000000,  0.331000,  0.163153,  0.226216,  1.000000,  0.127158,  1.000000]
+  }
+}
diff --git a/cosmos_framework/data/vfm/action/libero_pose_utils.py b/cosmos_framework/data/vfm/action/libero_pose_utils.py
new file mode 100644
index 0000000..5cc9fff
--- /dev/null
+++ b/cosmos_framework/data/vfm/action/libero_pose_utils.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""Small LIBERO pose helpers shared by training and closed-loop eval."""
+
+from __future__ import annotations
+
+import numpy as np
+import torch
+
+from cosmos_framework.data.vfm.action.pose_utils import (
+    RotationConvention,
+    build_abs_pose_from_components,
+)
+
+# Same local-frame post-rotation pattern used by DROID/Bridge/Fractal:
+# R_opencv = R_native @ *_TO_OPENCV.
+LIBERO_TO_OPENCV: np.ndarray = np.array(
+    [[0.0, -1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 1.0]],
+    dtype=np.float32,
+)
+
+LIBERO_ROTATION_FORMATS: dict[str, RotationConvention] = {
+    "3d": "axisangle",
+    "6d": "rot6d",
+    "9d": "rot9d",
+}
+LIBERO_ACTION_DIMS: dict[str, int] = {"3d": 7, "6d": 10, "9d": 13}
+
+
+def libero_rotation_format(rotation_space: str) -> RotationConvention:
+    """Return the shared ``pose_utils`` rotation format for a LIBERO setting."""
+    rotation_format = LIBERO_ROTATION_FORMATS.get(rotation_space)
+    if rotation_format is None:
+        raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.")
+    return rotation_format
+
+
+def libero_action_dim(rotation_space: str) -> int:
+    """Return ``[xyz, rotation, gripper]`` action width for LIBERO."""
+    action_dim = LIBERO_ACTION_DIMS.get(rotation_space)
+    if action_dim is None:
+        raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.")
+    return action_dim
+
+
+def libero_rotation_space_from_action_dim(action_dim: int) -> str:
+    """Infer LIBERO rotation space from unpadded action width."""
+    for rotation_space, dim in LIBERO_ACTION_DIMS.items():
+        if dim == action_dim:
+            return rotation_space
+    raise ValueError(f"Unable to infer rotation_space from action_dim={action_dim}.")
+
+
+def build_libero_abs_pose(state_raw: torch.Tensor | np.ndarray, *, to_opencv: bool) -> np.ndarray:
+    """Build absolute LIBERO EE poses from state rows.
+
+    ``state_raw`` is ``[x,y,z,axisangle(3),gripper(2)]``.  When requested, the
+    local EE frame is post-rotated into the shared OpenCV-style action frame.
+    """
+    if isinstance(state_raw, torch.Tensor):
+        state_np = state_raw.detach().cpu().numpy().astype(np.float32, copy=False)
+    else:
+        state_np = np.asarray(state_raw, dtype=np.float32)
+
+    poses_abs = build_abs_pose_from_components(state_np[:, :3], state_np[:, 3:6], "axisangle")
+    if to_opencv:
+        poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ LIBERO_TO_OPENCV
+    return poses_abs
diff --git a/cosmos_framework/scripts/action_policy_server_libero.py b/cosmos_framework/scripts/action_policy_server_libero.py
index 7382b97..2fa0567 100644
--- a/cosmos_framework/scripts/action_policy_server_libero.py
+++ b/cosmos_framework/scripts/action_policy_server_libero.py
@@ -63,6 +63,10 @@
 # Action-specific helpers live in the in-tree project tree. Imports stay as
 # `projects.cosmos3.vfm.*` and are auto-rewritten to `cosmos3._src.vfm.*` by the
 # cosmos-framework release script.
+from cosmos_framework.data.vfm.action.action_processing import (
+    ActionProcessingRecord,
+    make_batched_action_processing_fields,
+)
 from cosmos_framework.data.vfm.action.domain_utils import get_domain_id
 from cosmos_framework.data.vfm.action.transforms import (
     build_sequence_plan_from_mode,
@@ -431,7 +435,8 @@ class ActionServerArgs(pydantic.BaseModel):
     # ``OmniSetupOverrides`` programmatically in ``build_setup_overrides``.
 
     checkpoint: tyro.conf.OmitArgPrefixes[CheckpointOverrides] = CheckpointOverrides.model_construct()
-    """Checkpoint and config loading configuration."""
+    """Checkpoint and config loading configuration. ``use_ema_weights`` lives here and
+    defaults True at inference (suppressed from CLI) -> evals load net_ema by default."""
 
     output_dir: Path | None = None
     """Output directory for ``OmniInference`` (saved config.yaml, benchmarks).
@@ -804,101 +809,53 @@ def get_info(self) -> dict[str, Any]:
     # Predict
     # ------------------------------------------------------------------
 
-    def predict_policy(self, req: dict[str, Any]) -> dict[str, Any]:
-        """
-        Run policy inference: given an observation image and prompt, predict actions.
-
-        Input request format:
-        {
-            "image": "<base64_encoded_png>",
-            "prompt": "<task_description>",
-            "domain_name": "<domain_name>",
-            "image_size": <int>
-        }
-
-        Output format:
-        {
-            "action": [[a0_0, a0_1, ...], ..., [aN_0, aN_1, ...]],
-            "video": ["<base64_png>", ...]  # List of T base64-encoded PNG frames
-        }
-
-        All action dimensions are returned. Video is the decoded predicted rollout as base64 PNGs.
-        """
-        t0 = time.monotonic()
-
-        # Get or assign request ID
-        injected_id = req.get("request_id", None)
-        if isinstance(injected_id, int) and injected_id > 0:
-            request_id = int(injected_id)
-        else:
-            with self._req_id_lock:
-                self._req_id += 1
-                request_id = int(self._req_id)
+    def _input_video_key(self) -> str:
+        input_video_key = getattr(self.model, "input_video_key", None)
+        if input_video_key is None:
+            input_video_key = getattr(self.model, "config", None).input_video_key  # type: ignore[union-attr]
+        return input_video_key
 
-        # Validate request
+    def _prep_policy_item(self, req: dict[str, Any]) -> dict[str, Any]:
+        """Validate one request and build the per-sample model inputs (video pad,
+        prompt augmentation, sequence_plan). Shared by predict_policy (batch=1) and
+        predict_policy_batch (batch=N) so the two paths stay byte-identical per item."""
         image_b64 = req.get("image")
         if not isinstance(image_b64, str):
             raise ValueError("'image' must be a base64 string")
-
         prompt = req.get("prompt")
         if not isinstance(prompt, str):
             raise ValueError("'prompt' must be a string")
-
         domain_name = req.get("domain_name")
         if not isinstance(domain_name, str):
             raise ValueError("'domain_name' must be a string")
-
         image_size = req.get("image_size")
         if not isinstance(image_size, int) or image_size <= 0:
             raise ValueError("'image_size' must be a positive integer")
 
-        # Decode image
-        t_decode0 = time.monotonic()
         img_chw_uint8 = _decode_base64_png_to_rgb_uint8(image_b64)
         img_h, img_w = img_chw_uint8.shape[-2:]
-
-        # Handle resizing: for multi-view (non-square) images, scale proportionally
-        # to maintain aspect ratio while matching height to image_size
+        # Multi-view (non-square) images: scale proportionally, matching height to image_size.
         if img_h != image_size:
-            # Calculate new width to maintain aspect ratio
             scale = image_size / img_h
             new_w = int(round(img_w * scale))
-            hwc = img_chw_uint8.permute(1, 2, 0).cpu().numpy()  # [H,W,3]
+            hwc = img_chw_uint8.permute(1, 2, 0).cpu().numpy()
             resized = Image.fromarray(hwc).resize((new_w, image_size), resample=Image.Resampling.BILINEAR)
             arr = np.asarray(resized, dtype=np.uint8).copy()
-            img_chw_uint8 = torch.from_numpy(arr).permute(2, 0, 1).contiguous()  # [3,H,W]  # [3,H,W]
-        t_decode1 = time.monotonic()
+            img_chw_uint8 = torch.from_numpy(arr).permute(2, 0, 1).contiguous()
 
-        # Construct batch in IterativeJointDataLoader format (list-of-lists for multi-item keys)
         t_frames = self.cfg.action_chunk_size + 1
         _, final_h, final_w = img_chw_uint8.shape
         video_c_t_h_w_uint8 = img_chw_uint8.unsqueeze(1).repeat(1, t_frames, 1, 1)  # [3,T,H,W]
-
-        # Apply reflection padding to match closest predefined resolution
         resolution = get_vision_data_resolution((final_h, final_w))
         target_w, target_h = find_closest_target_size(final_h, final_w, resolution)
         pad_dict: dict[str, Any] = {"video": video_c_t_h_w_uint8}
         reflection_pad_to_target(pad_dict, ["video"], True, target_w, target_h)
-        video_padded = pad_dict["video"]  # (C, T, target_h, target_w)
-        padded_image_size = pad_dict["image_size"]  # (4,)
-
-        # Action: zeros tensor as noise starting point for policy mode
-        action_t_d = torch.zeros(
-            (self.cfg.action_chunk_size, self.cfg.max_action_dim),
-            dtype=torch.float32,
-        )  # [T,action_dim]
-
-        input_video_key = getattr(self.model, "input_video_key", None)
-        if input_video_key is None:
-            input_video_key = getattr(self.model, "config", None).input_video_key  # type: ignore[union-attr]
-
         sequence_plan = build_sequence_plan_from_mode(
             mode="policy",
             video_length=self.cfg.action_chunk_size + 1,
             action_length=self.cfg.action_chunk_size,
             has_text=True,
         )
-
         augmented_prompt = _augment_prompt_with_metadata(
             prompt,
             t_frames=t_frames,
@@ -908,10 +865,126 @@ def predict_policy(self, req: dict[str, Any]) -> dict[str, Any]:
             append_duration_fps=self.append_duration_fps,
             append_resolution_info=self.append_resolution_info,
         )
+        return {
+            "img_chw_uint8": img_chw_uint8,
+            "video_padded": pad_dict["video"],
+            "padded_image_size": pad_dict["image_size"],
+            "augmented_prompt": augmented_prompt,
+            "sequence_plan": sequence_plan,
+            "domain_name": domain_name,
+            "image_size": image_size,
+        }
+
+    def predict_policy_batch(self, reqs: list[dict[str, Any]]) -> dict[str, Any]:
+        """Batched policy inference: N requests -> ONE diffusion forward (batch_size=N)
+        -> N denormalized action chunks. Skips vision decode (the vectorized eval client
+        only needs actions), so it is ~N x faster than N serial /predict calls."""
+        t0 = time.monotonic()
+        if not isinstance(reqs, list) or not reqs:
+            raise ValueError("'items' must be a non-empty list of policy requests")
+        preps = [self._prep_policy_item(r) for r in reqs]
+        n = len(preps)
+        action_t_d = torch.zeros(
+            (self.cfg.action_chunk_size, self.cfg.max_action_dim), dtype=torch.float32
+        )
+        input_video_key = self._input_video_key()
+        batch: dict[str, Any] = {
+            input_video_key: [[p["video_padded"]] for p in preps],
+            **make_batched_action_processing_fields(
+                ActionProcessingRecord(raw_action_dim=self.raw_action_dim, action_normalizer=None),
+                batch_size=n,
+            ),
+            "action": [[action_t_d] for _ in preps],
+            "mode": ["policy"] * n,
+            "ai_caption": [p["augmented_prompt"] for p in preps],
+            "prompt": [p["augmented_prompt"] for p in preps],
+            "conditioning_fps": [torch.tensor(self.cfg.fps, dtype=torch.long) for _ in preps],
+            "image_size": torch.stack([p["padded_image_size"] for p in preps]).to(device="cuda"),
+            "domain_id": [torch.tensor(get_domain_id(p["domain_name"]), dtype=torch.long) for p in preps],
+            "sequence_plan": [p["sequence_plan"] for p in preps],
+        }
+        t_inf0 = time.monotonic()
+        with self._lock:
+            with torch.inference_mode():
+                samples = self.model.generate_samples_from_batch(
+                    batch,
+                    guidance=self.cfg.guidance,
+                    seed=[self.cfg.seed] * n,
+                    num_steps=self.cfg.num_steps,
+                    has_negative_prompt=False,
+                )
+        t_inf1 = time.monotonic()
+        actions: list[list[list[float]]] = []
+        for i in range(n):
+            pred = samples["action"][i].float().squeeze(0)  # [T,D]
+            pred = self._denormalize_action(pred)
+            actions.append(pred.detach().cpu().numpy().tolist())
+        log.info(
+            f"[action-server] predict_batch n={n} steps={self.cfg.num_steps} "
+            f"ms_total={(time.monotonic() - t0) * 1000.0:.1f} ms_infer={(t_inf1 - t_inf0) * 1000.0:.1f}"
+        )
+        return {"actions": actions}
+
+    def predict_policy(self, req: dict[str, Any]) -> dict[str, Any]:
+        """
+        Run policy inference: given an observation image and prompt, predict actions.
+
+        Input request format:
+        {
+            "image": "<base64_encoded_png>",
+            "prompt": "<task_description>",
+            "domain_name": "<domain_name>",
+            "image_size": <int>
+        }
+
+        Output format:
+        {
+            "action": [[a0_0, a0_1, ...], ..., [aN_0, aN_1, ...]],
+            "video": ["<base64_png>", ...]  # List of T base64-encoded PNG frames
+        }
+
+        All action dimensions are returned. Video is the decoded predicted rollout as base64 PNGs.
+        """
+        t0 = time.monotonic()
+
+        # Get or assign request ID
+        injected_id = req.get("request_id", None)
+        if isinstance(injected_id, int) and injected_id > 0:
+            request_id = int(injected_id)
+        else:
+            with self._req_id_lock:
+                self._req_id += 1
+                request_id = int(self._req_id)
+
+        # Per-item preprocessing (validation, decode/resize/pad, prompt, sequence_plan).
+        t_decode0 = time.monotonic()
+        prep = self._prep_policy_item(req)
+        t_decode1 = time.monotonic()
+        img_chw_uint8 = prep["img_chw_uint8"]
+        video_padded = prep["video_padded"]
+        padded_image_size = prep["padded_image_size"]
+        augmented_prompt = prep["augmented_prompt"]
+        sequence_plan = prep["sequence_plan"]
+        domain_name = prep["domain_name"]
+        image_size = prep["image_size"]
+
+        # Action: zeros tensor as noise starting point for policy mode
+        action_t_d = torch.zeros(
+            (self.cfg.action_chunk_size, self.cfg.max_action_dim),
+            dtype=torch.float32,
+        )  # [T,action_dim]
+
+        input_video_key = self._input_video_key()
 
         batch: dict[str, Any] = {
             input_video_key: [[video_padded]],
-            "raw_action_dim": [torch.tensor(self.raw_action_dim, dtype=torch.long)],
+            # Provide BOTH raw_action_dim and the action_processing_record the model
+            # needs to externalize (invert) the generated action; building the batch
+            # by hand previously omitted the record -> "cannot be externalized".
+            **make_batched_action_processing_fields(
+                ActionProcessingRecord(raw_action_dim=self.raw_action_dim, action_normalizer=None),
+                batch_size=1,
+            ),
             "action": [[action_t_d]],
             "mode": ["policy"],
             "ai_caption": [augmented_prompt],
@@ -1103,7 +1176,7 @@ def do_GET(self) -> None:  # noqa: N802
             self._send_json(404, {"error": "Not found"})
 
     def do_POST(self) -> None:  # noqa: N802
-        if self.path not in ("/", "/predict"):
+        if self.path not in ("/", "/predict", "/predict_batch"):
             self._send_json(404, {"error": "Not found"})
             return
 
@@ -1147,13 +1220,21 @@ def do_POST(self) -> None:  # noqa: N802
             f"path={self.path} bytes={length}"
         )
 
+        is_batch = self.path == "/predict_batch"
         try:
-            out = service.predict_policy(req)
+            if is_batch:
+                out = service.predict_policy_batch(req.get("items", []))
+            else:
+                out = service.predict_policy(req)
         except Exception as e:
             err = str(e)
             traceback.print_exc()
 
-            payload = {"action": [], "error": err, "request_id": req.get("request_id")}
+            payload = (
+                {"actions": [], "error": err}
+                if is_batch
+                else {"action": [], "error": err, "request_id": req.get("request_id")}
+            )
             log.error(f"[action-server] request_id={req.get('request_id')} ERROR: {err}")
 
             # Dump failed request for offline debugging if enabled.
diff --git a/cosmos_framework/simulation/__init__.py b/cosmos_framework/simulation/__init__.py
new file mode 100644
index 0000000..28a81be
--- /dev/null
+++ b/cosmos_framework/simulation/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
diff --git a/cosmos_framework/simulation/libero/__init__.py b/cosmos_framework/simulation/libero/__init__.py
new file mode 100644
index 0000000..503ec1b
--- /dev/null
+++ b/cosmos_framework/simulation/libero/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
diff --git a/cosmos_framework/simulation/libero/closed_loop_eval.py b/cosmos_framework/simulation/libero/closed_loop_eval.py
new file mode 100644
index 0000000..0205f9f
--- /dev/null
+++ b/cosmos_framework/simulation/libero/closed_loop_eval.py
@@ -0,0 +1,1343 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""
+Closed-loop evaluation for LIBERO using the Action HTTP inference server.
+
+# Single-view example (agentview camera):
+PYTHONPATH=. python cosmos_framework/simulation/libero/closed_loop_eval.py \
+  --server_url http://localhost:8000 \
+  --task_suite libero_10 \
+  --num_trials_per_task 10 \
+  --action_horizon 16 \
+  --camera agentview \
+  --save_gifs --gif_fps 20 \
+  --action_space frame_wise_relative \
+  --rotation_space 6d \
+  --action_dim 10 \
+  --output_dir results/libero_closed_loop_10_single_view
+
+# Multi-view example (agentview + wrist cameras):
+PYTHONPATH=. python cosmos_framework/simulation/libero/closed_loop_eval.py \
+  --server_url http://localhost:8000 \
+  --task_suite libero_goal \
+  --num_trials_per_task 2 \
+  --action_horizon 16 \
+  --camera agentview,wrist \
+  --save_gifs --gif_fps 20 \
+  --action_space frame_wise_relative \
+  --rotation_space 6d \
+  --action_dim 10 \
+  --output_dir results/libero_closed_loop_goal_multiview
+"""
+
+from __future__ import annotations
+
+import argparse
+import base64
+import io
+import json
+import os
+import random
+import sys
+import time
+from dataclasses import dataclass
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import requests
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+
+from cosmos_framework.data.vfm.action.libero_pose_utils import (
+    libero_rotation_format,
+    libero_rotation_space_from_action_dim,
+)
+from cosmos_framework.data.vfm.action.pose_utils import convert_rotation
+from cosmos_framework.data.vfm.action.viewpoint_utils import DEFAULT_VIEWPOINT_TEMPLATES
+
+benchmark: Any
+get_libero_path: Any
+OffScreenRenderEnv: Any
+
+
+TASK_MAX_STEPS: dict[str, int] = {
+    "libero_spatial": 220,
+    "libero_object": 280,
+    "libero_goal": 300,
+    "libero_10": 520,
+    "libero_90": 400,
+}
+
+
+_CAMERA_PROMPT_NAMES: dict[str, str] = {
+    "agentview": "third-person view",
+    "wrist": "wrist-mounted camera",
+}
+
+
+def _append_prompt_sentence(prompt: str, sentence: str) -> str:
+    """Append one metadata sentence using the same separator convention as training augmentors."""
+    if sentence in prompt:
+        return prompt
+    prompt = prompt.rstrip()
+    if not prompt:
+        return sentence.rstrip()
+    separator = " " if prompt.rstrip().endswith(".") else ". "
+    return prompt + separator + sentence.rstrip()
+
+
+def _concat_view_layout_description(cameras: list[str]) -> str:
+    """Describe the horizontal camera layout sent by ``ActionEnvironmentClient``."""
+    camera_names = [_CAMERA_PROMPT_NAMES[camera] for camera in cameras]
+    if len(camera_names) == 2:
+        return f"The left half shows the {camera_names[0]}; the right half shows the {camera_names[1]}."
+    layout = ", ".join(camera_names)
+    return f"The views are concatenated horizontally from left to right as: {layout}."
+
+
+def _augment_task_prompt_with_viewpoint(task_description: str, cameras: list[str]) -> str:
+    """Mirror DROID-style concat-view caption augmentation for closed-loop LIBERO eval."""
+    if len(cameras) <= 1:
+        return task_description
+    prompt = _append_prompt_sentence(task_description, DEFAULT_VIEWPOINT_TEMPLATES["concat_view"])
+    return _append_prompt_sentence(prompt, _concat_view_layout_description(cameras))
+
+
+def _rotation_repr_to_mat(rotation: np.ndarray, rotation_space: str) -> np.ndarray:
+    """Convert a single LIBERO rotation block to a 3x3 rotation matrix."""
+    matrix = convert_rotation(
+        rotation,
+        libero_rotation_format(rotation_space),
+        "matrix",
+        normalize_matrix=rotation_space != "3d",
+    )
+    if not isinstance(matrix, np.ndarray):
+        raise TypeError(f"Expected NumPy rotation matrix, got {type(matrix)!r}")
+    return matrix
+
+
+@dataclass
+class EpisodeResult:
+    success: bool
+    steps: int
+    error: str | None
+    actions: list[list[float]]
+
+
+class ActionEnvironmentClient:
+    """Client for interacting with the Action model server."""
+
+    server_url: str
+    domain_name: str
+    prompt: str
+    image_size: int
+    timeout: float
+
+    def __init__(
+        self,
+        server_url: str,
+        domain_name: str,
+        prompt: str,
+        image_size: int,
+        timeout: float,
+    ) -> None:
+        self.server_url = server_url.rstrip("/")
+        self.domain_name = domain_name
+        self.prompt = prompt
+        self.image_size = image_size
+        self.timeout = timeout
+
+    def check_health(self) -> bool:
+        """Check if the model server is healthy."""
+        try:
+            resp = requests.get(f"{self.server_url}/", timeout=5.0)
+            return resp.status_code == 200
+        except requests.RequestException:
+            return False
+
+    def get_info(self) -> dict[str, str]:
+        """Get model server info."""
+        resp = requests.get(f"{self.server_url}/info", timeout=5.0)
+        resp.raise_for_status()
+        return resp.json()
+
+    def notify_next_episode(self) -> None:
+        """Notify server to advance to next episode (used with dataset action server)."""
+        try:
+            requests.post(
+                f"{self.server_url}/next_episode",
+                json={"prompt": self.prompt},
+                timeout=5.0,
+            )
+        except requests.RequestException:
+            pass
+
+    def encode_image(self, image: np.ndarray) -> str:
+        """Encode a numpy image (H, W, 3) uint8 to base64 PNG, resizing to image_size."""
+        if image.dtype != np.uint8:
+            if image.max() <= 1.0:
+                image = (image * 255.0).round().astype(np.uint8)
+            else:
+                image = image.astype(np.uint8)
+        pil_img = Image.fromarray(image)
+        if pil_img.size != (self.image_size, self.image_size):
+            pil_img = pil_img.resize(
+                (self.image_size, self.image_size),
+                resample=Image.Resampling.BILINEAR,
+            )
+        buf = io.BytesIO()
+        pil_img.save(buf, format="PNG")
+        return base64.b64encode(buf.getvalue()).decode("ascii")
+
+    def encode_image_raw(self, image: np.ndarray) -> str:
+        """Encode a numpy image (H, W, 3) uint8 to base64 PNG without resizing."""
+        if image.dtype != np.uint8:
+            if image.max() <= 1.0:
+                image = (image * 255.0).round().astype(np.uint8)
+            else:
+                image = image.astype(np.uint8)
+        pil_img = Image.fromarray(image)
+        buf = io.BytesIO()
+        pil_img.save(buf, format="PNG")
+        return base64.b64encode(buf.getvalue()).decode("ascii")
+
+    def resize_image(self, image: np.ndarray) -> np.ndarray:
+        """Resize image to model input size."""
+        if image.dtype != np.uint8:
+            if image.max() <= 1.0:
+                image = (image * 255.0).round().astype(np.uint8)
+            else:
+                image = image.astype(np.uint8)
+        pil_img = Image.fromarray(image)
+        if pil_img.size != (self.image_size, self.image_size):
+            pil_img = pil_img.resize(
+                (self.image_size, self.image_size),
+                resample=Image.Resampling.BILINEAR,
+            )
+        return np.array(pil_img)
+
+    def concatenate_images(self, images: list[np.ndarray]) -> np.ndarray:
+        """Resize each image and concatenate horizontally (side-by-side).
+
+        Args:
+            images: List of images with shape (H, W, 3).
+
+        Returns:
+            Concatenated image with shape (image_size, image_size*num_views, 3).
+        """
+        resized = [self.resize_image(img) for img in images]
+        return np.concatenate(resized, axis=1)
+
+    def predict(self, observation: np.ndarray | list[np.ndarray]) -> dict[str, Any]:
+        """Send observation(s) to model server and get predicted actions.
+
+        Args:
+            observation: Single image as np.ndarray or list of images for multi-view.
+                For multi-view, images are resized and concatenated horizontally before sending.
+        """
+        if isinstance(observation, list):
+            # Multi-view: resize each, concatenate horizontally, and send as single image
+            concatenated = self.concatenate_images(observation)
+            encoded = self.encode_image_raw(concatenated)
+        else:
+            # Single view: send single image
+            encoded = self.encode_image(observation)
+
+        payload = {
+            "image": encoded,
+            "prompt": self.prompt,
+            "domain_name": self.domain_name,
+            "image_size": self.image_size,
+        }
+
+        resp = requests.post(
+            f"{self.server_url}/predict",
+            json=payload,
+            headers={"Content-Type": "application/json"},
+            timeout=self.timeout,
+        )
+        resp.raise_for_status()
+
+        result = resp.json()
+        if "error" in result and result["error"]:
+            raise RuntimeError(f"Model server error: {result['error']}")
+        return result
+
+    def predict_batch(self, observations: list[list[np.ndarray]]) -> list[list[list[float]]]:
+        """Batched inference: a list of per-env multi-view observations -> ONE
+        POST /predict_batch -> a list of action chunks (one per env). Used by the
+        vectorized eval so N parallel envs share a single diffusion forward."""
+        items = []
+        for obs_imgs in observations:
+            concat = self.concatenate_images(obs_imgs) if len(obs_imgs) > 1 else self.resize_image(obs_imgs[0])
+            items.append(
+                {
+                    "image": self.encode_image_raw(concat),
+                    "prompt": self.prompt,
+                    "domain_name": self.domain_name,
+                    "image_size": self.image_size,
+                }
+            )
+        resp = requests.post(
+            f"{self.server_url}/predict_batch",
+            json={"items": items},
+            headers={"Content-Type": "application/json"},
+            timeout=max(self.timeout, 300.0),
+        )
+        resp.raise_for_status()
+        result = resp.json()
+        if "error" in result and result["error"]:
+            raise RuntimeError(f"Model server error: {result['error']}")
+        return result["actions"]
+
+
+def _find_accessible_dri_nodes() -> list[Path]:
+    dri_path = Path("/dev/dri")
+    if not dri_path.exists():
+        return []
+    nodes = list(dri_path.glob("renderD*")) + list(dri_path.glob("card*"))
+    return [node for node in nodes if os.access(node, os.R_OK | os.W_OK)]
+
+
+def _resolve_mujoco_backend(requested_backend: str) -> tuple[str, str]:
+    requested_backend = requested_backend.lower()
+    if requested_backend != "auto":
+        return requested_backend, "requested"
+
+    env_backend = os.environ.get("MUJOCO_GL")
+    if env_backend:
+        return env_backend.lower(), "env"
+
+    if _find_accessible_dri_nodes():
+        return "egl", "auto-gpu"
+    return "osmesa", "auto-cpu"
+
+
+def _configure_mujoco_env(requested_backend: str) -> str:
+    backend, source = _resolve_mujoco_backend(requested_backend)
+    if backend not in {"egl", "osmesa", "glfw"}:
+        raise ValueError(f"Unsupported MuJoCo GL backend: {backend!r}. Use auto, egl, osmesa, or glfw.")
+
+    os.environ["MUJOCO_GL"] = backend
+    if backend == "egl":
+        os.environ["PYOPENGL_PLATFORM"] = "egl"
+    elif backend == "osmesa":
+        os.environ["PYOPENGL_PLATFORM"] = "osmesa"
+    return f"{backend} ({source})"
+
+
+def _import_libero() -> None:
+    global benchmark, get_libero_path, OffScreenRenderEnv
+    try:
+        from libero.libero import benchmark as libero_benchmark
+        from libero.libero import get_libero_path as libero_get_libero_path
+        from libero.libero.envs import OffScreenRenderEnv as libero_offscreen_render_env
+    except ImportError as exc:  # pragma: no cover - environment-specific dependency
+        raise RuntimeError(
+            "Failed to import LIBERO. Make sure the LIBERO environment is activated. "
+            f"python={sys.executable!r}, import_error={exc!r}"
+        ) from exc
+
+    benchmark = libero_benchmark
+    get_libero_path = libero_get_libero_path
+    OffScreenRenderEnv = libero_offscreen_render_env
+
+
+def _wait_for_server(client: ActionEnvironmentClient, timeout_s: float) -> None:
+    start = time.perf_counter()
+    while time.perf_counter() - start < timeout_s:
+        if client.check_health():
+            return
+        time.sleep(1.0)
+    raise RuntimeError(f"Timed out waiting for server at {client.server_url}")
+
+
+def _get_libero_env(
+    task: Any,
+    *,
+    resolution: int,
+    seed: int,
+    render_gpu_device_id: int,
+) -> tuple[Any, str]:
+    task_description = str(task.language)
+    task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
+    env_args = {
+        "bddl_file_name": task_bddl_file,
+        "camera_heights": resolution,
+        "camera_widths": resolution,
+        "render_gpu_device_id": render_gpu_device_id,
+    }
+    env = OffScreenRenderEnv(**env_args)
+    env.seed(seed)
+    return env, task_description
+
+
+def _get_libero_dummy_action() -> list[float]:
+    return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0]
+
+
+def _get_libero_image(
+    obs: dict[str, Any],
+    camera: str,
+    *,
+    flip_images: bool,
+    rotate_180: bool,
+) -> np.ndarray:
+    if camera == "agentview":
+        image = obs["agentview_image"]
+    elif camera == "wrist":
+        image = obs["robot0_eye_in_hand_image"]
+    else:
+        raise ValueError(f"Unsupported camera={camera!r}. Use 'agentview' or 'wrist'.")
+
+    if rotate_180:
+        image = image[::-1, ::-1]
+    if flip_images:
+        image = np.flipud(image)
+    return image
+
+
+def _get_libero_images(
+    obs: dict[str, Any],
+    cameras: list[str],
+    *,
+    flip_images: bool,
+    rotate_180: bool,
+) -> list[np.ndarray]:
+    """Get images from multiple cameras."""
+    return [_get_libero_image(obs, camera, flip_images=flip_images, rotate_180=rotate_180) for camera in cameras]
+
+
+def _ensure_uint8_image(image: np.ndarray) -> np.ndarray:
+    if image.dtype != np.uint8:
+        if image.max() <= 1.0:
+            image = (image * 255.0).round().astype(np.uint8)
+        else:
+            image = image.astype(np.uint8)
+    return image
+
+
+def _save_gif(frames: list[Image.Image], output_path: Path, fps: int) -> None:
+    if not frames:
+        return
+    duration_ms = int(1000 / fps) if fps > 0 else 100
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    first, *rest = frames
+    first.save(
+        output_path,
+        save_all=True,
+        append_images=rest,
+        duration=duration_ms,
+        loop=0,
+    )
+
+
+def _decode_b64_frames(b64_frames: list[str]) -> list[Image.Image]:
+    """Decode a list of base64-encoded PNG strings into PIL Images."""
+    images: list[Image.Image] = []
+    for b64 in b64_frames:
+        raw = base64.b64decode(b64)
+        images.append(Image.open(io.BytesIO(raw)).convert("RGB"))
+    return images
+
+
+def _save_comparison_gif(
+    comparison_windows: list[tuple[list[Image.Image], list[Image.Image]]],
+    output_path: Path,
+    fps: int,
+    target_height: int = 256,
+    separator_width: int = 4,
+) -> None:
+    """Create and save a side-by-side comparison GIF (Action prediction | env rollout).
+
+    Each window is a (action_frames, env_frames) pair from one prediction call.
+    Frames are paired index-by-index; the conditioning frame (index 0) of
+    subsequent windows is skipped to avoid duplicating the boundary frame.
+    """
+    from PIL import ImageDraw
+
+    combined_frames: list[Image.Image] = []
+    banner_h = 16
+
+    for window_idx, (action_frames, env_frames) in enumerate(comparison_windows):
+        n = min(len(action_frames), len(env_frames))
+        start = 1 if window_idx > 0 else 0
+        for i in range(start, n):
+            action_img = action_frames[i]
+            env_img = env_frames[i]
+
+            action_w = int(action_img.width * target_height / action_img.height)
+            env_w = int(env_img.width * target_height / env_img.height)
+            action_resized = action_img.resize((action_w, target_height), Image.Resampling.BILINEAR)
+            env_resized = env_img.resize((env_w, target_height), Image.Resampling.BILINEAR)
+
+            total_w = action_w + separator_width + env_w
+            total_h = target_height + banner_h
+            combined = Image.new("RGB", (total_w, total_h), color=0)
+
+            draw = ImageDraw.Draw(combined)
+            draw.rectangle([(0, 0), (action_w, banner_h)], fill=(30, 30, 60))
+            draw.rectangle([(action_w + separator_width, 0), (total_w, banner_h)], fill=(30, 60, 30))
+            draw.text((4, 1), "Action Prediction", fill=(100, 180, 255))
+            draw.text((action_w + separator_width + 4, 1), "Environment", fill=(100, 255, 100))
+
+            combined.paste(action_resized, (0, banner_h))
+            combined.paste(env_resized, (action_w + separator_width, banner_h))
+            combined_frames.append(combined)
+
+    if combined_frames:
+        _save_gif(combined_frames, output_path, fps)
+
+
+def _select_action_chunk(actions: list[list[float]], action_horizon: int) -> list[list[float]]:
+    if action_horizon <= 0 or action_horizon >= len(actions):
+        return actions
+    return actions[:action_horizon]
+
+
+def _format_action(action: list[float], action_dim: int) -> list[float]:
+    if len(action) < action_dim:
+        raise ValueError(f"Action dimension {len(action)} smaller than expected {action_dim}")
+    return action[:action_dim]
+
+
+def _remap_gripper(action: list[float], mode: str) -> list[float]:
+    """Map the model's gripper command to the LIBERO env's [-1, 1] (negative = open).
+
+    The right mapping depends on the gripper convention of the dataset the policy
+    was trained on (the server denormalizes back to that raw convention):
+
+    * ``zero_one`` (NVIDIA LIBERO_LeRobot_v3): raw gripper in [0, 1]; the env wants
+      [-1, 1] with negative=open. The i4/cosmos-rl reference BINARIZES this to hard
+      {-1, +1} via ``-sign(2g - 1)`` (not the continuous ``1 - 2g`` from issue #50).
+      For a confident policy the two agree (g~0/1), but an undertrained policy emits
+      g~0.5 where continuous ``1-2g``~0 never actuates the gripper -> grasps fail.
+      Binarizing matches the reference and is robust to weak checkpoints.
+    * ``pm_one`` (community ``lerobot/libero_*``): raw gripper already in {-1, +1}
+      (robosuite convention) -> pass through (clamped).
+    * ``pm_one_flip``: {-1, +1} but with inverted open/close sign.
+    """
+    action = list(action)  # avoid mutating the caller's list
+    g = action[-1]
+    if mode == "zero_one":
+        action[-1] = max(-1.0, min(1.0, g * 2.0 - 1.0)) * -1.0  # [0,1] -> [-1,1], negative=open (issue #50)
+    elif mode == "pm_one":
+        action[-1] = max(-1.0, min(1.0, g))
+    elif mode == "pm_one_flip":
+        action[-1] = max(-1.0, min(1.0, -g))
+    else:
+        raise ValueError(f"Unknown gripper_mode={mode!r}. Use zero_one/pm_one/pm_one_flip.")
+    return action
+
+
+def _infer_rotation_space(action_dim: int, rotation_space: str) -> str:
+    if rotation_space != "auto":
+        return rotation_space
+    return libero_rotation_space_from_action_dim(action_dim)
+
+
+def _obs_to_pose(obs: dict[str, Any]) -> tuple[np.ndarray, np.ndarray]:
+    position = np.asarray(obs["robot0_eef_pos"], dtype=np.float32)
+    quat = np.asarray(obs["robot0_eef_quat"], dtype=np.float32)
+    rotation = R.from_quat(quat).as_matrix()
+    return position, rotation
+
+
+def _anchored_action_to_delta(
+    anchored_action: np.ndarray,
+    base_pose: tuple[np.ndarray, np.ndarray],
+    current_pose: tuple[np.ndarray, np.ndarray],
+    rotation_space: str,
+) -> np.ndarray:
+    anchored_translation = anchored_action[:3]
+    rotation_dim = anchored_action.shape[0] - 4
+    anchored_rotation = anchored_action[3 : 3 + rotation_dim]
+    gripper = anchored_action[3 + rotation_dim : 4 + rotation_dim]
+
+    base_pos, base_rot = base_pose
+    current_pos, current_rot = current_pose
+
+    if rotation_space == "3d":
+        anchored_rot = R.from_rotvec(anchored_rotation).as_matrix()
+    elif rotation_space == "6d":
+        anchored_rot = _rotation_repr_to_mat(anchored_rotation, rotation_space)
+    elif rotation_space == "9d":
+        anchored_rot = anchored_rotation.reshape(3, 3)
+    else:
+        raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.")
+    target_rot = base_rot @ anchored_rot
+    target_pos = base_pos + base_rot @ anchored_translation
+    delta_pos = target_pos - current_pos
+    delta_rot = target_rot @ current_rot.T
+    delta_rotvec = R.from_matrix(delta_rot).as_rotvec()
+
+    return np.concatenate([delta_pos, delta_rotvec, gripper], axis=0)
+
+
+def _framewise_action_to_delta(
+    framewise_action: np.ndarray,
+    rotation_space: str,
+) -> np.ndarray:
+    """Convert a frame-wise policy action to LIBERO's 7D simulator command.
+
+    Frame-wise actions are already per-step deltas in the LIBERO controller's
+    convention (see ``LiberoDataset`` with ``action_space='frame_wise_relative'``),
+    so the only conversion required is decoding the chosen rotation
+    representation back to a rotation vector. No anchor/current pose is needed.
+    """
+    if rotation_space == "3d":
+        return framewise_action
+
+    translation = framewise_action[:3]
+    rotation_dim = framewise_action.shape[0] - 4
+    rotation_repr = framewise_action[3 : 3 + rotation_dim]
+    gripper = framewise_action[3 + rotation_dim : 4 + rotation_dim]
+    rotation_delta = _rotation_repr_to_mat(rotation_repr, rotation_space)
+
+    delta_pos = translation
+    delta_rotvec = R.from_matrix(rotation_delta).as_rotvec()
+    return np.concatenate([delta_pos, delta_rotvec, gripper], axis=0)
+
+
+def _run_episode(
+    env: Any,
+    client: ActionEnvironmentClient,
+    *,
+    cameras: list[str],
+    flip_images: bool,
+    rotate_180: bool,
+    action_horizon: int,
+    action_dim: int,
+    action_space: str,
+    rotation_space: str,
+    gripper_mode: str,
+    max_steps: int,
+    warmup_steps: int,
+    initial_state: np.ndarray | None,
+    gif_path: Path | None,
+    gif_fps: int,
+    comparison_path: Path | None = None,
+) -> EpisodeResult:
+    env.reset()
+    if initial_state is not None:
+        obs = env.set_init_state(initial_state)
+    else:
+        obs = env.get_observation()
+
+    action_queue: list[list[float]] = []
+    base_pose: tuple[np.ndarray, np.ndarray] | None = None
+    step = 0
+    success = False
+    gif_frames: list[Image.Image] = []
+    action_log: list[list[float]] = []
+    is_multi_view = len(cameras) > 1
+    resolved_rotation_space = _infer_rotation_space(action_dim, rotation_space)
+
+    comparison_windows: list[tuple[list[Image.Image], list[Image.Image]]] = []
+
+    def record_frame(current_obs: dict[str, Any]) -> None:
+        if gif_path is None:
+            return
+        image = _get_libero_image(
+            current_obs,
+            cameras[0],
+            flip_images=flip_images,
+            rotate_180=rotate_180,
+        )
+        image = _ensure_uint8_image(image)
+        gif_frames.append(Image.fromarray(image).convert("RGB"))
+
+    def capture_comparison_frame(current_obs: dict[str, Any]) -> Image.Image:
+        """Capture an env frame matching Action's input view (multi-view concatenated if applicable)."""
+        if is_multi_view:
+            imgs = _get_libero_images(current_obs, cameras, flip_images=flip_images, rotate_180=rotate_180)
+            concat = client.concatenate_images(imgs)
+            return Image.fromarray(_ensure_uint8_image(concat)).convert("RGB")
+        img = _get_libero_image(current_obs, cameras[0], flip_images=flip_images, rotate_180=rotate_180)
+        return Image.fromarray(_ensure_uint8_image(img)).convert("RGB")
+
+    record_frame(obs)
+
+    while step < max_steps:
+        if step < warmup_steps:
+            dummy = _get_libero_dummy_action()
+            obs, _, _, _ = env.step(dummy)
+            action_log.append(dummy)
+            step += 1
+            record_frame(obs)
+            continue
+
+        if not action_queue:
+            if is_multi_view:
+                observation_imgs = _get_libero_images(
+                    obs,
+                    cameras,
+                    flip_images=flip_images,
+                    rotate_180=rotate_180,
+                )
+                result = client.predict(observation_imgs)
+            else:
+                observation_img = _get_libero_image(
+                    obs,
+                    cameras[0],
+                    flip_images=flip_images,
+                    rotate_180=rotate_180,
+                )
+                result = client.predict(observation_img)
+            actions = result.get("action", [])
+            if not actions:
+                return EpisodeResult(False, step, "Empty action chunk from server", action_log)
+            action_queue = _select_action_chunk(actions, action_horizon)
+
+            if comparison_path is not None:
+                action_video_b64 = result.get("video", [])
+                if action_video_b64:
+                    action_frames = _decode_b64_frames(action_video_b64)
+                    env_comparison_frames = [capture_comparison_frame(obs)]
+                    comparison_windows.append((action_frames, env_comparison_frames))
+
+            if action_space == "relative":
+                base_pose = _obs_to_pose(obs)
+
+        raw_action = _format_action(action_queue.pop(0), action_dim)
+        if action_space == "relative":
+            if base_pose is None:
+                raise RuntimeError("Missing base pose for relative action conversion")
+            current_pose = _obs_to_pose(obs)
+            action = _anchored_action_to_delta(
+                np.asarray(raw_action, dtype=np.float32),
+                base_pose,
+                current_pose,
+                resolved_rotation_space,
+            )
+            action_list = action.tolist()
+        else:
+            action = _framewise_action_to_delta(
+                np.asarray(raw_action, dtype=np.float32),
+                resolved_rotation_space,
+            )
+            action_list = action.tolist()
+
+        # Map the model's gripper command to the env's [-1, 1] per the dataset convention.
+        action_list = _remap_gripper(action_list, gripper_mode)
+
+        action_log.append(action_list)
+        obs, _, done, info = env.step(action_list)
+        step += 1
+        record_frame(obs)
+
+        if comparison_path is not None and comparison_windows:
+            comparison_windows[-1][1].append(capture_comparison_frame(obs))
+
+        if isinstance(info, dict) and info.get("success"):
+            success = True
+            break
+        if done:
+            success = True if not isinstance(info, dict) else bool(info.get("success", True))
+            break
+
+    if gif_path is not None:
+        _save_gif(gif_frames, gif_path, gif_fps)
+    if comparison_path is not None and comparison_windows:
+        _save_comparison_gif(comparison_windows, comparison_path, gif_fps)
+    return EpisodeResult(success, step, None, action_log)
+
+
+def _load_initial_states(
+    task_suite: Any,
+    task_id: int,
+    *,
+    task_description: str,
+    initial_states_path: str,
+    episode_idx: int,
+) -> np.ndarray | None:
+    default_initial_states = task_suite.get_task_init_states(task_id)
+
+    if initial_states_path == "DEFAULT":
+        return np.array(default_initial_states[episode_idx])
+
+    with open(initial_states_path, "r", encoding="utf-8") as f:
+        all_initial_states = json.load(f)
+
+    task_key = task_description.replace(" ", "_")
+    episode_key = f"demo_{episode_idx}"
+    if not all_initial_states[task_key][episode_key]["success"]:
+        return None
+    return np.array(all_initial_states[task_key][episode_key]["initial_state"])
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="LIBERO closed-loop evaluation via Action HTTP server")
+    parser.add_argument(
+        "--server_url", type=str, required=True, help="Base URL for Action server (e.g., http://host:8000)"
+    )
+    parser.add_argument("--task_suite", type=str, default="libero_spatial", choices=sorted(TASK_MAX_STEPS.keys()))
+    parser.add_argument("--num_trials_per_task", type=int, default=10)
+    parser.add_argument("--task_ids", type=str, default="", help="Comma-separated task IDs to evaluate (default: all)")
+    parser.add_argument("--image_size", type=int, default=256, help="Model input image size")
+    parser.add_argument("--env_image_size", type=int, default=256, help="Environment render resolution")
+    parser.add_argument("--action_horizon", type=int, default=0, help="Actions to execute per request (0=full chunk)")
+    parser.add_argument("--action_dim", type=int, default=10, help="Action dimension for LIBERO")
+    parser.add_argument(
+        "--action_space",
+        type=str,
+        default="frame_wise_relative",
+        choices=["relative", "frame_wise_relative"],
+        help="Action space expected from the model (relative=anchored, frame_wise_relative=framewise deltas).",
+    )
+    parser.add_argument(
+        "--rotation_space",
+        type=str,
+        default="auto",
+        choices=["auto", "3d", "6d", "9d"],
+        help="Rotation representation for anchored actions (auto infers from action_dim).",
+    )
+    parser.add_argument(
+        "--gripper_mode",
+        type=str,
+        default="zero_one",
+        choices=["zero_one", "pm_one", "pm_one_flip"],
+        help="Gripper convention of the training data: 'zero_one' = [0,1] (NVIDIA "
+        "LIBERO_LeRobot_v3, mapped 1-2g); 'pm_one' = {-1,+1} (community lerobot/libero_*, "
+        "pass-through); 'pm_one_flip' = {-1,+1} with inverted sign.",
+    )
+    parser.add_argument("--domain_name", type=str, default="libero")
+    parser.add_argument(
+        "--camera",
+        type=str,
+        default="agentview",
+        help="Camera(s) to use. Single camera: 'agentview' or 'wrist'. Multiple cameras: comma-separated, e.g., 'agentview,wrist'.",
+    )
+    parser.add_argument("--flip_images", action="store_true", help="Flip images vertically before encoding")
+    parser.add_argument(
+        "--rotate_180",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Rotate images by 180 degrees before encoding (default: True; pass --no-rotate-180 to disable)",
+    )
+    parser.add_argument("--warmup_steps", type=int, default=10, help="Stabilization steps with dummy actions")
+    parser.add_argument("--max_steps", type=int, default=0, help="Override max steps per episode (0=default)")
+    parser.add_argument("--timeout", type=float, default=30.0, help="HTTP request timeout in seconds")
+    parser.add_argument("--wait_timeout", type=float, default=60.0, help="Seconds to wait for server health")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--save_gifs", action="store_true", help="Save per-episode GIFs of rendered frames")
+    parser.add_argument(
+        "--save_comparison",
+        action="store_true",
+        help="Save side-by-side comparison GIFs (Action prediction vs environment rollout)",
+    )
+    parser.add_argument("--gif_fps", type=int, default=20, help="Frames per second for saved GIFs")
+    parser.add_argument(
+        "--mujoco_gl",
+        type=str,
+        default="auto",
+        choices=["auto", "egl", "osmesa", "glfw"],
+        help="MuJoCo GL backend (auto picks egl if /dev/dri is accessible, else osmesa).",
+    )
+    parser.add_argument(
+        "--render_gpu_device_id",
+        type=int,
+        default=-1,
+        help="GPU device index for EGL rendering (-1 uses default device).",
+    )
+    parser.add_argument(
+        "--initial_states_path",
+        type=str,
+        default="DEFAULT",
+        help='Path to initial states JSON. Use "DEFAULT" for benchmark defaults.',
+    )
+    parser.add_argument(
+        "--num_envs",
+        type=int,
+        default=1,
+        help="Number of parallel LIBERO envs (SubprocVectorEnv). >1 runs trials in waves "
+        "with ONE batched /predict_batch per control step (~num_envs x faster). 1 = serial.",
+    )
+    parser.add_argument("--output_dir", type=str, default="", help="Directory to save evaluation summary JSON")
+    return parser.parse_args()
+
+
+class _LiberoEnvFactory:
+    """Picklable env factory for SubprocVectorEnv under the spawn start method.
+
+    spawn pickles each env_fn and re-imports this module in the child, so the
+    factory must be a top-level class (lambdas/closures are not picklable). The
+    child sets the GL backend and imports OffScreenRenderEnv locally so its EGL
+    context is created fresh in the worker process."""
+
+    def __init__(
+        self,
+        *,
+        bddl_file_name: str,
+        camera_heights: int,
+        camera_widths: int,
+        render_gpu_device_id: int,
+        mujoco_gl: str,
+    ) -> None:
+        self.bddl_file_name = bddl_file_name
+        self.camera_heights = camera_heights
+        self.camera_widths = camera_widths
+        self.render_gpu_device_id = render_gpu_device_id
+        self.mujoco_gl = mujoco_gl
+
+    def __call__(self) -> Any:
+        # Resolve to a concrete GPU; -1 (auto) makes EGL device selection race/fail
+        # across spawned workers (EGLError / "'EGLGLContext' object has no attribute
+        # '_context'"). Set the GL backend + pin the EGL device BEFORE importing
+        # OffScreenRenderEnv (which dlopen's the GL stack at import).
+        dev = self.render_gpu_device_id if self.render_gpu_device_id >= 0 else 0
+        os.environ["MUJOCO_GL"] = self.mujoco_gl
+        if self.mujoco_gl == "egl":
+            os.environ["PYOPENGL_PLATFORM"] = "egl"
+            os.environ["MUJOCO_EGL_DEVICE_ID"] = str(dev)
+            os.environ["EGL_DEVICE_ID"] = str(dev)
+        elif self.mujoco_gl == "osmesa":
+            os.environ["PYOPENGL_PLATFORM"] = "osmesa"
+        from libero.libero.envs import OffScreenRenderEnv as _OffScreenRenderEnv
+
+        return _OffScreenRenderEnv(
+            bddl_file_name=self.bddl_file_name,
+            camera_heights=self.camera_heights,
+            camera_widths=self.camera_widths,
+            render_gpu_device_id=dev,
+        )
+
+
+def _run_task_vectorized(
+    task: Any,
+    task_description: str,
+    *,
+    num_trials: int,
+    num_envs: int,
+    env_image_size: int,
+    seed: int,
+    render_gpu_device_id: int,
+    client: ActionEnvironmentClient,
+    cameras: list[str],
+    flip_images: bool,
+    rotate_180: bool,
+    action_horizon: int,
+    action_dim: int,
+    rotation_space: str,
+    gripper_mode: str,
+    max_steps: int,
+    warmup_steps: int,
+    init_states: list[np.ndarray | None],
+) -> list[dict[str, Any]]:
+    """Run all `num_trials` of one task across `num_envs` parallel LIBERO envs
+    (SubprocVectorEnv), in waves. Each control step gathers obs from the ACTIVE
+    (not-done) envs, issues ONE batched /predict_batch, and steps all active envs;
+    done envs are masked out. Returns per-trial result dicts in trial order with the
+    same shape as the serial path's episode_results."""
+    import multiprocessing as _mp
+
+    from libero.libero.envs.venv import SubprocVectorEnv
+
+    # LIBERO's SubprocVectorEnv defaults to the fork start method; forked children
+    # inherit the parent's already-dlopen'd EGL/GL state, which corrupts per-child
+    # render-context creation (EGLError / 'EGLGLContext' has no attribute '_context').
+    # Force spawn so each env worker starts clean — exactly like the (working) serial
+    # single-process path. spawn pickles env_fns, so the factory below is picklable.
+    try:
+        _mp.set_start_method("spawn", force=True)
+    except RuntimeError:  # pragma: no cover - already set
+        pass
+
+    resolved_rotation_space = _infer_rotation_space(action_dim, rotation_space)
+    bddl = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
+
+    results: list[dict[str, Any]] = [None] * num_trials  # type: ignore[list-item]
+    for t in range(num_trials):
+        if init_states[t] is None:
+            results[t] = {
+                "episode": t,
+                "success": False,
+                "steps": 0,
+                "error": "Skipped due to failed expert demo",
+                "elapsed_s": 0.0,
+            }
+    runnable = [t for t in range(num_trials) if init_states[t] is not None]
+    if not runnable:
+        return results
+
+    n = min(num_envs, len(runnable))
+
+    mujoco_gl = os.environ.get("MUJOCO_GL", "egl")
+    env_fn = _LiberoEnvFactory(
+        bddl_file_name=bddl,
+        camera_heights=env_image_size,
+        camera_widths=env_image_size,
+        render_gpu_device_id=render_gpu_device_id,
+        mujoco_gl=mujoco_gl,
+    )
+    venv = SubprocVectorEnv([env_fn for _ in range(n)])
+    try:
+        venv.seed(seed)
+        for w0 in range(0, len(runnable), n):
+            wave = runnable[w0 : w0 + n]          # trial indices for this wave
+            slots = list(range(len(wave)))        # env slots in use
+            t_wave0 = time.perf_counter()
+            venv.reset(id=slots)
+            states = np.stack([np.asarray(init_states[t], dtype=np.float64) for t in wave])
+            obs_arr = venv.set_init_state(states, id=slots)
+            obs_by_slot = {s: obs_arr[i] for i, s in enumerate(slots)}
+            done = {s: False for s in slots}
+            succ = {s: False for s in slots}
+            err: dict[int, str | None] = {s: None for s in slots}
+            nsteps = {s: max_steps for s in slots}
+            step = 0
+
+            for _ in range(warmup_steps):
+                act = np.stack([_get_libero_dummy_action() for _ in slots])
+                obs_arr, _, _, _ = venv.step(act, id=slots)
+                for i, s in enumerate(slots):
+                    obs_by_slot[s] = obs_arr[i]
+                step += 1
+
+            while step < max_steps:
+                active = [s for s in slots if not done[s]]
+                if not active:
+                    break
+                obs_batch = [
+                    _get_libero_images(obs_by_slot[s], cameras, flip_images=flip_images, rotate_180=rotate_180)
+                    for s in active
+                ]
+                try:
+                    chunks = client.predict_batch(obs_batch)
+                except Exception as e:  # noqa: BLE001
+                    for s in active:
+                        done[s] = True
+                        err[s] = f"server error: {e}"
+                        nsteps[s] = step
+                    break
+                if not chunks or len(chunks) != len(active):
+                    for s in active:
+                        done[s] = True
+                        err[s] = "bad batch response from server"
+                        nsteps[s] = step
+                    break
+                chunk_by_slot = {s: chunks[k] for k, s in enumerate(active)}
+                horizon = action_horizon if action_horizon > 0 else len(chunks[0])
+                for h in range(horizon):
+                    cur = [s for s in slots if not done[s]]
+                    if not cur or step >= max_steps:
+                        break
+                    env_actions = []
+                    for s in cur:
+                        raw = _format_action(chunk_by_slot[s][h], action_dim)
+                        a = _framewise_action_to_delta(np.asarray(raw, dtype=np.float32), resolved_rotation_space)
+                        env_actions.append(_remap_gripper(a.tolist(), gripper_mode))
+                    obs_arr, _, d, info = venv.step(np.stack(env_actions), id=cur)
+                    step += 1
+                    for i, s in enumerate(cur):
+                        obs_by_slot[s] = obs_arr[i]
+                        di = bool(d[i])
+                        ii = info[i] if isinstance(info, (list, np.ndarray)) else info
+                        is_succ = bool(ii.get("success")) if isinstance(ii, dict) else False
+                        if is_succ:
+                            done[s], succ[s], nsteps[s] = True, True, step
+                        elif di:
+                            # mirror serial: done w/o explicit success defaults to success
+                            done[s] = True
+                            succ[s] = ii.get("success", True) if isinstance(ii, dict) else True
+                            nsteps[s] = step
+            per_ep_elapsed = round((time.perf_counter() - t_wave0) / max(1, len(wave)), 3)
+            for s, t in zip(slots, wave):
+                results[t] = {
+                    "episode": t,
+                    "success": bool(succ[s]),
+                    "steps": int(nsteps[s]),
+                    "error": err[s],
+                    "elapsed_s": per_ep_elapsed,
+                }
+    finally:
+        try:
+            venv.close()
+        except Exception:  # noqa: BLE001
+            pass
+    return results
+
+
+def main() -> None:
+    args = _parse_args()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    if args.save_gifs and not args.output_dir:
+        raise ValueError("--save_gifs requires --output_dir to be set")
+    if args.save_comparison and not args.output_dir:
+        raise ValueError("--save_comparison requires --output_dir to be set")
+
+    # Parse cameras from comma-separated string
+    cameras = [c.strip() for c in args.camera.split(",") if c.strip()]
+    if not cameras:
+        raise ValueError("At least one camera must be specified")
+    for cam in cameras:
+        if cam not in ("agentview", "wrist"):
+            raise ValueError(f"Unsupported camera={cam!r}. Use 'agentview' or 'wrist'.")
+
+    mujoco_backend = _configure_mujoco_env(args.mujoco_gl)
+    _import_libero()
+
+    client = ActionEnvironmentClient(
+        server_url=args.server_url,
+        domain_name=args.domain_name,
+        prompt="",
+        image_size=args.image_size,
+        timeout=args.timeout,
+    )
+    print(f"MuJoCo GL backend: {mujoco_backend}", flush=True)
+    print("Waiting for model server...", flush=True)
+    _wait_for_server(client, args.wait_timeout)
+    print(f"Connected to model server: {client.get_info()}", flush=True)
+
+    benchmark_dict = benchmark.get_benchmark_dict()
+    task_suite = benchmark_dict[args.task_suite]()
+    num_tasks = int(task_suite.n_tasks)
+
+    if args.task_ids:
+        selected_task_ids = [int(t) for t in args.task_ids.split(",") if t.strip()]
+    else:
+        selected_task_ids = list(range(num_tasks))
+
+    max_steps = args.max_steps if args.max_steps > 0 else TASK_MAX_STEPS[args.task_suite]
+
+    total_episodes = 0
+    total_successes = 0
+    task_results: list[dict[str, Any]] = []
+
+    output_dir = Path(args.output_dir) if args.output_dir else None
+    gif_root = output_dir / "gifs" if output_dir and args.save_gifs else None
+    comparison_root = output_dir / "comparisons" if output_dir and args.save_comparison else None
+
+    for task_id in selected_task_ids:
+        task = task_suite.get_task(task_id)
+
+        # ---- Vectorized path: N parallel envs + one batched /predict_batch per step ----
+        if args.num_envs > 1:
+            task_description = str(task.language)
+            client.prompt = _augment_task_prompt_with_viewpoint(task_description, cameras)
+            init_states = [
+                _load_initial_states(
+                    task_suite,
+                    task_id,
+                    task_description=task_description,
+                    initial_states_path=args.initial_states_path,
+                    episode_idx=e,
+                )
+                for e in range(args.num_trials_per_task)
+            ]
+            episode_results = _run_task_vectorized(
+                task,
+                task_description,
+                num_trials=args.num_trials_per_task,
+                num_envs=args.num_envs,
+                env_image_size=args.env_image_size,
+                seed=args.seed,
+                render_gpu_device_id=args.render_gpu_device_id,
+                client=client,
+                cameras=cameras,
+                flip_images=args.flip_images,
+                rotate_180=args.rotate_180,
+                action_horizon=args.action_horizon,
+                action_dim=args.action_dim,
+                rotation_space=args.rotation_space,
+                gripper_mode=args.gripper_mode,
+                max_steps=max_steps,
+                warmup_steps=args.warmup_steps,
+                init_states=init_states,
+            )
+            task_episodes = 0
+            task_successes = 0
+            for er in episode_results:
+                task_episodes += 1
+                total_episodes += 1
+                if er["success"]:
+                    task_successes += 1
+                    total_successes += 1
+                print(
+                    f"Task {task_id} | Episode {er['episode'] + 1}/{args.num_trials_per_task} | "
+                    f"success={er['success']} steps={er['steps']} elapsed_s={er['elapsed_s']:.1f} | "
+                    f"task SR {task_successes}/{task_episodes} ({100.0 * task_successes / max(1, task_episodes):.1f}%) | "
+                    f"overall SR {total_successes}/{total_episodes} "
+                    f"({100.0 * total_successes / max(1, total_episodes):.1f}%)",
+                    flush=True,
+                )
+            task_success_rate = float(task_successes) / float(task_episodes) if task_episodes > 0 else 0.0
+            task_results.append(
+                {
+                    "task_id": task_id,
+                    "task_description": task_description,
+                    "episodes": task_episodes,
+                    "successes": task_successes,
+                    "success_rate": task_success_rate,
+                    "episode_results": episode_results,
+                }
+            )
+            print(
+                f"Task {task_id} summary: {task_successes}/{task_episodes} ({task_success_rate * 100:.1f}%)",
+                flush=True,
+            )
+            continue
+
+        env, task_description = _get_libero_env(
+            task,
+            resolution=args.env_image_size,
+            seed=args.seed,
+            render_gpu_device_id=args.render_gpu_device_id,
+        )
+
+        task_episodes = 0
+        task_successes = 0
+        episode_results: list[dict[str, Any]] = []
+
+        for episode_idx in range(args.num_trials_per_task):
+            episode_t0 = time.perf_counter()
+            client.prompt = _augment_task_prompt_with_viewpoint(task_description, cameras)
+            initial_state = _load_initial_states(
+                task_suite,
+                task_id,
+                task_description=task_description,
+                initial_states_path=args.initial_states_path,
+                episode_idx=episode_idx,
+            )
+            if initial_state is None:
+                episode_elapsed_s = time.perf_counter() - episode_t0
+                episode_results.append(
+                    {
+                        "episode": episode_idx,
+                        "success": False,
+                        "steps": 0,
+                        "error": "Skipped due to failed expert demo",
+                        "elapsed_s": round(episode_elapsed_s, 3),
+                    }
+                )
+                print(
+                    f"Task {task_id} | Episode {episode_idx + 1}/{args.num_trials_per_task} | "
+                    "success=False steps=0 "
+                    f"elapsed_s={episode_elapsed_s:.1f} "
+                    "error='Skipped due to failed expert demo'",
+                    flush=True,
+                )
+                continue
+
+            gif_path = (
+                gif_root / f"task_{task_id:03d}" / f"episode_{episode_idx:03d}.gif" if gif_root is not None else None
+            )
+            comparison_path = (
+                comparison_root / f"task_{task_id:03d}" / f"episode_{episode_idx:03d}.gif"
+                if comparison_root is not None
+                else None
+            )
+            try:
+                result = _run_episode(
+                    env,
+                    client,
+                    cameras=cameras,
+                    flip_images=args.flip_images,
+                    rotate_180=args.rotate_180,
+                    action_horizon=args.action_horizon,
+                    action_dim=args.action_dim,
+                    action_space=args.action_space,
+                    rotation_space=args.rotation_space,
+                    gripper_mode=args.gripper_mode,
+                    max_steps=max_steps,
+                    warmup_steps=args.warmup_steps,
+                    initial_state=initial_state,
+                    gif_path=gif_path,
+                    gif_fps=args.gif_fps,
+                    comparison_path=comparison_path,
+                )
+            except Exception as exc:
+                result = EpisodeResult(False, 0, str(exc), [])
+            episode_elapsed_s = time.perf_counter() - episode_t0
+
+            task_episodes += 1
+            total_episodes += 1
+            if result.success:
+                task_successes += 1
+                total_successes += 1
+
+            episode_results.append(
+                {
+                    "episode": episode_idx,
+                    "success": result.success,
+                    "steps": result.steps,
+                    "error": result.error,
+                    "elapsed_s": round(episode_elapsed_s, 3),
+                }
+            )
+
+            # Save per-episode action log as JSON
+            if output_dir is not None and result.actions:
+                action_log_dir = output_dir / "actions" / f"task_{task_id:03d}"
+                action_log_dir.mkdir(parents=True, exist_ok=True)
+                action_log_path = action_log_dir / f"episode_{episode_idx:03d}.json"
+                action_log_path.write_text(
+                    json.dumps(result.actions, indent=2),
+                    encoding="utf-8",
+                )
+
+            client.notify_next_episode()
+
+            print(
+                f"Task {task_id} | Episode {episode_idx + 1}/{args.num_trials_per_task} | "
+                f"success={result.success} steps={result.steps} elapsed_s={episode_elapsed_s:.1f} | "
+                f"task SR {task_successes}/{task_episodes} ({100.0 * task_successes / max(1, task_episodes):.1f}%) | "
+                f"overall SR {total_successes}/{total_episodes} ({100.0 * total_successes / max(1, total_episodes):.1f}%)",
+                flush=True,
+            )
+
+        task_success_rate = float(task_successes) / float(task_episodes) if task_episodes > 0 else 0.0
+        task_results.append(
+            {
+                "task_id": task_id,
+                "task_description": task_description,
+                "episodes": task_episodes,
+                "successes": task_successes,
+                "success_rate": task_success_rate,
+                "episode_results": episode_results,
+            }
+        )
+        print(
+            f"Task {task_id} summary: {task_successes}/{task_episodes} ({task_success_rate * 100:.1f}%)",
+            flush=True,
+        )
+        # Close the env (and its EGL/MuJoCo render context) before the next task.
+        # Leaving it open leaks one EGL context per task and hangs after ~8 tasks.
+        try:
+            env.close()
+        except Exception:
+            pass
+
+    overall_success_rate = float(total_successes) / float(total_episodes) if total_episodes > 0 else 0.0
+    summary = {
+        "task_suite": args.task_suite,
+        "total_episodes": total_episodes,
+        "total_successes": total_successes,
+        "overall_success_rate": overall_success_rate,
+        "num_trials_per_task": args.num_trials_per_task,
+        "selected_task_ids": selected_task_ids,
+        "action_space": args.action_space,
+        "rotation_space": _infer_rotation_space(args.action_dim, args.rotation_space),
+        "action_dim": args.action_dim,
+        "task_results": task_results,
+    }
+
+    print(
+        f"Overall success rate: {total_successes}/{total_episodes} ({overall_success_rate * 100:.1f}%)",
+        flush=True,
+    )
+
+    if output_dir is not None:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        summary_path = output_dir / "summary.json"
+        summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+        print(f"Saved summary to {summary_path}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cosmos_framework/simulation/libero/dataset_reply_action_server.py b/cosmos_framework/simulation/libero/dataset_reply_action_server.py
new file mode 100644
index 0000000..bb5d9a4
--- /dev/null
+++ b/cosmos_framework/simulation/libero/dataset_reply_action_server.py
@@ -0,0 +1,653 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""
+HTTP server that serves ground-truth actions from LIBERO LeRobot datasets.
+
+Same HTTP interface as `cosmos3.scripts.action_policy_server` (the model-backed
+server), enabling drop-in replacement for closed-loop evaluation to verify the
+action pipeline with known-good GT actions.
+
+Endpoints:
+- POST /predict: Return next chunk of GT actions for the given task (matched by prompt)
+- GET  /info:    Return dataset info (tasks, episode counts)
+- POST /next_episode: Advance to next episode for the task specified in request body
+- POST /reset:   Reset all per-task episode/step tracking
+
+Episode advancement:
+  The server auto-advances to the next episode when the current episode's actions
+  are exhausted.  For early-termination cases (e.g. success before all actions are
+  consumed), call POST /next_episode with {"prompt": "<task>"} between episodes.
+
+Example usage:
+
+
+PYTHONPATH=. python cosmos_framework/simulation/libero/dataset_reply_action_server.py \
+  --repo_id libero_10 \
+  --root /path/to/libero_10_no_noops_1.0.0_lerobot_aligned \
+  --action_space frame_wise_relative \
+  --rotation_space 6d \
+  --pose_coordinate_frame opencv \
+  --action_chunk_size 16 \
+  --send_video \
+  --camera_mode agentview \
+  --port 8000
+
+# Multiple datasets:
+PYTHONPATH=. python cosmos_framework/simulation/libero/dataset_reply_action_server.py \
+  --repo_id libero_10,libero_goal \
+  --root /path/to/libero_10,/path/to/libero_goal \
+  --action_space relative \
+  --rotation_space 6d \
+  --pose_coordinate_frame opencv \
+  --action_chunk_size 16 \
+  --port 8000
+"""
+
+from __future__ import annotations
+
+import argparse
+import base64
+import datetime
+import io
+import json
+import socket
+import threading
+import time
+from dataclasses import dataclass
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from typing import Any
+
+import numpy as np
+import torch
+from PIL import Image
+
+from cosmos_framework.data.vfm.action.libero_pose_utils import (
+    libero_rotation_format,
+)
+from cosmos_framework.data.vfm.action.pose_utils import convert_rotation
+
+
+def _ts() -> str:
+    return datetime.datetime.now(tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
+
+
+def _get_local_ip() -> str:
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+            s.connect(("8.8.8.8", 80))
+            return str(s.getsockname()[0])
+    except Exception:
+        return socket.gethostbyname(socket.gethostname())
+
+
+# ---------------------------------------------------------------------------
+# Action processing (mirrors LIBEROLeRobotDataset.__getitem__ logic)
+# ---------------------------------------------------------------------------
+
+
+def _compute_anchored_actions(
+    state_raw: torch.Tensor,
+    action_raw: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Compute anchored relative actions, same as LIBEROLeRobotDataset._compute_anchored_actions.
+
+    Actions are expressed in state_raw[0]'s local coordinate frame.
+
+    Args:
+        state_raw: (T+1, 8) states [x, y, z, ax, ay, az, grip1, grip2].
+        action_raw: (T+1, 7) actions [dx, dy, dz, dax, day, daz, grip].
+
+    Returns:
+        anchored_translation (T, 3), anchored_rotation (T, 3, 3), gripper (T, 1).
+    """
+    p_states = state_raw[:, :3]
+    rotvec_states = state_raw[:, 3:6]
+    delta_p = action_raw[:-1, :3]
+    delta_rotvec = action_raw[:-1, 3:6]
+    gripper = action_raw[:-1, 6:7]
+
+    R_states = convert_rotation(rotvec_states, "axisangle", "matrix")
+    R_deltas = convert_rotation(delta_rotvec, "axisangle", "matrix")
+
+    p_0 = p_states[0]
+    R_0_T = R_states[0].T
+
+    p_t = p_states[:-1]
+    R_t = R_states[:-1]
+
+    p_target = p_t + delta_p
+    R_target = torch.bmm(R_deltas, R_t)
+
+    anchored_p = (R_0_T @ (p_target - p_0).T).T
+    R_0_T_expanded = R_0_T.unsqueeze(0).expand(R_target.shape[0], -1, -1)
+    anchored_R = torch.bmm(R_0_T_expanded, R_target)
+
+    return anchored_p, anchored_R, gripper
+
+
+def _convert_rotation_to_repr(rotation_matrix: torch.Tensor, rotation_space: str) -> torch.Tensor:
+    return convert_rotation(rotation_matrix, "matrix", libero_rotation_format(rotation_space))
+
+
+def _process_action_chunk(
+    action_raw: torch.Tensor,
+    state_raw: torch.Tensor,
+    action_space: str,
+    rotation_space: str,
+) -> torch.Tensor:
+    """Process a chunk of raw actions with the same logic as LIBEROLeRobotDataset.__getitem__.
+
+    Args:
+        action_raw: (chunk+1, 7) raw actions covering chunk+1 consecutive frames.
+        state_raw:  (chunk+1, 8) raw states  covering chunk+1 consecutive frames.
+        action_space: "relative" or "frame_wise_relative".
+        rotation_space: "3d", "6d", or "9d".
+
+    Returns:
+        Processed actions (chunk, D) where D depends on rotation_space.
+    """
+    if action_space == "relative":
+        translation, rotation_matrix, gripper = _compute_anchored_actions(state_raw, action_raw)
+    elif action_space == "frame_wise_relative":
+        action = action_raw[:-1].clone()
+        translation = action[:, :3]
+        rotation_matrix = convert_rotation(action[:, 3:6], "axisangle", "matrix")
+        gripper = action[:, 6:]
+    else:
+        raise ValueError(f"Unsupported action_space: {action_space}")
+
+    rotation = _convert_rotation_to_repr(rotation_matrix, rotation_space)
+    return torch.cat([translation, rotation, gripper], dim=-1)
+
+
+# ---------------------------------------------------------------------------
+# Data structures
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class EpisodeData:
+    action_raw: torch.Tensor  # (N, 7) per-frame raw actions for the full episode
+    state_raw: torch.Tensor  # (N, 8) per-frame raw states for the full episode
+    task_description: str
+    dataset_ref_idx: int  # index into DatasetActionService._hf_datasets
+    frame_start: int  # first global frame index in the HF dataset
+    frame_end: int  # one-past-last global frame index
+
+
+@dataclass(frozen=True)
+class DatasetServerConfig:
+    repo_id: list[str]
+    root: list[str | None]
+    action_space: str
+    rotation_space: str
+    pose_coordinate_frame: str
+    action_chunk_size: int
+    max_action_dim: int
+    split: str
+    send_video: bool
+    camera_mode: str
+    image_size: int
+
+
+# ---------------------------------------------------------------------------
+# Service
+# ---------------------------------------------------------------------------
+
+
+class DatasetActionService:
+    """Serves GT actions (and optionally GT video) from pre-loaded LIBERO LeRobot episodes."""
+
+    def __init__(self, cfg: DatasetServerConfig) -> None:
+        self.cfg = cfg
+        self.episodes_by_task: dict[str, list[EpisodeData]] = {}
+        self._hf_datasets: list[Any] = []
+        self._lerobot_datasets: list[Any] = []
+        self._task_state: dict[str, dict[str, int]] = {}
+        self._lock = threading.Lock()
+
+        if cfg.camera_mode in ("concat_view", "both"):
+            self._image_keys = ["observation.images.image", "observation.images.wrist_image"]
+        elif cfg.camera_mode == "wrist_image":
+            self._image_keys = ["observation.images.wrist_image"]
+        else:
+            self._image_keys = ["observation.images.image"]
+
+        self._load_datasets()
+
+    def _load_datasets(self) -> None:
+        from lerobot.datasets.lerobot_dataset import LeRobotDataset
+
+        for repo_id, root in zip(self.cfg.repo_id, self.cfg.root):
+            print(f"[{_ts()}] [dataset-server] loading repo_id={repo_id} root={root} ...", flush=True)
+            t0 = time.monotonic()
+
+            dataset = LeRobotDataset(repo_id=repo_id, root=root)
+            tasks_df = dataset.meta.tasks
+            hf = dataset.hf_dataset
+            ds_ref_idx = len(self._hf_datasets)
+            self._hf_datasets.append(hf)
+
+            if self.cfg.send_video:
+                delta_ts: dict[str, list[float]] = {k: [0.0] for k in self._image_keys}
+                video_dataset = LeRobotDataset(repo_id=repo_id, root=root, delta_timestamps=delta_ts)
+                self._lerobot_datasets.append(video_dataset)
+            else:
+                self._lerobot_datasets.append(None)
+
+            for ep_meta in dataset.meta.episodes:
+                ep_idx = int(ep_meta["episode_index"])  # type: ignore[index]
+                start = int(ep_meta["dataset_from_index"])  # type: ignore[index]
+                end = int(ep_meta["dataset_to_index"])  # type: ignore[index]
+
+                ep_slice = hf.select(range(start, end))
+                actions = torch.tensor(np.array(ep_slice["action"], dtype=np.float32))
+                states = torch.tensor(np.array(ep_slice["observation.state"], dtype=np.float32))
+
+                task_idx = int(ep_slice[0]["task_index"])
+                matching = tasks_df[tasks_df["task_index"] == task_idx]
+                task_desc = str(matching.iloc[0].name) if not matching.empty else f"task_{task_idx}"
+
+                self.episodes_by_task.setdefault(task_desc, []).append(
+                    EpisodeData(
+                        action_raw=actions,
+                        state_raw=states,
+                        task_description=task_desc,
+                        dataset_ref_idx=ds_ref_idx,
+                        frame_start=start,
+                        frame_end=end,
+                    )
+                )
+
+            dt = time.monotonic() - t0
+            print(
+                f"[{_ts()}] [dataset-server] loaded {repo_id}: {dataset.meta.total_episodes} episodes in {dt:.1f}s",
+                flush=True,
+            )
+
+        total_tasks = len(self.episodes_by_task)
+        total_eps = sum(len(eps) for eps in self.episodes_by_task.values())
+        print(
+            f"[{_ts()}] [dataset-server] ready: {total_tasks} tasks, {total_eps} episodes "
+            f"send_video={self.cfg.send_video} camera_mode={self.cfg.camera_mode}",
+            flush=True,
+        )
+
+    def _load_video_frames(self, episode: EpisodeData, step: int, num_frames: int) -> list[str]:
+        """Load GT video frames from the dataset and encode as base64 PNGs.
+
+        Uses the LeRobotDataset wrapper (not the raw HF dataset) so that video-backed
+        datasets are decoded correctly via the configured video backend.
+
+        Args:
+            episode: Episode data with dataset reference.
+            step: Step offset within the episode (0-based).
+            num_frames: Number of frames to load (typically action_chunk_size + 1).
+
+        Returns:
+            List of base64-encoded PNG strings.
+        """
+        lr_dataset = self._lerobot_datasets[episode.dataset_ref_idx]
+        if lr_dataset is None:
+            return []
+        image_size = self.cfg.image_size
+        b64_frames: list[str] = []
+
+        for i in range(num_frames):
+            global_idx = episode.frame_start + step + i
+            if global_idx >= episode.frame_end:
+                break
+
+            item = lr_dataset[global_idx]
+
+            pil_images: list[Image.Image] = []
+            for key in self._image_keys:
+                img_tensor = item[key]
+                if isinstance(img_tensor, torch.Tensor):
+                    # LeRobot returns (T, C, H, W) with delta_timestamps=[0.0] -> (1, C, H, W)
+                    if img_tensor.dim() == 4:
+                        img_tensor = img_tensor[0]
+                    # (C, H, W) float [0, 1] -> PIL
+                    arr = (img_tensor.permute(1, 2, 0).clamp(0, 1) * 255).to(torch.uint8).numpy()
+                    img = Image.fromarray(arr)
+                elif isinstance(img_tensor, Image.Image):
+                    img = img_tensor
+                else:
+                    img = Image.fromarray(np.asarray(img_tensor, dtype=np.uint8))
+                img = img.convert("RGB").resize((image_size, image_size), Image.Resampling.BILINEAR)
+                pil_images.append(img)
+
+            if len(pil_images) > 1:
+                total_w = sum(im.width for im in pil_images)
+                combined = Image.new("RGB", (total_w, image_size))
+                x = 0
+                for im in pil_images:
+                    combined.paste(im, (x, 0))
+                    x += im.width
+                frame = combined
+            else:
+                frame = pil_images[0]
+
+            buf = io.BytesIO()
+            frame.save(buf, format="PNG")
+            b64_frames.append(base64.b64encode(buf.getvalue()).decode("ascii"))
+
+        return b64_frames
+
+    # -- state management --
+
+    def _get_task_state(self, prompt: str) -> dict[str, int]:
+        if prompt not in self._task_state:
+            self._task_state[prompt] = {"episode_idx": 0, "step": 0}
+        return self._task_state[prompt]
+
+    def _resolve_prompt(self, prompt: str) -> str:
+        """Resolve prompt to a known task description (exact or substring match)."""
+        if prompt in self.episodes_by_task:
+            return prompt
+        prompt_lower = prompt.lower().strip()
+        for task_desc in self.episodes_by_task:
+            if task_desc.lower().strip() == prompt_lower:
+                return task_desc
+        for task_desc in self.episodes_by_task:
+            td_lower = task_desc.lower().strip()
+            if prompt_lower in td_lower or td_lower in prompt_lower:
+                return task_desc
+        raise ValueError(
+            f"Task not found for prompt: {prompt!r}. Available tasks: {sorted(self.episodes_by_task.keys())}"
+        )
+
+    # -- endpoints --
+
+    def get_info(self) -> dict[str, Any]:
+        return {
+            "type": "dataset_action_server",
+            "action_space": self.cfg.action_space,
+            "rotation_space": self.cfg.rotation_space,
+            "action_chunk_size": self.cfg.action_chunk_size,
+            "tasks": {k: len(v) for k, v in sorted(self.episodes_by_task.items())},
+        }
+
+    def predict(self, req: dict[str, Any]) -> dict[str, Any]:
+        prompt = req.get("prompt")
+        if not isinstance(prompt, str):
+            raise ValueError("'prompt' must be a string")
+
+        resolved_prompt = self._resolve_prompt(prompt)
+
+        with self._lock:
+            state = self._get_task_state(resolved_prompt)
+            episodes = self.episodes_by_task[resolved_prompt]
+
+            ep_idx = state["episode_idx"] % len(episodes)
+            episode = episodes[ep_idx]
+            step = state["step"]
+
+            # Number of valid actions = num_frames - 1 (need pairs of consecutive frames)
+            max_actions = len(episode.action_raw) - 1
+
+            if step >= max_actions:
+                state["episode_idx"] = (ep_idx + 1) % len(episodes)
+                state["step"] = 0
+                ep_idx = state["episode_idx"]
+                episode = episodes[ep_idx]
+                step = 0
+                max_actions = len(episode.action_raw) - 1
+
+            chunk_size = min(self.cfg.action_chunk_size, max_actions - step)
+            # Slice chunk+1 frames for action computation (needs next-frame state)
+            raw_slice_end = step + chunk_size + 1
+            action_chunk_raw = episode.action_raw[step:raw_slice_end]
+            state_chunk_raw = episode.state_raw[step:raw_slice_end]
+
+            processed = _process_action_chunk(
+                action_chunk_raw,
+                state_chunk_raw,
+                self.cfg.action_space,
+                self.cfg.rotation_space,
+            )
+
+            # Pad to max_action_dim (same as the Action transform pipeline)
+            t, d = processed.shape
+            if d < self.cfg.max_action_dim:
+                processed = torch.cat(
+                    [processed, torch.zeros(t, self.cfg.max_action_dim - d)],
+                    dim=-1,
+                )
+
+            state["step"] += chunk_size
+
+            action_list = processed.float().numpy().tolist()
+
+            video_b64: list[str] = []
+            if self.cfg.send_video:
+                video_b64 = self._load_video_frames(episode, step, num_frames=chunk_size + 1)
+
+        print(
+            f"[{_ts()}] [dataset-server] predict prompt={resolved_prompt!r} "
+            f"ep={ep_idx} step={step}..{state['step']} actions={len(action_list)} "
+            f"video_frames={len(video_b64)}",
+            flush=True,
+        )
+        return {"action": action_list, "video": video_b64}
+
+    def next_episode(self, prompt: str | None = None) -> dict[str, Any]:
+        with self._lock:
+            if prompt is not None:
+                resolved = self._resolve_prompt(prompt)
+                state = self._get_task_state(resolved)
+                episodes = self.episodes_by_task[resolved]
+                state["episode_idx"] = (state["episode_idx"] + 1) % len(episodes)
+                state["step"] = 0
+                print(
+                    f"[{_ts()}] [dataset-server] next_episode task={resolved!r} -> ep={state['episode_idx']}",
+                    flush=True,
+                )
+                return {"task": resolved, "episode_idx": state["episode_idx"]}
+
+            for task in self._task_state:
+                episodes = self.episodes_by_task.get(task, [])
+                self._task_state[task]["episode_idx"] = (self._task_state[task]["episode_idx"] + 1) % max(
+                    len(episodes), 1
+                )
+                self._task_state[task]["step"] = 0
+            print(f"[{_ts()}] [dataset-server] next_episode (all tasks)", flush=True)
+            return {"advanced_all": True}
+
+    def reset(self) -> dict[str, str]:
+        with self._lock:
+            self._task_state.clear()
+        print(f"[{_ts()}] [dataset-server] reset", flush=True)
+        return {"status": "reset"}
+
+
+# ---------------------------------------------------------------------------
+# HTTP handler
+# ---------------------------------------------------------------------------
+
+
+class _DatasetHandler(BaseHTTPRequestHandler):
+    server: ThreadingHTTPServer  # type: ignore[assignment]
+
+    def _send_json(self, status_code: int, payload: dict[str, Any]) -> None:
+        body = json.dumps(payload).encode("utf-8")
+        self.send_response(status_code)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Cache-Control", "no-store")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        try:
+            self.wfile.write(body)
+        except (BrokenPipeError, ConnectionResetError):
+            return
+
+    def _read_json_body(self) -> dict[str, Any] | None:
+        try:
+            length = int(self.headers.get("Content-Length") or "0")
+        except ValueError:
+            self._send_json(400, {"error": "Invalid Content-Length"})
+            return None
+        body = self.rfile.read(max(0, length))
+        if not body:
+            return {}
+        try:
+            req = json.loads(body.decode("utf-8"))
+        except Exception as e:
+            self._send_json(400, {"error": f"Invalid JSON: {e}"})
+            return None
+        if not isinstance(req, dict):
+            self._send_json(400, {"error": "JSON body must be an object"})
+            return None
+        return req
+
+    def do_GET(self) -> None:  # noqa: N802
+        service: DatasetActionService = getattr(self.server, "service")
+        if self.path == "/info":
+            self._send_json(200, service.get_info())
+        elif self.path == "/":
+            self._send_json(200, {"status": "ok"})
+        else:
+            self._send_json(404, {"error": "Not found"})
+
+    def do_POST(self) -> None:  # noqa: N802
+        service: DatasetActionService = getattr(self.server, "service")
+
+        if self.path in ("/", "/predict"):
+            req = self._read_json_body()
+            if req is None:
+                return
+            try:
+                out = service.predict(req)
+            except Exception as e:
+                print(f"[{_ts()}] [dataset-server] predict ERROR: {e}", flush=True)
+                self._send_json(400, {"action": [], "error": str(e)})
+                return
+            self._send_json(200, out)
+
+        elif self.path == "/next_episode":
+            req = self._read_json_body()
+            prompt = req.get("prompt") if req else None
+            try:
+                out = service.next_episode(prompt)
+            except Exception as e:
+                self._send_json(400, {"error": str(e)})
+                return
+            self._send_json(200, out)
+
+        elif self.path == "/reset":
+            out = service.reset()
+            self._send_json(200, out)
+
+        else:
+            self._send_json(404, {"error": "Not found"})
+
+    def log_message(self, format: str, *args: Any) -> None:  # noqa: A002
+        return
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="HTTP server serving ground-truth actions from LIBERO LeRobot datasets."
+    )
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        required=True,
+        help="Comma-separated LeRobot repo IDs (e.g. libero_10,libero_goal)",
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        required=True,
+        help="Comma-separated local paths to dataset roots (one per repo_id)",
+    )
+    parser.add_argument(
+        "--action_space",
+        type=str,
+        default="frame_wise_relative",
+        choices=["relative", "frame_wise_relative"],
+        help="Action space (must match closed-loop eval's --action_space).",
+    )
+    parser.add_argument(
+        "--rotation_space",
+        type=str,
+        default="6d",
+        choices=["3d", "6d", "9d"],
+        help="Rotation representation (must match closed-loop eval's action_dim).",
+    )
+    parser.add_argument(
+        "--pose_coordinate_frame",
+        type=str,
+        default="native",
+        choices=["native", "opencv"],
+        help="Pose/action coordinate frame. Accepted for compatibility with LIBERO eval launchers.",
+    )
+    parser.add_argument("--action_chunk_size", type=int, default=16, help="Number of actions per predict call")
+    parser.add_argument("--max_action_dim", type=int, default=32, help="Pad actions to this dimension")
+    parser.add_argument("--split", type=str, default="full", help="Dataset split (train/val/full)")
+    parser.add_argument(
+        "--send_video",
+        action="store_true",
+        help="Include GT video frames (base64 PNGs) in /predict responses, same format as the Action server.",
+    )
+    parser.add_argument(
+        "--camera_mode",
+        type=str,
+        default="image",
+        choices=["agentview", "wrist_image", "concat_view", "both"],
+        help="Camera view(s) to include in video frames.",
+    )
+    parser.add_argument("--image_size", type=int, default=256, help="Resize video frames to this height/width")
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8000)
+    args = parser.parse_args()
+
+    repo_ids = [r.strip() for r in args.repo_id.split(",") if r.strip()]
+    roots = [r.strip() for r in args.root.split(",") if r.strip()]
+    if len(repo_ids) != len(roots):
+        raise ValueError(f"Number of repo_ids ({len(repo_ids)}) must match number of roots ({len(roots)})")
+
+    cfg = DatasetServerConfig(
+        repo_id=repo_ids,
+        root=roots,
+        action_space=args.action_space,
+        rotation_space=args.rotation_space,
+        pose_coordinate_frame=args.pose_coordinate_frame,
+        action_chunk_size=int(args.action_chunk_size),
+        max_action_dim=int(args.max_action_dim),
+        split=args.split,
+        send_video=bool(args.send_video),
+        camera_mode=args.camera_mode,
+        image_size=int(args.image_size),
+    )
+
+    service = DatasetActionService(cfg)
+    local_ip = _get_local_ip()
+
+    print(
+        f"[{_ts()}] [dataset-server] starting host={args.host} port={args.port} "
+        f"action_space={cfg.action_space} rotation_space={cfg.rotation_space} "
+        f"action_chunk_size={cfg.action_chunk_size}",
+        flush=True,
+    )
+    print(f"[{_ts()}] [dataset-server] Server accessible at: http://{local_ip}:{args.port}/", flush=True)
+    print(f"[{_ts()}] [dataset-server] Endpoints:", flush=True)
+    print(f"  - GET  /             : Health check", flush=True)
+    print(f"  - GET  /info         : Dataset info (tasks, episode counts)", flush=True)
+    print(f"  - POST /predict      : Get next GT action chunk (same interface as Action server)", flush=True)
+    print(f"  - POST /next_episode : Advance to next episode for a task", flush=True)
+    print(f"  - POST /reset        : Reset all per-task state", flush=True)
+
+    httpd = ThreadingHTTPServer((args.host, int(args.port)), _DatasetHandler)
+    setattr(httpd, "service", service)
+    httpd.serve_forever()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cosmos_framework/utils/vfm/model_loader.py b/cosmos_framework/utils/vfm/model_loader.py
index 6e6a0dd..b94817a 100644
--- a/cosmos_framework/utils/vfm/model_loader.py
+++ b/cosmos_framework/utils/vfm/model_loader.py
@@ -18,21 +18,7 @@
 try:
     from filelock import SoftReadWriteLock
 except ImportError:  # Older filelock versions in some inference containers.
-    try:
-        from filelock import ReadWriteLock as SoftReadWriteLock
-    except ImportError:
-        from filelock import FileLock
-
-        class SoftReadWriteLock:
-            """Compatibility adapter for filelock versions without read/write locks."""
-
-            def __init__(self, *args: Any, **kwargs: Any) -> None:
-                self._lock = FileLock(*args, **kwargs)
-
-            def write_lock(self) -> FileLock:
-                return self._lock
-
-
+    from filelock import ReadWriteLock as SoftReadWriteLock
 from torch.distributed.checkpoint.filesystem import FileSystemReader, FileSystemWriter
 
 from cosmos_framework.checkpoint.s3_filesystem import S3StorageReader
@@ -185,32 +171,6 @@ def _checkpoint_cache_group_lock(
         yield action
 
 
-def _reload_pretrained_reasoner_after_checkpoint_load(model: torch.nn.Module) -> None:
-    """Re-seed the reasoner pathway after a DCP load, mirroring the LoadPretrained
-    callback that runs during training (inference does not run training callbacks).
-
-    The decision is delegated entirely to the model's own gate in
-    ``load_pretrained_model_if_needed``: this is a no-op unless the model was built
-    with ``exclude_reasoner_weights_from_checkpoint=True`` (and pretrained weights
-    enabled), i.e. the case where the DCP checkpoint deliberately omits the reasoner
-    tower so it must be re-seeded from the pretrained source. For a normal checkpoint
-    that already contains the reasoner, the model's gate evaluates to False and
-    nothing is reloaded.
-
-    ``has_resumable_checkpoint=True`` / ``has_load_path=False`` is load-bearing: it
-    re-seeds the reasoner from the pretrained source while skipping the
-    understanding->generation copy (the generation pathway was already populated by
-    the DCP load). Passing ``has_load_path=True`` would instead force a reasoner
-    reload even for non-excluded checkpoints, clobbering any fine-tuned reasoner
-    weights restored from the DCP.
-    """
-    load_pretrained_model_if_needed = getattr(model, "load_pretrained_model_if_needed")
-    load_pretrained_model_if_needed(
-        has_resumable_checkpoint=True,
-        has_load_path=False,
-    )
-
-
 def _load_model(
     model: torch.nn.Module,
     checkpoint_path: str,
@@ -234,9 +194,6 @@ def _load_model(
     start_time = time.time()
 
     state_dict = ModelWrapper(model).state_dict()
-    if any(key.startswith("net_teacher.") for key in state_dict):
-        log.info("Dropping net_teacher.* keys from inference load target; distillation checkpoints do not save them.")
-        state_dict = {key: value for key, value in state_dict.items() if not key.startswith("net_teacher.")}
 
     if checkpoint_path.startswith("s3://"):
         storage_reader = S3StorageReader(
@@ -252,10 +209,19 @@ def _load_model(
         keys_to_skip_loading=keys_to_skip_loading or [],
     )
 
+    # Single-rank load (e.g. the action policy inference server): force no_dist so
+    # ``dcp.load`` skips the collective ``gather_object`` over the load plan. That
+    # gather pickles the plan, which fails with "cannot pickle code objects" for
+    # training/EMA DCPs whose metadata carries non-tensor objects; a single process
+    # owns the full checkpoint anyway, so the collective is unnecessary. Multi-rank
+    # (sharded) loads keep the default distributed path.
+    no_dist = not (dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1)
+
     dcp.load(
         state_dict=state_dict,
         storage_reader=storage_reader,
         planner=load_planner,
+        no_dist=no_dist,
     )
 
     log.info(f"Successfully loaded model from {checkpoint_path}")
@@ -394,16 +360,6 @@ def load_model_from_checkpoint(
 
     # Disable EMA for inference.
     config.model.config.ema.enabled = False
-    if hasattr(config.model.config, "load_teacher_weights"):
-        log.info("Setting load_teacher_weights=False for inference to skip teacher checkpoint download.")
-        config.model.config.load_teacher_weights = False
-
-    if (
-        config.model.config.exclude_reasoner_weights_from_checkpoint
-        and not config.model.config.vlm_config.pretrained_weights.enabled
-    ):
-        log.info("Enabling pretrained reasoner weights because this checkpoint excludes the reasoner tower from DCP.")
-        config.model.config.vlm_config.pretrained_weights.enabled = True
 
     config.validate()
     config.freeze()  # type: ignore
@@ -479,7 +435,6 @@ def load_model(checkpoint_load_path: str) -> None:
 
     if checkpoint_cache_path is None:
         load_model(checkpoint_path)
-        _reload_pretrained_reasoner_after_checkpoint_load(model)
         return model, config
 
     cache_lock_path = f"{checkpoint_cache_path}.lock"
@@ -497,6 +452,4 @@ def load_model(checkpoint_load_path: str) -> None:
     if cache_action == _CheckpointCacheAction.LOAD_CACHE:
         load_model(checkpoint_cache_path)
 
-    _reload_pretrained_reasoner_after_checkpoint_load(model)
-
     return model, config
diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md
new file mode 100644
index 0000000..02af929
--- /dev/null
+++ b/docs/action_policy_libero_sft.md
@@ -0,0 +1,206 @@
+# Cosmos3-Nano LIBERO action-policy SFT (reproduction)
+
+Reproduces the Cosmos3-Nano LIBERO-10 result (technical report Table 20, ~97.4%
+success at checkpoint 2000) as an action policy: vision + language in, action
+chunks out. Full SFT (no LoRA) on the public `nvidia/Cosmos3-Nano` base.
+
+Pieces:
+
+| Piece | Path |
+| --- | --- |
+| Dataset | `cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py` (`LIBEROLeRobotDataset`) |
+| SFT wrapper | `get_action_libero_sft_dataset` in `.../datasets/action_sft_dataset.py` |
+| Norm stats | `.../datasets/stats/libero_native_frame_wise_relative_rot6d.json` |
+| Experiment | `cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py` |
+| Run TOML | `examples/toml/sft_config/action_policy_libero_repro.toml` |
+| Launch | `examples/launch_sft_action_policy_libero.sh` |
+| Inference server | `cosmos_framework/scripts/action_policy_server_libero.py` |
+| Closed-loop eval | `cosmos_framework/simulation/libero/closed_loop_eval.py` |
+
+## 1. Data
+
+`LIBEROLeRobotDataset` reads a **local** LeRobot dir directly (parquet + video,
+like `DROIDLeRobotDataset`) — set `LIBERO_ROOT` to it. Use NVIDIA's **20 FPS**
+conversion [`nvidia/LIBERO_LeRobot_v3`](https://huggingface.co/datasets/nvidia/LIBERO_LeRobot_v3)
+(public, OpenMDW-1.1), which is what the bundled `quantile_rot` stats and the
+20 Hz eval cadence assume. It ships one subdirectory per suite, so pre-sync just
+`libero_10`:
+
+```bash
+hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset \
+  --include 'libero_10/**' --local-dir <nfs>/LIBERO_LeRobot_v3
+export LIBERO_ROOT=<nfs>/LIBERO_LeRobot_v3/libero_10
+```
+
+**For the Table-20 number, use `libero_10` ALONE.** Training on the full suite
+mix dilutes libero_10 to ~1 pass in 2000 steps (~82%); libero_10 alone is ~2.7
+passes (~97%). For more suites, sync the other subdirs and add more
+`datasets=dict(...)` entries to the experiment's dataloader.
+
+It uses `frame_wise_relative` rot6d actions (10D = `pos(3) + rot6d(6) +
+gripper(1)`), `concat_view` (third-person + wrist, each resized to 256×256,
+concatenated horizontally → 256×512), normalized with `quantile_rot` against the
+bundled stats.
+
+**FPS-agnostic loader.** It windows by frame index and decodes video at each
+frame's real timestamp (no `delta_timestamps` grid), so any LeRobot LIBERO dataset
+loads regardless of its `fps` label, and `conditioning_fps` is read from the
+dataset's own `meta/info.json`. Prefer the 20 FPS `nvidia/LIBERO_LeRobot_v3` so
+`conditioning_fps=20` matches the stats and the eval (serve with `--fps 20`). The
+community `lerobot/libero_*` repos carry the *same frames* but label them 10 FPS;
+see [§5](#5-fps--stats).
+
+**Model-input resolution = 192×320.** The 256×512 concat is aspect-2.0, so with
+`resolution=None` the `ActionTransformPipeline` snaps it to the closest `"256"`
+tier canvas — 16:9 → **320×192 (w×h) = 192×320 (h×w)** — by aspect-preserving
+resize + bottom reflection pad. The training prompt therefore reads
+`"...is of 192x320 resolution."`. Keep this; the eval server reproduces the same
+snap (see §4).
+
+## 2. Train (1 node, 8 GPUs)
+
+```bash
+export LD_LIBRARY_PATH=''                      # NGC/PyTorch container: avoid torch._C import error
+export LIBERO_ROOT=/path/to/libero_10_lerobot  # libero_10 conversion ONLY
+export BASE_CHECKPOINT_PATH=<Cosmos3-Nano DCP dir>
+export WAN_VAE_PATH=<Wan2.2_VAE.pth>
+export IMAGINAIRE_OUTPUT_ROOT=/path/to/output_root
+
+bash examples/launch_sft_action_policy_libero.sh
+```
+
+Or drive `cosmos_framework.scripts.train` directly:
+
+```bash
+torchrun --nproc_per_node=8 -m cosmos_framework.scripts.train \
+  --sft-toml examples/toml/sft_config/action_policy_libero_repro.toml
+```
+
+Recipe knobs live in the registered `action_policy_libero_nano` experiment (full
+SFT of the generation + action heads at lr 5e-5 with a 5× LR multiplier on the
+action bridge, FusedAdam, selective activation checkpointing, `quantile_rot`
+actions, action heads init fresh from the base via `keys_to_skip_loading`). The
+TOML sets only run-level scalars: DP=8, `max_iter=10000`, `warm_up_steps=2000`,
+`grad_accum_iter=2`, `save_iter=1000`. Checkpoint 2000 is the reference. On
+lower-memory GPUs reduce the per-rank batch:
+`--opts dataloader_train.max_samples_per_batch=32`.
+
+## 3. Closed-loop eval
+
+Start the policy server on a **trained** checkpoint, then run the LIBERO
+simulator client against it. (The base `nvidia/Cosmos3-Nano` DCP has no action
+heads — use a checkpoint from §2.)
+
+```bash
+# Server (training venv). Loads the DCP (single-rank no_dist), denormalizes with
+# quantile_rot + the bundled libero rot6d stats. The experiment supplies the VAE
+# path via the override (the server loads the experiment directly, no TOML).
+python -m cosmos_framework.scripts.action_policy_server_libero \
+  --experiment action_policy_libero_nano \
+  --experiment-overrides "model.config.tokenizer.vae_path=$WAN_VAE_PATH" \
+  --checkpoint-path <trained DCP dir, e.g. $OUTPUT_ROOT/.../checkpoints/iter_000002000> \
+  --action-normalization quantile_rot \
+  --action-stats-path cosmos_framework/data/vfm/action/datasets/stats/libero_native_frame_wise_relative_rot6d.json \
+  --raw-action-dim 10 --fps 20 --port 8000
+```
+
+**Eval environment** (the LIBERO sim needs a *separate* venv — robosuite/mujoco
+versions conflict with the training env, and the NGC image needs graphics
+enabled). This combo is validated headless on an NVIDIA GPU:
+
+```bash
+# 1. Enable the NVIDIA graphics libs in the container (mounts host libEGL_nvidia
+#    etc.); do NOT apt-install libnvidia-gl (it mismatches the mounted driver).
+export NVIDIA_DRIVER_CAPABILITIES=all
+apt-get install -y libegl1 libglvnd0 libgl1 libglib2.0-0 ffmpeg
+mkdir -p /usr/share/glvnd/egl_vendor.d   # ICD (usually already mounted)
+echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libEGL_nvidia.so.0"}}' \
+  > /usr/share/glvnd/egl_vendor.d/10_nvidia.json
+
+# 2. Separate py3.10 venv with LIBERO-compatible sim pins + torch<2.6
+#    (torch>=2.6 defaults weights_only=True and breaks LIBERO init-state loads).
+uv venv --python 3.10 .libenv && VV=.libenv/bin/python
+git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git && \
+  uv pip install -p $VV -e LIBERO -r LIBERO/requirements.txt
+uv pip install -p $VV "robosuite==1.4.1" "mujoco==2.3.7" "torch<2.6" loguru requests scipy pillow numpy
+
+# 3. LIBERO first-run config (avoids the interactive prompt) + robosuite macros
+mkdir -p ~/.libero && touch ~/.libero/config.yaml
+RS=$($VV -c "import robosuite,os;print(os.path.dirname(robosuite.__file__))")
+$VV "$RS/scripts/setup_macros.py"
+$VV -c "from libero.libero import set_libero_default_path; set_libero_default_path()"
+
+# 4. Run the client (concat agentview+wrist matches the 256x512 training view).
+MUJOCO_GL=egl PYTHONPATH=$PWD:$PWD/LIBERO $VV \
+  cosmos_framework/simulation/libero/closed_loop_eval.py \
+  --server_url http://localhost:8000 \
+  --task_suite libero_10 --num_trials_per_task 10 --action_horizon 16 \
+  --camera agentview,wrist --image_size 256 \
+  --action_space frame_wise_relative --rotation_space 6d --action_dim 10 \
+  --save_gifs --gif_fps 20 --output_dir results/libero_closed_loop_10
+```
+
+Validated end-to-end against a stub server (episode runs, `summary.json` + GIFs
+written, `rc=0`); a benign `EGLError` may print during context teardown on exit.
+
+## 4. Gotchas (from NVIDIA/cosmos-framework#50)
+
+These cost real accuracy if missed; the shipped eval client already handles the
+first two, but verify them against your checkpoint:
+
+- **Train ↔ serve parity (resolution + prompt).** Training snaps the 256×512
+  concat to a **192×320** model-input canvas (see §1) and the prompt suffix
+  encodes that resolution + clip duration (`append_resolution_info` /
+  `append_duration_fps_timestamps`). The server applies the *same* snap
+  (`get_vision_data_resolution` + `find_closest_target_size` + reflection pad),
+  so parity is automatic **as long as the client sends the same 2:1 concat
+  layout** — run `closed_loop_eval` with `--camera agentview,wrist --image_size
+  256` (agentview left, wrist right, matching training). A single-view client (or
+  an old server that skipped the snap) sends a different aspect → different
+  canvas → the reported 192×320-train vs 256×512-serve mismatch and ~62% (vs
+  ~97%). This is the first thing to check if numbers are low. Note the clip
+  *duration* string is computed slightly differently on each side (training's
+  rounds to `0.0s`); resolution is the dominant factor — verify both against a
+  `--dump_dir` server capture if accuracy is off.
+- **Gripper.** The model emits gripper in `[0, 1]`; the LIBERO env wants
+  `[-1, 1]` with negative = open. `closed_loop_eval._remap_gripper_to_neg1_pos1`
+  applies `1 - 2·g`. If the gripper never opens, the sign is inverted for your
+  data — flip it.
+- **Image orientation.** Sim frames are rotated 180° relative to training;
+  `closed_loop_eval` rotates them back (`img[::-1, ::-1]`).
+- **Normalization.** Always start the server with `--action-normalization
+  quantile_rot` and the bundled libero rot6d stats file, or actions come out at
+  the wrong scale.
+
+## 5. FPS & stats
+
+`LIBEROLeRobotDataset` follows `DROIDLeRobotDataset`: it reads the LeRobot parquet
+directly, windows by **frame index**, and decodes video at each frame's **real
+timestamp** — so it never builds LeRobot's `delta_timestamps` grid and works at
+any native FPS. (The earlier `delta_timestamps` port failed on the 10 FPS public
+dataset because a 1/20 s grid doesn't land on 10 FPS frames.)
+
+- **Use the 20 FPS `nvidia/LIBERO_LeRobot_v3`.** LIBERO demos are recorded at
+  robosuite's default 20 Hz `control_freq`. NVIDIA's conversion labels them 20 FPS
+  (correct); the community `lerobot/libero_*` repos contain the *same frames* (e.g.
+  libero_10 = 379 eps / 101,469 frames in both) but label them 10 FPS. Nothing was
+  subsampled — only the `fps` metadata differs.
+- **Why 20 FPS is the clean choice for THIS eval.** The closed-loop harness steps
+  the env at LIBERO's default 20 Hz and applies one predicted action per
+  `env.step` (no action-repeat, no `control_freq` override — see `_get_libero_env`
+  / `_run_episode`). So the policy's per-action cadence must be 20 Hz. Training on
+  the 20 FPS dataset makes `conditioning_fps=20` (read from `meta/info.json`),
+  matches the bundled `quantile_rot` stats, and lines up with the eval's 20 Hz —
+  serve with `--fps 20`, no harness change.
+- **The normalization gap was never the issue.** `normalize_action(quantile)` is an
+  *unclamped* affine map `2(a−q01)/(q99−q01)−1`; training and the server share the
+  same stats file, so any scale cancels (same reason DROID is fine at its own
+  15 FPS). The real consistency requirement is the **control rate**, which the
+  20 FPS dataset satisfies by construction.
+- **If you must use a differently-labelled dataset**, keep cadence consistent:
+  serve at the dataset's `fps`, and if its frames are genuinely sub-sampled (fewer
+  frames than the 20 Hz original), either run the eval env at a matching
+  `control_freq` or action-repeat. With `nvidia/LIBERO_LeRobot_v3` none of this is
+  needed.
+- `fps` only sets `conditioning_fps` + prompt duration; the loader always windows
+  by frame index and decodes at real timestamps.
diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh
new file mode 100755
index 0000000..79aec05
--- /dev/null
+++ b/examples/launch_sft_action_policy_libero.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Structured-TOML launch for action_policy_libero_nano — Cosmos3-Nano LIBERO
+# action-policy SFT (8-GPU FSDP, full SFT, no LoRA). Reproduces the Table-20
+# LIBERO-10 result (~97.4% @ ckpt 2000). Drives cosmos_framework.scripts.train
+# against examples/toml/sft_config/action_policy_libero_repro.toml.
+#
+# REPRODUCTION: point LIBERO_ROOT at the libero_10 suite ONLY. The full suite
+# mix dilutes libero_10 to ~1 pass in 2000 steps (~82%); libero_10 alone is ~2.7
+# passes (~97%). Use the 20 FPS nvidia/LIBERO_LeRobot_v3. See docs/action_policy_libero_sft.md.
+#
+# Required env vars:
+#   LIBERO_ROOT           local LIBERO-10 LeRobot dataset dir, e.g. <dir>/libero_10 (no default)
+# Optional env vars (defaults below; override to relocate data/checkpoints):
+#   BASE_CHECKPOINT_PATH  default: examples/checkpoints/Cosmos3-Nano
+#   WAN_VAE_PATH          default: examples/checkpoints/wan22_vae/Wan2.2_VAE.pth
+#   HF_TOKEN              if any tokenizer download requires gated HF access
+#   OUTPUT_ROOT           default: outputs/train
+#
+# Pre-sync the 20 FPS suite once:
+#   hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset --include 'libero_10/**' --local-dir <dir>
+#   export LIBERO_ROOT=<dir>/libero_10
+#
+# Usage (8-GPU allocation, inside the training container, from the repo root):
+#   LIBERO_ROOT=<dir>/libero_10 bash examples/launch_sft_action_policy_libero.sh
+
+TOML_FILE="examples/toml/sft_config/action_policy_libero_repro.toml"
+: "${BASE_CHECKPOINT_PATH:=examples/checkpoints/Cosmos3-Nano}"
+
+# LIBEROLeRobotDataset reads ${oc.env:LIBERO_ROOT} directly (a LOCAL LeRobot dir);
+# export it so torchrun (launched in this shell) inherits it.
+export LIBERO_ROOT="${LIBERO_ROOT:-}"
+
+EXTRA_DATASET_CHECK='[[ -f "$LIBERO_ROOT/meta/info.json" ]] || { echo "ERROR: LIBERO_ROOT must be a local LeRobot dir containing meta/info.json (got: '\''$LIBERO_ROOT'\''). Pre-sync: hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset --include '\''libero_10/**'\'' --local-dir <dir> (then LIBERO_ROOT=<dir>/libero_10). See docs/action_policy_libero_sft.md" >&2; exit 1; }'
+
+# Extra Hydra overrides from the environment: a space-separated string word-split into
+# the TAIL_OVERRIDES array. An exported string survives `bash <wrapper>` (a child
+# process), unlike a TAIL_OVERRIDES array set in your shell. Use it for smoke runs,
+# e.g. EXTRA_TAIL_OVERRIDES="trainer.max_iter=5 job.wandb_mode=offline".
+TAIL_OVERRIDES=(
+    ${EXTRA_TAIL_OVERRIDES:-}
+)
+
+source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
diff --git a/examples/toml/sft_config/action_policy_libero_repro.toml b/examples/toml/sft_config/action_policy_libero_repro.toml
new file mode 100644
index 0000000..7fd788d
--- /dev/null
+++ b/examples/toml/sft_config/action_policy_libero_repro.toml
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# ============================================================================
+# LIBERO action-policy SFT — run config for the `action_policy_libero_nano`
+# experiment (Cosmos3-Nano LIBERO-10). The recipe knobs (optimizer base, count-
+# based batch, action-head skip-on-load, dataset knobs) live in the registered
+# experiment; this file sets run-level scalars (lr/schedule, iters, ckpt cadence,
+# parallelism shape, wandb, VAE path).
+#
+# RECIPE (recommended): lr 5e-5, warmup 500, cycle 16000 (so LR is barely decayed
+# at iter 2000, ~4.5e-5), global batch 2048, save every 500 -> sweep 500..2000.
+# Best observed: ~95.2% @ iter_1500 (libero_10, 500-ep closed-loop eval), with
+# task-0 success stable across the sweep (no over-fit collapse). This gentle-LR
+# schedule is more robust than a higher lr (e.g. 1e-4), which peaks near iter_1000
+# then over-fits task 0 and regresses. See docs/action_policy_libero_sft.md.
+#
+# REPRODUCTION: train on libero_10 ALONE (point LIBERO_ROOT at the libero_10
+# LeRobot conversion only). The 4-suite mix dilutes libero_10 (~1/4 the exposure
+# per step) and converges more slowly.
+#
+# Env required:
+#   LIBERO_ROOT=/path/to/libero_10_lerobot
+#   BASE_CHECKPOINT_PATH=<Cosmos3-Nano DCP dir>
+#   WAN_VAE_PATH=<Wan2.2_VAE.pth>
+#   IMAGINAIRE_OUTPUT_ROOT=/path/to/output_root   # persist checkpoints
+# ============================================================================
+
+[job]
+task         = "vfm"
+experiment   = "action_policy_libero_nano"
+project      = "cosmos3_action_libero"
+group        = "action_sft"
+name         = "action_policy_libero_repro"
+wandb_mode   = "online"
+
+[model]
+precision = "bfloat16"
+# Cap the packed sequence (GA-validated). Uncapped (-1) packs one very long sequence
+# and OOMs even on H200.
+max_num_tokens_after_packing = 74000
+
+[model.parallelism]
+data_parallel_shard_degree     = 8    # 1-node 8-GPU shard; raise replicate for multi-node HSDP
+data_parallel_replicate_degree = 1
+
+[model.activation_checkpointing]
+mode           = "selective"          # GA recipe (full is slower; selective fits 256x512)
+save_ops_regex = ["fmha"]
+
+[model.tokenizer]
+vae_path = "${oc.env:WAN_VAE_PATH}"
+
+[optimizer]
+lr = 5.0e-05              # recommended base lr
+
+[scheduler]
+cycle_lengths = [16000]   # LR trajectory: warmup 500 -> linear decay over 16k (barely decayed at 2k)
+warm_up_steps = [500]
+
+[trainer]
+max_iter        = 2000    # pause at 2k; sweep checkpoints 500/1000/1500/2000 for the peak
+logging_iter    = 50
+grad_accum_iter = 2       # global batch = max_samples_per_batch 128 x DP 8 x grad_accum 2 = 2048
+
+[checkpoint]
+load_path = "${oc.env:BASE_CHECKPOINT_PATH}"
+save_iter = 500           # sweep cadence; peak is typically iter_1500
+
+# NOTE (train/serve parity — see GitHub issue NVIDIA/cosmos-framework#50): the
+# 256x512 concat_view is snapped to a 192x320 model canvas (resize+reflect-pad), and
+# the eval server reproduces the same snap. Run the client with the same 2:1 concat
+# (--camera agentview,wrist --image_size 256) so resolution + prompt suffix match, and
+# use --action-normalization quantile_rot + the bundled libero rot6d stats on the
+# server so denormalization matches training. See docs/action_policy_libero_sft.md.
+#
+# max_samples_per_batch is 128 in the experiment (256 OOMs: per-forward peak, not grad_accum).
+# On lower-memory GPUs reduce at launch:
+#   --opts dataloader_train.max_samples_per_batch=64

From 3ecd0a75394703c18a4cb78dc71f3f54028ca6cb Mon Sep 17 00:00:00 2001
From: Liang Hao <hliangac@connect.ust.hk>
Date: Fri, 26 Jun 2026 21:04:17 +0800
Subject: [PATCH 2/9] libero(doc): align markdown tables (rumdl-fmt / MD060)

---
 docs/action_policy_libero_sft.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md
index 02af929..4929f31 100644
--- a/docs/action_policy_libero_sft.md
+++ b/docs/action_policy_libero_sft.md
@@ -6,16 +6,16 @@ chunks out. Full SFT (no LoRA) on the public `nvidia/Cosmos3-Nano` base.
 
 Pieces:
 
-| Piece | Path |
-| --- | --- |
-| Dataset | `cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py` (`LIBEROLeRobotDataset`) |
-| SFT wrapper | `get_action_libero_sft_dataset` in `.../datasets/action_sft_dataset.py` |
-| Norm stats | `.../datasets/stats/libero_native_frame_wise_relative_rot6d.json` |
-| Experiment | `cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py` |
-| Run TOML | `examples/toml/sft_config/action_policy_libero_repro.toml` |
-| Launch | `examples/launch_sft_action_policy_libero.sh` |
-| Inference server | `cosmos_framework/scripts/action_policy_server_libero.py` |
-| Closed-loop eval | `cosmos_framework/simulation/libero/closed_loop_eval.py` |
+| Piece            | Path                                                                                            |
+| ---------------- | ----------------------------------------------------------------------------------------------- |
+| Dataset          | `cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py` (`LIBEROLeRobotDataset`)  |
+| SFT wrapper      | `get_action_libero_sft_dataset` in `.../datasets/action_sft_dataset.py`                         |
+| Norm stats       | `.../datasets/stats/libero_native_frame_wise_relative_rot6d.json`                               |
+| Experiment       | `cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py` |
+| Run TOML         | `examples/toml/sft_config/action_policy_libero_repro.toml`                                      |
+| Launch           | `examples/launch_sft_action_policy_libero.sh`                                                   |
+| Inference server | `cosmos_framework/scripts/action_policy_server_libero.py`                                       |
+| Closed-loop eval | `cosmos_framework/simulation/libero/closed_loop_eval.py`                                        |
 
 ## 1. Data
 

From 6b22dd5ac66cb2c7ac1b28bdf64b87f65a10605f Mon Sep 17 00:00:00 2001
From: Liang Hao <hliangac@connect.ust.hk>
Date: Fri, 26 Jun 2026 21:24:27 +0800
Subject: [PATCH 3/9] libero: trim recipe/doc comments to essentials; HSDP 2x8
 ga1 canonical

Lean the toml/config/launch/doc comments (drop SR numbers and experimental
detail), and set the canonical recipe to HSDP 2x8 with grad_accum=1 (global
batch 2048) instead of single-node grad_accum=2.
---
 .../action_policy_libero_nano.py              |  45 ++---
 .../action/datasets/libero_lerobot_dataset.py |   8 +-
 docs/action_policy_libero_sft.md              | 183 ++++--------------
 examples/launch_sft_action_policy_libero.sh   |  10 +-
 .../action_policy_libero_repro.toml           |  57 ++----
 5 files changed, 72 insertions(+), 231 deletions(-)

diff --git a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py
index 98b5b12..38c03d4 100644
--- a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py
+++ b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py
@@ -1,29 +1,13 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-"""``action_policy_libero_nano`` — Cosmos3-Nano LIBERO action-policy SFT recipe.
+"""``action_policy_libero_nano`` — Cosmos3-Nano LIBERO-10 action-policy SFT recipe.
 
-Reproduces the Cosmos3-Nano LIBERO-10 result (Table 20, 97.4% @ ckpt 2000).
-Mirrors ``action_policy_droid_nano`` (PackingDataLoader + RankPartitionedDataLoader
-+ ActionIterableShuffleDataset), but feeds ``LIBEROLeRobotDataset`` (frame-wise-relative
-rot6d actions, ``quantile_rot``-normalized, concat_view third-person + wrist at
-256x256 each -> 256x512) through ``ActionTransformPipeline``, and trains the
-generation + action heads from the public ``nvidia/Cosmos3-Nano`` base. Full SFT
-(no LoRA) — the LoRA variant is the 32B "super" tier only.
-
-LIBERO-10 reproduction note: the public Table-20 number is reached training on
-``libero_10`` ALONE. Training on the full 4-suite mix dilutes libero_10 to ~1 pass
-in 2000 steps (~82%); libero_10 alone is ~2.7 passes (~97%). Point ``LIBERO_ROOT``
-(and ``LIBERO_REPO_ID``) at the libero_10 LeRobot conversion only.
-
-Usage (1 node, 8 GPU)::
-
-    LIBERO_ROOT=/path/to/libero_10_lerobot \\
-    LIBERO_REPO_ID=lerobot/libero_10 \\
-    BASE_CHECKPOINT_PATH=<Cosmos3-Nano DCP dir> \\
-    WAN_VAE_PATH=<Wan2.2_VAE.pth> \\
-    torchrun --nproc_per_node=8 -m cosmos_framework.scripts.train \\
-        --sft-toml examples/toml/sft_config/action_policy_libero_repro.toml
+Mirrors ``action_policy_droid_nano`` but feeds ``LIBEROLeRobotDataset``
+(frame-wise-relative rot6d, ``quantile_rot``, concat_view third-person + wrist)
+and trains the generation + action heads from the public ``nvidia/Cosmos3-Nano``
+base. Train on ``libero_10`` alone (``LIBERO_ROOT``).
+See docs/action_policy_libero_sft.md.
 """
 
 import copy
@@ -44,16 +28,9 @@
 
 
 def _action_policy_libero_nano_model_config() -> dict:
-    """GA LIBERO model config: capped packed tokens, selective activation
-    checkpointing, fresh diffusion-expert init, 10x vision flow-matching loss, and
-    the VAE encode durations [17, 61, 73] carried by the Cosmos3 base.
-
-    NOTE: keep ``encode_exact_durations=[17, 61, 73]`` — do NOT reduce it to [17]
-    even though ``mode="policy"`` only produces 17-frame windows at the data level.
-    The public Cosmos3-Nano base was pretrained with [17, 61, 73]; the reference
-    GA LIBERO SFT (``action_policy_sft_nano`` on ``mharrim-nv-patch-1``) retains it,
-    and empirically reducing it to [17] regresses the policy badly
-    (60.8% vs 94.6% at iter 2000)."""
+    """LIBERO model config: capped packed tokens, selective activation
+    checkpointing, fresh diffusion-expert init, 10x vision flow-matching loss.
+    Keep ``encode_exact_durations=[17, 61, 73]`` to match the Cosmos3-Nano base."""
     cfg = copy.deepcopy(NANO_MODEL_CONFIG)  # action_gen=True, max_action_dim=64
     # Cap the packed sequence. Uncapped (-1) + a large max_samples_per_batch packs
     # one very long sequence and OOMs even on H200; 74000 keeps the GA-validated bound.
@@ -219,8 +196,8 @@ def _action_policy_libero_nano_model_config() -> dict:
                     libero=dict(
                         ratio=1,
                         dataset=L(get_action_libero_sft_dataset)(
-                            # Local LeRobot dir for the libero_10 suite ONLY (Table-20
-                            # reproduction; full suite mix -> ~82%, see module docstring). Use the
+                            # Local LeRobot dir for the libero_10 suite ONLY (the
+                            # full suite mix dilutes libero_10; see module docstring). Use the
                             # 20 FPS nvidia/LIBERO_LeRobot_v3 (matches the bundled stats + 20 Hz eval):
                             #   hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset \
                             #     --include 'libero_10/**' --local-dir <dir>   # LIBERO_ROOT=<dir>/libero_10
diff --git a/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py b/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py
index 146fcc1..1e5ef01 100644
--- a/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py
+++ b/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py
@@ -16,8 +16,8 @@
 
 NOTE on FPS / stats fidelity: the bundled ``quantile_rot`` stats were computed on
 a 20 FPS conversion. Per-frame deltas at 10 FPS span 2x the wall-clock motion, so
-for a faithful Table-20 reproduction use a 20 FPS LIBERO dataset (or recompute
-stats for the dataset's FPS). Loading/training is correct at any FPS regardless.
+use a 20 FPS LIBERO dataset (or recompute stats for the dataset's FPS).
+Loading/training is correct at any FPS regardless.
 """
 
 from __future__ import annotations
@@ -259,9 +259,7 @@ def get_shuffle_blocks(self) -> list[tuple[int, int]]:
     # ---- sample build ------------------------------------------------------
 
     def __getitem__(self, idx: int) -> dict[str, Any]:
-        # Resilience: a single unreadable/corrupt video frame (e.g. a torchcodec
-        # decode error on the packed LeRobot-v3 mp4s) must not crash a multi-node
-        # run. Resample a different valid window on failure (bounded retries).
+        # Resample a different valid window if a frame fails to decode (bounded retries).
         n = len(self)
         last_err: Exception | None = None
         for _attempt in range(8):
diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md
index 4929f31..735c630 100644
--- a/docs/action_policy_libero_sft.md
+++ b/docs/action_policy_libero_sft.md
@@ -1,10 +1,7 @@
-# Cosmos3-Nano LIBERO action-policy SFT (reproduction)
+# Cosmos3-Nano LIBERO-10 action-policy SFT
 
-Reproduces the Cosmos3-Nano LIBERO-10 result (technical report Table 20, ~97.4%
-success at checkpoint 2000) as an action policy: vision + language in, action
-chunks out. Full SFT (no LoRA) on the public `nvidia/Cosmos3-Nano` base.
-
-Pieces:
+Full SFT (no LoRA) of the public `nvidia/Cosmos3-Nano` base into a LIBERO-10
+action policy: vision + language in, action chunks out.
 
 | Piece            | Path                                                                                            |
 | ---------------- | ----------------------------------------------------------------------------------------------- |
@@ -19,12 +16,10 @@ Pieces:
 
 ## 1. Data
 
-`LIBEROLeRobotDataset` reads a **local** LeRobot dir directly (parquet + video,
-like `DROIDLeRobotDataset`) — set `LIBERO_ROOT` to it. Use NVIDIA's **20 FPS**
-conversion [`nvidia/LIBERO_LeRobot_v3`](https://huggingface.co/datasets/nvidia/LIBERO_LeRobot_v3)
-(public, OpenMDW-1.1), which is what the bundled `quantile_rot` stats and the
-20 Hz eval cadence assume. It ships one subdirectory per suite, so pre-sync just
-`libero_10`:
+`LIBEROLeRobotDataset` reads a local LeRobot dir (`LIBERO_ROOT`). Use the 20 FPS
+[`nvidia/LIBERO_LeRobot_v3`](https://huggingface.co/datasets/nvidia/LIBERO_LeRobot_v3),
+which the bundled `quantile_rot` stats and the 20 Hz eval assume. Train on
+`libero_10` alone:
 
 ```bash
 hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset \
@@ -32,175 +27,79 @@ hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset \
 export LIBERO_ROOT=<nfs>/LIBERO_LeRobot_v3/libero_10
 ```
 
-**For the Table-20 number, use `libero_10` ALONE.** Training on the full suite
-mix dilutes libero_10 to ~1 pass in 2000 steps (~82%); libero_10 alone is ~2.7
-passes (~97%). For more suites, sync the other subdirs and add more
-`datasets=dict(...)` entries to the experiment's dataloader.
-
-It uses `frame_wise_relative` rot6d actions (10D = `pos(3) + rot6d(6) +
-gripper(1)`), `concat_view` (third-person + wrist, each resized to 256×256,
-concatenated horizontally → 256×512), normalized with `quantile_rot` against the
-bundled stats.
-
-**FPS-agnostic loader.** It windows by frame index and decodes video at each
-frame's real timestamp (no `delta_timestamps` grid), so any LeRobot LIBERO dataset
-loads regardless of its `fps` label, and `conditioning_fps` is read from the
-dataset's own `meta/info.json`. Prefer the 20 FPS `nvidia/LIBERO_LeRobot_v3` so
-`conditioning_fps=20` matches the stats and the eval (serve with `--fps 20`). The
-community `lerobot/libero_*` repos carry the *same frames* but label them 10 FPS;
-see [§5](#5-fps--stats).
-
-**Model-input resolution = 192×320.** The 256×512 concat is aspect-2.0, so with
-`resolution=None` the `ActionTransformPipeline` snaps it to the closest `"256"`
-tier canvas — 16:9 → **320×192 (w×h) = 192×320 (h×w)** — by aspect-preserving
-resize + bottom reflection pad. The training prompt therefore reads
-`"...is of 192x320 resolution."`. Keep this; the eval server reproduces the same
-snap (see §4).
-
-## 2. Train (1 node, 8 GPUs)
+Actions are `frame_wise_relative` rot6d (10D = pos 3 + rot6d 6 + gripper 1),
+`concat_view` (third-person + wrist, each 256×256 → 256×512), `quantile_rot`
+normalized. The pipeline snaps the 256×512 concat to a 192×320 model canvas; the
+eval server reproduces the same snap (§4).
+
+## 2. Train
 
 ```bash
-export LD_LIBRARY_PATH=''                      # NGC/PyTorch container: avoid torch._C import error
-export LIBERO_ROOT=/path/to/libero_10_lerobot  # libero_10 conversion ONLY
+export LD_LIBRARY_PATH=''                      # NGC container: avoid torch._C import error
+export LIBERO_ROOT=/path/to/libero_10_lerobot
 export BASE_CHECKPOINT_PATH=<Cosmos3-Nano DCP dir>
 export WAN_VAE_PATH=<Wan2.2_VAE.pth>
 export IMAGINAIRE_OUTPUT_ROOT=/path/to/output_root
 
-bash examples/launch_sft_action_policy_libero.sh
+bash examples/launch_sft_action_policy_libero.sh   # HSDP 2x8; set NNODES/NODE_RANK/MASTER_ADDR per node
 ```
 
-Or drive `cosmos_framework.scripts.train` directly:
-
-```bash
-torchrun --nproc_per_node=8 -m cosmos_framework.scripts.train \
-  --sft-toml examples/toml/sft_config/action_policy_libero_repro.toml
-```
-
-Recipe knobs live in the registered `action_policy_libero_nano` experiment (full
-SFT of the generation + action heads at lr 5e-5 with a 5× LR multiplier on the
-action bridge, FusedAdam, selective activation checkpointing, `quantile_rot`
-actions, action heads init fresh from the base via `keys_to_skip_loading`). The
-TOML sets only run-level scalars: DP=8, `max_iter=10000`, `warm_up_steps=2000`,
-`grad_accum_iter=2`, `save_iter=1000`. Checkpoint 2000 is the reference. On
-lower-memory GPUs reduce the per-rank batch:
-`--opts dataloader_train.max_samples_per_batch=32`.
+Recipe knobs live in `action_policy_libero_nano`; the TOML sets run-level scalars
+(lr 5e-5, warmup 500, cycle 16000, global batch 2048, `save_iter=500`). Sweep the
+saved checkpoints to pick the best iteration. On lower-memory GPUs reduce the
+per-rank batch: `--opts dataloader_train.max_samples_per_batch=32`.
 
 ## 3. Closed-loop eval
 
-Start the policy server on a **trained** checkpoint, then run the LIBERO
-simulator client against it. (The base `nvidia/Cosmos3-Nano` DCP has no action
-heads — use a checkpoint from §2.)
+Start the policy server on a **trained** checkpoint (the base DCP has no action
+heads), then run the LIBERO simulator client against it.
 
 ```bash
-# Server (training venv). Loads the DCP (single-rank no_dist), denormalizes with
-# quantile_rot + the bundled libero rot6d stats. The experiment supplies the VAE
-# path via the override (the server loads the experiment directly, no TOML).
 python -m cosmos_framework.scripts.action_policy_server_libero \
   --experiment action_policy_libero_nano \
   --experiment-overrides "model.config.tokenizer.vae_path=$WAN_VAE_PATH" \
-  --checkpoint-path <trained DCP dir, e.g. $OUTPUT_ROOT/.../checkpoints/iter_000002000> \
+  --checkpoint-path <trained DCP dir>/checkpoints/iter_000001500 \
   --action-normalization quantile_rot \
   --action-stats-path cosmos_framework/data/vfm/action/datasets/stats/libero_native_frame_wise_relative_rot6d.json \
   --raw-action-dim 10 --fps 20 --port 8000
 ```
 
-**Eval environment** (the LIBERO sim needs a *separate* venv — robosuite/mujoco
-versions conflict with the training env, and the NGC image needs graphics
-enabled). This combo is validated headless on an NVIDIA GPU:
+The LIBERO sim needs a separate venv (robosuite/mujoco pins conflict with the
+training env) and graphics enabled in the container:
 
 ```bash
-# 1. Enable the NVIDIA graphics libs in the container (mounts host libEGL_nvidia
-#    etc.); do NOT apt-install libnvidia-gl (it mismatches the mounted driver).
 export NVIDIA_DRIVER_CAPABILITIES=all
 apt-get install -y libegl1 libglvnd0 libgl1 libglib2.0-0 ffmpeg
-mkdir -p /usr/share/glvnd/egl_vendor.d   # ICD (usually already mounted)
+mkdir -p /usr/share/glvnd/egl_vendor.d
 echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libEGL_nvidia.so.0"}}' \
   > /usr/share/glvnd/egl_vendor.d/10_nvidia.json
 
-# 2. Separate py3.10 venv with LIBERO-compatible sim pins + torch<2.6
-#    (torch>=2.6 defaults weights_only=True and breaks LIBERO init-state loads).
 uv venv --python 3.10 .libenv && VV=.libenv/bin/python
 git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git && \
   uv pip install -p $VV -e LIBERO -r LIBERO/requirements.txt
 uv pip install -p $VV "robosuite==1.4.1" "mujoco==2.3.7" "torch<2.6" loguru requests scipy pillow numpy
-
-# 3. LIBERO first-run config (avoids the interactive prompt) + robosuite macros
 mkdir -p ~/.libero && touch ~/.libero/config.yaml
-RS=$($VV -c "import robosuite,os;print(os.path.dirname(robosuite.__file__))")
-$VV "$RS/scripts/setup_macros.py"
+RS=$($VV -c "import robosuite,os;print(os.path.dirname(robosuite.__file__))"); $VV "$RS/scripts/setup_macros.py"
 $VV -c "from libero.libero import set_libero_default_path; set_libero_default_path()"
 
-# 4. Run the client (concat agentview+wrist matches the 256x512 training view).
 MUJOCO_GL=egl PYTHONPATH=$PWD:$PWD/LIBERO $VV \
   cosmos_framework/simulation/libero/closed_loop_eval.py \
   --server_url http://localhost:8000 \
-  --task_suite libero_10 --num_trials_per_task 10 --action_horizon 16 \
+  --task_suite libero_10 --num_trials_per_task 50 --num_envs 8 \
   --camera agentview,wrist --image_size 256 \
   --action_space frame_wise_relative --rotation_space 6d --action_dim 10 \
-  --save_gifs --gif_fps 20 --output_dir results/libero_closed_loop_10
+  --output_dir results/libero_closed_loop_10
 ```
 
-Validated end-to-end against a stub server (episode runs, `summary.json` + GIFs
-written, `rc=0`); a benign `EGLError` may print during context teardown on exit.
-
-## 4. Gotchas (from NVIDIA/cosmos-framework#50)
-
-These cost real accuracy if missed; the shipped eval client already handles the
-first two, but verify them against your checkpoint:
-
-- **Train ↔ serve parity (resolution + prompt).** Training snaps the 256×512
-  concat to a **192×320** model-input canvas (see §1) and the prompt suffix
-  encodes that resolution + clip duration (`append_resolution_info` /
-  `append_duration_fps_timestamps`). The server applies the *same* snap
-  (`get_vision_data_resolution` + `find_closest_target_size` + reflection pad),
-  so parity is automatic **as long as the client sends the same 2:1 concat
-  layout** — run `closed_loop_eval` with `--camera agentview,wrist --image_size
-  256` (agentview left, wrist right, matching training). A single-view client (or
-  an old server that skipped the snap) sends a different aspect → different
-  canvas → the reported 192×320-train vs 256×512-serve mismatch and ~62% (vs
-  ~97%). This is the first thing to check if numbers are low. Note the clip
-  *duration* string is computed slightly differently on each side (training's
-  rounds to `0.0s`); resolution is the dominant factor — verify both against a
-  `--dump_dir` server capture if accuracy is off.
-- **Gripper.** The model emits gripper in `[0, 1]`; the LIBERO env wants
-  `[-1, 1]` with negative = open. `closed_loop_eval._remap_gripper_to_neg1_pos1`
-  applies `1 - 2·g`. If the gripper never opens, the sign is inverted for your
-  data — flip it.
-- **Image orientation.** Sim frames are rotated 180° relative to training;
-  `closed_loop_eval` rotates them back (`img[::-1, ::-1]`).
-- **Normalization.** Always start the server with `--action-normalization
-  quantile_rot` and the bundled libero rot6d stats file, or actions come out at
-  the wrong scale.
-
-## 5. FPS & stats
-
-`LIBEROLeRobotDataset` follows `DROIDLeRobotDataset`: it reads the LeRobot parquet
-directly, windows by **frame index**, and decodes video at each frame's **real
-timestamp** — so it never builds LeRobot's `delta_timestamps` grid and works at
-any native FPS. (The earlier `delta_timestamps` port failed on the 10 FPS public
-dataset because a 1/20 s grid doesn't land on 10 FPS frames.)
-
-- **Use the 20 FPS `nvidia/LIBERO_LeRobot_v3`.** LIBERO demos are recorded at
-  robosuite's default 20 Hz `control_freq`. NVIDIA's conversion labels them 20 FPS
-  (correct); the community `lerobot/libero_*` repos contain the *same frames* (e.g.
-  libero_10 = 379 eps / 101,469 frames in both) but label them 10 FPS. Nothing was
-  subsampled — only the `fps` metadata differs.
-- **Why 20 FPS is the clean choice for THIS eval.** The closed-loop harness steps
-  the env at LIBERO's default 20 Hz and applies one predicted action per
-  `env.step` (no action-repeat, no `control_freq` override — see `_get_libero_env`
-  / `_run_episode`). So the policy's per-action cadence must be 20 Hz. Training on
-  the 20 FPS dataset makes `conditioning_fps=20` (read from `meta/info.json`),
-  matches the bundled `quantile_rot` stats, and lines up with the eval's 20 Hz —
-  serve with `--fps 20`, no harness change.
-- **The normalization gap was never the issue.** `normalize_action(quantile)` is an
-  *unclamped* affine map `2(a−q01)/(q99−q01)−1`; training and the server share the
-  same stats file, so any scale cancels (same reason DROID is fine at its own
-  15 FPS). The real consistency requirement is the **control rate**, which the
-  20 FPS dataset satisfies by construction.
-- **If you must use a differently-labelled dataset**, keep cadence consistent:
-  serve at the dataset's `fps`, and if its frames are genuinely sub-sampled (fewer
-  frames than the 20 Hz original), either run the eval env at a matching
-  `control_freq` or action-repeat. With `nvidia/LIBERO_LeRobot_v3` none of this is
-  needed.
-- `fps` only sets `conditioning_fps` + prompt duration; the loader always windows
-  by frame index and decodes at real timestamps.
+## 4. Eval parity
+
+The client/server already handle these; verify them if accuracy is low:
+
+- **Concat layout** — run with `--camera agentview,wrist --image_size 256` so the
+  256×512 concat matches training (the server snaps it to 192×320 identically).
+- **Gripper** — model emits `[0, 1]`; the env wants `[-1, 1]` (negative = open).
+  The client applies `1 − 2·g`; flip the sign if the gripper never opens.
+- **Image orientation** — sim frames are rotated 180° vs training; the client
+  rotates them back.
+- **Normalization** — start the server with `--action-normalization quantile_rot`
+  and the bundled rot6d stats, or actions come out at the wrong scale.
diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh
index 79aec05..4188d9d 100755
--- a/examples/launch_sft_action_policy_libero.sh
+++ b/examples/launch_sft_action_policy_libero.sh
@@ -3,13 +3,13 @@
 # SPDX-License-Identifier: OpenMDW-1.1
 
 # Structured-TOML launch for action_policy_libero_nano — Cosmos3-Nano LIBERO
-# action-policy SFT (8-GPU FSDP, full SFT, no LoRA). Reproduces the Table-20
-# LIBERO-10 result (~97.4% @ ckpt 2000). Drives cosmos_framework.scripts.train
+# action-policy SFT (HSDP, full SFT, no LoRA). Drives cosmos_framework.scripts.train
 # against examples/toml/sft_config/action_policy_libero_repro.toml.
 #
-# REPRODUCTION: point LIBERO_ROOT at the libero_10 suite ONLY. The full suite
-# mix dilutes libero_10 to ~1 pass in 2000 steps (~82%); libero_10 alone is ~2.7
-# passes (~97%). Use the 20 FPS nvidia/LIBERO_LeRobot_v3. See docs/action_policy_libero_sft.md.
+# Point LIBERO_ROOT at the libero_10 suite ONLY (the full suite mix dilutes
+# libero_10). Use the 20 FPS nvidia/LIBERO_LeRobot_v3. The default recipe is
+# HSDP 2x8 (global batch 2048); set NNODES/NODE_RANK/MASTER_ADDR per node.
+# See docs/action_policy_libero_sft.md.
 #
 # Required env vars:
 #   LIBERO_ROOT           local LIBERO-10 LeRobot dataset dir, e.g. <dir>/libero_10 (no default)
diff --git a/examples/toml/sft_config/action_policy_libero_repro.toml b/examples/toml/sft_config/action_policy_libero_repro.toml
index 7fd788d..d74237b 100644
--- a/examples/toml/sft_config/action_policy_libero_repro.toml
+++ b/examples/toml/sft_config/action_policy_libero_repro.toml
@@ -1,30 +1,10 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-# ============================================================================
-# LIBERO action-policy SFT — run config for the `action_policy_libero_nano`
-# experiment (Cosmos3-Nano LIBERO-10). The recipe knobs (optimizer base, count-
-# based batch, action-head skip-on-load, dataset knobs) live in the registered
-# experiment; this file sets run-level scalars (lr/schedule, iters, ckpt cadence,
-# parallelism shape, wandb, VAE path).
-#
-# RECIPE (recommended): lr 5e-5, warmup 500, cycle 16000 (so LR is barely decayed
-# at iter 2000, ~4.5e-5), global batch 2048, save every 500 -> sweep 500..2000.
-# Best observed: ~95.2% @ iter_1500 (libero_10, 500-ep closed-loop eval), with
-# task-0 success stable across the sweep (no over-fit collapse). This gentle-LR
-# schedule is more robust than a higher lr (e.g. 1e-4), which peaks near iter_1000
-# then over-fits task 0 and regresses. See docs/action_policy_libero_sft.md.
-#
-# REPRODUCTION: train on libero_10 ALONE (point LIBERO_ROOT at the libero_10
-# LeRobot conversion only). The 4-suite mix dilutes libero_10 (~1/4 the exposure
-# per step) and converges more slowly.
-#
-# Env required:
-#   LIBERO_ROOT=/path/to/libero_10_lerobot
-#   BASE_CHECKPOINT_PATH=<Cosmos3-Nano DCP dir>
-#   WAN_VAE_PATH=<Wan2.2_VAE.pth>
-#   IMAGINAIRE_OUTPUT_ROOT=/path/to/output_root   # persist checkpoints
-# ============================================================================
+# LIBERO-10 action-policy SFT run config for the `action_policy_libero_nano`
+# experiment. Train on libero_10 alone (HSDP 2x8, global batch 2048).
+# Env: LIBERO_ROOT, BASE_CHECKPOINT_PATH, WAN_VAE_PATH, IMAGINAIRE_OUTPUT_ROOT.
+# See docs/action_policy_libero_sft.md.
 
 [job]
 task         = "vfm"
@@ -36,44 +16,31 @@ wandb_mode   = "online"
 
 [model]
 precision = "bfloat16"
-# Cap the packed sequence (GA-validated). Uncapped (-1) packs one very long sequence
-# and OOMs even on H200.
 max_num_tokens_after_packing = 74000
 
 [model.parallelism]
-data_parallel_shard_degree     = 8    # 1-node 8-GPU shard; raise replicate for multi-node HSDP
-data_parallel_replicate_degree = 1
+data_parallel_shard_degree     = 8
+data_parallel_replicate_degree = 2    # HSDP 2x8 = 16 ranks (2 nodes)
 
 [model.activation_checkpointing]
-mode           = "selective"          # GA recipe (full is slower; selective fits 256x512)
+mode           = "selective"
 save_ops_regex = ["fmha"]
 
 [model.tokenizer]
 vae_path = "${oc.env:WAN_VAE_PATH}"
 
 [optimizer]
-lr = 5.0e-05              # recommended base lr
+lr = 5.0e-05
 
 [scheduler]
-cycle_lengths = [16000]   # LR trajectory: warmup 500 -> linear decay over 16k (barely decayed at 2k)
+cycle_lengths = [16000]
 warm_up_steps = [500]
 
 [trainer]
-max_iter        = 2000    # pause at 2k; sweep checkpoints 500/1000/1500/2000 for the peak
+max_iter        = 2000
 logging_iter    = 50
-grad_accum_iter = 2       # global batch = max_samples_per_batch 128 x DP 8 x grad_accum 2 = 2048
+grad_accum_iter = 1       # global batch = 128 x (8 x 2) x 1 = 2048
 
 [checkpoint]
 load_path = "${oc.env:BASE_CHECKPOINT_PATH}"
-save_iter = 500           # sweep cadence; peak is typically iter_1500
-
-# NOTE (train/serve parity — see GitHub issue NVIDIA/cosmos-framework#50): the
-# 256x512 concat_view is snapped to a 192x320 model canvas (resize+reflect-pad), and
-# the eval server reproduces the same snap. Run the client with the same 2:1 concat
-# (--camera agentview,wrist --image_size 256) so resolution + prompt suffix match, and
-# use --action-normalization quantile_rot + the bundled libero rot6d stats on the
-# server so denormalization matches training. See docs/action_policy_libero_sft.md.
-#
-# max_samples_per_batch is 128 in the experiment (256 OOMs: per-forward peak, not grad_accum).
-# On lower-memory GPUs reduce at launch:
-#   --opts dataloader_train.max_samples_per_batch=64
+save_iter = 500

From ffca1a1960f9d842742a0a4bb7bd975769badd60 Mon Sep 17 00:00:00 2001
From: Liang Hao <hliangac@connect.ust.hk>
Date: Fri, 26 Jun 2026 21:39:16 +0800
Subject: [PATCH 4/9] libero: fix clean-branch deps + drop
 droid/LoRA/reply-server mentions

- action_sft_dataset.py: rebuild as origin/main + libero-only (drop the speedup-era
  ShardedDROIDLeRobotDataset import that broke config load on a clean main).
- remove dataset_reply_action_server.py (GT-replay debug tool, not part of the recipe).
- drop DROID/LoRA references from libero docstrings/comments/doc/launch.
---
 cosmos_framework/configs/base/config.py       |   1 -
 .../action_policy_libero_nano.py              |   9 +-
 .../vfm/action/datasets/action_sft_dataset.py |  49 +-
 .../data/vfm/action/libero_pose_utils.py      |   2 +-
 .../simulation/libero/closed_loop_eval.py     |   2 +-
 .../libero/dataset_reply_action_server.py     | 653 ------------------
 docs/action_policy_libero_sft.md              |   4 +-
 examples/launch_sft_action_policy_libero.sh   |   2 +-
 8 files changed, 19 insertions(+), 703 deletions(-)
 delete mode 100644 cosmos_framework/simulation/libero/dataset_reply_action_server.py

diff --git a/cosmos_framework/configs/base/config.py b/cosmos_framework/configs/base/config.py
index 1fb0514..5ac2b41 100644
--- a/cosmos_framework/configs/base/config.py
+++ b/cosmos_framework/configs/base/config.py
@@ -98,5 +98,4 @@ def make_config() -> Config:
     import cosmos_framework.configs.base.experiment.sft.vision_sft_super  # noqa: F401
     import cosmos_framework.configs.base.experiment.action.posttrain_config.action_policy_droid_nano  # noqa: F401
     import cosmos_framework.configs.base.experiment.action.posttrain_config.action_policy_libero_nano  # noqa: F401
-    import cosmos_framework.configs.base.experiment.action.posttrain_config.action_policy_libero_nano_4suite  # noqa: F401
     return c
diff --git a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py
index 38c03d4..e5b5696 100644
--- a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py
+++ b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py
@@ -3,11 +3,10 @@
 
 """``action_policy_libero_nano`` — Cosmos3-Nano LIBERO-10 action-policy SFT recipe.
 
-Mirrors ``action_policy_droid_nano`` but feeds ``LIBEROLeRobotDataset``
-(frame-wise-relative rot6d, ``quantile_rot``, concat_view third-person + wrist)
-and trains the generation + action heads from the public ``nvidia/Cosmos3-Nano``
-base. Train on ``libero_10`` alone (``LIBERO_ROOT``).
-See docs/action_policy_libero_sft.md.
+Feeds ``LIBEROLeRobotDataset`` (frame-wise-relative rot6d, ``quantile_rot``,
+concat_view third-person + wrist) and trains the generation + action heads from
+the public ``nvidia/Cosmos3-Nano`` base. Train on ``libero_10`` alone
+(``LIBERO_ROOT``). See docs/action_policy_libero_sft.md.
 """
 
 import copy
diff --git a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py
index afe76da..7776875 100644
--- a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py
+++ b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py
@@ -18,10 +18,7 @@
 
 from torch.utils.data import Dataset, IterableDataset, get_worker_info
 
-from cosmos_framework.data.vfm.action.datasets.droid_lerobot_dataset import (
-    DROIDLeRobotDataset,
-    ShardedDROIDLeRobotDataset,
-)
+from cosmos_framework.data.vfm.action.datasets.droid_lerobot_dataset import DROIDLeRobotDataset
 from cosmos_framework.data.vfm.action.datasets.libero_lerobot_dataset import LIBEROLeRobotDataset
 from cosmos_framework.data.vfm.action.transforms import ActionTransformPipeline
 
@@ -102,7 +99,6 @@ def get_action_droid_sft_dataset(
     action_normalization: str | None = None,
     viewpoint: str = "concat_view",
     use_image_augmentation: bool = False,
-    apply_color_jitter: bool = True,
     use_filter_dict: bool = False,
     filter_dict_path: str | None = None,
     resolution: str | int = "256",
@@ -115,24 +111,11 @@ def get_action_droid_sft_dataset(
     append_idle_frames: bool = False,
     iterable_shuffle: bool = False,
     episode_shuffle_seed: int = 42,
-    sharded: bool = False,
-    lerobot_roots: list[str] | None = None,
-    use_success_only: bool = True,
 ) -> Dataset:
     """Build the DROID action SFT dataset: ``action_space='joint_pos'`` (8D) +
-    ``use_state`` (raw/un-normalized), concat_view, chunk_length 32.
-
-    ``sharded=True`` consumes the per-lab sharded layout (``<root>/success/<lab>``)
-    via :class:`ShardedDROIDLeRobotDataset` — one ``DROIDLeRobotDataset`` per lab
-    concatenated into one flat index — reproducing the internal sharded run's
-    per-shard index construction. ``sharded=False`` (default) reads ``root`` as a
-    single flat LeRobot dataset (the prior behavior). ``lerobot_roots`` optionally
-    pins the shard sub-paths (relative to ``root``); otherwise they are
-    auto-discovered."""
-    # ``sharded`` may arrive as a string from env-var config resolution.
-    if isinstance(sharded, str):
-        sharded = sharded.strip().lower() in ("1", "true", "yes", "on")
-    shard_kwargs = dict(
+    ``use_state`` (raw/un-normalized), concat_view, chunk_length 32."""
+    dataset = DROIDLeRobotDataset(
+        root=root,
         fps=fps,
         chunk_length=chunk_length,
         viewpoint=viewpoint,
@@ -141,19 +124,9 @@ def get_action_droid_sft_dataset(
         use_state=use_state,
         action_normalization=action_normalization,
         use_image_augmentation=use_image_augmentation,
-        apply_color_jitter=apply_color_jitter,
         use_filter_dict=use_filter_dict,
         filter_dict_path=filter_dict_path,
     )
-    if sharded:
-        dataset: Dataset = ShardedDROIDLeRobotDataset(
-            root=root,
-            lerobot_roots=lerobot_roots,
-            use_success_only=use_success_only,
-            **shard_kwargs,
-        )
-    else:
-        dataset = DROIDLeRobotDataset(root=root, **shard_kwargs)
     transform = ActionTransformPipeline(
         tokenizer_config=tokenizer_config,
         cfg_dropout_rate=cfg_dropout_rate,
@@ -198,14 +171,12 @@ def get_action_libero_sft_dataset(
 ) -> Dataset:
     """Build the LIBERO action-policy SFT dataset (GA reproduction defaults).
 
-    Mirrors :func:`get_action_droid_sft_dataset` but feeds ``LIBEROLeRobotDataset``
-    (frame-wise-relative rot6d actions, ``quantile_rot``-normalized, concat_view
-    third-person + wrist at 256x256 each → 256x512) through
-    ``ActionTransformPipeline``. ``root`` is a LOCAL LeRobot dir (read parquet +
-    video directly, like DROID); pre-sync the HF dataset once, e.g.
-    ``hf download lerobot/libero_10 --repo-type dataset --local-dir <root>``. For
-    the Table-20 LIBERO-10 reproduction point ``root`` at libero_10 alone (the
-    4-suite mix dilutes libero_10 to ~1 pass in 2000 steps → ~82% vs ~97%). The
+    Feeds ``LIBEROLeRobotDataset`` (frame-wise-relative rot6d actions,
+    ``quantile_rot``-normalized, concat_view third-person + wrist at 256x256 each
+    → 256x512) through ``ActionTransformPipeline``. ``root`` is a LOCAL LeRobot dir
+    (read parquet + video directly); pre-sync the HF dataset once, e.g.
+    ``hf download lerobot/libero_10 --repo-type dataset --local-dir <root>``. Point
+    ``root`` at libero_10 alone (the all-suites mix dilutes libero_10 per step). The
     dataset is FPS-agnostic (decodes at real frame timestamps); ``fps`` is metadata
     for ``conditioning_fps`` / prompt duration.
     """
diff --git a/cosmos_framework/data/vfm/action/libero_pose_utils.py b/cosmos_framework/data/vfm/action/libero_pose_utils.py
index 5cc9fff..3a4fd8e 100644
--- a/cosmos_framework/data/vfm/action/libero_pose_utils.py
+++ b/cosmos_framework/data/vfm/action/libero_pose_utils.py
@@ -13,7 +13,7 @@
     build_abs_pose_from_components,
 )
 
-# Same local-frame post-rotation pattern used by DROID/Bridge/Fractal:
+# Local-frame post-rotation pattern:
 # R_opencv = R_native @ *_TO_OPENCV.
 LIBERO_TO_OPENCV: np.ndarray = np.array(
     [[0.0, -1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 1.0]],
diff --git a/cosmos_framework/simulation/libero/closed_loop_eval.py b/cosmos_framework/simulation/libero/closed_loop_eval.py
index 0205f9f..660be36 100644
--- a/cosmos_framework/simulation/libero/closed_loop_eval.py
+++ b/cosmos_framework/simulation/libero/closed_loop_eval.py
@@ -99,7 +99,7 @@ def _concat_view_layout_description(cameras: list[str]) -> str:
 
 
 def _augment_task_prompt_with_viewpoint(task_description: str, cameras: list[str]) -> str:
-    """Mirror DROID-style concat-view caption augmentation for closed-loop LIBERO eval."""
+    """Concat-view caption augmentation for closed-loop LIBERO eval."""
     if len(cameras) <= 1:
         return task_description
     prompt = _append_prompt_sentence(task_description, DEFAULT_VIEWPOINT_TEMPLATES["concat_view"])
diff --git a/cosmos_framework/simulation/libero/dataset_reply_action_server.py b/cosmos_framework/simulation/libero/dataset_reply_action_server.py
deleted file mode 100644
index bb5d9a4..0000000
--- a/cosmos_framework/simulation/libero/dataset_reply_action_server.py
+++ /dev/null
@@ -1,653 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-"""
-HTTP server that serves ground-truth actions from LIBERO LeRobot datasets.
-
-Same HTTP interface as `cosmos3.scripts.action_policy_server` (the model-backed
-server), enabling drop-in replacement for closed-loop evaluation to verify the
-action pipeline with known-good GT actions.
-
-Endpoints:
-- POST /predict: Return next chunk of GT actions for the given task (matched by prompt)
-- GET  /info:    Return dataset info (tasks, episode counts)
-- POST /next_episode: Advance to next episode for the task specified in request body
-- POST /reset:   Reset all per-task episode/step tracking
-
-Episode advancement:
-  The server auto-advances to the next episode when the current episode's actions
-  are exhausted.  For early-termination cases (e.g. success before all actions are
-  consumed), call POST /next_episode with {"prompt": "<task>"} between episodes.
-
-Example usage:
-
-
-PYTHONPATH=. python cosmos_framework/simulation/libero/dataset_reply_action_server.py \
-  --repo_id libero_10 \
-  --root /path/to/libero_10_no_noops_1.0.0_lerobot_aligned \
-  --action_space frame_wise_relative \
-  --rotation_space 6d \
-  --pose_coordinate_frame opencv \
-  --action_chunk_size 16 \
-  --send_video \
-  --camera_mode agentview \
-  --port 8000
-
-# Multiple datasets:
-PYTHONPATH=. python cosmos_framework/simulation/libero/dataset_reply_action_server.py \
-  --repo_id libero_10,libero_goal \
-  --root /path/to/libero_10,/path/to/libero_goal \
-  --action_space relative \
-  --rotation_space 6d \
-  --pose_coordinate_frame opencv \
-  --action_chunk_size 16 \
-  --port 8000
-"""
-
-from __future__ import annotations
-
-import argparse
-import base64
-import datetime
-import io
-import json
-import socket
-import threading
-import time
-from dataclasses import dataclass
-from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
-from typing import Any
-
-import numpy as np
-import torch
-from PIL import Image
-
-from cosmos_framework.data.vfm.action.libero_pose_utils import (
-    libero_rotation_format,
-)
-from cosmos_framework.data.vfm.action.pose_utils import convert_rotation
-
-
-def _ts() -> str:
-    return datetime.datetime.now(tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
-
-
-def _get_local_ip() -> str:
-    try:
-        with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-            s.connect(("8.8.8.8", 80))
-            return str(s.getsockname()[0])
-    except Exception:
-        return socket.gethostbyname(socket.gethostname())
-
-
-# ---------------------------------------------------------------------------
-# Action processing (mirrors LIBEROLeRobotDataset.__getitem__ logic)
-# ---------------------------------------------------------------------------
-
-
-def _compute_anchored_actions(
-    state_raw: torch.Tensor,
-    action_raw: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Compute anchored relative actions, same as LIBEROLeRobotDataset._compute_anchored_actions.
-
-    Actions are expressed in state_raw[0]'s local coordinate frame.
-
-    Args:
-        state_raw: (T+1, 8) states [x, y, z, ax, ay, az, grip1, grip2].
-        action_raw: (T+1, 7) actions [dx, dy, dz, dax, day, daz, grip].
-
-    Returns:
-        anchored_translation (T, 3), anchored_rotation (T, 3, 3), gripper (T, 1).
-    """
-    p_states = state_raw[:, :3]
-    rotvec_states = state_raw[:, 3:6]
-    delta_p = action_raw[:-1, :3]
-    delta_rotvec = action_raw[:-1, 3:6]
-    gripper = action_raw[:-1, 6:7]
-
-    R_states = convert_rotation(rotvec_states, "axisangle", "matrix")
-    R_deltas = convert_rotation(delta_rotvec, "axisangle", "matrix")
-
-    p_0 = p_states[0]
-    R_0_T = R_states[0].T
-
-    p_t = p_states[:-1]
-    R_t = R_states[:-1]
-
-    p_target = p_t + delta_p
-    R_target = torch.bmm(R_deltas, R_t)
-
-    anchored_p = (R_0_T @ (p_target - p_0).T).T
-    R_0_T_expanded = R_0_T.unsqueeze(0).expand(R_target.shape[0], -1, -1)
-    anchored_R = torch.bmm(R_0_T_expanded, R_target)
-
-    return anchored_p, anchored_R, gripper
-
-
-def _convert_rotation_to_repr(rotation_matrix: torch.Tensor, rotation_space: str) -> torch.Tensor:
-    return convert_rotation(rotation_matrix, "matrix", libero_rotation_format(rotation_space))
-
-
-def _process_action_chunk(
-    action_raw: torch.Tensor,
-    state_raw: torch.Tensor,
-    action_space: str,
-    rotation_space: str,
-) -> torch.Tensor:
-    """Process a chunk of raw actions with the same logic as LIBEROLeRobotDataset.__getitem__.
-
-    Args:
-        action_raw: (chunk+1, 7) raw actions covering chunk+1 consecutive frames.
-        state_raw:  (chunk+1, 8) raw states  covering chunk+1 consecutive frames.
-        action_space: "relative" or "frame_wise_relative".
-        rotation_space: "3d", "6d", or "9d".
-
-    Returns:
-        Processed actions (chunk, D) where D depends on rotation_space.
-    """
-    if action_space == "relative":
-        translation, rotation_matrix, gripper = _compute_anchored_actions(state_raw, action_raw)
-    elif action_space == "frame_wise_relative":
-        action = action_raw[:-1].clone()
-        translation = action[:, :3]
-        rotation_matrix = convert_rotation(action[:, 3:6], "axisangle", "matrix")
-        gripper = action[:, 6:]
-    else:
-        raise ValueError(f"Unsupported action_space: {action_space}")
-
-    rotation = _convert_rotation_to_repr(rotation_matrix, rotation_space)
-    return torch.cat([translation, rotation, gripper], dim=-1)
-
-
-# ---------------------------------------------------------------------------
-# Data structures
-# ---------------------------------------------------------------------------
-
-
-@dataclass(frozen=True)
-class EpisodeData:
-    action_raw: torch.Tensor  # (N, 7) per-frame raw actions for the full episode
-    state_raw: torch.Tensor  # (N, 8) per-frame raw states for the full episode
-    task_description: str
-    dataset_ref_idx: int  # index into DatasetActionService._hf_datasets
-    frame_start: int  # first global frame index in the HF dataset
-    frame_end: int  # one-past-last global frame index
-
-
-@dataclass(frozen=True)
-class DatasetServerConfig:
-    repo_id: list[str]
-    root: list[str | None]
-    action_space: str
-    rotation_space: str
-    pose_coordinate_frame: str
-    action_chunk_size: int
-    max_action_dim: int
-    split: str
-    send_video: bool
-    camera_mode: str
-    image_size: int
-
-
-# ---------------------------------------------------------------------------
-# Service
-# ---------------------------------------------------------------------------
-
-
-class DatasetActionService:
-    """Serves GT actions (and optionally GT video) from pre-loaded LIBERO LeRobot episodes."""
-
-    def __init__(self, cfg: DatasetServerConfig) -> None:
-        self.cfg = cfg
-        self.episodes_by_task: dict[str, list[EpisodeData]] = {}
-        self._hf_datasets: list[Any] = []
-        self._lerobot_datasets: list[Any] = []
-        self._task_state: dict[str, dict[str, int]] = {}
-        self._lock = threading.Lock()
-
-        if cfg.camera_mode in ("concat_view", "both"):
-            self._image_keys = ["observation.images.image", "observation.images.wrist_image"]
-        elif cfg.camera_mode == "wrist_image":
-            self._image_keys = ["observation.images.wrist_image"]
-        else:
-            self._image_keys = ["observation.images.image"]
-
-        self._load_datasets()
-
-    def _load_datasets(self) -> None:
-        from lerobot.datasets.lerobot_dataset import LeRobotDataset
-
-        for repo_id, root in zip(self.cfg.repo_id, self.cfg.root):
-            print(f"[{_ts()}] [dataset-server] loading repo_id={repo_id} root={root} ...", flush=True)
-            t0 = time.monotonic()
-
-            dataset = LeRobotDataset(repo_id=repo_id, root=root)
-            tasks_df = dataset.meta.tasks
-            hf = dataset.hf_dataset
-            ds_ref_idx = len(self._hf_datasets)
-            self._hf_datasets.append(hf)
-
-            if self.cfg.send_video:
-                delta_ts: dict[str, list[float]] = {k: [0.0] for k in self._image_keys}
-                video_dataset = LeRobotDataset(repo_id=repo_id, root=root, delta_timestamps=delta_ts)
-                self._lerobot_datasets.append(video_dataset)
-            else:
-                self._lerobot_datasets.append(None)
-
-            for ep_meta in dataset.meta.episodes:
-                ep_idx = int(ep_meta["episode_index"])  # type: ignore[index]
-                start = int(ep_meta["dataset_from_index"])  # type: ignore[index]
-                end = int(ep_meta["dataset_to_index"])  # type: ignore[index]
-
-                ep_slice = hf.select(range(start, end))
-                actions = torch.tensor(np.array(ep_slice["action"], dtype=np.float32))
-                states = torch.tensor(np.array(ep_slice["observation.state"], dtype=np.float32))
-
-                task_idx = int(ep_slice[0]["task_index"])
-                matching = tasks_df[tasks_df["task_index"] == task_idx]
-                task_desc = str(matching.iloc[0].name) if not matching.empty else f"task_{task_idx}"
-
-                self.episodes_by_task.setdefault(task_desc, []).append(
-                    EpisodeData(
-                        action_raw=actions,
-                        state_raw=states,
-                        task_description=task_desc,
-                        dataset_ref_idx=ds_ref_idx,
-                        frame_start=start,
-                        frame_end=end,
-                    )
-                )
-
-            dt = time.monotonic() - t0
-            print(
-                f"[{_ts()}] [dataset-server] loaded {repo_id}: {dataset.meta.total_episodes} episodes in {dt:.1f}s",
-                flush=True,
-            )
-
-        total_tasks = len(self.episodes_by_task)
-        total_eps = sum(len(eps) for eps in self.episodes_by_task.values())
-        print(
-            f"[{_ts()}] [dataset-server] ready: {total_tasks} tasks, {total_eps} episodes "
-            f"send_video={self.cfg.send_video} camera_mode={self.cfg.camera_mode}",
-            flush=True,
-        )
-
-    def _load_video_frames(self, episode: EpisodeData, step: int, num_frames: int) -> list[str]:
-        """Load GT video frames from the dataset and encode as base64 PNGs.
-
-        Uses the LeRobotDataset wrapper (not the raw HF dataset) so that video-backed
-        datasets are decoded correctly via the configured video backend.
-
-        Args:
-            episode: Episode data with dataset reference.
-            step: Step offset within the episode (0-based).
-            num_frames: Number of frames to load (typically action_chunk_size + 1).
-
-        Returns:
-            List of base64-encoded PNG strings.
-        """
-        lr_dataset = self._lerobot_datasets[episode.dataset_ref_idx]
-        if lr_dataset is None:
-            return []
-        image_size = self.cfg.image_size
-        b64_frames: list[str] = []
-
-        for i in range(num_frames):
-            global_idx = episode.frame_start + step + i
-            if global_idx >= episode.frame_end:
-                break
-
-            item = lr_dataset[global_idx]
-
-            pil_images: list[Image.Image] = []
-            for key in self._image_keys:
-                img_tensor = item[key]
-                if isinstance(img_tensor, torch.Tensor):
-                    # LeRobot returns (T, C, H, W) with delta_timestamps=[0.0] -> (1, C, H, W)
-                    if img_tensor.dim() == 4:
-                        img_tensor = img_tensor[0]
-                    # (C, H, W) float [0, 1] -> PIL
-                    arr = (img_tensor.permute(1, 2, 0).clamp(0, 1) * 255).to(torch.uint8).numpy()
-                    img = Image.fromarray(arr)
-                elif isinstance(img_tensor, Image.Image):
-                    img = img_tensor
-                else:
-                    img = Image.fromarray(np.asarray(img_tensor, dtype=np.uint8))
-                img = img.convert("RGB").resize((image_size, image_size), Image.Resampling.BILINEAR)
-                pil_images.append(img)
-
-            if len(pil_images) > 1:
-                total_w = sum(im.width for im in pil_images)
-                combined = Image.new("RGB", (total_w, image_size))
-                x = 0
-                for im in pil_images:
-                    combined.paste(im, (x, 0))
-                    x += im.width
-                frame = combined
-            else:
-                frame = pil_images[0]
-
-            buf = io.BytesIO()
-            frame.save(buf, format="PNG")
-            b64_frames.append(base64.b64encode(buf.getvalue()).decode("ascii"))
-
-        return b64_frames
-
-    # -- state management --
-
-    def _get_task_state(self, prompt: str) -> dict[str, int]:
-        if prompt not in self._task_state:
-            self._task_state[prompt] = {"episode_idx": 0, "step": 0}
-        return self._task_state[prompt]
-
-    def _resolve_prompt(self, prompt: str) -> str:
-        """Resolve prompt to a known task description (exact or substring match)."""
-        if prompt in self.episodes_by_task:
-            return prompt
-        prompt_lower = prompt.lower().strip()
-        for task_desc in self.episodes_by_task:
-            if task_desc.lower().strip() == prompt_lower:
-                return task_desc
-        for task_desc in self.episodes_by_task:
-            td_lower = task_desc.lower().strip()
-            if prompt_lower in td_lower or td_lower in prompt_lower:
-                return task_desc
-        raise ValueError(
-            f"Task not found for prompt: {prompt!r}. Available tasks: {sorted(self.episodes_by_task.keys())}"
-        )
-
-    # -- endpoints --
-
-    def get_info(self) -> dict[str, Any]:
-        return {
-            "type": "dataset_action_server",
-            "action_space": self.cfg.action_space,
-            "rotation_space": self.cfg.rotation_space,
-            "action_chunk_size": self.cfg.action_chunk_size,
-            "tasks": {k: len(v) for k, v in sorted(self.episodes_by_task.items())},
-        }
-
-    def predict(self, req: dict[str, Any]) -> dict[str, Any]:
-        prompt = req.get("prompt")
-        if not isinstance(prompt, str):
-            raise ValueError("'prompt' must be a string")
-
-        resolved_prompt = self._resolve_prompt(prompt)
-
-        with self._lock:
-            state = self._get_task_state(resolved_prompt)
-            episodes = self.episodes_by_task[resolved_prompt]
-
-            ep_idx = state["episode_idx"] % len(episodes)
-            episode = episodes[ep_idx]
-            step = state["step"]
-
-            # Number of valid actions = num_frames - 1 (need pairs of consecutive frames)
-            max_actions = len(episode.action_raw) - 1
-
-            if step >= max_actions:
-                state["episode_idx"] = (ep_idx + 1) % len(episodes)
-                state["step"] = 0
-                ep_idx = state["episode_idx"]
-                episode = episodes[ep_idx]
-                step = 0
-                max_actions = len(episode.action_raw) - 1
-
-            chunk_size = min(self.cfg.action_chunk_size, max_actions - step)
-            # Slice chunk+1 frames for action computation (needs next-frame state)
-            raw_slice_end = step + chunk_size + 1
-            action_chunk_raw = episode.action_raw[step:raw_slice_end]
-            state_chunk_raw = episode.state_raw[step:raw_slice_end]
-
-            processed = _process_action_chunk(
-                action_chunk_raw,
-                state_chunk_raw,
-                self.cfg.action_space,
-                self.cfg.rotation_space,
-            )
-
-            # Pad to max_action_dim (same as the Action transform pipeline)
-            t, d = processed.shape
-            if d < self.cfg.max_action_dim:
-                processed = torch.cat(
-                    [processed, torch.zeros(t, self.cfg.max_action_dim - d)],
-                    dim=-1,
-                )
-
-            state["step"] += chunk_size
-
-            action_list = processed.float().numpy().tolist()
-
-            video_b64: list[str] = []
-            if self.cfg.send_video:
-                video_b64 = self._load_video_frames(episode, step, num_frames=chunk_size + 1)
-
-        print(
-            f"[{_ts()}] [dataset-server] predict prompt={resolved_prompt!r} "
-            f"ep={ep_idx} step={step}..{state['step']} actions={len(action_list)} "
-            f"video_frames={len(video_b64)}",
-            flush=True,
-        )
-        return {"action": action_list, "video": video_b64}
-
-    def next_episode(self, prompt: str | None = None) -> dict[str, Any]:
-        with self._lock:
-            if prompt is not None:
-                resolved = self._resolve_prompt(prompt)
-                state = self._get_task_state(resolved)
-                episodes = self.episodes_by_task[resolved]
-                state["episode_idx"] = (state["episode_idx"] + 1) % len(episodes)
-                state["step"] = 0
-                print(
-                    f"[{_ts()}] [dataset-server] next_episode task={resolved!r} -> ep={state['episode_idx']}",
-                    flush=True,
-                )
-                return {"task": resolved, "episode_idx": state["episode_idx"]}
-
-            for task in self._task_state:
-                episodes = self.episodes_by_task.get(task, [])
-                self._task_state[task]["episode_idx"] = (self._task_state[task]["episode_idx"] + 1) % max(
-                    len(episodes), 1
-                )
-                self._task_state[task]["step"] = 0
-            print(f"[{_ts()}] [dataset-server] next_episode (all tasks)", flush=True)
-            return {"advanced_all": True}
-
-    def reset(self) -> dict[str, str]:
-        with self._lock:
-            self._task_state.clear()
-        print(f"[{_ts()}] [dataset-server] reset", flush=True)
-        return {"status": "reset"}
-
-
-# ---------------------------------------------------------------------------
-# HTTP handler
-# ---------------------------------------------------------------------------
-
-
-class _DatasetHandler(BaseHTTPRequestHandler):
-    server: ThreadingHTTPServer  # type: ignore[assignment]
-
-    def _send_json(self, status_code: int, payload: dict[str, Any]) -> None:
-        body = json.dumps(payload).encode("utf-8")
-        self.send_response(status_code)
-        self.send_header("Content-Type", "application/json")
-        self.send_header("Cache-Control", "no-store")
-        self.send_header("Content-Length", str(len(body)))
-        self.end_headers()
-        try:
-            self.wfile.write(body)
-        except (BrokenPipeError, ConnectionResetError):
-            return
-
-    def _read_json_body(self) -> dict[str, Any] | None:
-        try:
-            length = int(self.headers.get("Content-Length") or "0")
-        except ValueError:
-            self._send_json(400, {"error": "Invalid Content-Length"})
-            return None
-        body = self.rfile.read(max(0, length))
-        if not body:
-            return {}
-        try:
-            req = json.loads(body.decode("utf-8"))
-        except Exception as e:
-            self._send_json(400, {"error": f"Invalid JSON: {e}"})
-            return None
-        if not isinstance(req, dict):
-            self._send_json(400, {"error": "JSON body must be an object"})
-            return None
-        return req
-
-    def do_GET(self) -> None:  # noqa: N802
-        service: DatasetActionService = getattr(self.server, "service")
-        if self.path == "/info":
-            self._send_json(200, service.get_info())
-        elif self.path == "/":
-            self._send_json(200, {"status": "ok"})
-        else:
-            self._send_json(404, {"error": "Not found"})
-
-    def do_POST(self) -> None:  # noqa: N802
-        service: DatasetActionService = getattr(self.server, "service")
-
-        if self.path in ("/", "/predict"):
-            req = self._read_json_body()
-            if req is None:
-                return
-            try:
-                out = service.predict(req)
-            except Exception as e:
-                print(f"[{_ts()}] [dataset-server] predict ERROR: {e}", flush=True)
-                self._send_json(400, {"action": [], "error": str(e)})
-                return
-            self._send_json(200, out)
-
-        elif self.path == "/next_episode":
-            req = self._read_json_body()
-            prompt = req.get("prompt") if req else None
-            try:
-                out = service.next_episode(prompt)
-            except Exception as e:
-                self._send_json(400, {"error": str(e)})
-                return
-            self._send_json(200, out)
-
-        elif self.path == "/reset":
-            out = service.reset()
-            self._send_json(200, out)
-
-        else:
-            self._send_json(404, {"error": "Not found"})
-
-    def log_message(self, format: str, *args: Any) -> None:  # noqa: A002
-        return
-
-
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="HTTP server serving ground-truth actions from LIBERO LeRobot datasets."
-    )
-    parser.add_argument(
-        "--repo_id",
-        type=str,
-        required=True,
-        help="Comma-separated LeRobot repo IDs (e.g. libero_10,libero_goal)",
-    )
-    parser.add_argument(
-        "--root",
-        type=str,
-        required=True,
-        help="Comma-separated local paths to dataset roots (one per repo_id)",
-    )
-    parser.add_argument(
-        "--action_space",
-        type=str,
-        default="frame_wise_relative",
-        choices=["relative", "frame_wise_relative"],
-        help="Action space (must match closed-loop eval's --action_space).",
-    )
-    parser.add_argument(
-        "--rotation_space",
-        type=str,
-        default="6d",
-        choices=["3d", "6d", "9d"],
-        help="Rotation representation (must match closed-loop eval's action_dim).",
-    )
-    parser.add_argument(
-        "--pose_coordinate_frame",
-        type=str,
-        default="native",
-        choices=["native", "opencv"],
-        help="Pose/action coordinate frame. Accepted for compatibility with LIBERO eval launchers.",
-    )
-    parser.add_argument("--action_chunk_size", type=int, default=16, help="Number of actions per predict call")
-    parser.add_argument("--max_action_dim", type=int, default=32, help="Pad actions to this dimension")
-    parser.add_argument("--split", type=str, default="full", help="Dataset split (train/val/full)")
-    parser.add_argument(
-        "--send_video",
-        action="store_true",
-        help="Include GT video frames (base64 PNGs) in /predict responses, same format as the Action server.",
-    )
-    parser.add_argument(
-        "--camera_mode",
-        type=str,
-        default="image",
-        choices=["agentview", "wrist_image", "concat_view", "both"],
-        help="Camera view(s) to include in video frames.",
-    )
-    parser.add_argument("--image_size", type=int, default=256, help="Resize video frames to this height/width")
-    parser.add_argument("--host", type=str, default="0.0.0.0")
-    parser.add_argument("--port", type=int, default=8000)
-    args = parser.parse_args()
-
-    repo_ids = [r.strip() for r in args.repo_id.split(",") if r.strip()]
-    roots = [r.strip() for r in args.root.split(",") if r.strip()]
-    if len(repo_ids) != len(roots):
-        raise ValueError(f"Number of repo_ids ({len(repo_ids)}) must match number of roots ({len(roots)})")
-
-    cfg = DatasetServerConfig(
-        repo_id=repo_ids,
-        root=roots,
-        action_space=args.action_space,
-        rotation_space=args.rotation_space,
-        pose_coordinate_frame=args.pose_coordinate_frame,
-        action_chunk_size=int(args.action_chunk_size),
-        max_action_dim=int(args.max_action_dim),
-        split=args.split,
-        send_video=bool(args.send_video),
-        camera_mode=args.camera_mode,
-        image_size=int(args.image_size),
-    )
-
-    service = DatasetActionService(cfg)
-    local_ip = _get_local_ip()
-
-    print(
-        f"[{_ts()}] [dataset-server] starting host={args.host} port={args.port} "
-        f"action_space={cfg.action_space} rotation_space={cfg.rotation_space} "
-        f"action_chunk_size={cfg.action_chunk_size}",
-        flush=True,
-    )
-    print(f"[{_ts()}] [dataset-server] Server accessible at: http://{local_ip}:{args.port}/", flush=True)
-    print(f"[{_ts()}] [dataset-server] Endpoints:", flush=True)
-    print(f"  - GET  /             : Health check", flush=True)
-    print(f"  - GET  /info         : Dataset info (tasks, episode counts)", flush=True)
-    print(f"  - POST /predict      : Get next GT action chunk (same interface as Action server)", flush=True)
-    print(f"  - POST /next_episode : Advance to next episode for a task", flush=True)
-    print(f"  - POST /reset        : Reset all per-task state", flush=True)
-
-    httpd = ThreadingHTTPServer((args.host, int(args.port)), _DatasetHandler)
-    setattr(httpd, "service", service)
-    httpd.serve_forever()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md
index 735c630..386f7c6 100644
--- a/docs/action_policy_libero_sft.md
+++ b/docs/action_policy_libero_sft.md
@@ -1,7 +1,7 @@
 # Cosmos3-Nano LIBERO-10 action-policy SFT
 
-Full SFT (no LoRA) of the public `nvidia/Cosmos3-Nano` base into a LIBERO-10
-action policy: vision + language in, action chunks out.
+Full SFT of the public `nvidia/Cosmos3-Nano` base into a LIBERO-10 action
+policy: vision + language in, action chunks out.
 
 | Piece            | Path                                                                                            |
 | ---------------- | ----------------------------------------------------------------------------------------------- |
diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh
index 4188d9d..7ec4ccc 100755
--- a/examples/launch_sft_action_policy_libero.sh
+++ b/examples/launch_sft_action_policy_libero.sh
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: OpenMDW-1.1
 
 # Structured-TOML launch for action_policy_libero_nano — Cosmos3-Nano LIBERO
-# action-policy SFT (HSDP, full SFT, no LoRA). Drives cosmos_framework.scripts.train
+# action-policy SFT (HSDP, full SFT). Drives cosmos_framework.scripts.train
 # against examples/toml/sft_config/action_policy_libero_repro.toml.
 #
 # Point LIBERO_ROOT at the libero_10 suite ONLY (the full suite mix dilutes

From dd78c68a1c8a86ac412edd87f87e3e63c5928c3d Mon Sep 17 00:00:00 2001
From: Liang Hao <hliangac@connect.ust.hk>
Date: Fri, 26 Jun 2026 21:43:14 +0800
Subject: [PATCH 5/9] libero: model_loader = origin/main + no_dist only (drop
 unrelated deletions); EGL setup optional in doc

---
 cosmos_framework/utils/vfm/model_loader.py | 68 +++++++++++++++++++---
 docs/action_policy_libero_sft.md           | 13 +++--
 2 files changed, 68 insertions(+), 13 deletions(-)

diff --git a/cosmos_framework/utils/vfm/model_loader.py b/cosmos_framework/utils/vfm/model_loader.py
index b94817a..51140b3 100644
--- a/cosmos_framework/utils/vfm/model_loader.py
+++ b/cosmos_framework/utils/vfm/model_loader.py
@@ -18,7 +18,21 @@
 try:
     from filelock import SoftReadWriteLock
 except ImportError:  # Older filelock versions in some inference containers.
-    from filelock import ReadWriteLock as SoftReadWriteLock
+    try:
+        from filelock import ReadWriteLock as SoftReadWriteLock
+    except ImportError:
+        from filelock import FileLock
+
+        class SoftReadWriteLock:
+            """Compatibility adapter for filelock versions without read/write locks."""
+
+            def __init__(self, *args: Any, **kwargs: Any) -> None:
+                self._lock = FileLock(*args, **kwargs)
+
+            def write_lock(self) -> FileLock:
+                return self._lock
+
+
 from torch.distributed.checkpoint.filesystem import FileSystemReader, FileSystemWriter
 
 from cosmos_framework.checkpoint.s3_filesystem import S3StorageReader
@@ -171,6 +185,32 @@ def _checkpoint_cache_group_lock(
         yield action
 
 
+def _reload_pretrained_reasoner_after_checkpoint_load(model: torch.nn.Module) -> None:
+    """Re-seed the reasoner pathway after a DCP load, mirroring the LoadPretrained
+    callback that runs during training (inference does not run training callbacks).
+
+    The decision is delegated entirely to the model's own gate in
+    ``load_pretrained_model_if_needed``: this is a no-op unless the model was built
+    with ``exclude_reasoner_weights_from_checkpoint=True`` (and pretrained weights
+    enabled), i.e. the case where the DCP checkpoint deliberately omits the reasoner
+    tower so it must be re-seeded from the pretrained source. For a normal checkpoint
+    that already contains the reasoner, the model's gate evaluates to False and
+    nothing is reloaded.
+
+    ``has_resumable_checkpoint=True`` / ``has_load_path=False`` is load-bearing: it
+    re-seeds the reasoner from the pretrained source while skipping the
+    understanding->generation copy (the generation pathway was already populated by
+    the DCP load). Passing ``has_load_path=True`` would instead force a reasoner
+    reload even for non-excluded checkpoints, clobbering any fine-tuned reasoner
+    weights restored from the DCP.
+    """
+    load_pretrained_model_if_needed = getattr(model, "load_pretrained_model_if_needed")
+    load_pretrained_model_if_needed(
+        has_resumable_checkpoint=True,
+        has_load_path=False,
+    )
+
+
 def _load_model(
     model: torch.nn.Module,
     checkpoint_path: str,
@@ -194,6 +234,9 @@ def _load_model(
     start_time = time.time()
 
     state_dict = ModelWrapper(model).state_dict()
+    if any(key.startswith("net_teacher.") for key in state_dict):
+        log.info("Dropping net_teacher.* keys from inference load target; distillation checkpoints do not save them.")
+        state_dict = {key: value for key, value in state_dict.items() if not key.startswith("net_teacher.")}
 
     if checkpoint_path.startswith("s3://"):
         storage_reader = S3StorageReader(
@@ -209,12 +252,10 @@ def _load_model(
         keys_to_skip_loading=keys_to_skip_loading or [],
     )
 
-    # Single-rank load (e.g. the action policy inference server): force no_dist so
-    # ``dcp.load`` skips the collective ``gather_object`` over the load plan. That
-    # gather pickles the plan, which fails with "cannot pickle code objects" for
-    # training/EMA DCPs whose metadata carries non-tensor objects; a single process
-    # owns the full checkpoint anyway, so the collective is unnecessary. Multi-rank
-    # (sharded) loads keep the default distributed path.
+    # Single-rank load (e.g. the action-policy inference server): force no_dist so
+    # ``dcp.load`` skips the collective ``gather_object`` over the load plan, which
+    # pickles the plan and can fail on training/EMA DCPs. Multi-rank loads keep the
+    # default distributed path.
     no_dist = not (dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1)
 
     dcp.load(
@@ -360,6 +401,16 @@ def load_model_from_checkpoint(
 
     # Disable EMA for inference.
     config.model.config.ema.enabled = False
+    if hasattr(config.model.config, "load_teacher_weights"):
+        log.info("Setting load_teacher_weights=False for inference to skip teacher checkpoint download.")
+        config.model.config.load_teacher_weights = False
+
+    if (
+        config.model.config.exclude_reasoner_weights_from_checkpoint
+        and not config.model.config.vlm_config.pretrained_weights.enabled
+    ):
+        log.info("Enabling pretrained reasoner weights because this checkpoint excludes the reasoner tower from DCP.")
+        config.model.config.vlm_config.pretrained_weights.enabled = True
 
     config.validate()
     config.freeze()  # type: ignore
@@ -435,6 +486,7 @@ def load_model(checkpoint_load_path: str) -> None:
 
     if checkpoint_cache_path is None:
         load_model(checkpoint_path)
+        _reload_pretrained_reasoner_after_checkpoint_load(model)
         return model, config
 
     cache_lock_path = f"{checkpoint_cache_path}.lock"
@@ -452,4 +504,6 @@ def load_model(checkpoint_load_path: str) -> None:
     if cache_action == _CheckpointCacheAction.LOAD_CACHE:
         load_model(checkpoint_cache_path)
 
+    _reload_pretrained_reasoner_after_checkpoint_load(model)
+
     return model, config
diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md
index 386f7c6..c239499 100644
--- a/docs/action_policy_libero_sft.md
+++ b/docs/action_policy_libero_sft.md
@@ -65,14 +65,15 @@ python -m cosmos_framework.scripts.action_policy_server_libero \
 ```
 
 The LIBERO sim needs a separate venv (robosuite/mujoco pins conflict with the
-training env) and graphics enabled in the container:
+training env):
 
 ```bash
-export NVIDIA_DRIVER_CAPABILITIES=all
-apt-get install -y libegl1 libglvnd0 libgl1 libglib2.0-0 ffmpeg
-mkdir -p /usr/share/glvnd/egl_vendor.d
-echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libEGL_nvidia.so.0"}}' \
-  > /usr/share/glvnd/egl_vendor.d/10_nvidia.json
+# Optional — only on a headless container without working GPU EGL:
+#   export NVIDIA_DRIVER_CAPABILITIES=all
+#   apt-get install -y libegl1 libglvnd0 libgl1 libglib2.0-0 ffmpeg
+#   mkdir -p /usr/share/glvnd/egl_vendor.d
+#   echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libEGL_nvidia.so.0"}}' \
+#     > /usr/share/glvnd/egl_vendor.d/10_nvidia.json
 
 uv venv --python 3.10 .libenv && VV=.libenv/bin/python
 git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git && \

From 5f1847e894a9ca203b4b76b682d96f9dc106ee1b Mon Sep 17 00:00:00 2001
From: Liang Hao <hliangac@connect.ust.hk>
Date: Fri, 26 Jun 2026 21:49:31 +0800
Subject: [PATCH 6/9] libero: canonical recipe = HSDP 8x8 (replicate 8,
 max_samples 32 in launch -> gbs 2048)

---
 docs/action_policy_libero_sft.md                         | 8 ++++----
 examples/launch_sft_action_policy_libero.sh              | 3 ++-
 examples/toml/sft_config/action_policy_libero_repro.toml | 6 +++---
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md
index c239499..fa41346 100644
--- a/docs/action_policy_libero_sft.md
+++ b/docs/action_policy_libero_sft.md
@@ -41,13 +41,13 @@ export BASE_CHECKPOINT_PATH=<Cosmos3-Nano DCP dir>
 export WAN_VAE_PATH=<Wan2.2_VAE.pth>
 export IMAGINAIRE_OUTPUT_ROOT=/path/to/output_root
 
-bash examples/launch_sft_action_policy_libero.sh   # HSDP 2x8; set NNODES/NODE_RANK/MASTER_ADDR per node
+bash examples/launch_sft_action_policy_libero.sh   # HSDP 8x8; set NNODES/NODE_RANK/MASTER_ADDR per node
 ```
 
 Recipe knobs live in `action_policy_libero_nano`; the TOML sets run-level scalars
-(lr 5e-5, warmup 500, cycle 16000, global batch 2048, `save_iter=500`). Sweep the
-saved checkpoints to pick the best iteration. On lower-memory GPUs reduce the
-per-rank batch: `--opts dataloader_train.max_samples_per_batch=32`.
+(lr 5e-5, warmup 500, cycle 16000, global batch 2048, `save_iter=500`). The launch
+sets `max_samples_per_batch=32` (32 × 64 ranks = gbs 2048); reduce it further on
+lower-memory GPUs. Sweep the saved checkpoints to pick the best iteration.
 
 ## 3. Closed-loop eval
 
diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh
index 7ec4ccc..1c40954 100755
--- a/examples/launch_sft_action_policy_libero.sh
+++ b/examples/launch_sft_action_policy_libero.sh
@@ -8,7 +8,7 @@
 #
 # Point LIBERO_ROOT at the libero_10 suite ONLY (the full suite mix dilutes
 # libero_10). Use the 20 FPS nvidia/LIBERO_LeRobot_v3. The default recipe is
-# HSDP 2x8 (global batch 2048); set NNODES/NODE_RANK/MASTER_ADDR per node.
+# HSDP 8x8 (global batch 2048); set NNODES/NODE_RANK/MASTER_ADDR per node.
 # See docs/action_policy_libero_sft.md.
 #
 # Required env vars:
@@ -40,6 +40,7 @@ EXTRA_DATASET_CHECK='[[ -f "$LIBERO_ROOT/meta/info.json" ]] || { echo "ERROR: LI
 # process), unlike a TAIL_OVERRIDES array set in your shell. Use it for smoke runs,
 # e.g. EXTRA_TAIL_OVERRIDES="trainer.max_iter=5 job.wandb_mode=offline".
 TAIL_OVERRIDES=(
+    dataloader_train.max_samples_per_batch=32   # 32 x (shard 8 x replicate 8) x ga1 = global batch 2048
     ${EXTRA_TAIL_OVERRIDES:-}
 )
 
diff --git a/examples/toml/sft_config/action_policy_libero_repro.toml b/examples/toml/sft_config/action_policy_libero_repro.toml
index d74237b..63ab032 100644
--- a/examples/toml/sft_config/action_policy_libero_repro.toml
+++ b/examples/toml/sft_config/action_policy_libero_repro.toml
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: OpenMDW-1.1
 
 # LIBERO-10 action-policy SFT run config for the `action_policy_libero_nano`
-# experiment. Train on libero_10 alone (HSDP 2x8, global batch 2048).
+# experiment. Train on libero_10 alone (HSDP 8x8, global batch 2048).
 # Env: LIBERO_ROOT, BASE_CHECKPOINT_PATH, WAN_VAE_PATH, IMAGINAIRE_OUTPUT_ROOT.
 # See docs/action_policy_libero_sft.md.
 
@@ -20,7 +20,7 @@ max_num_tokens_after_packing = 74000
 
 [model.parallelism]
 data_parallel_shard_degree     = 8
-data_parallel_replicate_degree = 2    # HSDP 2x8 = 16 ranks (2 nodes)
+data_parallel_replicate_degree = 8    # HSDP 8x8 = 64 ranks (8 nodes)
 
 [model.activation_checkpointing]
 mode           = "selective"
@@ -39,7 +39,7 @@ warm_up_steps = [500]
 [trainer]
 max_iter        = 2000
 logging_iter    = 50
-grad_accum_iter = 1       # global batch = 128 x (8 x 2) x 1 = 2048
+grad_accum_iter = 1       # global batch = max_samples 32 x (shard 8 x replicate 8) x 1 = 2048
 
 [checkpoint]
 load_path = "${oc.env:BASE_CHECKPOINT_PATH}"

From 21d34caa733ec8f71fb384c9d41ac15a62622392 Mon Sep 17 00:00:00 2001
From: Liang Hao <hliangac@connect.ust.hk>
Date: Fri, 26 Jun 2026 21:52:57 +0800
Subject: [PATCH 7/9] libero: recipe = minimum HSDP 2x8 (gbs 2048, grad_accum
 1); doc/launch synced

---
 docs/action_policy_libero_sft.md                         | 9 +++++----
 examples/launch_sft_action_policy_libero.sh              | 3 +--
 examples/toml/sft_config/action_policy_libero_repro.toml | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md
index fa41346..63abb4b 100644
--- a/docs/action_policy_libero_sft.md
+++ b/docs/action_policy_libero_sft.md
@@ -41,13 +41,14 @@ export BASE_CHECKPOINT_PATH=<Cosmos3-Nano DCP dir>
 export WAN_VAE_PATH=<Wan2.2_VAE.pth>
 export IMAGINAIRE_OUTPUT_ROOT=/path/to/output_root
 
-bash examples/launch_sft_action_policy_libero.sh   # HSDP 8x8; set NNODES/NODE_RANK/MASTER_ADDR per node
+bash examples/launch_sft_action_policy_libero.sh   # HSDP 2x8; set NNODES/NODE_RANK/MASTER_ADDR per node
 ```
 
 Recipe knobs live in `action_policy_libero_nano`; the TOML sets run-level scalars
-(lr 5e-5, warmup 500, cycle 16000, global batch 2048, `save_iter=500`). The launch
-sets `max_samples_per_batch=32` (32 × 64 ranks = gbs 2048); reduce it further on
-lower-memory GPUs. Sweep the saved checkpoints to pick the best iteration.
+(lr 5e-5, warmup 500, cycle 16000, `save_iter=500`, HSDP 2x8). Global batch is
+2048 = `max_samples_per_batch` 128 × 16 ranks × grad_accum 1; on lower-memory GPUs
+reduce it: `--opts dataloader_train.max_samples_per_batch=64`. Sweep the saved
+checkpoints to pick the best iteration.
 
 ## 3. Closed-loop eval
 
diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh
index 1c40954..7ec4ccc 100755
--- a/examples/launch_sft_action_policy_libero.sh
+++ b/examples/launch_sft_action_policy_libero.sh
@@ -8,7 +8,7 @@
 #
 # Point LIBERO_ROOT at the libero_10 suite ONLY (the full suite mix dilutes
 # libero_10). Use the 20 FPS nvidia/LIBERO_LeRobot_v3. The default recipe is
-# HSDP 8x8 (global batch 2048); set NNODES/NODE_RANK/MASTER_ADDR per node.
+# HSDP 2x8 (global batch 2048); set NNODES/NODE_RANK/MASTER_ADDR per node.
 # See docs/action_policy_libero_sft.md.
 #
 # Required env vars:
@@ -40,7 +40,6 @@ EXTRA_DATASET_CHECK='[[ -f "$LIBERO_ROOT/meta/info.json" ]] || { echo "ERROR: LI
 # process), unlike a TAIL_OVERRIDES array set in your shell. Use it for smoke runs,
 # e.g. EXTRA_TAIL_OVERRIDES="trainer.max_iter=5 job.wandb_mode=offline".
 TAIL_OVERRIDES=(
-    dataloader_train.max_samples_per_batch=32   # 32 x (shard 8 x replicate 8) x ga1 = global batch 2048
     ${EXTRA_TAIL_OVERRIDES:-}
 )
 
diff --git a/examples/toml/sft_config/action_policy_libero_repro.toml b/examples/toml/sft_config/action_policy_libero_repro.toml
index 63ab032..a0c49c7 100644
--- a/examples/toml/sft_config/action_policy_libero_repro.toml
+++ b/examples/toml/sft_config/action_policy_libero_repro.toml
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: OpenMDW-1.1
 
 # LIBERO-10 action-policy SFT run config for the `action_policy_libero_nano`
-# experiment. Train on libero_10 alone (HSDP 8x8, global batch 2048).
+# experiment. Train on libero_10 alone (HSDP 2x8, global batch 2048).
 # Env: LIBERO_ROOT, BASE_CHECKPOINT_PATH, WAN_VAE_PATH, IMAGINAIRE_OUTPUT_ROOT.
 # See docs/action_policy_libero_sft.md.
 
@@ -20,7 +20,7 @@ max_num_tokens_after_packing = 74000
 
 [model.parallelism]
 data_parallel_shard_degree     = 8
-data_parallel_replicate_degree = 8    # HSDP 8x8 = 64 ranks (8 nodes)
+data_parallel_replicate_degree = 2    # HSDP 2x8 = 16 ranks (2 nodes); minimum for gbs 2048 at grad_accum 1
 
 [model.activation_checkpointing]
 mode           = "selective"
@@ -39,7 +39,7 @@ warm_up_steps = [500]
 [trainer]
 max_iter        = 2000
 logging_iter    = 50
-grad_accum_iter = 1       # global batch = max_samples 32 x (shard 8 x replicate 8) x 1 = 2048
+grad_accum_iter = 1       # global batch = max_samples 128 x (shard 8 x replicate 2) x 1 = 2048
 
 [checkpoint]
 load_path = "${oc.env:BASE_CHECKPOINT_PATH}"

From 82a5a840a8dcdc79517ac4e25693e049002b86b4 Mon Sep 17 00:00:00 2001
From: Liang Hao <hliangac@connect.ust.hk>
Date: Fri, 26 Jun 2026 21:58:16 +0800
Subject: [PATCH 8/9] libero: move lower-mem caveat to Heads-up section; drop
 all-suites mention

---
 .../posttrain_config/action_policy_libero_nano.py     |  3 +--
 .../data/vfm/action/datasets/action_sft_dataset.py    |  2 +-
 docs/action_policy_libero_sft.md                      | 11 +++++++----
 examples/launch_sft_action_policy_libero.sh           |  6 +++---
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py
index e5b5696..b05d3ea 100644
--- a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py
+++ b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py
@@ -195,8 +195,7 @@ def _action_policy_libero_nano_model_config() -> dict:
                     libero=dict(
                         ratio=1,
                         dataset=L(get_action_libero_sft_dataset)(
-                            # Local LeRobot dir for the libero_10 suite ONLY (the
-                            # full suite mix dilutes libero_10; see module docstring). Use the
+                            # Local LeRobot dir for the libero_10 suite ONLY. Use the
                             # 20 FPS nvidia/LIBERO_LeRobot_v3 (matches the bundled stats + 20 Hz eval):
                             #   hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset \
                             #     --include 'libero_10/**' --local-dir <dir>   # LIBERO_ROOT=<dir>/libero_10
diff --git a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py
index 7776875..96a5219 100644
--- a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py
+++ b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py
@@ -176,7 +176,7 @@ def get_action_libero_sft_dataset(
     → 256x512) through ``ActionTransformPipeline``. ``root`` is a LOCAL LeRobot dir
     (read parquet + video directly); pre-sync the HF dataset once, e.g.
     ``hf download lerobot/libero_10 --repo-type dataset --local-dir <root>``. Point
-    ``root`` at libero_10 alone (the all-suites mix dilutes libero_10 per step). The
+    ``root`` at libero_10 alone. The
     dataset is FPS-agnostic (decodes at real frame timestamps); ``fps`` is metadata
     for ``conditioning_fps`` / prompt duration.
     """
diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md
index 63abb4b..84a9cbc 100644
--- a/docs/action_policy_libero_sft.md
+++ b/docs/action_policy_libero_sft.md
@@ -46,8 +46,7 @@ bash examples/launch_sft_action_policy_libero.sh   # HSDP 2x8; set NNODES/NODE_R
 
 Recipe knobs live in `action_policy_libero_nano`; the TOML sets run-level scalars
 (lr 5e-5, warmup 500, cycle 16000, `save_iter=500`, HSDP 2x8). Global batch is
-2048 = `max_samples_per_batch` 128 × 16 ranks × grad_accum 1; on lower-memory GPUs
-reduce it: `--opts dataloader_train.max_samples_per_batch=64`. Sweep the saved
+2048 = `max_samples_per_batch` 128 × 16 ranks × grad_accum 1. Sweep the saved
 checkpoints to pick the best iteration.
 
 ## 3. Closed-loop eval
@@ -93,9 +92,13 @@ MUJOCO_GL=egl PYTHONPATH=$PWD:$PWD/LIBERO $VV \
   --output_dir results/libero_closed_loop_10
 ```
 
-## 4. Eval parity
+## 4. Heads-up
 
-The client/server already handle these; verify them if accuracy is low:
+- **Lower-memory GPUs** — reduce the per-rank batch:
+  `--opts dataloader_train.max_samples_per_batch=64` (scale `replicate` to keep
+  global batch 2048).
+
+Eval parity — the client/server already handle these; verify if accuracy is low:
 
 - **Concat layout** — run with `--camera agentview,wrist --image_size 256` so the
   256×512 concat matches training (the server snaps it to 192×320 identically).
diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh
index 7ec4ccc..24ab760 100755
--- a/examples/launch_sft_action_policy_libero.sh
+++ b/examples/launch_sft_action_policy_libero.sh
@@ -6,9 +6,9 @@
 # action-policy SFT (HSDP, full SFT). Drives cosmos_framework.scripts.train
 # against examples/toml/sft_config/action_policy_libero_repro.toml.
 #
-# Point LIBERO_ROOT at the libero_10 suite ONLY (the full suite mix dilutes
-# libero_10). Use the 20 FPS nvidia/LIBERO_LeRobot_v3. The default recipe is
-# HSDP 2x8 (global batch 2048); set NNODES/NODE_RANK/MASTER_ADDR per node.
+# Point LIBERO_ROOT at the libero_10 suite ONLY. Use the 20 FPS
+# nvidia/LIBERO_LeRobot_v3. The default recipe is HSDP 2x8 (global batch 2048);
+# set NNODES/NODE_RANK/MASTER_ADDR per node.
 # See docs/action_policy_libero_sft.md.
 #
 # Required env vars:

From 4d351ddcbb1a6fea9d62d5020829b1caa16f6908 Mon Sep 17 00:00:00 2001
From: Liang Hao <hliangac@connect.ust.hk>
Date: Fri, 26 Jun 2026 22:09:40 +0800
Subject: [PATCH 9/9] libero: lint launch headers (drop GPU counts), drop sweep
 mention

---
 docs/action_policy_libero_sft.md            | 3 +--
 examples/launch_sft_action_policy_libero.sh | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md
index 84a9cbc..660de11 100644
--- a/docs/action_policy_libero_sft.md
+++ b/docs/action_policy_libero_sft.md
@@ -46,8 +46,7 @@ bash examples/launch_sft_action_policy_libero.sh   # HSDP 2x8; set NNODES/NODE_R
 
 Recipe knobs live in `action_policy_libero_nano`; the TOML sets run-level scalars
 (lr 5e-5, warmup 500, cycle 16000, `save_iter=500`, HSDP 2x8). Global batch is
-2048 = `max_samples_per_batch` 128 × 16 ranks × grad_accum 1. Sweep the saved
-checkpoints to pick the best iteration.
+2048 = `max_samples_per_batch` 128 × 16 ranks × grad_accum 1.
 
 ## 3. Closed-loop eval
 
diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh
index 24ab760..29a9a16 100755
--- a/examples/launch_sft_action_policy_libero.sh
+++ b/examples/launch_sft_action_policy_libero.sh
@@ -23,7 +23,7 @@
 #   hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset --include 'libero_10/**' --local-dir <dir>
 #   export LIBERO_ROOT=<dir>/libero_10
 #
-# Usage (8-GPU allocation, inside the training container, from the repo root):
+# Usage (HSDP 2x8; set NNODES/NODE_RANK/MASTER_ADDR per node):
 #   LIBERO_ROOT=<dir>/libero_10 bash examples/launch_sft_action_policy_libero.sh
 
 TOML_FILE="examples/toml/sft_config/action_policy_libero_repro.toml"