From 2b8ecd008fc963c2a9f9ab10029a0bb75f66f816 Mon Sep 17 00:00:00 2001 From: Liang Hao Date: Fri, 26 Jun 2026 20:59:32 +0800 Subject: [PATCH 1/9] Add Cosmos3-Nano LIBERO-10 action-policy SFT recipe, config, eval harness, and doc Mirrors the DROID action-policy counterpart (action_policy_droid_nano + repro toml + launch + doc). Net-new LIBERO feature: - experiment config: action_policy_libero_nano - dataset: LIBEROLeRobotDataset + get_action_libero_sft_dataset (frame_wise_relative rot6d, quantile_rot, concat_view, 20fps); base_dataset tasks.parquet fallback for community LIBERO layouts; resample-on-decode-failure guard (matches i4 behavior) - closed-loop eval harness (vectorized sim) + batched /predict_batch inference path + single-rank no_dist checkpoint load for the policy server - canonical recipe action_policy_libero_repro.toml + launch_sft_action_policy_libero.sh (lr 5e-5, warmup 500, cycle 16000, global batch 2048; ~95% libero_10 500-ep eval) - docs/action_policy_libero_sft.md Co-Authored-By: Claude Opus 4.8 (1M context) --- cosmos_framework/configs/base/config.py | 2 + .../action_policy_libero_nano.py | 258 ++++ .../data/vfm/action/datasets/__init__.py | 2 + .../vfm/action/datasets/action_sft_dataset.py | 107 +- .../data/vfm/action/datasets/base_dataset.py | 51 +- .../action/datasets/libero_lerobot_dataset.py | 335 ++++ ...bero_native_frame_wise_relative_rot6d.json | 37 + .../data/vfm/action/libero_pose_utils.py | 69 + .../scripts/action_policy_server_libero.py | 211 ++- cosmos_framework/simulation/__init__.py | 2 + .../simulation/libero/__init__.py | 3 + .../simulation/libero/closed_loop_eval.py | 1343 +++++++++++++++++ .../libero/dataset_reply_action_server.py | 653 ++++++++ cosmos_framework/utils/vfm/model_loader.py | 67 +- docs/action_policy_libero_sft.md | 206 +++ examples/launch_sft_action_policy_libero.sh | 46 + .../action_policy_libero_repro.toml | 79 + 17 files changed, 3333 insertions(+), 138 deletions(-) create mode 100644 cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py create mode 100644 cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py create mode 100644 cosmos_framework/data/vfm/action/datasets/stats/libero_native_frame_wise_relative_rot6d.json create mode 100644 cosmos_framework/data/vfm/action/libero_pose_utils.py create mode 100644 cosmos_framework/simulation/__init__.py create mode 100644 cosmos_framework/simulation/libero/__init__.py create mode 100644 cosmos_framework/simulation/libero/closed_loop_eval.py create mode 100644 cosmos_framework/simulation/libero/dataset_reply_action_server.py create mode 100644 docs/action_policy_libero_sft.md create mode 100755 examples/launch_sft_action_policy_libero.sh create mode 100644 examples/toml/sft_config/action_policy_libero_repro.toml diff --git a/cosmos_framework/configs/base/config.py b/cosmos_framework/configs/base/config.py index e766c5c..1fb0514 100644 --- a/cosmos_framework/configs/base/config.py +++ b/cosmos_framework/configs/base/config.py @@ -97,4 +97,6 @@ def make_config() -> Config: import cosmos_framework.configs.base.experiment.sft.vision_sft_nano # noqa: F401 import cosmos_framework.configs.base.experiment.sft.vision_sft_super # noqa: F401 import cosmos_framework.configs.base.experiment.action.posttrain_config.action_policy_droid_nano # noqa: F401 + import cosmos_framework.configs.base.experiment.action.posttrain_config.action_policy_libero_nano # noqa: F401 + import cosmos_framework.configs.base.experiment.action.posttrain_config.action_policy_libero_nano_4suite # noqa: F401 return c diff --git a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py new file mode 100644 index 0000000..98b5b12 --- /dev/null +++ b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py @@ -0,0 +1,258 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""``action_policy_libero_nano`` — Cosmos3-Nano LIBERO action-policy SFT recipe. + +Reproduces the Cosmos3-Nano LIBERO-10 result (Table 20, 97.4% @ ckpt 2000). +Mirrors ``action_policy_droid_nano`` (PackingDataLoader + RankPartitionedDataLoader ++ ActionIterableShuffleDataset), but feeds ``LIBEROLeRobotDataset`` (frame-wise-relative +rot6d actions, ``quantile_rot``-normalized, concat_view third-person + wrist at +256x256 each -> 256x512) through ``ActionTransformPipeline``, and trains the +generation + action heads from the public ``nvidia/Cosmos3-Nano`` base. Full SFT +(no LoRA) — the LoRA variant is the 32B "super" tier only. + +LIBERO-10 reproduction note: the public Table-20 number is reached training on +``libero_10`` ALONE. Training on the full 4-suite mix dilutes libero_10 to ~1 pass +in 2000 steps (~82%); libero_10 alone is ~2.7 passes (~97%). Point ``LIBERO_ROOT`` +(and ``LIBERO_REPO_ID``) at the libero_10 LeRobot conversion only. + +Usage (1 node, 8 GPU):: + + LIBERO_ROOT=/path/to/libero_10_lerobot \\ + LIBERO_REPO_ID=lerobot/libero_10 \\ + BASE_CHECKPOINT_PATH= \\ + WAN_VAE_PATH= \\ + torchrun --nproc_per_node=8 -m cosmos_framework.scripts.train \\ + --sft-toml examples/toml/sft_config/action_policy_libero_repro.toml +""" + +import copy + +from hydra.core.config_store import ConfigStore + +from cosmos_framework.utils.lazy_config import LazyCall as L +from cosmos_framework.utils.lazy_config import LazyDict + +from cosmos_framework.configs.base.experiment.sft.models.nano_model_config import NANO_MODEL_CONFIG +from cosmos_framework.data.vfm.joint_dataloader import ( + PackingDataLoader, + RankPartitionedDataLoader, +) +from cosmos_framework.data.vfm.action.datasets.action_sft_dataset import get_action_libero_sft_dataset + +cs = ConfigStore.instance() + + +def _action_policy_libero_nano_model_config() -> dict: + """GA LIBERO model config: capped packed tokens, selective activation + checkpointing, fresh diffusion-expert init, 10x vision flow-matching loss, and + the VAE encode durations [17, 61, 73] carried by the Cosmos3 base. + + NOTE: keep ``encode_exact_durations=[17, 61, 73]`` — do NOT reduce it to [17] + even though ``mode="policy"`` only produces 17-frame windows at the data level. + The public Cosmos3-Nano base was pretrained with [17, 61, 73]; the reference + GA LIBERO SFT (``action_policy_sft_nano`` on ``mharrim-nv-patch-1``) retains it, + and empirically reducing it to [17] regresses the policy badly + (60.8% vs 94.6% at iter 2000).""" + cfg = copy.deepcopy(NANO_MODEL_CONFIG) # action_gen=True, max_action_dim=64 + # Cap the packed sequence. Uncapped (-1) + a large max_samples_per_batch packs + # one very long sequence and OOMs even on H200; 74000 keeps the GA-validated bound. + cfg["max_num_tokens_after_packing"] = 74000 + cfg["activation_checkpointing"]["mode"] = "selective" + cfg["diffusion_expert_config"]["load_weights_from_pretrained"] = False + cfg["rectified_flow_training_config"]["loss_scale"] = 10.0 + cfg["rectified_flow_training_config"]["image_loss_scale"] = None + cfg["tokenizer"]["encode_exact_durations"] = [17, 61, 73] # match Cosmos3 base + reference SFT (do NOT reduce) + return cfg + + +action_policy_libero_nano = LazyDict( + dict( + defaults=[ + {"override /model": "mot_fsdp"}, + {"override /data_train": None}, + {"override /data_val": None}, + # FusedAdam with fp32 master_weights + eps 1e-8 (bf16 params + eps 1e-6 + # diverged on the action loss). + {"override /optimizer": "fusedadamw"}, + {"override /scheduler": "lambdalinear"}, # linear LR decay + {"override /checkpoint": "s3"}, + { + "override /callbacks": [ + "basic", + "optimization", + "job_monitor", + ] + }, + {"override /ema": "power"}, + {"override /tokenizer": "wan2pt2_tokenizer"}, + {"override /sound_tokenizer": None}, + {"override /vlm_config": None}, + {"override /ckpt_type": "dcp"}, + "_self_", + ], + job=dict( + project="cosmos3", + group="action_sft", + name="action_policy_libero_nano", + wandb_mode="disabled", + ), + model=dict( + config=_action_policy_libero_nano_model_config(), + ), + optimizer=dict( + betas=[0.9, 0.99], + eps=1.0e-08, + fused=True, # popped by build_optimizer for FusedAdam (fused by construction) + # Train the generation + action heads. + keys_to_select=[ + "moe_gen", + "time_embedder", + "vae2llm", + "llm2vae", + "action2llm", + "llm2action", + "action_modality_embed", + ], + lr=5.0e-05, + lr_multipliers={ + "action2llm": 5.0, + "llm2action": 5.0, + "action_modality_embed": 5.0, + }, + optimizer_type="FusedAdam", + weight_decay=0.05, + ), + scheduler=dict( + lr_scheduler_type="LambdaLinear", + cycle_lengths=[100], # smoke: 100 iters (real run sets via TOML, GA=10000) + f_max=[1.0], + f_min=[0.0], + f_start=[1.0e-06], + verbosity_interval=0, + warm_up_steps=[0], # smoke (real run sets via TOML, GA=2000) + ), + trainer=dict( + distributed_parallelism="fsdp", + grad_accum_iter=1, # real run sets via TOML (GA=2) + logging_iter=1, + max_iter=100, # smoke + max_val_iter=None, + run_validation=False, + run_validation_on_start=False, + save_zero_checkpoint=False, + seed=42, + timeout_period=999999999, + validation_iter=100, + compile_config=dict(recompile_limit=8, use_duck_shape=False), + cudnn=dict(benchmark=True, deterministic=False), + ddp=dict(broadcast_buffers=True, find_unused_parameters=False, static_graph=True), + grad_scaler_args=dict(enabled=False), + callbacks=dict( + dataloader_speed=dict(every_n=100, save_s3=False, step_size=1), + device_monitor=dict( + every_n=200, log_memory_detail=True, save_s3=False, step_size=1, upload_every_n_mul=5 + ), + grad_clip=dict(clip_norm=1.0, force_finite=True), + heart_beat=dict(every_n=200, save_s3=False, step_size=1, update_interval_in_minute=20), + iter_speed=dict(every_n=1, hit_thres=50, save_s3=False, save_s3_every_log_n=500), + low_precision=dict(update_iter=1), + manual_gc=dict(every_n=5, gc_level=1, warm_up=1), + param_count=dict(save_s3=False), + skip_nan_step=dict(max_consecutive_nan=100), + training_stats=dict(log_freq=100), + ), + ), + checkpoint=dict( + broadcast_via_filesystem=False, + dcp_async_mode_enabled=False, + enable_gcs_patch_in_boto3=True, + keys_not_to_resume=[], + # Skip net_ema (EMA warm-starts from net, see dcp.py) and the action + # heads, so they init fresh from the base (the public Cosmos3-Nano base + # has no LIBERO-trained action heads). + keys_to_skip_loading=[ + "net_ema.", + "action2llm", + "llm2action", + "action_modality_embed", + "action_pos_embed", + ], + load_ema_to_reg=False, + load_path="???", # Cosmos3-Nano DCP dir; supply via TOML/env + load_training_state=False, + only_load_scheduler_state=False, + save_iter=100, + strict_resume=False, # base init: tolerate key set differences + verbose=True, + hf_export=dict( + enabled=False, + export_every_n=1, + hf_repo_id=None, + upload_to_object_store=dict(bucket="", credentials="", enabled=False), + ), + jit=dict(device="cuda", dtype="bfloat16", enabled=False, input_shape=None, strict=True), + load_from_object_store=dict(bucket="", credentials="", enabled=False), + save_to_object_store=dict(bucket="", credentials="", enabled=False), + ), + dataloader_train=L(PackingDataLoader)( + audio_sample_rate=48000, + dataset_name="action_libero", + max_samples_per_batch=128, # peak-mem bound (256 OOMs on H200); global = 128 x DP8 x grad_accum2 = 2048 + max_sequence_length=None, # None disables token packing (TOML can't express null) + patch_spatial=2, + sound_latent_fps=0, + tokenizer_spatial_compression_factor=16, + tokenizer_temporal_compression_factor=4, + dataloader=L(RankPartitionedDataLoader)( + batch_size=1, + in_order=False, + num_workers=4, + persistent_workers=True, + pin_memory=True, + prefetch_factor=4, + sampler=None, + # Shuffling is handled by the dataset (iterable_shuffle=True below): + # ActionIterableShuffleDataset streams rank x worker-sharded, episode-order- + # shuffled, sequential-within-episode. + datasets=dict( + libero=dict( + ratio=1, + dataset=L(get_action_libero_sft_dataset)( + # Local LeRobot dir for the libero_10 suite ONLY (Table-20 + # reproduction; full suite mix -> ~82%, see module docstring). Use the + # 20 FPS nvidia/LIBERO_LeRobot_v3 (matches the bundled stats + 20 Hz eval): + # hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset \ + # --include 'libero_10/**' --local-dir # LIBERO_ROOT=/libero_10 + root="${oc.env:LIBERO_ROOT}", + fps=20, # metadata only (FPS-agnostic loader reads native fps from info.json) + chunk_length=16, + image_size=256, # concat_view -> 256x512 + mode="policy", + camera_mode="concat_view", + action_space="frame_wise_relative", + rotation_space="6d", + pose_coordinate_frame="native", + action_normalization="quantile_rot", + val_ratio=0.01, + iterable_shuffle=True, + episode_shuffle_seed=42, + resolution=None, + max_action_dim="${model.config.max_action_dim}", + cfg_dropout_rate=0.1, + tokenizer_config="${model.config.vlm_config.tokenizer}", + ), + ), + ), + ), + ), + dataloader_val=None, + upload_reproducible_setup=False, + ), + flags={"allow_objects": True}, +) + + +for _item in [action_policy_libero_nano]: + _name = [k for k, v in globals().items() if v is _item][0] + cs.store(group="experiment", package="_global_", name=_name, node=_item) diff --git a/cosmos_framework/data/vfm/action/datasets/__init__.py b/cosmos_framework/data/vfm/action/datasets/__init__.py index 0b01e6b..6365693 100644 --- a/cosmos_framework/data/vfm/action/datasets/__init__.py +++ b/cosmos_framework/data/vfm/action/datasets/__init__.py @@ -12,6 +12,7 @@ from cosmos_framework.data.vfm.action.datasets.base_dataset import ActionBaseDataset from cosmos_framework.data.vfm.action.datasets.bridge_orig_lerobot_dataset import BridgeOrigLeRobotDataset from cosmos_framework.data.vfm.action.datasets.droid_lerobot_dataset import DROIDLeRobotDataset +from cosmos_framework.data.vfm.action.datasets.libero_lerobot_dataset import LIBEROLeRobotDataset from cosmos_framework.data.vfm.action.datasets.robomind_franka_dataset import RoboMINDFrankaDataset __all__ = [ @@ -19,5 +20,6 @@ "AgiBotWorldBetaLeRobotDataset", "BridgeOrigLeRobotDataset", "DROIDLeRobotDataset", + "LIBEROLeRobotDataset", "RoboMINDFrankaDataset", ] diff --git a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py index 1790de5..afe76da 100644 --- a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py +++ b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py @@ -18,7 +18,11 @@ from torch.utils.data import Dataset, IterableDataset, get_worker_info -from cosmos_framework.data.vfm.action.datasets.droid_lerobot_dataset import DROIDLeRobotDataset +from cosmos_framework.data.vfm.action.datasets.droid_lerobot_dataset import ( + DROIDLeRobotDataset, + ShardedDROIDLeRobotDataset, +) +from cosmos_framework.data.vfm.action.datasets.libero_lerobot_dataset import LIBEROLeRobotDataset from cosmos_framework.data.vfm.action.transforms import ActionTransformPipeline @@ -98,6 +102,7 @@ def get_action_droid_sft_dataset( action_normalization: str | None = None, viewpoint: str = "concat_view", use_image_augmentation: bool = False, + apply_color_jitter: bool = True, use_filter_dict: bool = False, filter_dict_path: str | None = None, resolution: str | int = "256", @@ -110,11 +115,24 @@ def get_action_droid_sft_dataset( append_idle_frames: bool = False, iterable_shuffle: bool = False, episode_shuffle_seed: int = 42, + sharded: bool = False, + lerobot_roots: list[str] | None = None, + use_success_only: bool = True, ) -> Dataset: """Build the DROID action SFT dataset: ``action_space='joint_pos'`` (8D) + - ``use_state`` (raw/un-normalized), concat_view, chunk_length 32.""" - dataset = DROIDLeRobotDataset( - root=root, + ``use_state`` (raw/un-normalized), concat_view, chunk_length 32. + + ``sharded=True`` consumes the per-lab sharded layout (``/success/``) + via :class:`ShardedDROIDLeRobotDataset` — one ``DROIDLeRobotDataset`` per lab + concatenated into one flat index — reproducing the internal sharded run's + per-shard index construction. ``sharded=False`` (default) reads ``root`` as a + single flat LeRobot dataset (the prior behavior). ``lerobot_roots`` optionally + pins the shard sub-paths (relative to ``root``); otherwise they are + auto-discovered.""" + # ``sharded`` may arrive as a string from env-var config resolution. + if isinstance(sharded, str): + sharded = sharded.strip().lower() in ("1", "true", "yes", "on") + shard_kwargs = dict( fps=fps, chunk_length=chunk_length, viewpoint=viewpoint, @@ -123,9 +141,90 @@ def get_action_droid_sft_dataset( use_state=use_state, action_normalization=action_normalization, use_image_augmentation=use_image_augmentation, + apply_color_jitter=apply_color_jitter, use_filter_dict=use_filter_dict, filter_dict_path=filter_dict_path, ) + if sharded: + dataset: Dataset = ShardedDROIDLeRobotDataset( + root=root, + lerobot_roots=lerobot_roots, + use_success_only=use_success_only, + **shard_kwargs, + ) + else: + dataset = DROIDLeRobotDataset(root=root, **shard_kwargs) + transform = ActionTransformPipeline( + tokenizer_config=tokenizer_config, + cfg_dropout_rate=cfg_dropout_rate, + max_action_dim=max_action_dim, + append_viewpoint_info=append_viewpoint_info, + append_duration_fps_timestamps=append_duration_fps_timestamps, + append_resolution_info=append_resolution_info, + append_idle_frames=append_idle_frames, + ) + sft = ActionSFTDataset(dataset, transform, resolution) + if iterable_shuffle: + return ActionIterableShuffleDataset(sft, seed=episode_shuffle_seed) + return sft + + +def get_action_libero_sft_dataset( + *, + root: str, + fps: float = 20.0, + chunk_length: int = 16, + image_size: int = 256, + mode: str = "policy", + camera_mode: str = "concat_view", + action_space: str = "frame_wise_relative", + rotation_space: str = "6d", + pose_coordinate_frame: str = "native", + action_normalization: str | None = "quantile_rot", + action_stats_path: str | None = None, + split: str = "train", + val_ratio: float = 0.01, + seed: int = 0, + resolution: str | int | None = None, + max_action_dim: int = 64, + tokenizer_config: dict | None = None, + cfg_dropout_rate: float = 0.1, + append_viewpoint_info: bool = True, + append_duration_fps_timestamps: bool = True, + append_resolution_info: bool = True, + append_idle_frames: bool = True, + iterable_shuffle: bool = False, + episode_shuffle_seed: int = 42, +) -> Dataset: + """Build the LIBERO action-policy SFT dataset (GA reproduction defaults). + + Mirrors :func:`get_action_droid_sft_dataset` but feeds ``LIBEROLeRobotDataset`` + (frame-wise-relative rot6d actions, ``quantile_rot``-normalized, concat_view + third-person + wrist at 256x256 each → 256x512) through + ``ActionTransformPipeline``. ``root`` is a LOCAL LeRobot dir (read parquet + + video directly, like DROID); pre-sync the HF dataset once, e.g. + ``hf download lerobot/libero_10 --repo-type dataset --local-dir ``. For + the Table-20 LIBERO-10 reproduction point ``root`` at libero_10 alone (the + 4-suite mix dilutes libero_10 to ~1 pass in 2000 steps → ~82% vs ~97%). The + dataset is FPS-agnostic (decodes at real frame timestamps); ``fps`` is metadata + for ``conditioning_fps`` / prompt duration. + """ + dataset = LIBEROLeRobotDataset( + root=root, + image_size=image_size, + chunk_length=chunk_length, + fps=fps, + mode=mode, + split=split, + val_ratio=val_ratio, + seed=seed, + camera_mode=camera_mode, + action_space=action_space, + rotation_space=rotation_space, + pose_coordinate_frame=pose_coordinate_frame, + action_normalization=action_normalization, + action_stats_path=action_stats_path, + ) transform = ActionTransformPipeline( tokenizer_config=tokenizer_config, cfg_dropout_rate=cfg_dropout_rate, diff --git a/cosmos_framework/data/vfm/action/datasets/base_dataset.py b/cosmos_framework/data/vfm/action/datasets/base_dataset.py index 2c9c4cb..f9b0f61 100644 --- a/cosmos_framework/data/vfm/action/datasets/base_dataset.py +++ b/cosmos_framework/data/vfm/action/datasets/base_dataset.py @@ -69,18 +69,25 @@ def __init__( for path in sorted((self._root / "meta" / "episodes").glob("chunk-*/file-*.parquet")) for row in pq.read_table(path).to_pylist() } - self._tasks = { - int(row["task_index"]): str(row["task"]) - for row in pq.read_table(self._root / "meta" / "tasks.parquet").to_pylist() - } - self._rows = sorted( - ( - row - for path in sorted((self._root / "data").glob("chunk-*/file-*.parquet")) - for row in pq.read_table(path).to_pylist() - ), - key=lambda row: int(row["index"]), - ) + # ``meta/tasks.parquet`` normally has a ``task`` column. Some LeRobot + # conversions (e.g. the community LIBERO datasets) instead store the task + # string as the (unnamed) pandas index, which pyarrow surfaces as + # ``__index_level_0__``. Fall back to the lone non-``task_index`` field so + # both layouts work (datasets that have ``task`` are unaffected). + self._tasks = {} + for row in pq.read_table(self._root / "meta" / "tasks.parquet").to_pylist(): + if "task" in row: + task = row["task"] + else: + extras = [v for k, v in row.items() if k != "task_index"] + task = extras[0] if extras else "" + self._tasks[int(row["task_index"])] = str(task) + # ``self._rows`` (the flat, index-sorted list of every frame dict) is built + # lazily on first access — see the ``_rows`` property. Materializing all + # ~18M frames as Python dicts plus a full sort costs ~13 min and tens of GB; + # subclasses that build their own compact index (e.g. DROIDLeRobotDataset) + # never touch it, so they must not pay for it at construction. + self._rows_cache: list[dict[str, Any]] | None = None @property def fps(self) -> float: @@ -213,5 +220,25 @@ def _build_result( **extras, } + @property + def _rows(self) -> list[dict[str, Any]]: + """Flat, index-sorted list of every frame dict, built lazily on first access. + + Only datasets that don't build their own compact index (bridge / agibot / + robomind) touch this; for them it materializes once and caches. Datasets with + a bespoke index (e.g. DROIDLeRobotDataset) never read it, so they skip the + ~13 min / tens-of-GB construction entirely. + """ + if self._rows_cache is None: + self._rows_cache = sorted( + ( + row + for path in sorted((self._root / "data").glob("chunk-*/file-*.parquet")) + for row in pq.read_table(path).to_pylist() + ), + key=lambda row: int(row["index"]), + ) + return self._rows_cache + def __len__(self) -> int: return max(0, (len(self._rows) - self._chunk_length + self._sample_stride - 1) // self._sample_stride) diff --git a/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py b/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py new file mode 100644 index 0000000..146fcc1 --- /dev/null +++ b/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py @@ -0,0 +1,335 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""LIBERO LeRobot dataset (frame-wise-relative action policy). + +Mirrors ``DROIDLeRobotDataset``: reads the LeRobot parquet directly, windows by +frame index, and decodes video at each frame's REAL timestamp. That makes it +FPS-agnostic — it works with the 10 FPS community ``lerobot/libero_*`` datasets +and a 20 FPS conversion alike, without LeRobot's ``delta_timestamps`` grid (which +rejects any window whose synthetic timestamps don't land on real frames). + +Action layout (``frame_wise_relative``): the stored 7D ``action`` is already a +per-frame delta ``[dpos(3), drot_axisangle(3), gripper(1)]``; only the rotation is +re-encoded to the requested ``rotation_space`` -> ``[dpos(3), rot6d(6), gripper(1)]`` +(10D for ``6d``). + +NOTE on FPS / stats fidelity: the bundled ``quantile_rot`` stats were computed on +a 20 FPS conversion. Per-frame deltas at 10 FPS span 2x the wall-clock motion, so +for a faithful Table-20 reproduction use a 20 FPS LIBERO dataset (or recompute +stats for the dataset's FPS). Loading/training is correct at any FPS regardless. +""" + +from __future__ import annotations + +import json +import random +from pathlib import Path +from typing import Any, Literal + +import numpy as np +import pyarrow.parquet as pq +import torch +import torch.nn.functional as F +from lerobot.datasets.video_utils import decode_video_frames + +from cosmos_framework.utils import log +from cosmos_framework.data.vfm.action.action_normalization import normalize_action +from cosmos_framework.data.vfm.action.action_spec import ActionSpec, Gripper, Pos, Rot, build_action_spec +from cosmos_framework.data.vfm.action.datasets.base_dataset import ActionBaseDataset +from cosmos_framework.data.vfm.action.libero_pose_utils import libero_action_dim, libero_rotation_format +from cosmos_framework.data.vfm.action.pose_utils import convert_rotation + +CameraMode = Literal["image", "wrist_image", "concat_view"] +RotationSpace = Literal["3d", "6d", "9d"] + +_ACTION_FEATURE = "action" +_IMAGE_FEATURE = "observation.images.image" +_WRIST_FEATURE = "observation.images.wrist_image" +_STAT_KEYS = ("mean", "std", "min", "max", "q01", "q99") +_NORMALIZERS_DIR = Path(__file__).parent / "stats" + +_VIEWPOINT_BY_CAMERA = { + "image": "third_person_view", + "wrist_image": "wrist_view", + "concat_view": "concat_view", +} + + +class LIBEROLeRobotDataset(ActionBaseDataset): + """LIBERO action-policy dataset with frame-wise-relative rot6d actions. + + 10D ``[pos_delta(3), rot6d_delta(6), gripper(1)]`` (for ``rotation_space='6d'``), + ``concat_view`` third-person + wrist video, and ``quantile_rot`` normalization + against the bundled stats. Reads parquet + decodes video at real timestamps, + so the requested ``fps`` is metadata only (it sets ``conditioning_fps`` and the + prompt duration); frame windows always use the data's actual frames. + """ + + def __init__( + self, + root: str, + fps: float = 20.0, + chunk_length: int = 16, + mode: str = "policy", + tolerance_s: float = 1e-4, + camera_mode: CameraMode = "concat_view", + image_size: int = 256, + action_space: str = "frame_wise_relative", + rotation_space: RotationSpace = "6d", + pose_coordinate_frame: str = "native", + embodiment_type: str = "libero", + action_normalization: str | None = "quantile_rot", + action_stats_path: str | None = None, + split: str = "train", + val_ratio: float = 0.01, + seed: int = 0, + sample_stride: int = 1, + ) -> None: + if action_space != "frame_wise_relative": + raise NotImplementedError( + f"This LIBERO dataset only supports action_space='frame_wise_relative', got {action_space!r}." + ) + if camera_mode not in _VIEWPOINT_BY_CAMERA: + raise ValueError(f"Unsupported camera_mode={camera_mode!r}. Use image/wrist_image/concat_view.") + split = split.lower().strip() + if split not in {"train", "val", "valid", "validation", "eval", "test", "full"}: + raise ValueError(f"Unsupported split={split!r}. Use train/val/full.") + if chunk_length % 4 != 0: + raise ValueError(f"chunk_length must be divisible by 4, got {chunk_length}.") + + super().__init__( + root=root, + domain_name=embodiment_type, + fps=fps, + chunk_length=chunk_length, + mode=mode, + pose_convention="backward_framewise", # unused for frame_wise deltas; satisfies the base assert + tolerance_s=tolerance_s, + viewpoint=_VIEWPOINT_BY_CAMERA[camera_mode], + # frame_wise_relative ⇔ backward_framewise idle semantics. quantile_rot is a + # LIBERO convention -> normalize with the "quantile" formula on raw-rotation + # stats (see _load_norm_stats); pass the method the base will call. + action_normalization=None if action_normalization is None else "quantile", + sample_stride=sample_stride, + ) + # FPS-agnostic loader: trust the dataset's NATIVE fps for conditioning_fps / + # prompt duration so the metadata is truthful (10 for the public + # lerobot/libero_*, 20 for a 20 FPS conversion). Frame sampling uses each + # frame's real timestamp regardless, so the requested ``fps`` is ignored here. + info_fps = self._info.get("fps") + if info_fps: + if int(info_fps) != int(fps): + log.info(f"Using dataset native fps={info_fps} for conditioning (requested {fps}).") + self._fps = float(info_fps) + self._dt = 1.0 / self._fps + self._camera_mode = camera_mode + self._image_size = int(image_size) + self._rotation_space = rotation_space.lower().strip() + self._pose_coordinate_frame = pose_coordinate_frame + self._embodiment_type = embodiment_type + self._requested_normalization = action_normalization + # quantile_rot normalizes against the raw (un-orthonormalized) rotation stats + # under "global_raw"; everything else uses "global". + self._stats_key = "global_raw" if action_normalization == "quantile_rot" else "global" + self._stats_file = self._resolve_stats_file(action_stats_path) + + if self._camera_mode == "image": + self._video_keys = [_IMAGE_FEATURE] + elif self._camera_mode == "wrist_image": + self._video_keys = [_WRIST_FEATURE] + else: + self._video_keys = [_IMAGE_FEATURE, _WRIST_FEATURE] + + # Compact, lazy frame index (mirrors DROIDLeRobotDataset): read only the + # columns the sample builder needs into contiguous arrays, ordered by global + # frame index, so DataLoader worker forks share them copy-on-write. + index_parts, episode_parts, task_parts, ts_parts, action_parts = [], [], [], [], [] + for path in sorted((self._root / "data").glob("chunk-*/file-*.parquet")): + table = pq.read_table(path, columns=["index", "episode_index", "task_index", "timestamp", _ACTION_FEATURE]) + index_parts.append(table["index"].to_numpy()) + episode_parts.append(table["episode_index"].to_numpy()) + task_parts.append(table["task_index"].to_numpy()) + ts_parts.append(table["timestamp"].to_numpy()) + action_parts.append(np.asarray(table[_ACTION_FEATURE].to_pylist(), dtype=np.float32)) + if not index_parts: + raise FileNotFoundError(f"No data parquet found under {self._root / 'data'}.") + order = np.argsort(np.concatenate(index_parts).astype(np.int64), kind="stable") + self._row_episode = np.concatenate(episode_parts).astype(np.int64)[order] + self._row_task = np.concatenate(task_parts).astype(np.int64)[order] + self._row_timestamp = np.concatenate(ts_parts).astype(np.float64)[order] + self._row_action = np.concatenate(action_parts, axis=0).astype(np.float32)[order] + + assert np.all(np.diff(self._row_episode) >= 0), "episode_index not contiguous after sorting by frame index" + ep_vals, ep_starts, ep_counts = np.unique(self._row_episode, return_index=True, return_counts=True) + + # Deterministic per-episode train/val split (seeded; same on every rank). + keep = self._split_episode_ids(ep_vals.tolist(), split, val_ratio, seed) + kept = np.array([int(v) in keep for v in ep_vals], dtype=bool) + self._ep_vals = ep_vals.astype(np.int64)[kept] + self._ep_starts = ep_starts.astype(np.int64)[kept] + kept_counts = ep_counts.astype(np.int64)[kept] + # Within-episode windows only: total - n_kept_episodes * chunk_length valid samples. + self._valid_cum = np.cumsum(np.maximum(0, kept_counts - self._chunk_length)).astype(np.int64) + + log.info( + f"Loaded LIBERO dataset root={self._root} split={split!r} camera_mode={camera_mode!r} " + f"fps={self._fps} kept_episodes={len(self._ep_vals)}/{len(ep_vals)} " + f"valid_indices={int(self._valid_cum[-1]) if self._valid_cum.size else 0}" + ) + + # ---- spec / dims ------------------------------------------------------- + + @property + def action_dim(self) -> int: + return libero_action_dim(self._rotation_space) + + def _action_spec(self) -> ActionSpec: + return build_action_spec(Pos(), Rot(libero_rotation_format(self._rotation_space)), Gripper()) + + @classmethod + def _stats_path(cls) -> Path: + # Base classmethod fallback; the instance uses self._stats_file (which also + # honors action_stats_path + the rotation/coordinate-frame-specific filename). + return _NORMALIZERS_DIR / "libero_native_frame_wise_relative_rot6d.json" + + # ---- normalization (nested global/global_raw + quantile_rot) ------------ + + def _bundled_stats_filename(self) -> str: + rotation_suffix = {"3d": "3d", "6d": "rot6d", "9d": "rot9d"}.get(self._rotation_space) + if rotation_suffix is None: + raise ValueError(f"Unsupported rotation_space={self._rotation_space!r}.") + action_space = "frame_wise_relative" + return f"{self._embodiment_type}_{self._pose_coordinate_frame}_{action_space}_{rotation_suffix}.json" + + def _resolve_stats_file(self, action_stats_path: str | None) -> Path: + if action_stats_path: + p = Path(action_stats_path) + if not p.is_absolute(): + p = _NORMALIZERS_DIR / p.name + if not p.exists(): + raise FileNotFoundError(f"action_stats_path not found: {action_stats_path!r}") + return p + p = _NORMALIZERS_DIR / self._bundled_stats_filename() + if not p.exists(): + raise FileNotFoundError( + f"Bundled LIBERO stats not found at {p}. Pass action_stats_path or recompute stats." + ) + return p + + def _load_norm_stats(self) -> dict[str, torch.Tensor]: + if self._norm_stats is None: + raw = json.loads(self._stats_file.read_text())[self._stats_key] + self._norm_stats = { + k: torch.tensor(v, dtype=torch.float32) for k, v in raw.items() if k in _STAT_KEYS + } + return self._norm_stats + + # ---- index helpers ----------------------------------------------------- + + @staticmethod + def _split_episode_ids(ep_ids: list[int], split: str, val_ratio: float, seed: int) -> set[int]: + if split == "full": + return set(int(v) for v in ep_ids) + if not (0.0 < val_ratio < 1.0): + raise ValueError(f"val_ratio must be in (0, 1), got {val_ratio}.") + n_val = max(1, int(round(len(ep_ids) * val_ratio))) + rng = random.Random(seed) # identical selection on every rank + val = set(int(v) for v in rng.sample(list(ep_ids), n_val)) + if split == "train": + return set(int(v) for v in ep_ids) - val + return val # val/valid/validation/eval/test + + def __len__(self) -> int: + return int(self._valid_cum[-1]) if self._valid_cum.size else 0 + + def get_shuffle_blocks(self) -> list[tuple[int, int]]: + """Per-episode ``(start, length)`` flat-index blocks for + ``ActionIterableShuffleDataset`` (shuffle block ORDER + shard across + ranks, sequential within a block).""" + blocks: list[tuple[int, int]] = [] + prev = 0 + for c in np.asarray(self._valid_cum).tolist(): + c = int(c) + if c > prev: + blocks.append((prev, c - prev)) + prev = c + return blocks + + # ---- sample build ------------------------------------------------------ + + def __getitem__(self, idx: int) -> dict[str, Any]: + # Resilience: a single unreadable/corrupt video frame (e.g. a torchcodec + # decode error on the packed LeRobot-v3 mp4s) must not crash a multi-node + # run. Resample a different valid window on failure (bounded retries). + n = len(self) + last_err: Exception | None = None + for _attempt in range(8): + try: + return self._build_item(idx) + except Exception as e: # noqa: BLE001 — skip past undecodable frames + last_err = e + log.warning(f"LIBERO: sample idx={idx} failed to load ({type(e).__name__}: {e}); resampling") + if n > 0: + idx = random.randint(0, n - 1) + raise RuntimeError(f"LIBERO: failed to load a sample after 8 resamples; last error: {last_err}") + + def _build_item(self, idx: int) -> dict[str, Any]: + mode = self._choose_mode() + idx = int(idx) + ep = int(np.searchsorted(self._valid_cum, idx, side="right")) + prev = int(self._valid_cum[ep - 1]) if ep > 0 else 0 + start = int(self._ep_starts[ep]) + (idx - prev) + episode_index = int(self._ep_vals[ep]) + episode = self._episodes[episode_index] + + stop = start + self._chunk_length + 1 + timestamps = [float(self._row_timestamp[j]) for j in range(start, stop)] + video = self._load_video(episode, timestamps) + + # frame_wise_relative: chunk per-frame deltas are the stored actions directly. + raw = self._row_action[start : start + self._chunk_length] # [chunk, 7] + action = self._build_frame_wise_action(raw) + + task = self._tasks[int(self._row_task[start])] + ai_caption = random.choice([p.strip() for p in task.split(" | ") if p.strip()] or [task]) + + extras: dict[str, Any] = {} + if self._camera_mode == "concat_view": + extras["additional_view_description"] = ( + "The left half shows the third-person view; the right half shows the wrist-mounted camera." + ) + return self._build_result(mode=mode, video=video, action=action, ai_caption=ai_caption, **extras) + + def _build_frame_wise_action(self, raw: np.ndarray) -> torch.Tensor: + raw_t = torch.from_numpy(np.ascontiguousarray(raw)).float() # [chunk, 7] + translation = raw_t[:, 0:3] + rotation_matrix = convert_rotation(raw_t[:, 3:6], input_format="axisangle", output_format="matrix") + rotation = convert_rotation( + rotation_matrix, input_format="matrix", output_format=libero_rotation_format(self._rotation_space) + ) + gripper = raw_t[:, 6:7] + return torch.cat([translation, rotation, gripper], dim=-1) # [chunk, action_dim] + + def _load_video(self, episode: dict[str, Any], timestamps: list[float]) -> torch.Tensor: + frames_by_view = {} + for key in self._video_keys: + from_ts = float(episode.get(f"videos/{key}/from_timestamp", 0.0)) + frames = decode_video_frames( + self._video_path(episode, key), + [from_ts + ts for ts in timestamps], + self._tolerance_s, + ) # [T, C, H, W] in [0, 1] + frames = self._resize(frames) + frames_by_view[key] = frames + if self._camera_mode == "concat_view": + # third-person (left) + wrist (right), horizontally concatenated -> [T, C, H, 2W] + return torch.cat([frames_by_view[_IMAGE_FEATURE], frames_by_view[_WRIST_FEATURE]], dim=-1) + return frames_by_view[self._video_keys[0]] + + def _resize(self, frames: torch.Tensor) -> torch.Tensor: + if frames.shape[-1] == self._image_size and frames.shape[-2] == self._image_size: + return frames + return F.interpolate( + frames, size=(self._image_size, self._image_size), mode="bilinear", align_corners=False + ) diff --git a/cosmos_framework/data/vfm/action/datasets/stats/libero_native_frame_wise_relative_rot6d.json b/cosmos_framework/data/vfm/action/datasets/stats/libero_native_frame_wise_relative_rot6d.json new file mode 100644 index 0000000..a705e7c --- /dev/null +++ b/cosmos_framework/data/vfm/action/datasets/stats/libero_native_frame_wise_relative_rot6d.json @@ -0,0 +1,37 @@ +{ + "metadata": { + "embodiment_type": "libero", + "pose_convention": "frame_wise_relative", + "pose_coordinate_frame": "native", + "rotation_format": "6d", + "action_dim": 10, + "skip_rotation_dims": [3, 4, 5, 6, 7, 8], + "chunk_length": 16, + "sample_stride": null, + "dataset_name": "libero", + "dataset_class": "LIBEROLeRobotDataset", + "dataset_root": ["outputs/libero_datasets/libero_10", "outputs/libero_datasets/libero_object", "outputs/libero_datasets/libero_spatial", "outputs/libero_datasets/libero_goal"], + "_comment": "Dataset paths are placeholders; the statistics values are independent of local dataset location.", + "split": "train", + "num_samples_stats": 10000, + "reservoir_size": 50000, + "max_samples": 10000, + "sampling_seed": 42 + }, + "global": { + "mean": [ 0.050704, 0.097407, -0.094833, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.476725], + "std": [ 0.333621, 0.387175, 0.457140, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.499460], + "min": [-0.937500, -0.937500, -0.937500, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "max": [ 0.937500, 0.937500, 0.937500, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000], + "q01": [-0.723214, -0.808929, -0.937500, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "q99": [ 0.937500, 0.870536, 0.937500, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000] + }, + "global_raw": { + "mean": [ 0.050704, 0.097407, -0.094833, 0.994873, -0.004579, -0.004288, 0.004389, 0.996104, 0.001109, 0.476725], + "std": [ 0.333621, 0.387175, 0.457140, 0.010807, 0.077802, 0.063386, 0.078571, 0.009994, 0.038504, 0.499460], + "min": [-0.937500, -0.937500, -0.937500, 0.902028, -0.356085, -0.367416, -0.370434, 0.921907, -0.255000, 0.000000], + "max": [ 0.937500, 0.937500, 0.937500, 1.000000, 0.368853, 0.341214, 0.356395, 1.000000, 0.348251, 1.000000], + "q01": [-0.723214, -0.808929, -0.937500, 0.934955, -0.223431, -0.189878, -0.334735, 0.938516, -0.107736, 0.000000], + "q99": [ 0.937500, 0.870536, 0.937500, 1.000000, 0.331000, 0.163153, 0.226216, 1.000000, 0.127158, 1.000000] + } +} diff --git a/cosmos_framework/data/vfm/action/libero_pose_utils.py b/cosmos_framework/data/vfm/action/libero_pose_utils.py new file mode 100644 index 0000000..5cc9fff --- /dev/null +++ b/cosmos_framework/data/vfm/action/libero_pose_utils.py @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Small LIBERO pose helpers shared by training and closed-loop eval.""" + +from __future__ import annotations + +import numpy as np +import torch + +from cosmos_framework.data.vfm.action.pose_utils import ( + RotationConvention, + build_abs_pose_from_components, +) + +# Same local-frame post-rotation pattern used by DROID/Bridge/Fractal: +# R_opencv = R_native @ *_TO_OPENCV. +LIBERO_TO_OPENCV: np.ndarray = np.array( + [[0.0, -1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 1.0]], + dtype=np.float32, +) + +LIBERO_ROTATION_FORMATS: dict[str, RotationConvention] = { + "3d": "axisangle", + "6d": "rot6d", + "9d": "rot9d", +} +LIBERO_ACTION_DIMS: dict[str, int] = {"3d": 7, "6d": 10, "9d": 13} + + +def libero_rotation_format(rotation_space: str) -> RotationConvention: + """Return the shared ``pose_utils`` rotation format for a LIBERO setting.""" + rotation_format = LIBERO_ROTATION_FORMATS.get(rotation_space) + if rotation_format is None: + raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.") + return rotation_format + + +def libero_action_dim(rotation_space: str) -> int: + """Return ``[xyz, rotation, gripper]`` action width for LIBERO.""" + action_dim = LIBERO_ACTION_DIMS.get(rotation_space) + if action_dim is None: + raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.") + return action_dim + + +def libero_rotation_space_from_action_dim(action_dim: int) -> str: + """Infer LIBERO rotation space from unpadded action width.""" + for rotation_space, dim in LIBERO_ACTION_DIMS.items(): + if dim == action_dim: + return rotation_space + raise ValueError(f"Unable to infer rotation_space from action_dim={action_dim}.") + + +def build_libero_abs_pose(state_raw: torch.Tensor | np.ndarray, *, to_opencv: bool) -> np.ndarray: + """Build absolute LIBERO EE poses from state rows. + + ``state_raw`` is ``[x,y,z,axisangle(3),gripper(2)]``. When requested, the + local EE frame is post-rotated into the shared OpenCV-style action frame. + """ + if isinstance(state_raw, torch.Tensor): + state_np = state_raw.detach().cpu().numpy().astype(np.float32, copy=False) + else: + state_np = np.asarray(state_raw, dtype=np.float32) + + poses_abs = build_abs_pose_from_components(state_np[:, :3], state_np[:, 3:6], "axisangle") + if to_opencv: + poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ LIBERO_TO_OPENCV + return poses_abs diff --git a/cosmos_framework/scripts/action_policy_server_libero.py b/cosmos_framework/scripts/action_policy_server_libero.py index 7382b97..2fa0567 100644 --- a/cosmos_framework/scripts/action_policy_server_libero.py +++ b/cosmos_framework/scripts/action_policy_server_libero.py @@ -63,6 +63,10 @@ # Action-specific helpers live in the in-tree project tree. Imports stay as # `projects.cosmos3.vfm.*` and are auto-rewritten to `cosmos3._src.vfm.*` by the # cosmos-framework release script. +from cosmos_framework.data.vfm.action.action_processing import ( + ActionProcessingRecord, + make_batched_action_processing_fields, +) from cosmos_framework.data.vfm.action.domain_utils import get_domain_id from cosmos_framework.data.vfm.action.transforms import ( build_sequence_plan_from_mode, @@ -431,7 +435,8 @@ class ActionServerArgs(pydantic.BaseModel): # ``OmniSetupOverrides`` programmatically in ``build_setup_overrides``. checkpoint: tyro.conf.OmitArgPrefixes[CheckpointOverrides] = CheckpointOverrides.model_construct() - """Checkpoint and config loading configuration.""" + """Checkpoint and config loading configuration. ``use_ema_weights`` lives here and + defaults True at inference (suppressed from CLI) -> evals load net_ema by default.""" output_dir: Path | None = None """Output directory for ``OmniInference`` (saved config.yaml, benchmarks). @@ -804,101 +809,53 @@ def get_info(self) -> dict[str, Any]: # Predict # ------------------------------------------------------------------ - def predict_policy(self, req: dict[str, Any]) -> dict[str, Any]: - """ - Run policy inference: given an observation image and prompt, predict actions. - - Input request format: - { - "image": "", - "prompt": "", - "domain_name": "", - "image_size": - } - - Output format: - { - "action": [[a0_0, a0_1, ...], ..., [aN_0, aN_1, ...]], - "video": ["", ...] # List of T base64-encoded PNG frames - } - - All action dimensions are returned. Video is the decoded predicted rollout as base64 PNGs. - """ - t0 = time.monotonic() - - # Get or assign request ID - injected_id = req.get("request_id", None) - if isinstance(injected_id, int) and injected_id > 0: - request_id = int(injected_id) - else: - with self._req_id_lock: - self._req_id += 1 - request_id = int(self._req_id) + def _input_video_key(self) -> str: + input_video_key = getattr(self.model, "input_video_key", None) + if input_video_key is None: + input_video_key = getattr(self.model, "config", None).input_video_key # type: ignore[union-attr] + return input_video_key - # Validate request + def _prep_policy_item(self, req: dict[str, Any]) -> dict[str, Any]: + """Validate one request and build the per-sample model inputs (video pad, + prompt augmentation, sequence_plan). Shared by predict_policy (batch=1) and + predict_policy_batch (batch=N) so the two paths stay byte-identical per item.""" image_b64 = req.get("image") if not isinstance(image_b64, str): raise ValueError("'image' must be a base64 string") - prompt = req.get("prompt") if not isinstance(prompt, str): raise ValueError("'prompt' must be a string") - domain_name = req.get("domain_name") if not isinstance(domain_name, str): raise ValueError("'domain_name' must be a string") - image_size = req.get("image_size") if not isinstance(image_size, int) or image_size <= 0: raise ValueError("'image_size' must be a positive integer") - # Decode image - t_decode0 = time.monotonic() img_chw_uint8 = _decode_base64_png_to_rgb_uint8(image_b64) img_h, img_w = img_chw_uint8.shape[-2:] - - # Handle resizing: for multi-view (non-square) images, scale proportionally - # to maintain aspect ratio while matching height to image_size + # Multi-view (non-square) images: scale proportionally, matching height to image_size. if img_h != image_size: - # Calculate new width to maintain aspect ratio scale = image_size / img_h new_w = int(round(img_w * scale)) - hwc = img_chw_uint8.permute(1, 2, 0).cpu().numpy() # [H,W,3] + hwc = img_chw_uint8.permute(1, 2, 0).cpu().numpy() resized = Image.fromarray(hwc).resize((new_w, image_size), resample=Image.Resampling.BILINEAR) arr = np.asarray(resized, dtype=np.uint8).copy() - img_chw_uint8 = torch.from_numpy(arr).permute(2, 0, 1).contiguous() # [3,H,W] # [3,H,W] - t_decode1 = time.monotonic() + img_chw_uint8 = torch.from_numpy(arr).permute(2, 0, 1).contiguous() - # Construct batch in IterativeJointDataLoader format (list-of-lists for multi-item keys) t_frames = self.cfg.action_chunk_size + 1 _, final_h, final_w = img_chw_uint8.shape video_c_t_h_w_uint8 = img_chw_uint8.unsqueeze(1).repeat(1, t_frames, 1, 1) # [3,T,H,W] - - # Apply reflection padding to match closest predefined resolution resolution = get_vision_data_resolution((final_h, final_w)) target_w, target_h = find_closest_target_size(final_h, final_w, resolution) pad_dict: dict[str, Any] = {"video": video_c_t_h_w_uint8} reflection_pad_to_target(pad_dict, ["video"], True, target_w, target_h) - video_padded = pad_dict["video"] # (C, T, target_h, target_w) - padded_image_size = pad_dict["image_size"] # (4,) - - # Action: zeros tensor as noise starting point for policy mode - action_t_d = torch.zeros( - (self.cfg.action_chunk_size, self.cfg.max_action_dim), - dtype=torch.float32, - ) # [T,action_dim] - - input_video_key = getattr(self.model, "input_video_key", None) - if input_video_key is None: - input_video_key = getattr(self.model, "config", None).input_video_key # type: ignore[union-attr] - sequence_plan = build_sequence_plan_from_mode( mode="policy", video_length=self.cfg.action_chunk_size + 1, action_length=self.cfg.action_chunk_size, has_text=True, ) - augmented_prompt = _augment_prompt_with_metadata( prompt, t_frames=t_frames, @@ -908,10 +865,126 @@ def predict_policy(self, req: dict[str, Any]) -> dict[str, Any]: append_duration_fps=self.append_duration_fps, append_resolution_info=self.append_resolution_info, ) + return { + "img_chw_uint8": img_chw_uint8, + "video_padded": pad_dict["video"], + "padded_image_size": pad_dict["image_size"], + "augmented_prompt": augmented_prompt, + "sequence_plan": sequence_plan, + "domain_name": domain_name, + "image_size": image_size, + } + + def predict_policy_batch(self, reqs: list[dict[str, Any]]) -> dict[str, Any]: + """Batched policy inference: N requests -> ONE diffusion forward (batch_size=N) + -> N denormalized action chunks. Skips vision decode (the vectorized eval client + only needs actions), so it is ~N x faster than N serial /predict calls.""" + t0 = time.monotonic() + if not isinstance(reqs, list) or not reqs: + raise ValueError("'items' must be a non-empty list of policy requests") + preps = [self._prep_policy_item(r) for r in reqs] + n = len(preps) + action_t_d = torch.zeros( + (self.cfg.action_chunk_size, self.cfg.max_action_dim), dtype=torch.float32 + ) + input_video_key = self._input_video_key() + batch: dict[str, Any] = { + input_video_key: [[p["video_padded"]] for p in preps], + **make_batched_action_processing_fields( + ActionProcessingRecord(raw_action_dim=self.raw_action_dim, action_normalizer=None), + batch_size=n, + ), + "action": [[action_t_d] for _ in preps], + "mode": ["policy"] * n, + "ai_caption": [p["augmented_prompt"] for p in preps], + "prompt": [p["augmented_prompt"] for p in preps], + "conditioning_fps": [torch.tensor(self.cfg.fps, dtype=torch.long) for _ in preps], + "image_size": torch.stack([p["padded_image_size"] for p in preps]).to(device="cuda"), + "domain_id": [torch.tensor(get_domain_id(p["domain_name"]), dtype=torch.long) for p in preps], + "sequence_plan": [p["sequence_plan"] for p in preps], + } + t_inf0 = time.monotonic() + with self._lock: + with torch.inference_mode(): + samples = self.model.generate_samples_from_batch( + batch, + guidance=self.cfg.guidance, + seed=[self.cfg.seed] * n, + num_steps=self.cfg.num_steps, + has_negative_prompt=False, + ) + t_inf1 = time.monotonic() + actions: list[list[list[float]]] = [] + for i in range(n): + pred = samples["action"][i].float().squeeze(0) # [T,D] + pred = self._denormalize_action(pred) + actions.append(pred.detach().cpu().numpy().tolist()) + log.info( + f"[action-server] predict_batch n={n} steps={self.cfg.num_steps} " + f"ms_total={(time.monotonic() - t0) * 1000.0:.1f} ms_infer={(t_inf1 - t_inf0) * 1000.0:.1f}" + ) + return {"actions": actions} + + def predict_policy(self, req: dict[str, Any]) -> dict[str, Any]: + """ + Run policy inference: given an observation image and prompt, predict actions. + + Input request format: + { + "image": "", + "prompt": "", + "domain_name": "", + "image_size": + } + + Output format: + { + "action": [[a0_0, a0_1, ...], ..., [aN_0, aN_1, ...]], + "video": ["", ...] # List of T base64-encoded PNG frames + } + + All action dimensions are returned. Video is the decoded predicted rollout as base64 PNGs. + """ + t0 = time.monotonic() + + # Get or assign request ID + injected_id = req.get("request_id", None) + if isinstance(injected_id, int) and injected_id > 0: + request_id = int(injected_id) + else: + with self._req_id_lock: + self._req_id += 1 + request_id = int(self._req_id) + + # Per-item preprocessing (validation, decode/resize/pad, prompt, sequence_plan). + t_decode0 = time.monotonic() + prep = self._prep_policy_item(req) + t_decode1 = time.monotonic() + img_chw_uint8 = prep["img_chw_uint8"] + video_padded = prep["video_padded"] + padded_image_size = prep["padded_image_size"] + augmented_prompt = prep["augmented_prompt"] + sequence_plan = prep["sequence_plan"] + domain_name = prep["domain_name"] + image_size = prep["image_size"] + + # Action: zeros tensor as noise starting point for policy mode + action_t_d = torch.zeros( + (self.cfg.action_chunk_size, self.cfg.max_action_dim), + dtype=torch.float32, + ) # [T,action_dim] + + input_video_key = self._input_video_key() batch: dict[str, Any] = { input_video_key: [[video_padded]], - "raw_action_dim": [torch.tensor(self.raw_action_dim, dtype=torch.long)], + # Provide BOTH raw_action_dim and the action_processing_record the model + # needs to externalize (invert) the generated action; building the batch + # by hand previously omitted the record -> "cannot be externalized". + **make_batched_action_processing_fields( + ActionProcessingRecord(raw_action_dim=self.raw_action_dim, action_normalizer=None), + batch_size=1, + ), "action": [[action_t_d]], "mode": ["policy"], "ai_caption": [augmented_prompt], @@ -1103,7 +1176,7 @@ def do_GET(self) -> None: # noqa: N802 self._send_json(404, {"error": "Not found"}) def do_POST(self) -> None: # noqa: N802 - if self.path not in ("/", "/predict"): + if self.path not in ("/", "/predict", "/predict_batch"): self._send_json(404, {"error": "Not found"}) return @@ -1147,13 +1220,21 @@ def do_POST(self) -> None: # noqa: N802 f"path={self.path} bytes={length}" ) + is_batch = self.path == "/predict_batch" try: - out = service.predict_policy(req) + if is_batch: + out = service.predict_policy_batch(req.get("items", [])) + else: + out = service.predict_policy(req) except Exception as e: err = str(e) traceback.print_exc() - payload = {"action": [], "error": err, "request_id": req.get("request_id")} + payload = ( + {"actions": [], "error": err} + if is_batch + else {"action": [], "error": err, "request_id": req.get("request_id")} + ) log.error(f"[action-server] request_id={req.get('request_id')} ERROR: {err}") # Dump failed request for offline debugging if enabled. diff --git a/cosmos_framework/simulation/__init__.py b/cosmos_framework/simulation/__init__.py new file mode 100644 index 0000000..28a81be --- /dev/null +++ b/cosmos_framework/simulation/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 diff --git a/cosmos_framework/simulation/libero/__init__.py b/cosmos_framework/simulation/libero/__init__.py new file mode 100644 index 0000000..503ec1b --- /dev/null +++ b/cosmos_framework/simulation/libero/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + diff --git a/cosmos_framework/simulation/libero/closed_loop_eval.py b/cosmos_framework/simulation/libero/closed_loop_eval.py new file mode 100644 index 0000000..0205f9f --- /dev/null +++ b/cosmos_framework/simulation/libero/closed_loop_eval.py @@ -0,0 +1,1343 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +""" +Closed-loop evaluation for LIBERO using the Action HTTP inference server. + +# Single-view example (agentview camera): +PYTHONPATH=. python cosmos_framework/simulation/libero/closed_loop_eval.py \ + --server_url http://localhost:8000 \ + --task_suite libero_10 \ + --num_trials_per_task 10 \ + --action_horizon 16 \ + --camera agentview \ + --save_gifs --gif_fps 20 \ + --action_space frame_wise_relative \ + --rotation_space 6d \ + --action_dim 10 \ + --output_dir results/libero_closed_loop_10_single_view + +# Multi-view example (agentview + wrist cameras): +PYTHONPATH=. python cosmos_framework/simulation/libero/closed_loop_eval.py \ + --server_url http://localhost:8000 \ + --task_suite libero_goal \ + --num_trials_per_task 2 \ + --action_horizon 16 \ + --camera agentview,wrist \ + --save_gifs --gif_fps 20 \ + --action_space frame_wise_relative \ + --rotation_space 6d \ + --action_dim 10 \ + --output_dir results/libero_closed_loop_goal_multiview +""" + +from __future__ import annotations + +import argparse +import base64 +import io +import json +import os +import random +import sys +import time +from dataclasses import dataclass +from collections.abc import Callable +from pathlib import Path +from typing import Any + +import numpy as np +import requests +from PIL import Image +from scipy.spatial.transform import Rotation as R + +from cosmos_framework.data.vfm.action.libero_pose_utils import ( + libero_rotation_format, + libero_rotation_space_from_action_dim, +) +from cosmos_framework.data.vfm.action.pose_utils import convert_rotation +from cosmos_framework.data.vfm.action.viewpoint_utils import DEFAULT_VIEWPOINT_TEMPLATES + +benchmark: Any +get_libero_path: Any +OffScreenRenderEnv: Any + + +TASK_MAX_STEPS: dict[str, int] = { + "libero_spatial": 220, + "libero_object": 280, + "libero_goal": 300, + "libero_10": 520, + "libero_90": 400, +} + + +_CAMERA_PROMPT_NAMES: dict[str, str] = { + "agentview": "third-person view", + "wrist": "wrist-mounted camera", +} + + +def _append_prompt_sentence(prompt: str, sentence: str) -> str: + """Append one metadata sentence using the same separator convention as training augmentors.""" + if sentence in prompt: + return prompt + prompt = prompt.rstrip() + if not prompt: + return sentence.rstrip() + separator = " " if prompt.rstrip().endswith(".") else ". " + return prompt + separator + sentence.rstrip() + + +def _concat_view_layout_description(cameras: list[str]) -> str: + """Describe the horizontal camera layout sent by ``ActionEnvironmentClient``.""" + camera_names = [_CAMERA_PROMPT_NAMES[camera] for camera in cameras] + if len(camera_names) == 2: + return f"The left half shows the {camera_names[0]}; the right half shows the {camera_names[1]}." + layout = ", ".join(camera_names) + return f"The views are concatenated horizontally from left to right as: {layout}." + + +def _augment_task_prompt_with_viewpoint(task_description: str, cameras: list[str]) -> str: + """Mirror DROID-style concat-view caption augmentation for closed-loop LIBERO eval.""" + if len(cameras) <= 1: + return task_description + prompt = _append_prompt_sentence(task_description, DEFAULT_VIEWPOINT_TEMPLATES["concat_view"]) + return _append_prompt_sentence(prompt, _concat_view_layout_description(cameras)) + + +def _rotation_repr_to_mat(rotation: np.ndarray, rotation_space: str) -> np.ndarray: + """Convert a single LIBERO rotation block to a 3x3 rotation matrix.""" + matrix = convert_rotation( + rotation, + libero_rotation_format(rotation_space), + "matrix", + normalize_matrix=rotation_space != "3d", + ) + if not isinstance(matrix, np.ndarray): + raise TypeError(f"Expected NumPy rotation matrix, got {type(matrix)!r}") + return matrix + + +@dataclass +class EpisodeResult: + success: bool + steps: int + error: str | None + actions: list[list[float]] + + +class ActionEnvironmentClient: + """Client for interacting with the Action model server.""" + + server_url: str + domain_name: str + prompt: str + image_size: int + timeout: float + + def __init__( + self, + server_url: str, + domain_name: str, + prompt: str, + image_size: int, + timeout: float, + ) -> None: + self.server_url = server_url.rstrip("/") + self.domain_name = domain_name + self.prompt = prompt + self.image_size = image_size + self.timeout = timeout + + def check_health(self) -> bool: + """Check if the model server is healthy.""" + try: + resp = requests.get(f"{self.server_url}/", timeout=5.0) + return resp.status_code == 200 + except requests.RequestException: + return False + + def get_info(self) -> dict[str, str]: + """Get model server info.""" + resp = requests.get(f"{self.server_url}/info", timeout=5.0) + resp.raise_for_status() + return resp.json() + + def notify_next_episode(self) -> None: + """Notify server to advance to next episode (used with dataset action server).""" + try: + requests.post( + f"{self.server_url}/next_episode", + json={"prompt": self.prompt}, + timeout=5.0, + ) + except requests.RequestException: + pass + + def encode_image(self, image: np.ndarray) -> str: + """Encode a numpy image (H, W, 3) uint8 to base64 PNG, resizing to image_size.""" + if image.dtype != np.uint8: + if image.max() <= 1.0: + image = (image * 255.0).round().astype(np.uint8) + else: + image = image.astype(np.uint8) + pil_img = Image.fromarray(image) + if pil_img.size != (self.image_size, self.image_size): + pil_img = pil_img.resize( + (self.image_size, self.image_size), + resample=Image.Resampling.BILINEAR, + ) + buf = io.BytesIO() + pil_img.save(buf, format="PNG") + return base64.b64encode(buf.getvalue()).decode("ascii") + + def encode_image_raw(self, image: np.ndarray) -> str: + """Encode a numpy image (H, W, 3) uint8 to base64 PNG without resizing.""" + if image.dtype != np.uint8: + if image.max() <= 1.0: + image = (image * 255.0).round().astype(np.uint8) + else: + image = image.astype(np.uint8) + pil_img = Image.fromarray(image) + buf = io.BytesIO() + pil_img.save(buf, format="PNG") + return base64.b64encode(buf.getvalue()).decode("ascii") + + def resize_image(self, image: np.ndarray) -> np.ndarray: + """Resize image to model input size.""" + if image.dtype != np.uint8: + if image.max() <= 1.0: + image = (image * 255.0).round().astype(np.uint8) + else: + image = image.astype(np.uint8) + pil_img = Image.fromarray(image) + if pil_img.size != (self.image_size, self.image_size): + pil_img = pil_img.resize( + (self.image_size, self.image_size), + resample=Image.Resampling.BILINEAR, + ) + return np.array(pil_img) + + def concatenate_images(self, images: list[np.ndarray]) -> np.ndarray: + """Resize each image and concatenate horizontally (side-by-side). + + Args: + images: List of images with shape (H, W, 3). + + Returns: + Concatenated image with shape (image_size, image_size*num_views, 3). + """ + resized = [self.resize_image(img) for img in images] + return np.concatenate(resized, axis=1) + + def predict(self, observation: np.ndarray | list[np.ndarray]) -> dict[str, Any]: + """Send observation(s) to model server and get predicted actions. + + Args: + observation: Single image as np.ndarray or list of images for multi-view. + For multi-view, images are resized and concatenated horizontally before sending. + """ + if isinstance(observation, list): + # Multi-view: resize each, concatenate horizontally, and send as single image + concatenated = self.concatenate_images(observation) + encoded = self.encode_image_raw(concatenated) + else: + # Single view: send single image + encoded = self.encode_image(observation) + + payload = { + "image": encoded, + "prompt": self.prompt, + "domain_name": self.domain_name, + "image_size": self.image_size, + } + + resp = requests.post( + f"{self.server_url}/predict", + json=payload, + headers={"Content-Type": "application/json"}, + timeout=self.timeout, + ) + resp.raise_for_status() + + result = resp.json() + if "error" in result and result["error"]: + raise RuntimeError(f"Model server error: {result['error']}") + return result + + def predict_batch(self, observations: list[list[np.ndarray]]) -> list[list[list[float]]]: + """Batched inference: a list of per-env multi-view observations -> ONE + POST /predict_batch -> a list of action chunks (one per env). Used by the + vectorized eval so N parallel envs share a single diffusion forward.""" + items = [] + for obs_imgs in observations: + concat = self.concatenate_images(obs_imgs) if len(obs_imgs) > 1 else self.resize_image(obs_imgs[0]) + items.append( + { + "image": self.encode_image_raw(concat), + "prompt": self.prompt, + "domain_name": self.domain_name, + "image_size": self.image_size, + } + ) + resp = requests.post( + f"{self.server_url}/predict_batch", + json={"items": items}, + headers={"Content-Type": "application/json"}, + timeout=max(self.timeout, 300.0), + ) + resp.raise_for_status() + result = resp.json() + if "error" in result and result["error"]: + raise RuntimeError(f"Model server error: {result['error']}") + return result["actions"] + + +def _find_accessible_dri_nodes() -> list[Path]: + dri_path = Path("/dev/dri") + if not dri_path.exists(): + return [] + nodes = list(dri_path.glob("renderD*")) + list(dri_path.glob("card*")) + return [node for node in nodes if os.access(node, os.R_OK | os.W_OK)] + + +def _resolve_mujoco_backend(requested_backend: str) -> tuple[str, str]: + requested_backend = requested_backend.lower() + if requested_backend != "auto": + return requested_backend, "requested" + + env_backend = os.environ.get("MUJOCO_GL") + if env_backend: + return env_backend.lower(), "env" + + if _find_accessible_dri_nodes(): + return "egl", "auto-gpu" + return "osmesa", "auto-cpu" + + +def _configure_mujoco_env(requested_backend: str) -> str: + backend, source = _resolve_mujoco_backend(requested_backend) + if backend not in {"egl", "osmesa", "glfw"}: + raise ValueError(f"Unsupported MuJoCo GL backend: {backend!r}. Use auto, egl, osmesa, or glfw.") + + os.environ["MUJOCO_GL"] = backend + if backend == "egl": + os.environ["PYOPENGL_PLATFORM"] = "egl" + elif backend == "osmesa": + os.environ["PYOPENGL_PLATFORM"] = "osmesa" + return f"{backend} ({source})" + + +def _import_libero() -> None: + global benchmark, get_libero_path, OffScreenRenderEnv + try: + from libero.libero import benchmark as libero_benchmark + from libero.libero import get_libero_path as libero_get_libero_path + from libero.libero.envs import OffScreenRenderEnv as libero_offscreen_render_env + except ImportError as exc: # pragma: no cover - environment-specific dependency + raise RuntimeError( + "Failed to import LIBERO. Make sure the LIBERO environment is activated. " + f"python={sys.executable!r}, import_error={exc!r}" + ) from exc + + benchmark = libero_benchmark + get_libero_path = libero_get_libero_path + OffScreenRenderEnv = libero_offscreen_render_env + + +def _wait_for_server(client: ActionEnvironmentClient, timeout_s: float) -> None: + start = time.perf_counter() + while time.perf_counter() - start < timeout_s: + if client.check_health(): + return + time.sleep(1.0) + raise RuntimeError(f"Timed out waiting for server at {client.server_url}") + + +def _get_libero_env( + task: Any, + *, + resolution: int, + seed: int, + render_gpu_device_id: int, +) -> tuple[Any, str]: + task_description = str(task.language) + task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file) + env_args = { + "bddl_file_name": task_bddl_file, + "camera_heights": resolution, + "camera_widths": resolution, + "render_gpu_device_id": render_gpu_device_id, + } + env = OffScreenRenderEnv(**env_args) + env.seed(seed) + return env, task_description + + +def _get_libero_dummy_action() -> list[float]: + return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0] + + +def _get_libero_image( + obs: dict[str, Any], + camera: str, + *, + flip_images: bool, + rotate_180: bool, +) -> np.ndarray: + if camera == "agentview": + image = obs["agentview_image"] + elif camera == "wrist": + image = obs["robot0_eye_in_hand_image"] + else: + raise ValueError(f"Unsupported camera={camera!r}. Use 'agentview' or 'wrist'.") + + if rotate_180: + image = image[::-1, ::-1] + if flip_images: + image = np.flipud(image) + return image + + +def _get_libero_images( + obs: dict[str, Any], + cameras: list[str], + *, + flip_images: bool, + rotate_180: bool, +) -> list[np.ndarray]: + """Get images from multiple cameras.""" + return [_get_libero_image(obs, camera, flip_images=flip_images, rotate_180=rotate_180) for camera in cameras] + + +def _ensure_uint8_image(image: np.ndarray) -> np.ndarray: + if image.dtype != np.uint8: + if image.max() <= 1.0: + image = (image * 255.0).round().astype(np.uint8) + else: + image = image.astype(np.uint8) + return image + + +def _save_gif(frames: list[Image.Image], output_path: Path, fps: int) -> None: + if not frames: + return + duration_ms = int(1000 / fps) if fps > 0 else 100 + output_path.parent.mkdir(parents=True, exist_ok=True) + first, *rest = frames + first.save( + output_path, + save_all=True, + append_images=rest, + duration=duration_ms, + loop=0, + ) + + +def _decode_b64_frames(b64_frames: list[str]) -> list[Image.Image]: + """Decode a list of base64-encoded PNG strings into PIL Images.""" + images: list[Image.Image] = [] + for b64 in b64_frames: + raw = base64.b64decode(b64) + images.append(Image.open(io.BytesIO(raw)).convert("RGB")) + return images + + +def _save_comparison_gif( + comparison_windows: list[tuple[list[Image.Image], list[Image.Image]]], + output_path: Path, + fps: int, + target_height: int = 256, + separator_width: int = 4, +) -> None: + """Create and save a side-by-side comparison GIF (Action prediction | env rollout). + + Each window is a (action_frames, env_frames) pair from one prediction call. + Frames are paired index-by-index; the conditioning frame (index 0) of + subsequent windows is skipped to avoid duplicating the boundary frame. + """ + from PIL import ImageDraw + + combined_frames: list[Image.Image] = [] + banner_h = 16 + + for window_idx, (action_frames, env_frames) in enumerate(comparison_windows): + n = min(len(action_frames), len(env_frames)) + start = 1 if window_idx > 0 else 0 + for i in range(start, n): + action_img = action_frames[i] + env_img = env_frames[i] + + action_w = int(action_img.width * target_height / action_img.height) + env_w = int(env_img.width * target_height / env_img.height) + action_resized = action_img.resize((action_w, target_height), Image.Resampling.BILINEAR) + env_resized = env_img.resize((env_w, target_height), Image.Resampling.BILINEAR) + + total_w = action_w + separator_width + env_w + total_h = target_height + banner_h + combined = Image.new("RGB", (total_w, total_h), color=0) + + draw = ImageDraw.Draw(combined) + draw.rectangle([(0, 0), (action_w, banner_h)], fill=(30, 30, 60)) + draw.rectangle([(action_w + separator_width, 0), (total_w, banner_h)], fill=(30, 60, 30)) + draw.text((4, 1), "Action Prediction", fill=(100, 180, 255)) + draw.text((action_w + separator_width + 4, 1), "Environment", fill=(100, 255, 100)) + + combined.paste(action_resized, (0, banner_h)) + combined.paste(env_resized, (action_w + separator_width, banner_h)) + combined_frames.append(combined) + + if combined_frames: + _save_gif(combined_frames, output_path, fps) + + +def _select_action_chunk(actions: list[list[float]], action_horizon: int) -> list[list[float]]: + if action_horizon <= 0 or action_horizon >= len(actions): + return actions + return actions[:action_horizon] + + +def _format_action(action: list[float], action_dim: int) -> list[float]: + if len(action) < action_dim: + raise ValueError(f"Action dimension {len(action)} smaller than expected {action_dim}") + return action[:action_dim] + + +def _remap_gripper(action: list[float], mode: str) -> list[float]: + """Map the model's gripper command to the LIBERO env's [-1, 1] (negative = open). + + The right mapping depends on the gripper convention of the dataset the policy + was trained on (the server denormalizes back to that raw convention): + + * ``zero_one`` (NVIDIA LIBERO_LeRobot_v3): raw gripper in [0, 1]; the env wants + [-1, 1] with negative=open. The i4/cosmos-rl reference BINARIZES this to hard + {-1, +1} via ``-sign(2g - 1)`` (not the continuous ``1 - 2g`` from issue #50). + For a confident policy the two agree (g~0/1), but an undertrained policy emits + g~0.5 where continuous ``1-2g``~0 never actuates the gripper -> grasps fail. + Binarizing matches the reference and is robust to weak checkpoints. + * ``pm_one`` (community ``lerobot/libero_*``): raw gripper already in {-1, +1} + (robosuite convention) -> pass through (clamped). + * ``pm_one_flip``: {-1, +1} but with inverted open/close sign. + """ + action = list(action) # avoid mutating the caller's list + g = action[-1] + if mode == "zero_one": + action[-1] = max(-1.0, min(1.0, g * 2.0 - 1.0)) * -1.0 # [0,1] -> [-1,1], negative=open (issue #50) + elif mode == "pm_one": + action[-1] = max(-1.0, min(1.0, g)) + elif mode == "pm_one_flip": + action[-1] = max(-1.0, min(1.0, -g)) + else: + raise ValueError(f"Unknown gripper_mode={mode!r}. Use zero_one/pm_one/pm_one_flip.") + return action + + +def _infer_rotation_space(action_dim: int, rotation_space: str) -> str: + if rotation_space != "auto": + return rotation_space + return libero_rotation_space_from_action_dim(action_dim) + + +def _obs_to_pose(obs: dict[str, Any]) -> tuple[np.ndarray, np.ndarray]: + position = np.asarray(obs["robot0_eef_pos"], dtype=np.float32) + quat = np.asarray(obs["robot0_eef_quat"], dtype=np.float32) + rotation = R.from_quat(quat).as_matrix() + return position, rotation + + +def _anchored_action_to_delta( + anchored_action: np.ndarray, + base_pose: tuple[np.ndarray, np.ndarray], + current_pose: tuple[np.ndarray, np.ndarray], + rotation_space: str, +) -> np.ndarray: + anchored_translation = anchored_action[:3] + rotation_dim = anchored_action.shape[0] - 4 + anchored_rotation = anchored_action[3 : 3 + rotation_dim] + gripper = anchored_action[3 + rotation_dim : 4 + rotation_dim] + + base_pos, base_rot = base_pose + current_pos, current_rot = current_pose + + if rotation_space == "3d": + anchored_rot = R.from_rotvec(anchored_rotation).as_matrix() + elif rotation_space == "6d": + anchored_rot = _rotation_repr_to_mat(anchored_rotation, rotation_space) + elif rotation_space == "9d": + anchored_rot = anchored_rotation.reshape(3, 3) + else: + raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.") + target_rot = base_rot @ anchored_rot + target_pos = base_pos + base_rot @ anchored_translation + delta_pos = target_pos - current_pos + delta_rot = target_rot @ current_rot.T + delta_rotvec = R.from_matrix(delta_rot).as_rotvec() + + return np.concatenate([delta_pos, delta_rotvec, gripper], axis=0) + + +def _framewise_action_to_delta( + framewise_action: np.ndarray, + rotation_space: str, +) -> np.ndarray: + """Convert a frame-wise policy action to LIBERO's 7D simulator command. + + Frame-wise actions are already per-step deltas in the LIBERO controller's + convention (see ``LiberoDataset`` with ``action_space='frame_wise_relative'``), + so the only conversion required is decoding the chosen rotation + representation back to a rotation vector. No anchor/current pose is needed. + """ + if rotation_space == "3d": + return framewise_action + + translation = framewise_action[:3] + rotation_dim = framewise_action.shape[0] - 4 + rotation_repr = framewise_action[3 : 3 + rotation_dim] + gripper = framewise_action[3 + rotation_dim : 4 + rotation_dim] + rotation_delta = _rotation_repr_to_mat(rotation_repr, rotation_space) + + delta_pos = translation + delta_rotvec = R.from_matrix(rotation_delta).as_rotvec() + return np.concatenate([delta_pos, delta_rotvec, gripper], axis=0) + + +def _run_episode( + env: Any, + client: ActionEnvironmentClient, + *, + cameras: list[str], + flip_images: bool, + rotate_180: bool, + action_horizon: int, + action_dim: int, + action_space: str, + rotation_space: str, + gripper_mode: str, + max_steps: int, + warmup_steps: int, + initial_state: np.ndarray | None, + gif_path: Path | None, + gif_fps: int, + comparison_path: Path | None = None, +) -> EpisodeResult: + env.reset() + if initial_state is not None: + obs = env.set_init_state(initial_state) + else: + obs = env.get_observation() + + action_queue: list[list[float]] = [] + base_pose: tuple[np.ndarray, np.ndarray] | None = None + step = 0 + success = False + gif_frames: list[Image.Image] = [] + action_log: list[list[float]] = [] + is_multi_view = len(cameras) > 1 + resolved_rotation_space = _infer_rotation_space(action_dim, rotation_space) + + comparison_windows: list[tuple[list[Image.Image], list[Image.Image]]] = [] + + def record_frame(current_obs: dict[str, Any]) -> None: + if gif_path is None: + return + image = _get_libero_image( + current_obs, + cameras[0], + flip_images=flip_images, + rotate_180=rotate_180, + ) + image = _ensure_uint8_image(image) + gif_frames.append(Image.fromarray(image).convert("RGB")) + + def capture_comparison_frame(current_obs: dict[str, Any]) -> Image.Image: + """Capture an env frame matching Action's input view (multi-view concatenated if applicable).""" + if is_multi_view: + imgs = _get_libero_images(current_obs, cameras, flip_images=flip_images, rotate_180=rotate_180) + concat = client.concatenate_images(imgs) + return Image.fromarray(_ensure_uint8_image(concat)).convert("RGB") + img = _get_libero_image(current_obs, cameras[0], flip_images=flip_images, rotate_180=rotate_180) + return Image.fromarray(_ensure_uint8_image(img)).convert("RGB") + + record_frame(obs) + + while step < max_steps: + if step < warmup_steps: + dummy = _get_libero_dummy_action() + obs, _, _, _ = env.step(dummy) + action_log.append(dummy) + step += 1 + record_frame(obs) + continue + + if not action_queue: + if is_multi_view: + observation_imgs = _get_libero_images( + obs, + cameras, + flip_images=flip_images, + rotate_180=rotate_180, + ) + result = client.predict(observation_imgs) + else: + observation_img = _get_libero_image( + obs, + cameras[0], + flip_images=flip_images, + rotate_180=rotate_180, + ) + result = client.predict(observation_img) + actions = result.get("action", []) + if not actions: + return EpisodeResult(False, step, "Empty action chunk from server", action_log) + action_queue = _select_action_chunk(actions, action_horizon) + + if comparison_path is not None: + action_video_b64 = result.get("video", []) + if action_video_b64: + action_frames = _decode_b64_frames(action_video_b64) + env_comparison_frames = [capture_comparison_frame(obs)] + comparison_windows.append((action_frames, env_comparison_frames)) + + if action_space == "relative": + base_pose = _obs_to_pose(obs) + + raw_action = _format_action(action_queue.pop(0), action_dim) + if action_space == "relative": + if base_pose is None: + raise RuntimeError("Missing base pose for relative action conversion") + current_pose = _obs_to_pose(obs) + action = _anchored_action_to_delta( + np.asarray(raw_action, dtype=np.float32), + base_pose, + current_pose, + resolved_rotation_space, + ) + action_list = action.tolist() + else: + action = _framewise_action_to_delta( + np.asarray(raw_action, dtype=np.float32), + resolved_rotation_space, + ) + action_list = action.tolist() + + # Map the model's gripper command to the env's [-1, 1] per the dataset convention. + action_list = _remap_gripper(action_list, gripper_mode) + + action_log.append(action_list) + obs, _, done, info = env.step(action_list) + step += 1 + record_frame(obs) + + if comparison_path is not None and comparison_windows: + comparison_windows[-1][1].append(capture_comparison_frame(obs)) + + if isinstance(info, dict) and info.get("success"): + success = True + break + if done: + success = True if not isinstance(info, dict) else bool(info.get("success", True)) + break + + if gif_path is not None: + _save_gif(gif_frames, gif_path, gif_fps) + if comparison_path is not None and comparison_windows: + _save_comparison_gif(comparison_windows, comparison_path, gif_fps) + return EpisodeResult(success, step, None, action_log) + + +def _load_initial_states( + task_suite: Any, + task_id: int, + *, + task_description: str, + initial_states_path: str, + episode_idx: int, +) -> np.ndarray | None: + default_initial_states = task_suite.get_task_init_states(task_id) + + if initial_states_path == "DEFAULT": + return np.array(default_initial_states[episode_idx]) + + with open(initial_states_path, "r", encoding="utf-8") as f: + all_initial_states = json.load(f) + + task_key = task_description.replace(" ", "_") + episode_key = f"demo_{episode_idx}" + if not all_initial_states[task_key][episode_key]["success"]: + return None + return np.array(all_initial_states[task_key][episode_key]["initial_state"]) + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="LIBERO closed-loop evaluation via Action HTTP server") + parser.add_argument( + "--server_url", type=str, required=True, help="Base URL for Action server (e.g., http://host:8000)" + ) + parser.add_argument("--task_suite", type=str, default="libero_spatial", choices=sorted(TASK_MAX_STEPS.keys())) + parser.add_argument("--num_trials_per_task", type=int, default=10) + parser.add_argument("--task_ids", type=str, default="", help="Comma-separated task IDs to evaluate (default: all)") + parser.add_argument("--image_size", type=int, default=256, help="Model input image size") + parser.add_argument("--env_image_size", type=int, default=256, help="Environment render resolution") + parser.add_argument("--action_horizon", type=int, default=0, help="Actions to execute per request (0=full chunk)") + parser.add_argument("--action_dim", type=int, default=10, help="Action dimension for LIBERO") + parser.add_argument( + "--action_space", + type=str, + default="frame_wise_relative", + choices=["relative", "frame_wise_relative"], + help="Action space expected from the model (relative=anchored, frame_wise_relative=framewise deltas).", + ) + parser.add_argument( + "--rotation_space", + type=str, + default="auto", + choices=["auto", "3d", "6d", "9d"], + help="Rotation representation for anchored actions (auto infers from action_dim).", + ) + parser.add_argument( + "--gripper_mode", + type=str, + default="zero_one", + choices=["zero_one", "pm_one", "pm_one_flip"], + help="Gripper convention of the training data: 'zero_one' = [0,1] (NVIDIA " + "LIBERO_LeRobot_v3, mapped 1-2g); 'pm_one' = {-1,+1} (community lerobot/libero_*, " + "pass-through); 'pm_one_flip' = {-1,+1} with inverted sign.", + ) + parser.add_argument("--domain_name", type=str, default="libero") + parser.add_argument( + "--camera", + type=str, + default="agentview", + help="Camera(s) to use. Single camera: 'agentview' or 'wrist'. Multiple cameras: comma-separated, e.g., 'agentview,wrist'.", + ) + parser.add_argument("--flip_images", action="store_true", help="Flip images vertically before encoding") + parser.add_argument( + "--rotate_180", + action=argparse.BooleanOptionalAction, + default=True, + help="Rotate images by 180 degrees before encoding (default: True; pass --no-rotate-180 to disable)", + ) + parser.add_argument("--warmup_steps", type=int, default=10, help="Stabilization steps with dummy actions") + parser.add_argument("--max_steps", type=int, default=0, help="Override max steps per episode (0=default)") + parser.add_argument("--timeout", type=float, default=30.0, help="HTTP request timeout in seconds") + parser.add_argument("--wait_timeout", type=float, default=60.0, help="Seconds to wait for server health") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--save_gifs", action="store_true", help="Save per-episode GIFs of rendered frames") + parser.add_argument( + "--save_comparison", + action="store_true", + help="Save side-by-side comparison GIFs (Action prediction vs environment rollout)", + ) + parser.add_argument("--gif_fps", type=int, default=20, help="Frames per second for saved GIFs") + parser.add_argument( + "--mujoco_gl", + type=str, + default="auto", + choices=["auto", "egl", "osmesa", "glfw"], + help="MuJoCo GL backend (auto picks egl if /dev/dri is accessible, else osmesa).", + ) + parser.add_argument( + "--render_gpu_device_id", + type=int, + default=-1, + help="GPU device index for EGL rendering (-1 uses default device).", + ) + parser.add_argument( + "--initial_states_path", + type=str, + default="DEFAULT", + help='Path to initial states JSON. Use "DEFAULT" for benchmark defaults.', + ) + parser.add_argument( + "--num_envs", + type=int, + default=1, + help="Number of parallel LIBERO envs (SubprocVectorEnv). >1 runs trials in waves " + "with ONE batched /predict_batch per control step (~num_envs x faster). 1 = serial.", + ) + parser.add_argument("--output_dir", type=str, default="", help="Directory to save evaluation summary JSON") + return parser.parse_args() + + +class _LiberoEnvFactory: + """Picklable env factory for SubprocVectorEnv under the spawn start method. + + spawn pickles each env_fn and re-imports this module in the child, so the + factory must be a top-level class (lambdas/closures are not picklable). The + child sets the GL backend and imports OffScreenRenderEnv locally so its EGL + context is created fresh in the worker process.""" + + def __init__( + self, + *, + bddl_file_name: str, + camera_heights: int, + camera_widths: int, + render_gpu_device_id: int, + mujoco_gl: str, + ) -> None: + self.bddl_file_name = bddl_file_name + self.camera_heights = camera_heights + self.camera_widths = camera_widths + self.render_gpu_device_id = render_gpu_device_id + self.mujoco_gl = mujoco_gl + + def __call__(self) -> Any: + # Resolve to a concrete GPU; -1 (auto) makes EGL device selection race/fail + # across spawned workers (EGLError / "'EGLGLContext' object has no attribute + # '_context'"). Set the GL backend + pin the EGL device BEFORE importing + # OffScreenRenderEnv (which dlopen's the GL stack at import). + dev = self.render_gpu_device_id if self.render_gpu_device_id >= 0 else 0 + os.environ["MUJOCO_GL"] = self.mujoco_gl + if self.mujoco_gl == "egl": + os.environ["PYOPENGL_PLATFORM"] = "egl" + os.environ["MUJOCO_EGL_DEVICE_ID"] = str(dev) + os.environ["EGL_DEVICE_ID"] = str(dev) + elif self.mujoco_gl == "osmesa": + os.environ["PYOPENGL_PLATFORM"] = "osmesa" + from libero.libero.envs import OffScreenRenderEnv as _OffScreenRenderEnv + + return _OffScreenRenderEnv( + bddl_file_name=self.bddl_file_name, + camera_heights=self.camera_heights, + camera_widths=self.camera_widths, + render_gpu_device_id=dev, + ) + + +def _run_task_vectorized( + task: Any, + task_description: str, + *, + num_trials: int, + num_envs: int, + env_image_size: int, + seed: int, + render_gpu_device_id: int, + client: ActionEnvironmentClient, + cameras: list[str], + flip_images: bool, + rotate_180: bool, + action_horizon: int, + action_dim: int, + rotation_space: str, + gripper_mode: str, + max_steps: int, + warmup_steps: int, + init_states: list[np.ndarray | None], +) -> list[dict[str, Any]]: + """Run all `num_trials` of one task across `num_envs` parallel LIBERO envs + (SubprocVectorEnv), in waves. Each control step gathers obs from the ACTIVE + (not-done) envs, issues ONE batched /predict_batch, and steps all active envs; + done envs are masked out. Returns per-trial result dicts in trial order with the + same shape as the serial path's episode_results.""" + import multiprocessing as _mp + + from libero.libero.envs.venv import SubprocVectorEnv + + # LIBERO's SubprocVectorEnv defaults to the fork start method; forked children + # inherit the parent's already-dlopen'd EGL/GL state, which corrupts per-child + # render-context creation (EGLError / 'EGLGLContext' has no attribute '_context'). + # Force spawn so each env worker starts clean — exactly like the (working) serial + # single-process path. spawn pickles env_fns, so the factory below is picklable. + try: + _mp.set_start_method("spawn", force=True) + except RuntimeError: # pragma: no cover - already set + pass + + resolved_rotation_space = _infer_rotation_space(action_dim, rotation_space) + bddl = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file) + + results: list[dict[str, Any]] = [None] * num_trials # type: ignore[list-item] + for t in range(num_trials): + if init_states[t] is None: + results[t] = { + "episode": t, + "success": False, + "steps": 0, + "error": "Skipped due to failed expert demo", + "elapsed_s": 0.0, + } + runnable = [t for t in range(num_trials) if init_states[t] is not None] + if not runnable: + return results + + n = min(num_envs, len(runnable)) + + mujoco_gl = os.environ.get("MUJOCO_GL", "egl") + env_fn = _LiberoEnvFactory( + bddl_file_name=bddl, + camera_heights=env_image_size, + camera_widths=env_image_size, + render_gpu_device_id=render_gpu_device_id, + mujoco_gl=mujoco_gl, + ) + venv = SubprocVectorEnv([env_fn for _ in range(n)]) + try: + venv.seed(seed) + for w0 in range(0, len(runnable), n): + wave = runnable[w0 : w0 + n] # trial indices for this wave + slots = list(range(len(wave))) # env slots in use + t_wave0 = time.perf_counter() + venv.reset(id=slots) + states = np.stack([np.asarray(init_states[t], dtype=np.float64) for t in wave]) + obs_arr = venv.set_init_state(states, id=slots) + obs_by_slot = {s: obs_arr[i] for i, s in enumerate(slots)} + done = {s: False for s in slots} + succ = {s: False for s in slots} + err: dict[int, str | None] = {s: None for s in slots} + nsteps = {s: max_steps for s in slots} + step = 0 + + for _ in range(warmup_steps): + act = np.stack([_get_libero_dummy_action() for _ in slots]) + obs_arr, _, _, _ = venv.step(act, id=slots) + for i, s in enumerate(slots): + obs_by_slot[s] = obs_arr[i] + step += 1 + + while step < max_steps: + active = [s for s in slots if not done[s]] + if not active: + break + obs_batch = [ + _get_libero_images(obs_by_slot[s], cameras, flip_images=flip_images, rotate_180=rotate_180) + for s in active + ] + try: + chunks = client.predict_batch(obs_batch) + except Exception as e: # noqa: BLE001 + for s in active: + done[s] = True + err[s] = f"server error: {e}" + nsteps[s] = step + break + if not chunks or len(chunks) != len(active): + for s in active: + done[s] = True + err[s] = "bad batch response from server" + nsteps[s] = step + break + chunk_by_slot = {s: chunks[k] for k, s in enumerate(active)} + horizon = action_horizon if action_horizon > 0 else len(chunks[0]) + for h in range(horizon): + cur = [s for s in slots if not done[s]] + if not cur or step >= max_steps: + break + env_actions = [] + for s in cur: + raw = _format_action(chunk_by_slot[s][h], action_dim) + a = _framewise_action_to_delta(np.asarray(raw, dtype=np.float32), resolved_rotation_space) + env_actions.append(_remap_gripper(a.tolist(), gripper_mode)) + obs_arr, _, d, info = venv.step(np.stack(env_actions), id=cur) + step += 1 + for i, s in enumerate(cur): + obs_by_slot[s] = obs_arr[i] + di = bool(d[i]) + ii = info[i] if isinstance(info, (list, np.ndarray)) else info + is_succ = bool(ii.get("success")) if isinstance(ii, dict) else False + if is_succ: + done[s], succ[s], nsteps[s] = True, True, step + elif di: + # mirror serial: done w/o explicit success defaults to success + done[s] = True + succ[s] = ii.get("success", True) if isinstance(ii, dict) else True + nsteps[s] = step + per_ep_elapsed = round((time.perf_counter() - t_wave0) / max(1, len(wave)), 3) + for s, t in zip(slots, wave): + results[t] = { + "episode": t, + "success": bool(succ[s]), + "steps": int(nsteps[s]), + "error": err[s], + "elapsed_s": per_ep_elapsed, + } + finally: + try: + venv.close() + except Exception: # noqa: BLE001 + pass + return results + + +def main() -> None: + args = _parse_args() + random.seed(args.seed) + np.random.seed(args.seed) + + if args.save_gifs and not args.output_dir: + raise ValueError("--save_gifs requires --output_dir to be set") + if args.save_comparison and not args.output_dir: + raise ValueError("--save_comparison requires --output_dir to be set") + + # Parse cameras from comma-separated string + cameras = [c.strip() for c in args.camera.split(",") if c.strip()] + if not cameras: + raise ValueError("At least one camera must be specified") + for cam in cameras: + if cam not in ("agentview", "wrist"): + raise ValueError(f"Unsupported camera={cam!r}. Use 'agentview' or 'wrist'.") + + mujoco_backend = _configure_mujoco_env(args.mujoco_gl) + _import_libero() + + client = ActionEnvironmentClient( + server_url=args.server_url, + domain_name=args.domain_name, + prompt="", + image_size=args.image_size, + timeout=args.timeout, + ) + print(f"MuJoCo GL backend: {mujoco_backend}", flush=True) + print("Waiting for model server...", flush=True) + _wait_for_server(client, args.wait_timeout) + print(f"Connected to model server: {client.get_info()}", flush=True) + + benchmark_dict = benchmark.get_benchmark_dict() + task_suite = benchmark_dict[args.task_suite]() + num_tasks = int(task_suite.n_tasks) + + if args.task_ids: + selected_task_ids = [int(t) for t in args.task_ids.split(",") if t.strip()] + else: + selected_task_ids = list(range(num_tasks)) + + max_steps = args.max_steps if args.max_steps > 0 else TASK_MAX_STEPS[args.task_suite] + + total_episodes = 0 + total_successes = 0 + task_results: list[dict[str, Any]] = [] + + output_dir = Path(args.output_dir) if args.output_dir else None + gif_root = output_dir / "gifs" if output_dir and args.save_gifs else None + comparison_root = output_dir / "comparisons" if output_dir and args.save_comparison else None + + for task_id in selected_task_ids: + task = task_suite.get_task(task_id) + + # ---- Vectorized path: N parallel envs + one batched /predict_batch per step ---- + if args.num_envs > 1: + task_description = str(task.language) + client.prompt = _augment_task_prompt_with_viewpoint(task_description, cameras) + init_states = [ + _load_initial_states( + task_suite, + task_id, + task_description=task_description, + initial_states_path=args.initial_states_path, + episode_idx=e, + ) + for e in range(args.num_trials_per_task) + ] + episode_results = _run_task_vectorized( + task, + task_description, + num_trials=args.num_trials_per_task, + num_envs=args.num_envs, + env_image_size=args.env_image_size, + seed=args.seed, + render_gpu_device_id=args.render_gpu_device_id, + client=client, + cameras=cameras, + flip_images=args.flip_images, + rotate_180=args.rotate_180, + action_horizon=args.action_horizon, + action_dim=args.action_dim, + rotation_space=args.rotation_space, + gripper_mode=args.gripper_mode, + max_steps=max_steps, + warmup_steps=args.warmup_steps, + init_states=init_states, + ) + task_episodes = 0 + task_successes = 0 + for er in episode_results: + task_episodes += 1 + total_episodes += 1 + if er["success"]: + task_successes += 1 + total_successes += 1 + print( + f"Task {task_id} | Episode {er['episode'] + 1}/{args.num_trials_per_task} | " + f"success={er['success']} steps={er['steps']} elapsed_s={er['elapsed_s']:.1f} | " + f"task SR {task_successes}/{task_episodes} ({100.0 * task_successes / max(1, task_episodes):.1f}%) | " + f"overall SR {total_successes}/{total_episodes} " + f"({100.0 * total_successes / max(1, total_episodes):.1f}%)", + flush=True, + ) + task_success_rate = float(task_successes) / float(task_episodes) if task_episodes > 0 else 0.0 + task_results.append( + { + "task_id": task_id, + "task_description": task_description, + "episodes": task_episodes, + "successes": task_successes, + "success_rate": task_success_rate, + "episode_results": episode_results, + } + ) + print( + f"Task {task_id} summary: {task_successes}/{task_episodes} ({task_success_rate * 100:.1f}%)", + flush=True, + ) + continue + + env, task_description = _get_libero_env( + task, + resolution=args.env_image_size, + seed=args.seed, + render_gpu_device_id=args.render_gpu_device_id, + ) + + task_episodes = 0 + task_successes = 0 + episode_results: list[dict[str, Any]] = [] + + for episode_idx in range(args.num_trials_per_task): + episode_t0 = time.perf_counter() + client.prompt = _augment_task_prompt_with_viewpoint(task_description, cameras) + initial_state = _load_initial_states( + task_suite, + task_id, + task_description=task_description, + initial_states_path=args.initial_states_path, + episode_idx=episode_idx, + ) + if initial_state is None: + episode_elapsed_s = time.perf_counter() - episode_t0 + episode_results.append( + { + "episode": episode_idx, + "success": False, + "steps": 0, + "error": "Skipped due to failed expert demo", + "elapsed_s": round(episode_elapsed_s, 3), + } + ) + print( + f"Task {task_id} | Episode {episode_idx + 1}/{args.num_trials_per_task} | " + "success=False steps=0 " + f"elapsed_s={episode_elapsed_s:.1f} " + "error='Skipped due to failed expert demo'", + flush=True, + ) + continue + + gif_path = ( + gif_root / f"task_{task_id:03d}" / f"episode_{episode_idx:03d}.gif" if gif_root is not None else None + ) + comparison_path = ( + comparison_root / f"task_{task_id:03d}" / f"episode_{episode_idx:03d}.gif" + if comparison_root is not None + else None + ) + try: + result = _run_episode( + env, + client, + cameras=cameras, + flip_images=args.flip_images, + rotate_180=args.rotate_180, + action_horizon=args.action_horizon, + action_dim=args.action_dim, + action_space=args.action_space, + rotation_space=args.rotation_space, + gripper_mode=args.gripper_mode, + max_steps=max_steps, + warmup_steps=args.warmup_steps, + initial_state=initial_state, + gif_path=gif_path, + gif_fps=args.gif_fps, + comparison_path=comparison_path, + ) + except Exception as exc: + result = EpisodeResult(False, 0, str(exc), []) + episode_elapsed_s = time.perf_counter() - episode_t0 + + task_episodes += 1 + total_episodes += 1 + if result.success: + task_successes += 1 + total_successes += 1 + + episode_results.append( + { + "episode": episode_idx, + "success": result.success, + "steps": result.steps, + "error": result.error, + "elapsed_s": round(episode_elapsed_s, 3), + } + ) + + # Save per-episode action log as JSON + if output_dir is not None and result.actions: + action_log_dir = output_dir / "actions" / f"task_{task_id:03d}" + action_log_dir.mkdir(parents=True, exist_ok=True) + action_log_path = action_log_dir / f"episode_{episode_idx:03d}.json" + action_log_path.write_text( + json.dumps(result.actions, indent=2), + encoding="utf-8", + ) + + client.notify_next_episode() + + print( + f"Task {task_id} | Episode {episode_idx + 1}/{args.num_trials_per_task} | " + f"success={result.success} steps={result.steps} elapsed_s={episode_elapsed_s:.1f} | " + f"task SR {task_successes}/{task_episodes} ({100.0 * task_successes / max(1, task_episodes):.1f}%) | " + f"overall SR {total_successes}/{total_episodes} ({100.0 * total_successes / max(1, total_episodes):.1f}%)", + flush=True, + ) + + task_success_rate = float(task_successes) / float(task_episodes) if task_episodes > 0 else 0.0 + task_results.append( + { + "task_id": task_id, + "task_description": task_description, + "episodes": task_episodes, + "successes": task_successes, + "success_rate": task_success_rate, + "episode_results": episode_results, + } + ) + print( + f"Task {task_id} summary: {task_successes}/{task_episodes} ({task_success_rate * 100:.1f}%)", + flush=True, + ) + # Close the env (and its EGL/MuJoCo render context) before the next task. + # Leaving it open leaks one EGL context per task and hangs after ~8 tasks. + try: + env.close() + except Exception: + pass + + overall_success_rate = float(total_successes) / float(total_episodes) if total_episodes > 0 else 0.0 + summary = { + "task_suite": args.task_suite, + "total_episodes": total_episodes, + "total_successes": total_successes, + "overall_success_rate": overall_success_rate, + "num_trials_per_task": args.num_trials_per_task, + "selected_task_ids": selected_task_ids, + "action_space": args.action_space, + "rotation_space": _infer_rotation_space(args.action_dim, args.rotation_space), + "action_dim": args.action_dim, + "task_results": task_results, + } + + print( + f"Overall success rate: {total_successes}/{total_episodes} ({overall_success_rate * 100:.1f}%)", + flush=True, + ) + + if output_dir is not None: + output_dir.mkdir(parents=True, exist_ok=True) + summary_path = output_dir / "summary.json" + summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8") + print(f"Saved summary to {summary_path}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/cosmos_framework/simulation/libero/dataset_reply_action_server.py b/cosmos_framework/simulation/libero/dataset_reply_action_server.py new file mode 100644 index 0000000..bb5d9a4 --- /dev/null +++ b/cosmos_framework/simulation/libero/dataset_reply_action_server.py @@ -0,0 +1,653 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +""" +HTTP server that serves ground-truth actions from LIBERO LeRobot datasets. + +Same HTTP interface as `cosmos3.scripts.action_policy_server` (the model-backed +server), enabling drop-in replacement for closed-loop evaluation to verify the +action pipeline with known-good GT actions. + +Endpoints: +- POST /predict: Return next chunk of GT actions for the given task (matched by prompt) +- GET /info: Return dataset info (tasks, episode counts) +- POST /next_episode: Advance to next episode for the task specified in request body +- POST /reset: Reset all per-task episode/step tracking + +Episode advancement: + The server auto-advances to the next episode when the current episode's actions + are exhausted. For early-termination cases (e.g. success before all actions are + consumed), call POST /next_episode with {"prompt": ""} between episodes. + +Example usage: + + +PYTHONPATH=. python cosmos_framework/simulation/libero/dataset_reply_action_server.py \ + --repo_id libero_10 \ + --root /path/to/libero_10_no_noops_1.0.0_lerobot_aligned \ + --action_space frame_wise_relative \ + --rotation_space 6d \ + --pose_coordinate_frame opencv \ + --action_chunk_size 16 \ + --send_video \ + --camera_mode agentview \ + --port 8000 + +# Multiple datasets: +PYTHONPATH=. python cosmos_framework/simulation/libero/dataset_reply_action_server.py \ + --repo_id libero_10,libero_goal \ + --root /path/to/libero_10,/path/to/libero_goal \ + --action_space relative \ + --rotation_space 6d \ + --pose_coordinate_frame opencv \ + --action_chunk_size 16 \ + --port 8000 +""" + +from __future__ import annotations + +import argparse +import base64 +import datetime +import io +import json +import socket +import threading +import time +from dataclasses import dataclass +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from typing import Any + +import numpy as np +import torch +from PIL import Image + +from cosmos_framework.data.vfm.action.libero_pose_utils import ( + libero_rotation_format, +) +from cosmos_framework.data.vfm.action.pose_utils import convert_rotation + + +def _ts() -> str: + return datetime.datetime.now(tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ") + + +def _get_local_ip() -> str: + try: + with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: + s.connect(("8.8.8.8", 80)) + return str(s.getsockname()[0]) + except Exception: + return socket.gethostbyname(socket.gethostname()) + + +# --------------------------------------------------------------------------- +# Action processing (mirrors LIBEROLeRobotDataset.__getitem__ logic) +# --------------------------------------------------------------------------- + + +def _compute_anchored_actions( + state_raw: torch.Tensor, + action_raw: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Compute anchored relative actions, same as LIBEROLeRobotDataset._compute_anchored_actions. + + Actions are expressed in state_raw[0]'s local coordinate frame. + + Args: + state_raw: (T+1, 8) states [x, y, z, ax, ay, az, grip1, grip2]. + action_raw: (T+1, 7) actions [dx, dy, dz, dax, day, daz, grip]. + + Returns: + anchored_translation (T, 3), anchored_rotation (T, 3, 3), gripper (T, 1). + """ + p_states = state_raw[:, :3] + rotvec_states = state_raw[:, 3:6] + delta_p = action_raw[:-1, :3] + delta_rotvec = action_raw[:-1, 3:6] + gripper = action_raw[:-1, 6:7] + + R_states = convert_rotation(rotvec_states, "axisangle", "matrix") + R_deltas = convert_rotation(delta_rotvec, "axisangle", "matrix") + + p_0 = p_states[0] + R_0_T = R_states[0].T + + p_t = p_states[:-1] + R_t = R_states[:-1] + + p_target = p_t + delta_p + R_target = torch.bmm(R_deltas, R_t) + + anchored_p = (R_0_T @ (p_target - p_0).T).T + R_0_T_expanded = R_0_T.unsqueeze(0).expand(R_target.shape[0], -1, -1) + anchored_R = torch.bmm(R_0_T_expanded, R_target) + + return anchored_p, anchored_R, gripper + + +def _convert_rotation_to_repr(rotation_matrix: torch.Tensor, rotation_space: str) -> torch.Tensor: + return convert_rotation(rotation_matrix, "matrix", libero_rotation_format(rotation_space)) + + +def _process_action_chunk( + action_raw: torch.Tensor, + state_raw: torch.Tensor, + action_space: str, + rotation_space: str, +) -> torch.Tensor: + """Process a chunk of raw actions with the same logic as LIBEROLeRobotDataset.__getitem__. + + Args: + action_raw: (chunk+1, 7) raw actions covering chunk+1 consecutive frames. + state_raw: (chunk+1, 8) raw states covering chunk+1 consecutive frames. + action_space: "relative" or "frame_wise_relative". + rotation_space: "3d", "6d", or "9d". + + Returns: + Processed actions (chunk, D) where D depends on rotation_space. + """ + if action_space == "relative": + translation, rotation_matrix, gripper = _compute_anchored_actions(state_raw, action_raw) + elif action_space == "frame_wise_relative": + action = action_raw[:-1].clone() + translation = action[:, :3] + rotation_matrix = convert_rotation(action[:, 3:6], "axisangle", "matrix") + gripper = action[:, 6:] + else: + raise ValueError(f"Unsupported action_space: {action_space}") + + rotation = _convert_rotation_to_repr(rotation_matrix, rotation_space) + return torch.cat([translation, rotation, gripper], dim=-1) + + +# --------------------------------------------------------------------------- +# Data structures +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class EpisodeData: + action_raw: torch.Tensor # (N, 7) per-frame raw actions for the full episode + state_raw: torch.Tensor # (N, 8) per-frame raw states for the full episode + task_description: str + dataset_ref_idx: int # index into DatasetActionService._hf_datasets + frame_start: int # first global frame index in the HF dataset + frame_end: int # one-past-last global frame index + + +@dataclass(frozen=True) +class DatasetServerConfig: + repo_id: list[str] + root: list[str | None] + action_space: str + rotation_space: str + pose_coordinate_frame: str + action_chunk_size: int + max_action_dim: int + split: str + send_video: bool + camera_mode: str + image_size: int + + +# --------------------------------------------------------------------------- +# Service +# --------------------------------------------------------------------------- + + +class DatasetActionService: + """Serves GT actions (and optionally GT video) from pre-loaded LIBERO LeRobot episodes.""" + + def __init__(self, cfg: DatasetServerConfig) -> None: + self.cfg = cfg + self.episodes_by_task: dict[str, list[EpisodeData]] = {} + self._hf_datasets: list[Any] = [] + self._lerobot_datasets: list[Any] = [] + self._task_state: dict[str, dict[str, int]] = {} + self._lock = threading.Lock() + + if cfg.camera_mode in ("concat_view", "both"): + self._image_keys = ["observation.images.image", "observation.images.wrist_image"] + elif cfg.camera_mode == "wrist_image": + self._image_keys = ["observation.images.wrist_image"] + else: + self._image_keys = ["observation.images.image"] + + self._load_datasets() + + def _load_datasets(self) -> None: + from lerobot.datasets.lerobot_dataset import LeRobotDataset + + for repo_id, root in zip(self.cfg.repo_id, self.cfg.root): + print(f"[{_ts()}] [dataset-server] loading repo_id={repo_id} root={root} ...", flush=True) + t0 = time.monotonic() + + dataset = LeRobotDataset(repo_id=repo_id, root=root) + tasks_df = dataset.meta.tasks + hf = dataset.hf_dataset + ds_ref_idx = len(self._hf_datasets) + self._hf_datasets.append(hf) + + if self.cfg.send_video: + delta_ts: dict[str, list[float]] = {k: [0.0] for k in self._image_keys} + video_dataset = LeRobotDataset(repo_id=repo_id, root=root, delta_timestamps=delta_ts) + self._lerobot_datasets.append(video_dataset) + else: + self._lerobot_datasets.append(None) + + for ep_meta in dataset.meta.episodes: + ep_idx = int(ep_meta["episode_index"]) # type: ignore[index] + start = int(ep_meta["dataset_from_index"]) # type: ignore[index] + end = int(ep_meta["dataset_to_index"]) # type: ignore[index] + + ep_slice = hf.select(range(start, end)) + actions = torch.tensor(np.array(ep_slice["action"], dtype=np.float32)) + states = torch.tensor(np.array(ep_slice["observation.state"], dtype=np.float32)) + + task_idx = int(ep_slice[0]["task_index"]) + matching = tasks_df[tasks_df["task_index"] == task_idx] + task_desc = str(matching.iloc[0].name) if not matching.empty else f"task_{task_idx}" + + self.episodes_by_task.setdefault(task_desc, []).append( + EpisodeData( + action_raw=actions, + state_raw=states, + task_description=task_desc, + dataset_ref_idx=ds_ref_idx, + frame_start=start, + frame_end=end, + ) + ) + + dt = time.monotonic() - t0 + print( + f"[{_ts()}] [dataset-server] loaded {repo_id}: {dataset.meta.total_episodes} episodes in {dt:.1f}s", + flush=True, + ) + + total_tasks = len(self.episodes_by_task) + total_eps = sum(len(eps) for eps in self.episodes_by_task.values()) + print( + f"[{_ts()}] [dataset-server] ready: {total_tasks} tasks, {total_eps} episodes " + f"send_video={self.cfg.send_video} camera_mode={self.cfg.camera_mode}", + flush=True, + ) + + def _load_video_frames(self, episode: EpisodeData, step: int, num_frames: int) -> list[str]: + """Load GT video frames from the dataset and encode as base64 PNGs. + + Uses the LeRobotDataset wrapper (not the raw HF dataset) so that video-backed + datasets are decoded correctly via the configured video backend. + + Args: + episode: Episode data with dataset reference. + step: Step offset within the episode (0-based). + num_frames: Number of frames to load (typically action_chunk_size + 1). + + Returns: + List of base64-encoded PNG strings. + """ + lr_dataset = self._lerobot_datasets[episode.dataset_ref_idx] + if lr_dataset is None: + return [] + image_size = self.cfg.image_size + b64_frames: list[str] = [] + + for i in range(num_frames): + global_idx = episode.frame_start + step + i + if global_idx >= episode.frame_end: + break + + item = lr_dataset[global_idx] + + pil_images: list[Image.Image] = [] + for key in self._image_keys: + img_tensor = item[key] + if isinstance(img_tensor, torch.Tensor): + # LeRobot returns (T, C, H, W) with delta_timestamps=[0.0] -> (1, C, H, W) + if img_tensor.dim() == 4: + img_tensor = img_tensor[0] + # (C, H, W) float [0, 1] -> PIL + arr = (img_tensor.permute(1, 2, 0).clamp(0, 1) * 255).to(torch.uint8).numpy() + img = Image.fromarray(arr) + elif isinstance(img_tensor, Image.Image): + img = img_tensor + else: + img = Image.fromarray(np.asarray(img_tensor, dtype=np.uint8)) + img = img.convert("RGB").resize((image_size, image_size), Image.Resampling.BILINEAR) + pil_images.append(img) + + if len(pil_images) > 1: + total_w = sum(im.width for im in pil_images) + combined = Image.new("RGB", (total_w, image_size)) + x = 0 + for im in pil_images: + combined.paste(im, (x, 0)) + x += im.width + frame = combined + else: + frame = pil_images[0] + + buf = io.BytesIO() + frame.save(buf, format="PNG") + b64_frames.append(base64.b64encode(buf.getvalue()).decode("ascii")) + + return b64_frames + + # -- state management -- + + def _get_task_state(self, prompt: str) -> dict[str, int]: + if prompt not in self._task_state: + self._task_state[prompt] = {"episode_idx": 0, "step": 0} + return self._task_state[prompt] + + def _resolve_prompt(self, prompt: str) -> str: + """Resolve prompt to a known task description (exact or substring match).""" + if prompt in self.episodes_by_task: + return prompt + prompt_lower = prompt.lower().strip() + for task_desc in self.episodes_by_task: + if task_desc.lower().strip() == prompt_lower: + return task_desc + for task_desc in self.episodes_by_task: + td_lower = task_desc.lower().strip() + if prompt_lower in td_lower or td_lower in prompt_lower: + return task_desc + raise ValueError( + f"Task not found for prompt: {prompt!r}. Available tasks: {sorted(self.episodes_by_task.keys())}" + ) + + # -- endpoints -- + + def get_info(self) -> dict[str, Any]: + return { + "type": "dataset_action_server", + "action_space": self.cfg.action_space, + "rotation_space": self.cfg.rotation_space, + "action_chunk_size": self.cfg.action_chunk_size, + "tasks": {k: len(v) for k, v in sorted(self.episodes_by_task.items())}, + } + + def predict(self, req: dict[str, Any]) -> dict[str, Any]: + prompt = req.get("prompt") + if not isinstance(prompt, str): + raise ValueError("'prompt' must be a string") + + resolved_prompt = self._resolve_prompt(prompt) + + with self._lock: + state = self._get_task_state(resolved_prompt) + episodes = self.episodes_by_task[resolved_prompt] + + ep_idx = state["episode_idx"] % len(episodes) + episode = episodes[ep_idx] + step = state["step"] + + # Number of valid actions = num_frames - 1 (need pairs of consecutive frames) + max_actions = len(episode.action_raw) - 1 + + if step >= max_actions: + state["episode_idx"] = (ep_idx + 1) % len(episodes) + state["step"] = 0 + ep_idx = state["episode_idx"] + episode = episodes[ep_idx] + step = 0 + max_actions = len(episode.action_raw) - 1 + + chunk_size = min(self.cfg.action_chunk_size, max_actions - step) + # Slice chunk+1 frames for action computation (needs next-frame state) + raw_slice_end = step + chunk_size + 1 + action_chunk_raw = episode.action_raw[step:raw_slice_end] + state_chunk_raw = episode.state_raw[step:raw_slice_end] + + processed = _process_action_chunk( + action_chunk_raw, + state_chunk_raw, + self.cfg.action_space, + self.cfg.rotation_space, + ) + + # Pad to max_action_dim (same as the Action transform pipeline) + t, d = processed.shape + if d < self.cfg.max_action_dim: + processed = torch.cat( + [processed, torch.zeros(t, self.cfg.max_action_dim - d)], + dim=-1, + ) + + state["step"] += chunk_size + + action_list = processed.float().numpy().tolist() + + video_b64: list[str] = [] + if self.cfg.send_video: + video_b64 = self._load_video_frames(episode, step, num_frames=chunk_size + 1) + + print( + f"[{_ts()}] [dataset-server] predict prompt={resolved_prompt!r} " + f"ep={ep_idx} step={step}..{state['step']} actions={len(action_list)} " + f"video_frames={len(video_b64)}", + flush=True, + ) + return {"action": action_list, "video": video_b64} + + def next_episode(self, prompt: str | None = None) -> dict[str, Any]: + with self._lock: + if prompt is not None: + resolved = self._resolve_prompt(prompt) + state = self._get_task_state(resolved) + episodes = self.episodes_by_task[resolved] + state["episode_idx"] = (state["episode_idx"] + 1) % len(episodes) + state["step"] = 0 + print( + f"[{_ts()}] [dataset-server] next_episode task={resolved!r} -> ep={state['episode_idx']}", + flush=True, + ) + return {"task": resolved, "episode_idx": state["episode_idx"]} + + for task in self._task_state: + episodes = self.episodes_by_task.get(task, []) + self._task_state[task]["episode_idx"] = (self._task_state[task]["episode_idx"] + 1) % max( + len(episodes), 1 + ) + self._task_state[task]["step"] = 0 + print(f"[{_ts()}] [dataset-server] next_episode (all tasks)", flush=True) + return {"advanced_all": True} + + def reset(self) -> dict[str, str]: + with self._lock: + self._task_state.clear() + print(f"[{_ts()}] [dataset-server] reset", flush=True) + return {"status": "reset"} + + +# --------------------------------------------------------------------------- +# HTTP handler +# --------------------------------------------------------------------------- + + +class _DatasetHandler(BaseHTTPRequestHandler): + server: ThreadingHTTPServer # type: ignore[assignment] + + def _send_json(self, status_code: int, payload: dict[str, Any]) -> None: + body = json.dumps(payload).encode("utf-8") + self.send_response(status_code) + self.send_header("Content-Type", "application/json") + self.send_header("Cache-Control", "no-store") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + try: + self.wfile.write(body) + except (BrokenPipeError, ConnectionResetError): + return + + def _read_json_body(self) -> dict[str, Any] | None: + try: + length = int(self.headers.get("Content-Length") or "0") + except ValueError: + self._send_json(400, {"error": "Invalid Content-Length"}) + return None + body = self.rfile.read(max(0, length)) + if not body: + return {} + try: + req = json.loads(body.decode("utf-8")) + except Exception as e: + self._send_json(400, {"error": f"Invalid JSON: {e}"}) + return None + if not isinstance(req, dict): + self._send_json(400, {"error": "JSON body must be an object"}) + return None + return req + + def do_GET(self) -> None: # noqa: N802 + service: DatasetActionService = getattr(self.server, "service") + if self.path == "/info": + self._send_json(200, service.get_info()) + elif self.path == "/": + self._send_json(200, {"status": "ok"}) + else: + self._send_json(404, {"error": "Not found"}) + + def do_POST(self) -> None: # noqa: N802 + service: DatasetActionService = getattr(self.server, "service") + + if self.path in ("/", "/predict"): + req = self._read_json_body() + if req is None: + return + try: + out = service.predict(req) + except Exception as e: + print(f"[{_ts()}] [dataset-server] predict ERROR: {e}", flush=True) + self._send_json(400, {"action": [], "error": str(e)}) + return + self._send_json(200, out) + + elif self.path == "/next_episode": + req = self._read_json_body() + prompt = req.get("prompt") if req else None + try: + out = service.next_episode(prompt) + except Exception as e: + self._send_json(400, {"error": str(e)}) + return + self._send_json(200, out) + + elif self.path == "/reset": + out = service.reset() + self._send_json(200, out) + + else: + self._send_json(404, {"error": "Not found"}) + + def log_message(self, format: str, *args: Any) -> None: # noqa: A002 + return + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser( + description="HTTP server serving ground-truth actions from LIBERO LeRobot datasets." + ) + parser.add_argument( + "--repo_id", + type=str, + required=True, + help="Comma-separated LeRobot repo IDs (e.g. libero_10,libero_goal)", + ) + parser.add_argument( + "--root", + type=str, + required=True, + help="Comma-separated local paths to dataset roots (one per repo_id)", + ) + parser.add_argument( + "--action_space", + type=str, + default="frame_wise_relative", + choices=["relative", "frame_wise_relative"], + help="Action space (must match closed-loop eval's --action_space).", + ) + parser.add_argument( + "--rotation_space", + type=str, + default="6d", + choices=["3d", "6d", "9d"], + help="Rotation representation (must match closed-loop eval's action_dim).", + ) + parser.add_argument( + "--pose_coordinate_frame", + type=str, + default="native", + choices=["native", "opencv"], + help="Pose/action coordinate frame. Accepted for compatibility with LIBERO eval launchers.", + ) + parser.add_argument("--action_chunk_size", type=int, default=16, help="Number of actions per predict call") + parser.add_argument("--max_action_dim", type=int, default=32, help="Pad actions to this dimension") + parser.add_argument("--split", type=str, default="full", help="Dataset split (train/val/full)") + parser.add_argument( + "--send_video", + action="store_true", + help="Include GT video frames (base64 PNGs) in /predict responses, same format as the Action server.", + ) + parser.add_argument( + "--camera_mode", + type=str, + default="image", + choices=["agentview", "wrist_image", "concat_view", "both"], + help="Camera view(s) to include in video frames.", + ) + parser.add_argument("--image_size", type=int, default=256, help="Resize video frames to this height/width") + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--port", type=int, default=8000) + args = parser.parse_args() + + repo_ids = [r.strip() for r in args.repo_id.split(",") if r.strip()] + roots = [r.strip() for r in args.root.split(",") if r.strip()] + if len(repo_ids) != len(roots): + raise ValueError(f"Number of repo_ids ({len(repo_ids)}) must match number of roots ({len(roots)})") + + cfg = DatasetServerConfig( + repo_id=repo_ids, + root=roots, + action_space=args.action_space, + rotation_space=args.rotation_space, + pose_coordinate_frame=args.pose_coordinate_frame, + action_chunk_size=int(args.action_chunk_size), + max_action_dim=int(args.max_action_dim), + split=args.split, + send_video=bool(args.send_video), + camera_mode=args.camera_mode, + image_size=int(args.image_size), + ) + + service = DatasetActionService(cfg) + local_ip = _get_local_ip() + + print( + f"[{_ts()}] [dataset-server] starting host={args.host} port={args.port} " + f"action_space={cfg.action_space} rotation_space={cfg.rotation_space} " + f"action_chunk_size={cfg.action_chunk_size}", + flush=True, + ) + print(f"[{_ts()}] [dataset-server] Server accessible at: http://{local_ip}:{args.port}/", flush=True) + print(f"[{_ts()}] [dataset-server] Endpoints:", flush=True) + print(f" - GET / : Health check", flush=True) + print(f" - GET /info : Dataset info (tasks, episode counts)", flush=True) + print(f" - POST /predict : Get next GT action chunk (same interface as Action server)", flush=True) + print(f" - POST /next_episode : Advance to next episode for a task", flush=True) + print(f" - POST /reset : Reset all per-task state", flush=True) + + httpd = ThreadingHTTPServer((args.host, int(args.port)), _DatasetHandler) + setattr(httpd, "service", service) + httpd.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/cosmos_framework/utils/vfm/model_loader.py b/cosmos_framework/utils/vfm/model_loader.py index 6e6a0dd..b94817a 100644 --- a/cosmos_framework/utils/vfm/model_loader.py +++ b/cosmos_framework/utils/vfm/model_loader.py @@ -18,21 +18,7 @@ try: from filelock import SoftReadWriteLock except ImportError: # Older filelock versions in some inference containers. - try: - from filelock import ReadWriteLock as SoftReadWriteLock - except ImportError: - from filelock import FileLock - - class SoftReadWriteLock: - """Compatibility adapter for filelock versions without read/write locks.""" - - def __init__(self, *args: Any, **kwargs: Any) -> None: - self._lock = FileLock(*args, **kwargs) - - def write_lock(self) -> FileLock: - return self._lock - - + from filelock import ReadWriteLock as SoftReadWriteLock from torch.distributed.checkpoint.filesystem import FileSystemReader, FileSystemWriter from cosmos_framework.checkpoint.s3_filesystem import S3StorageReader @@ -185,32 +171,6 @@ def _checkpoint_cache_group_lock( yield action -def _reload_pretrained_reasoner_after_checkpoint_load(model: torch.nn.Module) -> None: - """Re-seed the reasoner pathway after a DCP load, mirroring the LoadPretrained - callback that runs during training (inference does not run training callbacks). - - The decision is delegated entirely to the model's own gate in - ``load_pretrained_model_if_needed``: this is a no-op unless the model was built - with ``exclude_reasoner_weights_from_checkpoint=True`` (and pretrained weights - enabled), i.e. the case where the DCP checkpoint deliberately omits the reasoner - tower so it must be re-seeded from the pretrained source. For a normal checkpoint - that already contains the reasoner, the model's gate evaluates to False and - nothing is reloaded. - - ``has_resumable_checkpoint=True`` / ``has_load_path=False`` is load-bearing: it - re-seeds the reasoner from the pretrained source while skipping the - understanding->generation copy (the generation pathway was already populated by - the DCP load). Passing ``has_load_path=True`` would instead force a reasoner - reload even for non-excluded checkpoints, clobbering any fine-tuned reasoner - weights restored from the DCP. - """ - load_pretrained_model_if_needed = getattr(model, "load_pretrained_model_if_needed") - load_pretrained_model_if_needed( - has_resumable_checkpoint=True, - has_load_path=False, - ) - - def _load_model( model: torch.nn.Module, checkpoint_path: str, @@ -234,9 +194,6 @@ def _load_model( start_time = time.time() state_dict = ModelWrapper(model).state_dict() - if any(key.startswith("net_teacher.") for key in state_dict): - log.info("Dropping net_teacher.* keys from inference load target; distillation checkpoints do not save them.") - state_dict = {key: value for key, value in state_dict.items() if not key.startswith("net_teacher.")} if checkpoint_path.startswith("s3://"): storage_reader = S3StorageReader( @@ -252,10 +209,19 @@ def _load_model( keys_to_skip_loading=keys_to_skip_loading or [], ) + # Single-rank load (e.g. the action policy inference server): force no_dist so + # ``dcp.load`` skips the collective ``gather_object`` over the load plan. That + # gather pickles the plan, which fails with "cannot pickle code objects" for + # training/EMA DCPs whose metadata carries non-tensor objects; a single process + # owns the full checkpoint anyway, so the collective is unnecessary. Multi-rank + # (sharded) loads keep the default distributed path. + no_dist = not (dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1) + dcp.load( state_dict=state_dict, storage_reader=storage_reader, planner=load_planner, + no_dist=no_dist, ) log.info(f"Successfully loaded model from {checkpoint_path}") @@ -394,16 +360,6 @@ def load_model_from_checkpoint( # Disable EMA for inference. config.model.config.ema.enabled = False - if hasattr(config.model.config, "load_teacher_weights"): - log.info("Setting load_teacher_weights=False for inference to skip teacher checkpoint download.") - config.model.config.load_teacher_weights = False - - if ( - config.model.config.exclude_reasoner_weights_from_checkpoint - and not config.model.config.vlm_config.pretrained_weights.enabled - ): - log.info("Enabling pretrained reasoner weights because this checkpoint excludes the reasoner tower from DCP.") - config.model.config.vlm_config.pretrained_weights.enabled = True config.validate() config.freeze() # type: ignore @@ -479,7 +435,6 @@ def load_model(checkpoint_load_path: str) -> None: if checkpoint_cache_path is None: load_model(checkpoint_path) - _reload_pretrained_reasoner_after_checkpoint_load(model) return model, config cache_lock_path = f"{checkpoint_cache_path}.lock" @@ -497,6 +452,4 @@ def load_model(checkpoint_load_path: str) -> None: if cache_action == _CheckpointCacheAction.LOAD_CACHE: load_model(checkpoint_cache_path) - _reload_pretrained_reasoner_after_checkpoint_load(model) - return model, config diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md new file mode 100644 index 0000000..02af929 --- /dev/null +++ b/docs/action_policy_libero_sft.md @@ -0,0 +1,206 @@ +# Cosmos3-Nano LIBERO action-policy SFT (reproduction) + +Reproduces the Cosmos3-Nano LIBERO-10 result (technical report Table 20, ~97.4% +success at checkpoint 2000) as an action policy: vision + language in, action +chunks out. Full SFT (no LoRA) on the public `nvidia/Cosmos3-Nano` base. + +Pieces: + +| Piece | Path | +| --- | --- | +| Dataset | `cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py` (`LIBEROLeRobotDataset`) | +| SFT wrapper | `get_action_libero_sft_dataset` in `.../datasets/action_sft_dataset.py` | +| Norm stats | `.../datasets/stats/libero_native_frame_wise_relative_rot6d.json` | +| Experiment | `cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py` | +| Run TOML | `examples/toml/sft_config/action_policy_libero_repro.toml` | +| Launch | `examples/launch_sft_action_policy_libero.sh` | +| Inference server | `cosmos_framework/scripts/action_policy_server_libero.py` | +| Closed-loop eval | `cosmos_framework/simulation/libero/closed_loop_eval.py` | + +## 1. Data + +`LIBEROLeRobotDataset` reads a **local** LeRobot dir directly (parquet + video, +like `DROIDLeRobotDataset`) — set `LIBERO_ROOT` to it. Use NVIDIA's **20 FPS** +conversion [`nvidia/LIBERO_LeRobot_v3`](https://huggingface.co/datasets/nvidia/LIBERO_LeRobot_v3) +(public, OpenMDW-1.1), which is what the bundled `quantile_rot` stats and the +20 Hz eval cadence assume. It ships one subdirectory per suite, so pre-sync just +`libero_10`: + +```bash +hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset \ + --include 'libero_10/**' --local-dir /LIBERO_LeRobot_v3 +export LIBERO_ROOT=/LIBERO_LeRobot_v3/libero_10 +``` + +**For the Table-20 number, use `libero_10` ALONE.** Training on the full suite +mix dilutes libero_10 to ~1 pass in 2000 steps (~82%); libero_10 alone is ~2.7 +passes (~97%). For more suites, sync the other subdirs and add more +`datasets=dict(...)` entries to the experiment's dataloader. + +It uses `frame_wise_relative` rot6d actions (10D = `pos(3) + rot6d(6) + +gripper(1)`), `concat_view` (third-person + wrist, each resized to 256×256, +concatenated horizontally → 256×512), normalized with `quantile_rot` against the +bundled stats. + +**FPS-agnostic loader.** It windows by frame index and decodes video at each +frame's real timestamp (no `delta_timestamps` grid), so any LeRobot LIBERO dataset +loads regardless of its `fps` label, and `conditioning_fps` is read from the +dataset's own `meta/info.json`. Prefer the 20 FPS `nvidia/LIBERO_LeRobot_v3` so +`conditioning_fps=20` matches the stats and the eval (serve with `--fps 20`). The +community `lerobot/libero_*` repos carry the *same frames* but label them 10 FPS; +see [§5](#5-fps--stats). + +**Model-input resolution = 192×320.** The 256×512 concat is aspect-2.0, so with +`resolution=None` the `ActionTransformPipeline` snaps it to the closest `"256"` +tier canvas — 16:9 → **320×192 (w×h) = 192×320 (h×w)** — by aspect-preserving +resize + bottom reflection pad. The training prompt therefore reads +`"...is of 192x320 resolution."`. Keep this; the eval server reproduces the same +snap (see §4). + +## 2. Train (1 node, 8 GPUs) + +```bash +export LD_LIBRARY_PATH='' # NGC/PyTorch container: avoid torch._C import error +export LIBERO_ROOT=/path/to/libero_10_lerobot # libero_10 conversion ONLY +export BASE_CHECKPOINT_PATH= +export WAN_VAE_PATH= +export IMAGINAIRE_OUTPUT_ROOT=/path/to/output_root + +bash examples/launch_sft_action_policy_libero.sh +``` + +Or drive `cosmos_framework.scripts.train` directly: + +```bash +torchrun --nproc_per_node=8 -m cosmos_framework.scripts.train \ + --sft-toml examples/toml/sft_config/action_policy_libero_repro.toml +``` + +Recipe knobs live in the registered `action_policy_libero_nano` experiment (full +SFT of the generation + action heads at lr 5e-5 with a 5× LR multiplier on the +action bridge, FusedAdam, selective activation checkpointing, `quantile_rot` +actions, action heads init fresh from the base via `keys_to_skip_loading`). The +TOML sets only run-level scalars: DP=8, `max_iter=10000`, `warm_up_steps=2000`, +`grad_accum_iter=2`, `save_iter=1000`. Checkpoint 2000 is the reference. On +lower-memory GPUs reduce the per-rank batch: +`--opts dataloader_train.max_samples_per_batch=32`. + +## 3. Closed-loop eval + +Start the policy server on a **trained** checkpoint, then run the LIBERO +simulator client against it. (The base `nvidia/Cosmos3-Nano` DCP has no action +heads — use a checkpoint from §2.) + +```bash +# Server (training venv). Loads the DCP (single-rank no_dist), denormalizes with +# quantile_rot + the bundled libero rot6d stats. The experiment supplies the VAE +# path via the override (the server loads the experiment directly, no TOML). +python -m cosmos_framework.scripts.action_policy_server_libero \ + --experiment action_policy_libero_nano \ + --experiment-overrides "model.config.tokenizer.vae_path=$WAN_VAE_PATH" \ + --checkpoint-path \ + --action-normalization quantile_rot \ + --action-stats-path cosmos_framework/data/vfm/action/datasets/stats/libero_native_frame_wise_relative_rot6d.json \ + --raw-action-dim 10 --fps 20 --port 8000 +``` + +**Eval environment** (the LIBERO sim needs a *separate* venv — robosuite/mujoco +versions conflict with the training env, and the NGC image needs graphics +enabled). This combo is validated headless on an NVIDIA GPU: + +```bash +# 1. Enable the NVIDIA graphics libs in the container (mounts host libEGL_nvidia +# etc.); do NOT apt-install libnvidia-gl (it mismatches the mounted driver). +export NVIDIA_DRIVER_CAPABILITIES=all +apt-get install -y libegl1 libglvnd0 libgl1 libglib2.0-0 ffmpeg +mkdir -p /usr/share/glvnd/egl_vendor.d # ICD (usually already mounted) +echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libEGL_nvidia.so.0"}}' \ + > /usr/share/glvnd/egl_vendor.d/10_nvidia.json + +# 2. Separate py3.10 venv with LIBERO-compatible sim pins + torch<2.6 +# (torch>=2.6 defaults weights_only=True and breaks LIBERO init-state loads). +uv venv --python 3.10 .libenv && VV=.libenv/bin/python +git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git && \ + uv pip install -p $VV -e LIBERO -r LIBERO/requirements.txt +uv pip install -p $VV "robosuite==1.4.1" "mujoco==2.3.7" "torch<2.6" loguru requests scipy pillow numpy + +# 3. LIBERO first-run config (avoids the interactive prompt) + robosuite macros +mkdir -p ~/.libero && touch ~/.libero/config.yaml +RS=$($VV -c "import robosuite,os;print(os.path.dirname(robosuite.__file__))") +$VV "$RS/scripts/setup_macros.py" +$VV -c "from libero.libero import set_libero_default_path; set_libero_default_path()" + +# 4. Run the client (concat agentview+wrist matches the 256x512 training view). +MUJOCO_GL=egl PYTHONPATH=$PWD:$PWD/LIBERO $VV \ + cosmos_framework/simulation/libero/closed_loop_eval.py \ + --server_url http://localhost:8000 \ + --task_suite libero_10 --num_trials_per_task 10 --action_horizon 16 \ + --camera agentview,wrist --image_size 256 \ + --action_space frame_wise_relative --rotation_space 6d --action_dim 10 \ + --save_gifs --gif_fps 20 --output_dir results/libero_closed_loop_10 +``` + +Validated end-to-end against a stub server (episode runs, `summary.json` + GIFs +written, `rc=0`); a benign `EGLError` may print during context teardown on exit. + +## 4. Gotchas (from NVIDIA/cosmos-framework#50) + +These cost real accuracy if missed; the shipped eval client already handles the +first two, but verify them against your checkpoint: + +- **Train ↔ serve parity (resolution + prompt).** Training snaps the 256×512 + concat to a **192×320** model-input canvas (see §1) and the prompt suffix + encodes that resolution + clip duration (`append_resolution_info` / + `append_duration_fps_timestamps`). The server applies the *same* snap + (`get_vision_data_resolution` + `find_closest_target_size` + reflection pad), + so parity is automatic **as long as the client sends the same 2:1 concat + layout** — run `closed_loop_eval` with `--camera agentview,wrist --image_size + 256` (agentview left, wrist right, matching training). A single-view client (or + an old server that skipped the snap) sends a different aspect → different + canvas → the reported 192×320-train vs 256×512-serve mismatch and ~62% (vs + ~97%). This is the first thing to check if numbers are low. Note the clip + *duration* string is computed slightly differently on each side (training's + rounds to `0.0s`); resolution is the dominant factor — verify both against a + `--dump_dir` server capture if accuracy is off. +- **Gripper.** The model emits gripper in `[0, 1]`; the LIBERO env wants + `[-1, 1]` with negative = open. `closed_loop_eval._remap_gripper_to_neg1_pos1` + applies `1 - 2·g`. If the gripper never opens, the sign is inverted for your + data — flip it. +- **Image orientation.** Sim frames are rotated 180° relative to training; + `closed_loop_eval` rotates them back (`img[::-1, ::-1]`). +- **Normalization.** Always start the server with `--action-normalization + quantile_rot` and the bundled libero rot6d stats file, or actions come out at + the wrong scale. + +## 5. FPS & stats + +`LIBEROLeRobotDataset` follows `DROIDLeRobotDataset`: it reads the LeRobot parquet +directly, windows by **frame index**, and decodes video at each frame's **real +timestamp** — so it never builds LeRobot's `delta_timestamps` grid and works at +any native FPS. (The earlier `delta_timestamps` port failed on the 10 FPS public +dataset because a 1/20 s grid doesn't land on 10 FPS frames.) + +- **Use the 20 FPS `nvidia/LIBERO_LeRobot_v3`.** LIBERO demos are recorded at + robosuite's default 20 Hz `control_freq`. NVIDIA's conversion labels them 20 FPS + (correct); the community `lerobot/libero_*` repos contain the *same frames* (e.g. + libero_10 = 379 eps / 101,469 frames in both) but label them 10 FPS. Nothing was + subsampled — only the `fps` metadata differs. +- **Why 20 FPS is the clean choice for THIS eval.** The closed-loop harness steps + the env at LIBERO's default 20 Hz and applies one predicted action per + `env.step` (no action-repeat, no `control_freq` override — see `_get_libero_env` + / `_run_episode`). So the policy's per-action cadence must be 20 Hz. Training on + the 20 FPS dataset makes `conditioning_fps=20` (read from `meta/info.json`), + matches the bundled `quantile_rot` stats, and lines up with the eval's 20 Hz — + serve with `--fps 20`, no harness change. +- **The normalization gap was never the issue.** `normalize_action(quantile)` is an + *unclamped* affine map `2(a−q01)/(q99−q01)−1`; training and the server share the + same stats file, so any scale cancels (same reason DROID is fine at its own + 15 FPS). The real consistency requirement is the **control rate**, which the + 20 FPS dataset satisfies by construction. +- **If you must use a differently-labelled dataset**, keep cadence consistent: + serve at the dataset's `fps`, and if its frames are genuinely sub-sampled (fewer + frames than the 20 Hz original), either run the eval env at a matching + `control_freq` or action-repeat. With `nvidia/LIBERO_LeRobot_v3` none of this is + needed. +- `fps` only sets `conditioning_fps` + prompt duration; the loader always windows + by frame index and decodes at real timestamps. diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh new file mode 100755 index 0000000..79aec05 --- /dev/null +++ b/examples/launch_sft_action_policy_libero.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Structured-TOML launch for action_policy_libero_nano — Cosmos3-Nano LIBERO +# action-policy SFT (8-GPU FSDP, full SFT, no LoRA). Reproduces the Table-20 +# LIBERO-10 result (~97.4% @ ckpt 2000). Drives cosmos_framework.scripts.train +# against examples/toml/sft_config/action_policy_libero_repro.toml. +# +# REPRODUCTION: point LIBERO_ROOT at the libero_10 suite ONLY. The full suite +# mix dilutes libero_10 to ~1 pass in 2000 steps (~82%); libero_10 alone is ~2.7 +# passes (~97%). Use the 20 FPS nvidia/LIBERO_LeRobot_v3. See docs/action_policy_libero_sft.md. +# +# Required env vars: +# LIBERO_ROOT local LIBERO-10 LeRobot dataset dir, e.g. /libero_10 (no default) +# Optional env vars (defaults below; override to relocate data/checkpoints): +# BASE_CHECKPOINT_PATH default: examples/checkpoints/Cosmos3-Nano +# WAN_VAE_PATH default: examples/checkpoints/wan22_vae/Wan2.2_VAE.pth +# HF_TOKEN if any tokenizer download requires gated HF access +# OUTPUT_ROOT default: outputs/train +# +# Pre-sync the 20 FPS suite once: +# hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset --include 'libero_10/**' --local-dir +# export LIBERO_ROOT=/libero_10 +# +# Usage (8-GPU allocation, inside the training container, from the repo root): +# LIBERO_ROOT=/libero_10 bash examples/launch_sft_action_policy_libero.sh + +TOML_FILE="examples/toml/sft_config/action_policy_libero_repro.toml" +: "${BASE_CHECKPOINT_PATH:=examples/checkpoints/Cosmos3-Nano}" + +# LIBEROLeRobotDataset reads ${oc.env:LIBERO_ROOT} directly (a LOCAL LeRobot dir); +# export it so torchrun (launched in this shell) inherits it. +export LIBERO_ROOT="${LIBERO_ROOT:-}" + +EXTRA_DATASET_CHECK='[[ -f "$LIBERO_ROOT/meta/info.json" ]] || { echo "ERROR: LIBERO_ROOT must be a local LeRobot dir containing meta/info.json (got: '\''$LIBERO_ROOT'\''). Pre-sync: hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset --include '\''libero_10/**'\'' --local-dir (then LIBERO_ROOT=/libero_10). See docs/action_policy_libero_sft.md" >&2; exit 1; }' + +# Extra Hydra overrides from the environment: a space-separated string word-split into +# the TAIL_OVERRIDES array. An exported string survives `bash ` (a child +# process), unlike a TAIL_OVERRIDES array set in your shell. Use it for smoke runs, +# e.g. EXTRA_TAIL_OVERRIDES="trainer.max_iter=5 job.wandb_mode=offline". +TAIL_OVERRIDES=( + ${EXTRA_TAIL_OVERRIDES:-} +) + +source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh" diff --git a/examples/toml/sft_config/action_policy_libero_repro.toml b/examples/toml/sft_config/action_policy_libero_repro.toml new file mode 100644 index 0000000..7fd788d --- /dev/null +++ b/examples/toml/sft_config/action_policy_libero_repro.toml @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# ============================================================================ +# LIBERO action-policy SFT — run config for the `action_policy_libero_nano` +# experiment (Cosmos3-Nano LIBERO-10). The recipe knobs (optimizer base, count- +# based batch, action-head skip-on-load, dataset knobs) live in the registered +# experiment; this file sets run-level scalars (lr/schedule, iters, ckpt cadence, +# parallelism shape, wandb, VAE path). +# +# RECIPE (recommended): lr 5e-5, warmup 500, cycle 16000 (so LR is barely decayed +# at iter 2000, ~4.5e-5), global batch 2048, save every 500 -> sweep 500..2000. +# Best observed: ~95.2% @ iter_1500 (libero_10, 500-ep closed-loop eval), with +# task-0 success stable across the sweep (no over-fit collapse). This gentle-LR +# schedule is more robust than a higher lr (e.g. 1e-4), which peaks near iter_1000 +# then over-fits task 0 and regresses. See docs/action_policy_libero_sft.md. +# +# REPRODUCTION: train on libero_10 ALONE (point LIBERO_ROOT at the libero_10 +# LeRobot conversion only). The 4-suite mix dilutes libero_10 (~1/4 the exposure +# per step) and converges more slowly. +# +# Env required: +# LIBERO_ROOT=/path/to/libero_10_lerobot +# BASE_CHECKPOINT_PATH= +# WAN_VAE_PATH= +# IMAGINAIRE_OUTPUT_ROOT=/path/to/output_root # persist checkpoints +# ============================================================================ + +[job] +task = "vfm" +experiment = "action_policy_libero_nano" +project = "cosmos3_action_libero" +group = "action_sft" +name = "action_policy_libero_repro" +wandb_mode = "online" + +[model] +precision = "bfloat16" +# Cap the packed sequence (GA-validated). Uncapped (-1) packs one very long sequence +# and OOMs even on H200. +max_num_tokens_after_packing = 74000 + +[model.parallelism] +data_parallel_shard_degree = 8 # 1-node 8-GPU shard; raise replicate for multi-node HSDP +data_parallel_replicate_degree = 1 + +[model.activation_checkpointing] +mode = "selective" # GA recipe (full is slower; selective fits 256x512) +save_ops_regex = ["fmha"] + +[model.tokenizer] +vae_path = "${oc.env:WAN_VAE_PATH}" + +[optimizer] +lr = 5.0e-05 # recommended base lr + +[scheduler] +cycle_lengths = [16000] # LR trajectory: warmup 500 -> linear decay over 16k (barely decayed at 2k) +warm_up_steps = [500] + +[trainer] +max_iter = 2000 # pause at 2k; sweep checkpoints 500/1000/1500/2000 for the peak +logging_iter = 50 +grad_accum_iter = 2 # global batch = max_samples_per_batch 128 x DP 8 x grad_accum 2 = 2048 + +[checkpoint] +load_path = "${oc.env:BASE_CHECKPOINT_PATH}" +save_iter = 500 # sweep cadence; peak is typically iter_1500 + +# NOTE (train/serve parity — see GitHub issue NVIDIA/cosmos-framework#50): the +# 256x512 concat_view is snapped to a 192x320 model canvas (resize+reflect-pad), and +# the eval server reproduces the same snap. Run the client with the same 2:1 concat +# (--camera agentview,wrist --image_size 256) so resolution + prompt suffix match, and +# use --action-normalization quantile_rot + the bundled libero rot6d stats on the +# server so denormalization matches training. See docs/action_policy_libero_sft.md. +# +# max_samples_per_batch is 128 in the experiment (256 OOMs: per-forward peak, not grad_accum). +# On lower-memory GPUs reduce at launch: +# --opts dataloader_train.max_samples_per_batch=64 From 3ecd0a75394703c18a4cb78dc71f3f54028ca6cb Mon Sep 17 00:00:00 2001 From: Liang Hao Date: Fri, 26 Jun 2026 21:04:17 +0800 Subject: [PATCH 2/9] libero(doc): align markdown tables (rumdl-fmt / MD060) --- docs/action_policy_libero_sft.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md index 02af929..4929f31 100644 --- a/docs/action_policy_libero_sft.md +++ b/docs/action_policy_libero_sft.md @@ -6,16 +6,16 @@ chunks out. Full SFT (no LoRA) on the public `nvidia/Cosmos3-Nano` base. Pieces: -| Piece | Path | -| --- | --- | -| Dataset | `cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py` (`LIBEROLeRobotDataset`) | -| SFT wrapper | `get_action_libero_sft_dataset` in `.../datasets/action_sft_dataset.py` | -| Norm stats | `.../datasets/stats/libero_native_frame_wise_relative_rot6d.json` | -| Experiment | `cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py` | -| Run TOML | `examples/toml/sft_config/action_policy_libero_repro.toml` | -| Launch | `examples/launch_sft_action_policy_libero.sh` | -| Inference server | `cosmos_framework/scripts/action_policy_server_libero.py` | -| Closed-loop eval | `cosmos_framework/simulation/libero/closed_loop_eval.py` | +| Piece | Path | +| ---------------- | ----------------------------------------------------------------------------------------------- | +| Dataset | `cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py` (`LIBEROLeRobotDataset`) | +| SFT wrapper | `get_action_libero_sft_dataset` in `.../datasets/action_sft_dataset.py` | +| Norm stats | `.../datasets/stats/libero_native_frame_wise_relative_rot6d.json` | +| Experiment | `cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py` | +| Run TOML | `examples/toml/sft_config/action_policy_libero_repro.toml` | +| Launch | `examples/launch_sft_action_policy_libero.sh` | +| Inference server | `cosmos_framework/scripts/action_policy_server_libero.py` | +| Closed-loop eval | `cosmos_framework/simulation/libero/closed_loop_eval.py` | ## 1. Data From 6b22dd5ac66cb2c7ac1b28bdf64b87f65a10605f Mon Sep 17 00:00:00 2001 From: Liang Hao Date: Fri, 26 Jun 2026 21:24:27 +0800 Subject: [PATCH 3/9] libero: trim recipe/doc comments to essentials; HSDP 2x8 ga1 canonical Lean the toml/config/launch/doc comments (drop SR numbers and experimental detail), and set the canonical recipe to HSDP 2x8 with grad_accum=1 (global batch 2048) instead of single-node grad_accum=2. --- .../action_policy_libero_nano.py | 45 ++--- .../action/datasets/libero_lerobot_dataset.py | 8 +- docs/action_policy_libero_sft.md | 183 ++++-------------- examples/launch_sft_action_policy_libero.sh | 10 +- .../action_policy_libero_repro.toml | 57 ++---- 5 files changed, 72 insertions(+), 231 deletions(-) diff --git a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py index 98b5b12..38c03d4 100644 --- a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py +++ b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py @@ -1,29 +1,13 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: OpenMDW-1.1 -"""``action_policy_libero_nano`` — Cosmos3-Nano LIBERO action-policy SFT recipe. +"""``action_policy_libero_nano`` — Cosmos3-Nano LIBERO-10 action-policy SFT recipe. -Reproduces the Cosmos3-Nano LIBERO-10 result (Table 20, 97.4% @ ckpt 2000). -Mirrors ``action_policy_droid_nano`` (PackingDataLoader + RankPartitionedDataLoader -+ ActionIterableShuffleDataset), but feeds ``LIBEROLeRobotDataset`` (frame-wise-relative -rot6d actions, ``quantile_rot``-normalized, concat_view third-person + wrist at -256x256 each -> 256x512) through ``ActionTransformPipeline``, and trains the -generation + action heads from the public ``nvidia/Cosmos3-Nano`` base. Full SFT -(no LoRA) — the LoRA variant is the 32B "super" tier only. - -LIBERO-10 reproduction note: the public Table-20 number is reached training on -``libero_10`` ALONE. Training on the full 4-suite mix dilutes libero_10 to ~1 pass -in 2000 steps (~82%); libero_10 alone is ~2.7 passes (~97%). Point ``LIBERO_ROOT`` -(and ``LIBERO_REPO_ID``) at the libero_10 LeRobot conversion only. - -Usage (1 node, 8 GPU):: - - LIBERO_ROOT=/path/to/libero_10_lerobot \\ - LIBERO_REPO_ID=lerobot/libero_10 \\ - BASE_CHECKPOINT_PATH= \\ - WAN_VAE_PATH= \\ - torchrun --nproc_per_node=8 -m cosmos_framework.scripts.train \\ - --sft-toml examples/toml/sft_config/action_policy_libero_repro.toml +Mirrors ``action_policy_droid_nano`` but feeds ``LIBEROLeRobotDataset`` +(frame-wise-relative rot6d, ``quantile_rot``, concat_view third-person + wrist) +and trains the generation + action heads from the public ``nvidia/Cosmos3-Nano`` +base. Train on ``libero_10`` alone (``LIBERO_ROOT``). +See docs/action_policy_libero_sft.md. """ import copy @@ -44,16 +28,9 @@ def _action_policy_libero_nano_model_config() -> dict: - """GA LIBERO model config: capped packed tokens, selective activation - checkpointing, fresh diffusion-expert init, 10x vision flow-matching loss, and - the VAE encode durations [17, 61, 73] carried by the Cosmos3 base. - - NOTE: keep ``encode_exact_durations=[17, 61, 73]`` — do NOT reduce it to [17] - even though ``mode="policy"`` only produces 17-frame windows at the data level. - The public Cosmos3-Nano base was pretrained with [17, 61, 73]; the reference - GA LIBERO SFT (``action_policy_sft_nano`` on ``mharrim-nv-patch-1``) retains it, - and empirically reducing it to [17] regresses the policy badly - (60.8% vs 94.6% at iter 2000).""" + """LIBERO model config: capped packed tokens, selective activation + checkpointing, fresh diffusion-expert init, 10x vision flow-matching loss. + Keep ``encode_exact_durations=[17, 61, 73]`` to match the Cosmos3-Nano base.""" cfg = copy.deepcopy(NANO_MODEL_CONFIG) # action_gen=True, max_action_dim=64 # Cap the packed sequence. Uncapped (-1) + a large max_samples_per_batch packs # one very long sequence and OOMs even on H200; 74000 keeps the GA-validated bound. @@ -219,8 +196,8 @@ def _action_policy_libero_nano_model_config() -> dict: libero=dict( ratio=1, dataset=L(get_action_libero_sft_dataset)( - # Local LeRobot dir for the libero_10 suite ONLY (Table-20 - # reproduction; full suite mix -> ~82%, see module docstring). Use the + # Local LeRobot dir for the libero_10 suite ONLY (the + # full suite mix dilutes libero_10; see module docstring). Use the # 20 FPS nvidia/LIBERO_LeRobot_v3 (matches the bundled stats + 20 Hz eval): # hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset \ # --include 'libero_10/**' --local-dir # LIBERO_ROOT=/libero_10 diff --git a/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py b/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py index 146fcc1..1e5ef01 100644 --- a/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py +++ b/cosmos_framework/data/vfm/action/datasets/libero_lerobot_dataset.py @@ -16,8 +16,8 @@ NOTE on FPS / stats fidelity: the bundled ``quantile_rot`` stats were computed on a 20 FPS conversion. Per-frame deltas at 10 FPS span 2x the wall-clock motion, so -for a faithful Table-20 reproduction use a 20 FPS LIBERO dataset (or recompute -stats for the dataset's FPS). Loading/training is correct at any FPS regardless. +use a 20 FPS LIBERO dataset (or recompute stats for the dataset's FPS). +Loading/training is correct at any FPS regardless. """ from __future__ import annotations @@ -259,9 +259,7 @@ def get_shuffle_blocks(self) -> list[tuple[int, int]]: # ---- sample build ------------------------------------------------------ def __getitem__(self, idx: int) -> dict[str, Any]: - # Resilience: a single unreadable/corrupt video frame (e.g. a torchcodec - # decode error on the packed LeRobot-v3 mp4s) must not crash a multi-node - # run. Resample a different valid window on failure (bounded retries). + # Resample a different valid window if a frame fails to decode (bounded retries). n = len(self) last_err: Exception | None = None for _attempt in range(8): diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md index 4929f31..735c630 100644 --- a/docs/action_policy_libero_sft.md +++ b/docs/action_policy_libero_sft.md @@ -1,10 +1,7 @@ -# Cosmos3-Nano LIBERO action-policy SFT (reproduction) +# Cosmos3-Nano LIBERO-10 action-policy SFT -Reproduces the Cosmos3-Nano LIBERO-10 result (technical report Table 20, ~97.4% -success at checkpoint 2000) as an action policy: vision + language in, action -chunks out. Full SFT (no LoRA) on the public `nvidia/Cosmos3-Nano` base. - -Pieces: +Full SFT (no LoRA) of the public `nvidia/Cosmos3-Nano` base into a LIBERO-10 +action policy: vision + language in, action chunks out. | Piece | Path | | ---------------- | ----------------------------------------------------------------------------------------------- | @@ -19,12 +16,10 @@ Pieces: ## 1. Data -`LIBEROLeRobotDataset` reads a **local** LeRobot dir directly (parquet + video, -like `DROIDLeRobotDataset`) — set `LIBERO_ROOT` to it. Use NVIDIA's **20 FPS** -conversion [`nvidia/LIBERO_LeRobot_v3`](https://huggingface.co/datasets/nvidia/LIBERO_LeRobot_v3) -(public, OpenMDW-1.1), which is what the bundled `quantile_rot` stats and the -20 Hz eval cadence assume. It ships one subdirectory per suite, so pre-sync just -`libero_10`: +`LIBEROLeRobotDataset` reads a local LeRobot dir (`LIBERO_ROOT`). Use the 20 FPS +[`nvidia/LIBERO_LeRobot_v3`](https://huggingface.co/datasets/nvidia/LIBERO_LeRobot_v3), +which the bundled `quantile_rot` stats and the 20 Hz eval assume. Train on +`libero_10` alone: ```bash hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset \ @@ -32,175 +27,79 @@ hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset \ export LIBERO_ROOT=/LIBERO_LeRobot_v3/libero_10 ``` -**For the Table-20 number, use `libero_10` ALONE.** Training on the full suite -mix dilutes libero_10 to ~1 pass in 2000 steps (~82%); libero_10 alone is ~2.7 -passes (~97%). For more suites, sync the other subdirs and add more -`datasets=dict(...)` entries to the experiment's dataloader. - -It uses `frame_wise_relative` rot6d actions (10D = `pos(3) + rot6d(6) + -gripper(1)`), `concat_view` (third-person + wrist, each resized to 256×256, -concatenated horizontally → 256×512), normalized with `quantile_rot` against the -bundled stats. - -**FPS-agnostic loader.** It windows by frame index and decodes video at each -frame's real timestamp (no `delta_timestamps` grid), so any LeRobot LIBERO dataset -loads regardless of its `fps` label, and `conditioning_fps` is read from the -dataset's own `meta/info.json`. Prefer the 20 FPS `nvidia/LIBERO_LeRobot_v3` so -`conditioning_fps=20` matches the stats and the eval (serve with `--fps 20`). The -community `lerobot/libero_*` repos carry the *same frames* but label them 10 FPS; -see [§5](#5-fps--stats). - -**Model-input resolution = 192×320.** The 256×512 concat is aspect-2.0, so with -`resolution=None` the `ActionTransformPipeline` snaps it to the closest `"256"` -tier canvas — 16:9 → **320×192 (w×h) = 192×320 (h×w)** — by aspect-preserving -resize + bottom reflection pad. The training prompt therefore reads -`"...is of 192x320 resolution."`. Keep this; the eval server reproduces the same -snap (see §4). - -## 2. Train (1 node, 8 GPUs) +Actions are `frame_wise_relative` rot6d (10D = pos 3 + rot6d 6 + gripper 1), +`concat_view` (third-person + wrist, each 256×256 → 256×512), `quantile_rot` +normalized. The pipeline snaps the 256×512 concat to a 192×320 model canvas; the +eval server reproduces the same snap (§4). + +## 2. Train ```bash -export LD_LIBRARY_PATH='' # NGC/PyTorch container: avoid torch._C import error -export LIBERO_ROOT=/path/to/libero_10_lerobot # libero_10 conversion ONLY +export LD_LIBRARY_PATH='' # NGC container: avoid torch._C import error +export LIBERO_ROOT=/path/to/libero_10_lerobot export BASE_CHECKPOINT_PATH= export WAN_VAE_PATH= export IMAGINAIRE_OUTPUT_ROOT=/path/to/output_root -bash examples/launch_sft_action_policy_libero.sh +bash examples/launch_sft_action_policy_libero.sh # HSDP 2x8; set NNODES/NODE_RANK/MASTER_ADDR per node ``` -Or drive `cosmos_framework.scripts.train` directly: - -```bash -torchrun --nproc_per_node=8 -m cosmos_framework.scripts.train \ - --sft-toml examples/toml/sft_config/action_policy_libero_repro.toml -``` - -Recipe knobs live in the registered `action_policy_libero_nano` experiment (full -SFT of the generation + action heads at lr 5e-5 with a 5× LR multiplier on the -action bridge, FusedAdam, selective activation checkpointing, `quantile_rot` -actions, action heads init fresh from the base via `keys_to_skip_loading`). The -TOML sets only run-level scalars: DP=8, `max_iter=10000`, `warm_up_steps=2000`, -`grad_accum_iter=2`, `save_iter=1000`. Checkpoint 2000 is the reference. On -lower-memory GPUs reduce the per-rank batch: -`--opts dataloader_train.max_samples_per_batch=32`. +Recipe knobs live in `action_policy_libero_nano`; the TOML sets run-level scalars +(lr 5e-5, warmup 500, cycle 16000, global batch 2048, `save_iter=500`). Sweep the +saved checkpoints to pick the best iteration. On lower-memory GPUs reduce the +per-rank batch: `--opts dataloader_train.max_samples_per_batch=32`. ## 3. Closed-loop eval -Start the policy server on a **trained** checkpoint, then run the LIBERO -simulator client against it. (The base `nvidia/Cosmos3-Nano` DCP has no action -heads — use a checkpoint from §2.) +Start the policy server on a **trained** checkpoint (the base DCP has no action +heads), then run the LIBERO simulator client against it. ```bash -# Server (training venv). Loads the DCP (single-rank no_dist), denormalizes with -# quantile_rot + the bundled libero rot6d stats. The experiment supplies the VAE -# path via the override (the server loads the experiment directly, no TOML). python -m cosmos_framework.scripts.action_policy_server_libero \ --experiment action_policy_libero_nano \ --experiment-overrides "model.config.tokenizer.vae_path=$WAN_VAE_PATH" \ - --checkpoint-path \ + --checkpoint-path /checkpoints/iter_000001500 \ --action-normalization quantile_rot \ --action-stats-path cosmos_framework/data/vfm/action/datasets/stats/libero_native_frame_wise_relative_rot6d.json \ --raw-action-dim 10 --fps 20 --port 8000 ``` -**Eval environment** (the LIBERO sim needs a *separate* venv — robosuite/mujoco -versions conflict with the training env, and the NGC image needs graphics -enabled). This combo is validated headless on an NVIDIA GPU: +The LIBERO sim needs a separate venv (robosuite/mujoco pins conflict with the +training env) and graphics enabled in the container: ```bash -# 1. Enable the NVIDIA graphics libs in the container (mounts host libEGL_nvidia -# etc.); do NOT apt-install libnvidia-gl (it mismatches the mounted driver). export NVIDIA_DRIVER_CAPABILITIES=all apt-get install -y libegl1 libglvnd0 libgl1 libglib2.0-0 ffmpeg -mkdir -p /usr/share/glvnd/egl_vendor.d # ICD (usually already mounted) +mkdir -p /usr/share/glvnd/egl_vendor.d echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libEGL_nvidia.so.0"}}' \ > /usr/share/glvnd/egl_vendor.d/10_nvidia.json -# 2. Separate py3.10 venv with LIBERO-compatible sim pins + torch<2.6 -# (torch>=2.6 defaults weights_only=True and breaks LIBERO init-state loads). uv venv --python 3.10 .libenv && VV=.libenv/bin/python git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git && \ uv pip install -p $VV -e LIBERO -r LIBERO/requirements.txt uv pip install -p $VV "robosuite==1.4.1" "mujoco==2.3.7" "torch<2.6" loguru requests scipy pillow numpy - -# 3. LIBERO first-run config (avoids the interactive prompt) + robosuite macros mkdir -p ~/.libero && touch ~/.libero/config.yaml -RS=$($VV -c "import robosuite,os;print(os.path.dirname(robosuite.__file__))") -$VV "$RS/scripts/setup_macros.py" +RS=$($VV -c "import robosuite,os;print(os.path.dirname(robosuite.__file__))"); $VV "$RS/scripts/setup_macros.py" $VV -c "from libero.libero import set_libero_default_path; set_libero_default_path()" -# 4. Run the client (concat agentview+wrist matches the 256x512 training view). MUJOCO_GL=egl PYTHONPATH=$PWD:$PWD/LIBERO $VV \ cosmos_framework/simulation/libero/closed_loop_eval.py \ --server_url http://localhost:8000 \ - --task_suite libero_10 --num_trials_per_task 10 --action_horizon 16 \ + --task_suite libero_10 --num_trials_per_task 50 --num_envs 8 \ --camera agentview,wrist --image_size 256 \ --action_space frame_wise_relative --rotation_space 6d --action_dim 10 \ - --save_gifs --gif_fps 20 --output_dir results/libero_closed_loop_10 + --output_dir results/libero_closed_loop_10 ``` -Validated end-to-end against a stub server (episode runs, `summary.json` + GIFs -written, `rc=0`); a benign `EGLError` may print during context teardown on exit. - -## 4. Gotchas (from NVIDIA/cosmos-framework#50) - -These cost real accuracy if missed; the shipped eval client already handles the -first two, but verify them against your checkpoint: - -- **Train ↔ serve parity (resolution + prompt).** Training snaps the 256×512 - concat to a **192×320** model-input canvas (see §1) and the prompt suffix - encodes that resolution + clip duration (`append_resolution_info` / - `append_duration_fps_timestamps`). The server applies the *same* snap - (`get_vision_data_resolution` + `find_closest_target_size` + reflection pad), - so parity is automatic **as long as the client sends the same 2:1 concat - layout** — run `closed_loop_eval` with `--camera agentview,wrist --image_size - 256` (agentview left, wrist right, matching training). A single-view client (or - an old server that skipped the snap) sends a different aspect → different - canvas → the reported 192×320-train vs 256×512-serve mismatch and ~62% (vs - ~97%). This is the first thing to check if numbers are low. Note the clip - *duration* string is computed slightly differently on each side (training's - rounds to `0.0s`); resolution is the dominant factor — verify both against a - `--dump_dir` server capture if accuracy is off. -- **Gripper.** The model emits gripper in `[0, 1]`; the LIBERO env wants - `[-1, 1]` with negative = open. `closed_loop_eval._remap_gripper_to_neg1_pos1` - applies `1 - 2·g`. If the gripper never opens, the sign is inverted for your - data — flip it. -- **Image orientation.** Sim frames are rotated 180° relative to training; - `closed_loop_eval` rotates them back (`img[::-1, ::-1]`). -- **Normalization.** Always start the server with `--action-normalization - quantile_rot` and the bundled libero rot6d stats file, or actions come out at - the wrong scale. - -## 5. FPS & stats - -`LIBEROLeRobotDataset` follows `DROIDLeRobotDataset`: it reads the LeRobot parquet -directly, windows by **frame index**, and decodes video at each frame's **real -timestamp** — so it never builds LeRobot's `delta_timestamps` grid and works at -any native FPS. (The earlier `delta_timestamps` port failed on the 10 FPS public -dataset because a 1/20 s grid doesn't land on 10 FPS frames.) - -- **Use the 20 FPS `nvidia/LIBERO_LeRobot_v3`.** LIBERO demos are recorded at - robosuite's default 20 Hz `control_freq`. NVIDIA's conversion labels them 20 FPS - (correct); the community `lerobot/libero_*` repos contain the *same frames* (e.g. - libero_10 = 379 eps / 101,469 frames in both) but label them 10 FPS. Nothing was - subsampled — only the `fps` metadata differs. -- **Why 20 FPS is the clean choice for THIS eval.** The closed-loop harness steps - the env at LIBERO's default 20 Hz and applies one predicted action per - `env.step` (no action-repeat, no `control_freq` override — see `_get_libero_env` - / `_run_episode`). So the policy's per-action cadence must be 20 Hz. Training on - the 20 FPS dataset makes `conditioning_fps=20` (read from `meta/info.json`), - matches the bundled `quantile_rot` stats, and lines up with the eval's 20 Hz — - serve with `--fps 20`, no harness change. -- **The normalization gap was never the issue.** `normalize_action(quantile)` is an - *unclamped* affine map `2(a−q01)/(q99−q01)−1`; training and the server share the - same stats file, so any scale cancels (same reason DROID is fine at its own - 15 FPS). The real consistency requirement is the **control rate**, which the - 20 FPS dataset satisfies by construction. -- **If you must use a differently-labelled dataset**, keep cadence consistent: - serve at the dataset's `fps`, and if its frames are genuinely sub-sampled (fewer - frames than the 20 Hz original), either run the eval env at a matching - `control_freq` or action-repeat. With `nvidia/LIBERO_LeRobot_v3` none of this is - needed. -- `fps` only sets `conditioning_fps` + prompt duration; the loader always windows - by frame index and decodes at real timestamps. +## 4. Eval parity + +The client/server already handle these; verify them if accuracy is low: + +- **Concat layout** — run with `--camera agentview,wrist --image_size 256` so the + 256×512 concat matches training (the server snaps it to 192×320 identically). +- **Gripper** — model emits `[0, 1]`; the env wants `[-1, 1]` (negative = open). + The client applies `1 − 2·g`; flip the sign if the gripper never opens. +- **Image orientation** — sim frames are rotated 180° vs training; the client + rotates them back. +- **Normalization** — start the server with `--action-normalization quantile_rot` + and the bundled rot6d stats, or actions come out at the wrong scale. diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh index 79aec05..4188d9d 100755 --- a/examples/launch_sft_action_policy_libero.sh +++ b/examples/launch_sft_action_policy_libero.sh @@ -3,13 +3,13 @@ # SPDX-License-Identifier: OpenMDW-1.1 # Structured-TOML launch for action_policy_libero_nano — Cosmos3-Nano LIBERO -# action-policy SFT (8-GPU FSDP, full SFT, no LoRA). Reproduces the Table-20 -# LIBERO-10 result (~97.4% @ ckpt 2000). Drives cosmos_framework.scripts.train +# action-policy SFT (HSDP, full SFT, no LoRA). Drives cosmos_framework.scripts.train # against examples/toml/sft_config/action_policy_libero_repro.toml. # -# REPRODUCTION: point LIBERO_ROOT at the libero_10 suite ONLY. The full suite -# mix dilutes libero_10 to ~1 pass in 2000 steps (~82%); libero_10 alone is ~2.7 -# passes (~97%). Use the 20 FPS nvidia/LIBERO_LeRobot_v3. See docs/action_policy_libero_sft.md. +# Point LIBERO_ROOT at the libero_10 suite ONLY (the full suite mix dilutes +# libero_10). Use the 20 FPS nvidia/LIBERO_LeRobot_v3. The default recipe is +# HSDP 2x8 (global batch 2048); set NNODES/NODE_RANK/MASTER_ADDR per node. +# See docs/action_policy_libero_sft.md. # # Required env vars: # LIBERO_ROOT local LIBERO-10 LeRobot dataset dir, e.g. /libero_10 (no default) diff --git a/examples/toml/sft_config/action_policy_libero_repro.toml b/examples/toml/sft_config/action_policy_libero_repro.toml index 7fd788d..d74237b 100644 --- a/examples/toml/sft_config/action_policy_libero_repro.toml +++ b/examples/toml/sft_config/action_policy_libero_repro.toml @@ -1,30 +1,10 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: OpenMDW-1.1 -# ============================================================================ -# LIBERO action-policy SFT — run config for the `action_policy_libero_nano` -# experiment (Cosmos3-Nano LIBERO-10). The recipe knobs (optimizer base, count- -# based batch, action-head skip-on-load, dataset knobs) live in the registered -# experiment; this file sets run-level scalars (lr/schedule, iters, ckpt cadence, -# parallelism shape, wandb, VAE path). -# -# RECIPE (recommended): lr 5e-5, warmup 500, cycle 16000 (so LR is barely decayed -# at iter 2000, ~4.5e-5), global batch 2048, save every 500 -> sweep 500..2000. -# Best observed: ~95.2% @ iter_1500 (libero_10, 500-ep closed-loop eval), with -# task-0 success stable across the sweep (no over-fit collapse). This gentle-LR -# schedule is more robust than a higher lr (e.g. 1e-4), which peaks near iter_1000 -# then over-fits task 0 and regresses. See docs/action_policy_libero_sft.md. -# -# REPRODUCTION: train on libero_10 ALONE (point LIBERO_ROOT at the libero_10 -# LeRobot conversion only). The 4-suite mix dilutes libero_10 (~1/4 the exposure -# per step) and converges more slowly. -# -# Env required: -# LIBERO_ROOT=/path/to/libero_10_lerobot -# BASE_CHECKPOINT_PATH= -# WAN_VAE_PATH= -# IMAGINAIRE_OUTPUT_ROOT=/path/to/output_root # persist checkpoints -# ============================================================================ +# LIBERO-10 action-policy SFT run config for the `action_policy_libero_nano` +# experiment. Train on libero_10 alone (HSDP 2x8, global batch 2048). +# Env: LIBERO_ROOT, BASE_CHECKPOINT_PATH, WAN_VAE_PATH, IMAGINAIRE_OUTPUT_ROOT. +# See docs/action_policy_libero_sft.md. [job] task = "vfm" @@ -36,44 +16,31 @@ wandb_mode = "online" [model] precision = "bfloat16" -# Cap the packed sequence (GA-validated). Uncapped (-1) packs one very long sequence -# and OOMs even on H200. max_num_tokens_after_packing = 74000 [model.parallelism] -data_parallel_shard_degree = 8 # 1-node 8-GPU shard; raise replicate for multi-node HSDP -data_parallel_replicate_degree = 1 +data_parallel_shard_degree = 8 +data_parallel_replicate_degree = 2 # HSDP 2x8 = 16 ranks (2 nodes) [model.activation_checkpointing] -mode = "selective" # GA recipe (full is slower; selective fits 256x512) +mode = "selective" save_ops_regex = ["fmha"] [model.tokenizer] vae_path = "${oc.env:WAN_VAE_PATH}" [optimizer] -lr = 5.0e-05 # recommended base lr +lr = 5.0e-05 [scheduler] -cycle_lengths = [16000] # LR trajectory: warmup 500 -> linear decay over 16k (barely decayed at 2k) +cycle_lengths = [16000] warm_up_steps = [500] [trainer] -max_iter = 2000 # pause at 2k; sweep checkpoints 500/1000/1500/2000 for the peak +max_iter = 2000 logging_iter = 50 -grad_accum_iter = 2 # global batch = max_samples_per_batch 128 x DP 8 x grad_accum 2 = 2048 +grad_accum_iter = 1 # global batch = 128 x (8 x 2) x 1 = 2048 [checkpoint] load_path = "${oc.env:BASE_CHECKPOINT_PATH}" -save_iter = 500 # sweep cadence; peak is typically iter_1500 - -# NOTE (train/serve parity — see GitHub issue NVIDIA/cosmos-framework#50): the -# 256x512 concat_view is snapped to a 192x320 model canvas (resize+reflect-pad), and -# the eval server reproduces the same snap. Run the client with the same 2:1 concat -# (--camera agentview,wrist --image_size 256) so resolution + prompt suffix match, and -# use --action-normalization quantile_rot + the bundled libero rot6d stats on the -# server so denormalization matches training. See docs/action_policy_libero_sft.md. -# -# max_samples_per_batch is 128 in the experiment (256 OOMs: per-forward peak, not grad_accum). -# On lower-memory GPUs reduce at launch: -# --opts dataloader_train.max_samples_per_batch=64 +save_iter = 500 From ffca1a1960f9d842742a0a4bb7bd975769badd60 Mon Sep 17 00:00:00 2001 From: Liang Hao Date: Fri, 26 Jun 2026 21:39:16 +0800 Subject: [PATCH 4/9] libero: fix clean-branch deps + drop droid/LoRA/reply-server mentions - action_sft_dataset.py: rebuild as origin/main + libero-only (drop the speedup-era ShardedDROIDLeRobotDataset import that broke config load on a clean main). - remove dataset_reply_action_server.py (GT-replay debug tool, not part of the recipe). - drop DROID/LoRA references from libero docstrings/comments/doc/launch. --- cosmos_framework/configs/base/config.py | 1 - .../action_policy_libero_nano.py | 9 +- .../vfm/action/datasets/action_sft_dataset.py | 49 +- .../data/vfm/action/libero_pose_utils.py | 2 +- .../simulation/libero/closed_loop_eval.py | 2 +- .../libero/dataset_reply_action_server.py | 653 ------------------ docs/action_policy_libero_sft.md | 4 +- examples/launch_sft_action_policy_libero.sh | 2 +- 8 files changed, 19 insertions(+), 703 deletions(-) delete mode 100644 cosmos_framework/simulation/libero/dataset_reply_action_server.py diff --git a/cosmos_framework/configs/base/config.py b/cosmos_framework/configs/base/config.py index 1fb0514..5ac2b41 100644 --- a/cosmos_framework/configs/base/config.py +++ b/cosmos_framework/configs/base/config.py @@ -98,5 +98,4 @@ def make_config() -> Config: import cosmos_framework.configs.base.experiment.sft.vision_sft_super # noqa: F401 import cosmos_framework.configs.base.experiment.action.posttrain_config.action_policy_droid_nano # noqa: F401 import cosmos_framework.configs.base.experiment.action.posttrain_config.action_policy_libero_nano # noqa: F401 - import cosmos_framework.configs.base.experiment.action.posttrain_config.action_policy_libero_nano_4suite # noqa: F401 return c diff --git a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py index 38c03d4..e5b5696 100644 --- a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py +++ b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py @@ -3,11 +3,10 @@ """``action_policy_libero_nano`` — Cosmos3-Nano LIBERO-10 action-policy SFT recipe. -Mirrors ``action_policy_droid_nano`` but feeds ``LIBEROLeRobotDataset`` -(frame-wise-relative rot6d, ``quantile_rot``, concat_view third-person + wrist) -and trains the generation + action heads from the public ``nvidia/Cosmos3-Nano`` -base. Train on ``libero_10`` alone (``LIBERO_ROOT``). -See docs/action_policy_libero_sft.md. +Feeds ``LIBEROLeRobotDataset`` (frame-wise-relative rot6d, ``quantile_rot``, +concat_view third-person + wrist) and trains the generation + action heads from +the public ``nvidia/Cosmos3-Nano`` base. Train on ``libero_10`` alone +(``LIBERO_ROOT``). See docs/action_policy_libero_sft.md. """ import copy diff --git a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py index afe76da..7776875 100644 --- a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py +++ b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py @@ -18,10 +18,7 @@ from torch.utils.data import Dataset, IterableDataset, get_worker_info -from cosmos_framework.data.vfm.action.datasets.droid_lerobot_dataset import ( - DROIDLeRobotDataset, - ShardedDROIDLeRobotDataset, -) +from cosmos_framework.data.vfm.action.datasets.droid_lerobot_dataset import DROIDLeRobotDataset from cosmos_framework.data.vfm.action.datasets.libero_lerobot_dataset import LIBEROLeRobotDataset from cosmos_framework.data.vfm.action.transforms import ActionTransformPipeline @@ -102,7 +99,6 @@ def get_action_droid_sft_dataset( action_normalization: str | None = None, viewpoint: str = "concat_view", use_image_augmentation: bool = False, - apply_color_jitter: bool = True, use_filter_dict: bool = False, filter_dict_path: str | None = None, resolution: str | int = "256", @@ -115,24 +111,11 @@ def get_action_droid_sft_dataset( append_idle_frames: bool = False, iterable_shuffle: bool = False, episode_shuffle_seed: int = 42, - sharded: bool = False, - lerobot_roots: list[str] | None = None, - use_success_only: bool = True, ) -> Dataset: """Build the DROID action SFT dataset: ``action_space='joint_pos'`` (8D) + - ``use_state`` (raw/un-normalized), concat_view, chunk_length 32. - - ``sharded=True`` consumes the per-lab sharded layout (``/success/``) - via :class:`ShardedDROIDLeRobotDataset` — one ``DROIDLeRobotDataset`` per lab - concatenated into one flat index — reproducing the internal sharded run's - per-shard index construction. ``sharded=False`` (default) reads ``root`` as a - single flat LeRobot dataset (the prior behavior). ``lerobot_roots`` optionally - pins the shard sub-paths (relative to ``root``); otherwise they are - auto-discovered.""" - # ``sharded`` may arrive as a string from env-var config resolution. - if isinstance(sharded, str): - sharded = sharded.strip().lower() in ("1", "true", "yes", "on") - shard_kwargs = dict( + ``use_state`` (raw/un-normalized), concat_view, chunk_length 32.""" + dataset = DROIDLeRobotDataset( + root=root, fps=fps, chunk_length=chunk_length, viewpoint=viewpoint, @@ -141,19 +124,9 @@ def get_action_droid_sft_dataset( use_state=use_state, action_normalization=action_normalization, use_image_augmentation=use_image_augmentation, - apply_color_jitter=apply_color_jitter, use_filter_dict=use_filter_dict, filter_dict_path=filter_dict_path, ) - if sharded: - dataset: Dataset = ShardedDROIDLeRobotDataset( - root=root, - lerobot_roots=lerobot_roots, - use_success_only=use_success_only, - **shard_kwargs, - ) - else: - dataset = DROIDLeRobotDataset(root=root, **shard_kwargs) transform = ActionTransformPipeline( tokenizer_config=tokenizer_config, cfg_dropout_rate=cfg_dropout_rate, @@ -198,14 +171,12 @@ def get_action_libero_sft_dataset( ) -> Dataset: """Build the LIBERO action-policy SFT dataset (GA reproduction defaults). - Mirrors :func:`get_action_droid_sft_dataset` but feeds ``LIBEROLeRobotDataset`` - (frame-wise-relative rot6d actions, ``quantile_rot``-normalized, concat_view - third-person + wrist at 256x256 each → 256x512) through - ``ActionTransformPipeline``. ``root`` is a LOCAL LeRobot dir (read parquet + - video directly, like DROID); pre-sync the HF dataset once, e.g. - ``hf download lerobot/libero_10 --repo-type dataset --local-dir ``. For - the Table-20 LIBERO-10 reproduction point ``root`` at libero_10 alone (the - 4-suite mix dilutes libero_10 to ~1 pass in 2000 steps → ~82% vs ~97%). The + Feeds ``LIBEROLeRobotDataset`` (frame-wise-relative rot6d actions, + ``quantile_rot``-normalized, concat_view third-person + wrist at 256x256 each + → 256x512) through ``ActionTransformPipeline``. ``root`` is a LOCAL LeRobot dir + (read parquet + video directly); pre-sync the HF dataset once, e.g. + ``hf download lerobot/libero_10 --repo-type dataset --local-dir ``. Point + ``root`` at libero_10 alone (the all-suites mix dilutes libero_10 per step). The dataset is FPS-agnostic (decodes at real frame timestamps); ``fps`` is metadata for ``conditioning_fps`` / prompt duration. """ diff --git a/cosmos_framework/data/vfm/action/libero_pose_utils.py b/cosmos_framework/data/vfm/action/libero_pose_utils.py index 5cc9fff..3a4fd8e 100644 --- a/cosmos_framework/data/vfm/action/libero_pose_utils.py +++ b/cosmos_framework/data/vfm/action/libero_pose_utils.py @@ -13,7 +13,7 @@ build_abs_pose_from_components, ) -# Same local-frame post-rotation pattern used by DROID/Bridge/Fractal: +# Local-frame post-rotation pattern: # R_opencv = R_native @ *_TO_OPENCV. LIBERO_TO_OPENCV: np.ndarray = np.array( [[0.0, -1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 1.0]], diff --git a/cosmos_framework/simulation/libero/closed_loop_eval.py b/cosmos_framework/simulation/libero/closed_loop_eval.py index 0205f9f..660be36 100644 --- a/cosmos_framework/simulation/libero/closed_loop_eval.py +++ b/cosmos_framework/simulation/libero/closed_loop_eval.py @@ -99,7 +99,7 @@ def _concat_view_layout_description(cameras: list[str]) -> str: def _augment_task_prompt_with_viewpoint(task_description: str, cameras: list[str]) -> str: - """Mirror DROID-style concat-view caption augmentation for closed-loop LIBERO eval.""" + """Concat-view caption augmentation for closed-loop LIBERO eval.""" if len(cameras) <= 1: return task_description prompt = _append_prompt_sentence(task_description, DEFAULT_VIEWPOINT_TEMPLATES["concat_view"]) diff --git a/cosmos_framework/simulation/libero/dataset_reply_action_server.py b/cosmos_framework/simulation/libero/dataset_reply_action_server.py deleted file mode 100644 index bb5d9a4..0000000 --- a/cosmos_framework/simulation/libero/dataset_reply_action_server.py +++ /dev/null @@ -1,653 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: OpenMDW-1.1 - -""" -HTTP server that serves ground-truth actions from LIBERO LeRobot datasets. - -Same HTTP interface as `cosmos3.scripts.action_policy_server` (the model-backed -server), enabling drop-in replacement for closed-loop evaluation to verify the -action pipeline with known-good GT actions. - -Endpoints: -- POST /predict: Return next chunk of GT actions for the given task (matched by prompt) -- GET /info: Return dataset info (tasks, episode counts) -- POST /next_episode: Advance to next episode for the task specified in request body -- POST /reset: Reset all per-task episode/step tracking - -Episode advancement: - The server auto-advances to the next episode when the current episode's actions - are exhausted. For early-termination cases (e.g. success before all actions are - consumed), call POST /next_episode with {"prompt": ""} between episodes. - -Example usage: - - -PYTHONPATH=. python cosmos_framework/simulation/libero/dataset_reply_action_server.py \ - --repo_id libero_10 \ - --root /path/to/libero_10_no_noops_1.0.0_lerobot_aligned \ - --action_space frame_wise_relative \ - --rotation_space 6d \ - --pose_coordinate_frame opencv \ - --action_chunk_size 16 \ - --send_video \ - --camera_mode agentview \ - --port 8000 - -# Multiple datasets: -PYTHONPATH=. python cosmos_framework/simulation/libero/dataset_reply_action_server.py \ - --repo_id libero_10,libero_goal \ - --root /path/to/libero_10,/path/to/libero_goal \ - --action_space relative \ - --rotation_space 6d \ - --pose_coordinate_frame opencv \ - --action_chunk_size 16 \ - --port 8000 -""" - -from __future__ import annotations - -import argparse -import base64 -import datetime -import io -import json -import socket -import threading -import time -from dataclasses import dataclass -from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer -from typing import Any - -import numpy as np -import torch -from PIL import Image - -from cosmos_framework.data.vfm.action.libero_pose_utils import ( - libero_rotation_format, -) -from cosmos_framework.data.vfm.action.pose_utils import convert_rotation - - -def _ts() -> str: - return datetime.datetime.now(tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ") - - -def _get_local_ip() -> str: - try: - with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: - s.connect(("8.8.8.8", 80)) - return str(s.getsockname()[0]) - except Exception: - return socket.gethostbyname(socket.gethostname()) - - -# --------------------------------------------------------------------------- -# Action processing (mirrors LIBEROLeRobotDataset.__getitem__ logic) -# --------------------------------------------------------------------------- - - -def _compute_anchored_actions( - state_raw: torch.Tensor, - action_raw: torch.Tensor, -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute anchored relative actions, same as LIBEROLeRobotDataset._compute_anchored_actions. - - Actions are expressed in state_raw[0]'s local coordinate frame. - - Args: - state_raw: (T+1, 8) states [x, y, z, ax, ay, az, grip1, grip2]. - action_raw: (T+1, 7) actions [dx, dy, dz, dax, day, daz, grip]. - - Returns: - anchored_translation (T, 3), anchored_rotation (T, 3, 3), gripper (T, 1). - """ - p_states = state_raw[:, :3] - rotvec_states = state_raw[:, 3:6] - delta_p = action_raw[:-1, :3] - delta_rotvec = action_raw[:-1, 3:6] - gripper = action_raw[:-1, 6:7] - - R_states = convert_rotation(rotvec_states, "axisangle", "matrix") - R_deltas = convert_rotation(delta_rotvec, "axisangle", "matrix") - - p_0 = p_states[0] - R_0_T = R_states[0].T - - p_t = p_states[:-1] - R_t = R_states[:-1] - - p_target = p_t + delta_p - R_target = torch.bmm(R_deltas, R_t) - - anchored_p = (R_0_T @ (p_target - p_0).T).T - R_0_T_expanded = R_0_T.unsqueeze(0).expand(R_target.shape[0], -1, -1) - anchored_R = torch.bmm(R_0_T_expanded, R_target) - - return anchored_p, anchored_R, gripper - - -def _convert_rotation_to_repr(rotation_matrix: torch.Tensor, rotation_space: str) -> torch.Tensor: - return convert_rotation(rotation_matrix, "matrix", libero_rotation_format(rotation_space)) - - -def _process_action_chunk( - action_raw: torch.Tensor, - state_raw: torch.Tensor, - action_space: str, - rotation_space: str, -) -> torch.Tensor: - """Process a chunk of raw actions with the same logic as LIBEROLeRobotDataset.__getitem__. - - Args: - action_raw: (chunk+1, 7) raw actions covering chunk+1 consecutive frames. - state_raw: (chunk+1, 8) raw states covering chunk+1 consecutive frames. - action_space: "relative" or "frame_wise_relative". - rotation_space: "3d", "6d", or "9d". - - Returns: - Processed actions (chunk, D) where D depends on rotation_space. - """ - if action_space == "relative": - translation, rotation_matrix, gripper = _compute_anchored_actions(state_raw, action_raw) - elif action_space == "frame_wise_relative": - action = action_raw[:-1].clone() - translation = action[:, :3] - rotation_matrix = convert_rotation(action[:, 3:6], "axisangle", "matrix") - gripper = action[:, 6:] - else: - raise ValueError(f"Unsupported action_space: {action_space}") - - rotation = _convert_rotation_to_repr(rotation_matrix, rotation_space) - return torch.cat([translation, rotation, gripper], dim=-1) - - -# --------------------------------------------------------------------------- -# Data structures -# --------------------------------------------------------------------------- - - -@dataclass(frozen=True) -class EpisodeData: - action_raw: torch.Tensor # (N, 7) per-frame raw actions for the full episode - state_raw: torch.Tensor # (N, 8) per-frame raw states for the full episode - task_description: str - dataset_ref_idx: int # index into DatasetActionService._hf_datasets - frame_start: int # first global frame index in the HF dataset - frame_end: int # one-past-last global frame index - - -@dataclass(frozen=True) -class DatasetServerConfig: - repo_id: list[str] - root: list[str | None] - action_space: str - rotation_space: str - pose_coordinate_frame: str - action_chunk_size: int - max_action_dim: int - split: str - send_video: bool - camera_mode: str - image_size: int - - -# --------------------------------------------------------------------------- -# Service -# --------------------------------------------------------------------------- - - -class DatasetActionService: - """Serves GT actions (and optionally GT video) from pre-loaded LIBERO LeRobot episodes.""" - - def __init__(self, cfg: DatasetServerConfig) -> None: - self.cfg = cfg - self.episodes_by_task: dict[str, list[EpisodeData]] = {} - self._hf_datasets: list[Any] = [] - self._lerobot_datasets: list[Any] = [] - self._task_state: dict[str, dict[str, int]] = {} - self._lock = threading.Lock() - - if cfg.camera_mode in ("concat_view", "both"): - self._image_keys = ["observation.images.image", "observation.images.wrist_image"] - elif cfg.camera_mode == "wrist_image": - self._image_keys = ["observation.images.wrist_image"] - else: - self._image_keys = ["observation.images.image"] - - self._load_datasets() - - def _load_datasets(self) -> None: - from lerobot.datasets.lerobot_dataset import LeRobotDataset - - for repo_id, root in zip(self.cfg.repo_id, self.cfg.root): - print(f"[{_ts()}] [dataset-server] loading repo_id={repo_id} root={root} ...", flush=True) - t0 = time.monotonic() - - dataset = LeRobotDataset(repo_id=repo_id, root=root) - tasks_df = dataset.meta.tasks - hf = dataset.hf_dataset - ds_ref_idx = len(self._hf_datasets) - self._hf_datasets.append(hf) - - if self.cfg.send_video: - delta_ts: dict[str, list[float]] = {k: [0.0] for k in self._image_keys} - video_dataset = LeRobotDataset(repo_id=repo_id, root=root, delta_timestamps=delta_ts) - self._lerobot_datasets.append(video_dataset) - else: - self._lerobot_datasets.append(None) - - for ep_meta in dataset.meta.episodes: - ep_idx = int(ep_meta["episode_index"]) # type: ignore[index] - start = int(ep_meta["dataset_from_index"]) # type: ignore[index] - end = int(ep_meta["dataset_to_index"]) # type: ignore[index] - - ep_slice = hf.select(range(start, end)) - actions = torch.tensor(np.array(ep_slice["action"], dtype=np.float32)) - states = torch.tensor(np.array(ep_slice["observation.state"], dtype=np.float32)) - - task_idx = int(ep_slice[0]["task_index"]) - matching = tasks_df[tasks_df["task_index"] == task_idx] - task_desc = str(matching.iloc[0].name) if not matching.empty else f"task_{task_idx}" - - self.episodes_by_task.setdefault(task_desc, []).append( - EpisodeData( - action_raw=actions, - state_raw=states, - task_description=task_desc, - dataset_ref_idx=ds_ref_idx, - frame_start=start, - frame_end=end, - ) - ) - - dt = time.monotonic() - t0 - print( - f"[{_ts()}] [dataset-server] loaded {repo_id}: {dataset.meta.total_episodes} episodes in {dt:.1f}s", - flush=True, - ) - - total_tasks = len(self.episodes_by_task) - total_eps = sum(len(eps) for eps in self.episodes_by_task.values()) - print( - f"[{_ts()}] [dataset-server] ready: {total_tasks} tasks, {total_eps} episodes " - f"send_video={self.cfg.send_video} camera_mode={self.cfg.camera_mode}", - flush=True, - ) - - def _load_video_frames(self, episode: EpisodeData, step: int, num_frames: int) -> list[str]: - """Load GT video frames from the dataset and encode as base64 PNGs. - - Uses the LeRobotDataset wrapper (not the raw HF dataset) so that video-backed - datasets are decoded correctly via the configured video backend. - - Args: - episode: Episode data with dataset reference. - step: Step offset within the episode (0-based). - num_frames: Number of frames to load (typically action_chunk_size + 1). - - Returns: - List of base64-encoded PNG strings. - """ - lr_dataset = self._lerobot_datasets[episode.dataset_ref_idx] - if lr_dataset is None: - return [] - image_size = self.cfg.image_size - b64_frames: list[str] = [] - - for i in range(num_frames): - global_idx = episode.frame_start + step + i - if global_idx >= episode.frame_end: - break - - item = lr_dataset[global_idx] - - pil_images: list[Image.Image] = [] - for key in self._image_keys: - img_tensor = item[key] - if isinstance(img_tensor, torch.Tensor): - # LeRobot returns (T, C, H, W) with delta_timestamps=[0.0] -> (1, C, H, W) - if img_tensor.dim() == 4: - img_tensor = img_tensor[0] - # (C, H, W) float [0, 1] -> PIL - arr = (img_tensor.permute(1, 2, 0).clamp(0, 1) * 255).to(torch.uint8).numpy() - img = Image.fromarray(arr) - elif isinstance(img_tensor, Image.Image): - img = img_tensor - else: - img = Image.fromarray(np.asarray(img_tensor, dtype=np.uint8)) - img = img.convert("RGB").resize((image_size, image_size), Image.Resampling.BILINEAR) - pil_images.append(img) - - if len(pil_images) > 1: - total_w = sum(im.width for im in pil_images) - combined = Image.new("RGB", (total_w, image_size)) - x = 0 - for im in pil_images: - combined.paste(im, (x, 0)) - x += im.width - frame = combined - else: - frame = pil_images[0] - - buf = io.BytesIO() - frame.save(buf, format="PNG") - b64_frames.append(base64.b64encode(buf.getvalue()).decode("ascii")) - - return b64_frames - - # -- state management -- - - def _get_task_state(self, prompt: str) -> dict[str, int]: - if prompt not in self._task_state: - self._task_state[prompt] = {"episode_idx": 0, "step": 0} - return self._task_state[prompt] - - def _resolve_prompt(self, prompt: str) -> str: - """Resolve prompt to a known task description (exact or substring match).""" - if prompt in self.episodes_by_task: - return prompt - prompt_lower = prompt.lower().strip() - for task_desc in self.episodes_by_task: - if task_desc.lower().strip() == prompt_lower: - return task_desc - for task_desc in self.episodes_by_task: - td_lower = task_desc.lower().strip() - if prompt_lower in td_lower or td_lower in prompt_lower: - return task_desc - raise ValueError( - f"Task not found for prompt: {prompt!r}. Available tasks: {sorted(self.episodes_by_task.keys())}" - ) - - # -- endpoints -- - - def get_info(self) -> dict[str, Any]: - return { - "type": "dataset_action_server", - "action_space": self.cfg.action_space, - "rotation_space": self.cfg.rotation_space, - "action_chunk_size": self.cfg.action_chunk_size, - "tasks": {k: len(v) for k, v in sorted(self.episodes_by_task.items())}, - } - - def predict(self, req: dict[str, Any]) -> dict[str, Any]: - prompt = req.get("prompt") - if not isinstance(prompt, str): - raise ValueError("'prompt' must be a string") - - resolved_prompt = self._resolve_prompt(prompt) - - with self._lock: - state = self._get_task_state(resolved_prompt) - episodes = self.episodes_by_task[resolved_prompt] - - ep_idx = state["episode_idx"] % len(episodes) - episode = episodes[ep_idx] - step = state["step"] - - # Number of valid actions = num_frames - 1 (need pairs of consecutive frames) - max_actions = len(episode.action_raw) - 1 - - if step >= max_actions: - state["episode_idx"] = (ep_idx + 1) % len(episodes) - state["step"] = 0 - ep_idx = state["episode_idx"] - episode = episodes[ep_idx] - step = 0 - max_actions = len(episode.action_raw) - 1 - - chunk_size = min(self.cfg.action_chunk_size, max_actions - step) - # Slice chunk+1 frames for action computation (needs next-frame state) - raw_slice_end = step + chunk_size + 1 - action_chunk_raw = episode.action_raw[step:raw_slice_end] - state_chunk_raw = episode.state_raw[step:raw_slice_end] - - processed = _process_action_chunk( - action_chunk_raw, - state_chunk_raw, - self.cfg.action_space, - self.cfg.rotation_space, - ) - - # Pad to max_action_dim (same as the Action transform pipeline) - t, d = processed.shape - if d < self.cfg.max_action_dim: - processed = torch.cat( - [processed, torch.zeros(t, self.cfg.max_action_dim - d)], - dim=-1, - ) - - state["step"] += chunk_size - - action_list = processed.float().numpy().tolist() - - video_b64: list[str] = [] - if self.cfg.send_video: - video_b64 = self._load_video_frames(episode, step, num_frames=chunk_size + 1) - - print( - f"[{_ts()}] [dataset-server] predict prompt={resolved_prompt!r} " - f"ep={ep_idx} step={step}..{state['step']} actions={len(action_list)} " - f"video_frames={len(video_b64)}", - flush=True, - ) - return {"action": action_list, "video": video_b64} - - def next_episode(self, prompt: str | None = None) -> dict[str, Any]: - with self._lock: - if prompt is not None: - resolved = self._resolve_prompt(prompt) - state = self._get_task_state(resolved) - episodes = self.episodes_by_task[resolved] - state["episode_idx"] = (state["episode_idx"] + 1) % len(episodes) - state["step"] = 0 - print( - f"[{_ts()}] [dataset-server] next_episode task={resolved!r} -> ep={state['episode_idx']}", - flush=True, - ) - return {"task": resolved, "episode_idx": state["episode_idx"]} - - for task in self._task_state: - episodes = self.episodes_by_task.get(task, []) - self._task_state[task]["episode_idx"] = (self._task_state[task]["episode_idx"] + 1) % max( - len(episodes), 1 - ) - self._task_state[task]["step"] = 0 - print(f"[{_ts()}] [dataset-server] next_episode (all tasks)", flush=True) - return {"advanced_all": True} - - def reset(self) -> dict[str, str]: - with self._lock: - self._task_state.clear() - print(f"[{_ts()}] [dataset-server] reset", flush=True) - return {"status": "reset"} - - -# --------------------------------------------------------------------------- -# HTTP handler -# --------------------------------------------------------------------------- - - -class _DatasetHandler(BaseHTTPRequestHandler): - server: ThreadingHTTPServer # type: ignore[assignment] - - def _send_json(self, status_code: int, payload: dict[str, Any]) -> None: - body = json.dumps(payload).encode("utf-8") - self.send_response(status_code) - self.send_header("Content-Type", "application/json") - self.send_header("Cache-Control", "no-store") - self.send_header("Content-Length", str(len(body))) - self.end_headers() - try: - self.wfile.write(body) - except (BrokenPipeError, ConnectionResetError): - return - - def _read_json_body(self) -> dict[str, Any] | None: - try: - length = int(self.headers.get("Content-Length") or "0") - except ValueError: - self._send_json(400, {"error": "Invalid Content-Length"}) - return None - body = self.rfile.read(max(0, length)) - if not body: - return {} - try: - req = json.loads(body.decode("utf-8")) - except Exception as e: - self._send_json(400, {"error": f"Invalid JSON: {e}"}) - return None - if not isinstance(req, dict): - self._send_json(400, {"error": "JSON body must be an object"}) - return None - return req - - def do_GET(self) -> None: # noqa: N802 - service: DatasetActionService = getattr(self.server, "service") - if self.path == "/info": - self._send_json(200, service.get_info()) - elif self.path == "/": - self._send_json(200, {"status": "ok"}) - else: - self._send_json(404, {"error": "Not found"}) - - def do_POST(self) -> None: # noqa: N802 - service: DatasetActionService = getattr(self.server, "service") - - if self.path in ("/", "/predict"): - req = self._read_json_body() - if req is None: - return - try: - out = service.predict(req) - except Exception as e: - print(f"[{_ts()}] [dataset-server] predict ERROR: {e}", flush=True) - self._send_json(400, {"action": [], "error": str(e)}) - return - self._send_json(200, out) - - elif self.path == "/next_episode": - req = self._read_json_body() - prompt = req.get("prompt") if req else None - try: - out = service.next_episode(prompt) - except Exception as e: - self._send_json(400, {"error": str(e)}) - return - self._send_json(200, out) - - elif self.path == "/reset": - out = service.reset() - self._send_json(200, out) - - else: - self._send_json(404, {"error": "Not found"}) - - def log_message(self, format: str, *args: Any) -> None: # noqa: A002 - return - - -# --------------------------------------------------------------------------- -# CLI -# --------------------------------------------------------------------------- - - -def main() -> None: - parser = argparse.ArgumentParser( - description="HTTP server serving ground-truth actions from LIBERO LeRobot datasets." - ) - parser.add_argument( - "--repo_id", - type=str, - required=True, - help="Comma-separated LeRobot repo IDs (e.g. libero_10,libero_goal)", - ) - parser.add_argument( - "--root", - type=str, - required=True, - help="Comma-separated local paths to dataset roots (one per repo_id)", - ) - parser.add_argument( - "--action_space", - type=str, - default="frame_wise_relative", - choices=["relative", "frame_wise_relative"], - help="Action space (must match closed-loop eval's --action_space).", - ) - parser.add_argument( - "--rotation_space", - type=str, - default="6d", - choices=["3d", "6d", "9d"], - help="Rotation representation (must match closed-loop eval's action_dim).", - ) - parser.add_argument( - "--pose_coordinate_frame", - type=str, - default="native", - choices=["native", "opencv"], - help="Pose/action coordinate frame. Accepted for compatibility with LIBERO eval launchers.", - ) - parser.add_argument("--action_chunk_size", type=int, default=16, help="Number of actions per predict call") - parser.add_argument("--max_action_dim", type=int, default=32, help="Pad actions to this dimension") - parser.add_argument("--split", type=str, default="full", help="Dataset split (train/val/full)") - parser.add_argument( - "--send_video", - action="store_true", - help="Include GT video frames (base64 PNGs) in /predict responses, same format as the Action server.", - ) - parser.add_argument( - "--camera_mode", - type=str, - default="image", - choices=["agentview", "wrist_image", "concat_view", "both"], - help="Camera view(s) to include in video frames.", - ) - parser.add_argument("--image_size", type=int, default=256, help="Resize video frames to this height/width") - parser.add_argument("--host", type=str, default="0.0.0.0") - parser.add_argument("--port", type=int, default=8000) - args = parser.parse_args() - - repo_ids = [r.strip() for r in args.repo_id.split(",") if r.strip()] - roots = [r.strip() for r in args.root.split(",") if r.strip()] - if len(repo_ids) != len(roots): - raise ValueError(f"Number of repo_ids ({len(repo_ids)}) must match number of roots ({len(roots)})") - - cfg = DatasetServerConfig( - repo_id=repo_ids, - root=roots, - action_space=args.action_space, - rotation_space=args.rotation_space, - pose_coordinate_frame=args.pose_coordinate_frame, - action_chunk_size=int(args.action_chunk_size), - max_action_dim=int(args.max_action_dim), - split=args.split, - send_video=bool(args.send_video), - camera_mode=args.camera_mode, - image_size=int(args.image_size), - ) - - service = DatasetActionService(cfg) - local_ip = _get_local_ip() - - print( - f"[{_ts()}] [dataset-server] starting host={args.host} port={args.port} " - f"action_space={cfg.action_space} rotation_space={cfg.rotation_space} " - f"action_chunk_size={cfg.action_chunk_size}", - flush=True, - ) - print(f"[{_ts()}] [dataset-server] Server accessible at: http://{local_ip}:{args.port}/", flush=True) - print(f"[{_ts()}] [dataset-server] Endpoints:", flush=True) - print(f" - GET / : Health check", flush=True) - print(f" - GET /info : Dataset info (tasks, episode counts)", flush=True) - print(f" - POST /predict : Get next GT action chunk (same interface as Action server)", flush=True) - print(f" - POST /next_episode : Advance to next episode for a task", flush=True) - print(f" - POST /reset : Reset all per-task state", flush=True) - - httpd = ThreadingHTTPServer((args.host, int(args.port)), _DatasetHandler) - setattr(httpd, "service", service) - httpd.serve_forever() - - -if __name__ == "__main__": - main() diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md index 735c630..386f7c6 100644 --- a/docs/action_policy_libero_sft.md +++ b/docs/action_policy_libero_sft.md @@ -1,7 +1,7 @@ # Cosmos3-Nano LIBERO-10 action-policy SFT -Full SFT (no LoRA) of the public `nvidia/Cosmos3-Nano` base into a LIBERO-10 -action policy: vision + language in, action chunks out. +Full SFT of the public `nvidia/Cosmos3-Nano` base into a LIBERO-10 action +policy: vision + language in, action chunks out. | Piece | Path | | ---------------- | ----------------------------------------------------------------------------------------------- | diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh index 4188d9d..7ec4ccc 100755 --- a/examples/launch_sft_action_policy_libero.sh +++ b/examples/launch_sft_action_policy_libero.sh @@ -3,7 +3,7 @@ # SPDX-License-Identifier: OpenMDW-1.1 # Structured-TOML launch for action_policy_libero_nano — Cosmos3-Nano LIBERO -# action-policy SFT (HSDP, full SFT, no LoRA). Drives cosmos_framework.scripts.train +# action-policy SFT (HSDP, full SFT). Drives cosmos_framework.scripts.train # against examples/toml/sft_config/action_policy_libero_repro.toml. # # Point LIBERO_ROOT at the libero_10 suite ONLY (the full suite mix dilutes From dd78c68a1c8a86ac412edd87f87e3e63c5928c3d Mon Sep 17 00:00:00 2001 From: Liang Hao Date: Fri, 26 Jun 2026 21:43:14 +0800 Subject: [PATCH 5/9] libero: model_loader = origin/main + no_dist only (drop unrelated deletions); EGL setup optional in doc --- cosmos_framework/utils/vfm/model_loader.py | 68 +++++++++++++++++++--- docs/action_policy_libero_sft.md | 13 +++-- 2 files changed, 68 insertions(+), 13 deletions(-) diff --git a/cosmos_framework/utils/vfm/model_loader.py b/cosmos_framework/utils/vfm/model_loader.py index b94817a..51140b3 100644 --- a/cosmos_framework/utils/vfm/model_loader.py +++ b/cosmos_framework/utils/vfm/model_loader.py @@ -18,7 +18,21 @@ try: from filelock import SoftReadWriteLock except ImportError: # Older filelock versions in some inference containers. - from filelock import ReadWriteLock as SoftReadWriteLock + try: + from filelock import ReadWriteLock as SoftReadWriteLock + except ImportError: + from filelock import FileLock + + class SoftReadWriteLock: + """Compatibility adapter for filelock versions without read/write locks.""" + + def __init__(self, *args: Any, **kwargs: Any) -> None: + self._lock = FileLock(*args, **kwargs) + + def write_lock(self) -> FileLock: + return self._lock + + from torch.distributed.checkpoint.filesystem import FileSystemReader, FileSystemWriter from cosmos_framework.checkpoint.s3_filesystem import S3StorageReader @@ -171,6 +185,32 @@ def _checkpoint_cache_group_lock( yield action +def _reload_pretrained_reasoner_after_checkpoint_load(model: torch.nn.Module) -> None: + """Re-seed the reasoner pathway after a DCP load, mirroring the LoadPretrained + callback that runs during training (inference does not run training callbacks). + + The decision is delegated entirely to the model's own gate in + ``load_pretrained_model_if_needed``: this is a no-op unless the model was built + with ``exclude_reasoner_weights_from_checkpoint=True`` (and pretrained weights + enabled), i.e. the case where the DCP checkpoint deliberately omits the reasoner + tower so it must be re-seeded from the pretrained source. For a normal checkpoint + that already contains the reasoner, the model's gate evaluates to False and + nothing is reloaded. + + ``has_resumable_checkpoint=True`` / ``has_load_path=False`` is load-bearing: it + re-seeds the reasoner from the pretrained source while skipping the + understanding->generation copy (the generation pathway was already populated by + the DCP load). Passing ``has_load_path=True`` would instead force a reasoner + reload even for non-excluded checkpoints, clobbering any fine-tuned reasoner + weights restored from the DCP. + """ + load_pretrained_model_if_needed = getattr(model, "load_pretrained_model_if_needed") + load_pretrained_model_if_needed( + has_resumable_checkpoint=True, + has_load_path=False, + ) + + def _load_model( model: torch.nn.Module, checkpoint_path: str, @@ -194,6 +234,9 @@ def _load_model( start_time = time.time() state_dict = ModelWrapper(model).state_dict() + if any(key.startswith("net_teacher.") for key in state_dict): + log.info("Dropping net_teacher.* keys from inference load target; distillation checkpoints do not save them.") + state_dict = {key: value for key, value in state_dict.items() if not key.startswith("net_teacher.")} if checkpoint_path.startswith("s3://"): storage_reader = S3StorageReader( @@ -209,12 +252,10 @@ def _load_model( keys_to_skip_loading=keys_to_skip_loading or [], ) - # Single-rank load (e.g. the action policy inference server): force no_dist so - # ``dcp.load`` skips the collective ``gather_object`` over the load plan. That - # gather pickles the plan, which fails with "cannot pickle code objects" for - # training/EMA DCPs whose metadata carries non-tensor objects; a single process - # owns the full checkpoint anyway, so the collective is unnecessary. Multi-rank - # (sharded) loads keep the default distributed path. + # Single-rank load (e.g. the action-policy inference server): force no_dist so + # ``dcp.load`` skips the collective ``gather_object`` over the load plan, which + # pickles the plan and can fail on training/EMA DCPs. Multi-rank loads keep the + # default distributed path. no_dist = not (dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1) dcp.load( @@ -360,6 +401,16 @@ def load_model_from_checkpoint( # Disable EMA for inference. config.model.config.ema.enabled = False + if hasattr(config.model.config, "load_teacher_weights"): + log.info("Setting load_teacher_weights=False for inference to skip teacher checkpoint download.") + config.model.config.load_teacher_weights = False + + if ( + config.model.config.exclude_reasoner_weights_from_checkpoint + and not config.model.config.vlm_config.pretrained_weights.enabled + ): + log.info("Enabling pretrained reasoner weights because this checkpoint excludes the reasoner tower from DCP.") + config.model.config.vlm_config.pretrained_weights.enabled = True config.validate() config.freeze() # type: ignore @@ -435,6 +486,7 @@ def load_model(checkpoint_load_path: str) -> None: if checkpoint_cache_path is None: load_model(checkpoint_path) + _reload_pretrained_reasoner_after_checkpoint_load(model) return model, config cache_lock_path = f"{checkpoint_cache_path}.lock" @@ -452,4 +504,6 @@ def load_model(checkpoint_load_path: str) -> None: if cache_action == _CheckpointCacheAction.LOAD_CACHE: load_model(checkpoint_cache_path) + _reload_pretrained_reasoner_after_checkpoint_load(model) + return model, config diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md index 386f7c6..c239499 100644 --- a/docs/action_policy_libero_sft.md +++ b/docs/action_policy_libero_sft.md @@ -65,14 +65,15 @@ python -m cosmos_framework.scripts.action_policy_server_libero \ ``` The LIBERO sim needs a separate venv (robosuite/mujoco pins conflict with the -training env) and graphics enabled in the container: +training env): ```bash -export NVIDIA_DRIVER_CAPABILITIES=all -apt-get install -y libegl1 libglvnd0 libgl1 libglib2.0-0 ffmpeg -mkdir -p /usr/share/glvnd/egl_vendor.d -echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libEGL_nvidia.so.0"}}' \ - > /usr/share/glvnd/egl_vendor.d/10_nvidia.json +# Optional — only on a headless container without working GPU EGL: +# export NVIDIA_DRIVER_CAPABILITIES=all +# apt-get install -y libegl1 libglvnd0 libgl1 libglib2.0-0 ffmpeg +# mkdir -p /usr/share/glvnd/egl_vendor.d +# echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libEGL_nvidia.so.0"}}' \ +# > /usr/share/glvnd/egl_vendor.d/10_nvidia.json uv venv --python 3.10 .libenv && VV=.libenv/bin/python git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git && \ From 5f1847e894a9ca203b4b76b682d96f9dc106ee1b Mon Sep 17 00:00:00 2001 From: Liang Hao Date: Fri, 26 Jun 2026 21:49:31 +0800 Subject: [PATCH 6/9] libero: canonical recipe = HSDP 8x8 (replicate 8, max_samples 32 in launch -> gbs 2048) --- docs/action_policy_libero_sft.md | 8 ++++---- examples/launch_sft_action_policy_libero.sh | 3 ++- examples/toml/sft_config/action_policy_libero_repro.toml | 6 +++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md index c239499..fa41346 100644 --- a/docs/action_policy_libero_sft.md +++ b/docs/action_policy_libero_sft.md @@ -41,13 +41,13 @@ export BASE_CHECKPOINT_PATH= export WAN_VAE_PATH= export IMAGINAIRE_OUTPUT_ROOT=/path/to/output_root -bash examples/launch_sft_action_policy_libero.sh # HSDP 2x8; set NNODES/NODE_RANK/MASTER_ADDR per node +bash examples/launch_sft_action_policy_libero.sh # HSDP 8x8; set NNODES/NODE_RANK/MASTER_ADDR per node ``` Recipe knobs live in `action_policy_libero_nano`; the TOML sets run-level scalars -(lr 5e-5, warmup 500, cycle 16000, global batch 2048, `save_iter=500`). Sweep the -saved checkpoints to pick the best iteration. On lower-memory GPUs reduce the -per-rank batch: `--opts dataloader_train.max_samples_per_batch=32`. +(lr 5e-5, warmup 500, cycle 16000, global batch 2048, `save_iter=500`). The launch +sets `max_samples_per_batch=32` (32 × 64 ranks = gbs 2048); reduce it further on +lower-memory GPUs. Sweep the saved checkpoints to pick the best iteration. ## 3. Closed-loop eval diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh index 7ec4ccc..1c40954 100755 --- a/examples/launch_sft_action_policy_libero.sh +++ b/examples/launch_sft_action_policy_libero.sh @@ -8,7 +8,7 @@ # # Point LIBERO_ROOT at the libero_10 suite ONLY (the full suite mix dilutes # libero_10). Use the 20 FPS nvidia/LIBERO_LeRobot_v3. The default recipe is -# HSDP 2x8 (global batch 2048); set NNODES/NODE_RANK/MASTER_ADDR per node. +# HSDP 8x8 (global batch 2048); set NNODES/NODE_RANK/MASTER_ADDR per node. # See docs/action_policy_libero_sft.md. # # Required env vars: @@ -40,6 +40,7 @@ EXTRA_DATASET_CHECK='[[ -f "$LIBERO_ROOT/meta/info.json" ]] || { echo "ERROR: LI # process), unlike a TAIL_OVERRIDES array set in your shell. Use it for smoke runs, # e.g. EXTRA_TAIL_OVERRIDES="trainer.max_iter=5 job.wandb_mode=offline". TAIL_OVERRIDES=( + dataloader_train.max_samples_per_batch=32 # 32 x (shard 8 x replicate 8) x ga1 = global batch 2048 ${EXTRA_TAIL_OVERRIDES:-} ) diff --git a/examples/toml/sft_config/action_policy_libero_repro.toml b/examples/toml/sft_config/action_policy_libero_repro.toml index d74237b..63ab032 100644 --- a/examples/toml/sft_config/action_policy_libero_repro.toml +++ b/examples/toml/sft_config/action_policy_libero_repro.toml @@ -2,7 +2,7 @@ # SPDX-License-Identifier: OpenMDW-1.1 # LIBERO-10 action-policy SFT run config for the `action_policy_libero_nano` -# experiment. Train on libero_10 alone (HSDP 2x8, global batch 2048). +# experiment. Train on libero_10 alone (HSDP 8x8, global batch 2048). # Env: LIBERO_ROOT, BASE_CHECKPOINT_PATH, WAN_VAE_PATH, IMAGINAIRE_OUTPUT_ROOT. # See docs/action_policy_libero_sft.md. @@ -20,7 +20,7 @@ max_num_tokens_after_packing = 74000 [model.parallelism] data_parallel_shard_degree = 8 -data_parallel_replicate_degree = 2 # HSDP 2x8 = 16 ranks (2 nodes) +data_parallel_replicate_degree = 8 # HSDP 8x8 = 64 ranks (8 nodes) [model.activation_checkpointing] mode = "selective" @@ -39,7 +39,7 @@ warm_up_steps = [500] [trainer] max_iter = 2000 logging_iter = 50 -grad_accum_iter = 1 # global batch = 128 x (8 x 2) x 1 = 2048 +grad_accum_iter = 1 # global batch = max_samples 32 x (shard 8 x replicate 8) x 1 = 2048 [checkpoint] load_path = "${oc.env:BASE_CHECKPOINT_PATH}" From 21d34caa733ec8f71fb384c9d41ac15a62622392 Mon Sep 17 00:00:00 2001 From: Liang Hao Date: Fri, 26 Jun 2026 21:52:57 +0800 Subject: [PATCH 7/9] libero: recipe = minimum HSDP 2x8 (gbs 2048, grad_accum 1); doc/launch synced --- docs/action_policy_libero_sft.md | 9 +++++---- examples/launch_sft_action_policy_libero.sh | 3 +-- examples/toml/sft_config/action_policy_libero_repro.toml | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md index fa41346..63abb4b 100644 --- a/docs/action_policy_libero_sft.md +++ b/docs/action_policy_libero_sft.md @@ -41,13 +41,14 @@ export BASE_CHECKPOINT_PATH= export WAN_VAE_PATH= export IMAGINAIRE_OUTPUT_ROOT=/path/to/output_root -bash examples/launch_sft_action_policy_libero.sh # HSDP 8x8; set NNODES/NODE_RANK/MASTER_ADDR per node +bash examples/launch_sft_action_policy_libero.sh # HSDP 2x8; set NNODES/NODE_RANK/MASTER_ADDR per node ``` Recipe knobs live in `action_policy_libero_nano`; the TOML sets run-level scalars -(lr 5e-5, warmup 500, cycle 16000, global batch 2048, `save_iter=500`). The launch -sets `max_samples_per_batch=32` (32 × 64 ranks = gbs 2048); reduce it further on -lower-memory GPUs. Sweep the saved checkpoints to pick the best iteration. +(lr 5e-5, warmup 500, cycle 16000, `save_iter=500`, HSDP 2x8). Global batch is +2048 = `max_samples_per_batch` 128 × 16 ranks × grad_accum 1; on lower-memory GPUs +reduce it: `--opts dataloader_train.max_samples_per_batch=64`. Sweep the saved +checkpoints to pick the best iteration. ## 3. Closed-loop eval diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh index 1c40954..7ec4ccc 100755 --- a/examples/launch_sft_action_policy_libero.sh +++ b/examples/launch_sft_action_policy_libero.sh @@ -8,7 +8,7 @@ # # Point LIBERO_ROOT at the libero_10 suite ONLY (the full suite mix dilutes # libero_10). Use the 20 FPS nvidia/LIBERO_LeRobot_v3. The default recipe is -# HSDP 8x8 (global batch 2048); set NNODES/NODE_RANK/MASTER_ADDR per node. +# HSDP 2x8 (global batch 2048); set NNODES/NODE_RANK/MASTER_ADDR per node. # See docs/action_policy_libero_sft.md. # # Required env vars: @@ -40,7 +40,6 @@ EXTRA_DATASET_CHECK='[[ -f "$LIBERO_ROOT/meta/info.json" ]] || { echo "ERROR: LI # process), unlike a TAIL_OVERRIDES array set in your shell. Use it for smoke runs, # e.g. EXTRA_TAIL_OVERRIDES="trainer.max_iter=5 job.wandb_mode=offline". TAIL_OVERRIDES=( - dataloader_train.max_samples_per_batch=32 # 32 x (shard 8 x replicate 8) x ga1 = global batch 2048 ${EXTRA_TAIL_OVERRIDES:-} ) diff --git a/examples/toml/sft_config/action_policy_libero_repro.toml b/examples/toml/sft_config/action_policy_libero_repro.toml index 63ab032..a0c49c7 100644 --- a/examples/toml/sft_config/action_policy_libero_repro.toml +++ b/examples/toml/sft_config/action_policy_libero_repro.toml @@ -2,7 +2,7 @@ # SPDX-License-Identifier: OpenMDW-1.1 # LIBERO-10 action-policy SFT run config for the `action_policy_libero_nano` -# experiment. Train on libero_10 alone (HSDP 8x8, global batch 2048). +# experiment. Train on libero_10 alone (HSDP 2x8, global batch 2048). # Env: LIBERO_ROOT, BASE_CHECKPOINT_PATH, WAN_VAE_PATH, IMAGINAIRE_OUTPUT_ROOT. # See docs/action_policy_libero_sft.md. @@ -20,7 +20,7 @@ max_num_tokens_after_packing = 74000 [model.parallelism] data_parallel_shard_degree = 8 -data_parallel_replicate_degree = 8 # HSDP 8x8 = 64 ranks (8 nodes) +data_parallel_replicate_degree = 2 # HSDP 2x8 = 16 ranks (2 nodes); minimum for gbs 2048 at grad_accum 1 [model.activation_checkpointing] mode = "selective" @@ -39,7 +39,7 @@ warm_up_steps = [500] [trainer] max_iter = 2000 logging_iter = 50 -grad_accum_iter = 1 # global batch = max_samples 32 x (shard 8 x replicate 8) x 1 = 2048 +grad_accum_iter = 1 # global batch = max_samples 128 x (shard 8 x replicate 2) x 1 = 2048 [checkpoint] load_path = "${oc.env:BASE_CHECKPOINT_PATH}" From 82a5a840a8dcdc79517ac4e25693e049002b86b4 Mon Sep 17 00:00:00 2001 From: Liang Hao Date: Fri, 26 Jun 2026 21:58:16 +0800 Subject: [PATCH 8/9] libero: move lower-mem caveat to Heads-up section; drop all-suites mention --- .../posttrain_config/action_policy_libero_nano.py | 3 +-- .../data/vfm/action/datasets/action_sft_dataset.py | 2 +- docs/action_policy_libero_sft.md | 11 +++++++---- examples/launch_sft_action_policy_libero.sh | 6 +++--- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py index e5b5696..b05d3ea 100644 --- a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py +++ b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_libero_nano.py @@ -195,8 +195,7 @@ def _action_policy_libero_nano_model_config() -> dict: libero=dict( ratio=1, dataset=L(get_action_libero_sft_dataset)( - # Local LeRobot dir for the libero_10 suite ONLY (the - # full suite mix dilutes libero_10; see module docstring). Use the + # Local LeRobot dir for the libero_10 suite ONLY. Use the # 20 FPS nvidia/LIBERO_LeRobot_v3 (matches the bundled stats + 20 Hz eval): # hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset \ # --include 'libero_10/**' --local-dir # LIBERO_ROOT=/libero_10 diff --git a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py index 7776875..96a5219 100644 --- a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py +++ b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py @@ -176,7 +176,7 @@ def get_action_libero_sft_dataset( → 256x512) through ``ActionTransformPipeline``. ``root`` is a LOCAL LeRobot dir (read parquet + video directly); pre-sync the HF dataset once, e.g. ``hf download lerobot/libero_10 --repo-type dataset --local-dir ``. Point - ``root`` at libero_10 alone (the all-suites mix dilutes libero_10 per step). The + ``root`` at libero_10 alone. The dataset is FPS-agnostic (decodes at real frame timestamps); ``fps`` is metadata for ``conditioning_fps`` / prompt duration. """ diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md index 63abb4b..84a9cbc 100644 --- a/docs/action_policy_libero_sft.md +++ b/docs/action_policy_libero_sft.md @@ -46,8 +46,7 @@ bash examples/launch_sft_action_policy_libero.sh # HSDP 2x8; set NNODES/NODE_R Recipe knobs live in `action_policy_libero_nano`; the TOML sets run-level scalars (lr 5e-5, warmup 500, cycle 16000, `save_iter=500`, HSDP 2x8). Global batch is -2048 = `max_samples_per_batch` 128 × 16 ranks × grad_accum 1; on lower-memory GPUs -reduce it: `--opts dataloader_train.max_samples_per_batch=64`. Sweep the saved +2048 = `max_samples_per_batch` 128 × 16 ranks × grad_accum 1. Sweep the saved checkpoints to pick the best iteration. ## 3. Closed-loop eval @@ -93,9 +92,13 @@ MUJOCO_GL=egl PYTHONPATH=$PWD:$PWD/LIBERO $VV \ --output_dir results/libero_closed_loop_10 ``` -## 4. Eval parity +## 4. Heads-up -The client/server already handle these; verify them if accuracy is low: +- **Lower-memory GPUs** — reduce the per-rank batch: + `--opts dataloader_train.max_samples_per_batch=64` (scale `replicate` to keep + global batch 2048). + +Eval parity — the client/server already handle these; verify if accuracy is low: - **Concat layout** — run with `--camera agentview,wrist --image_size 256` so the 256×512 concat matches training (the server snaps it to 192×320 identically). diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh index 7ec4ccc..24ab760 100755 --- a/examples/launch_sft_action_policy_libero.sh +++ b/examples/launch_sft_action_policy_libero.sh @@ -6,9 +6,9 @@ # action-policy SFT (HSDP, full SFT). Drives cosmos_framework.scripts.train # against examples/toml/sft_config/action_policy_libero_repro.toml. # -# Point LIBERO_ROOT at the libero_10 suite ONLY (the full suite mix dilutes -# libero_10). Use the 20 FPS nvidia/LIBERO_LeRobot_v3. The default recipe is -# HSDP 2x8 (global batch 2048); set NNODES/NODE_RANK/MASTER_ADDR per node. +# Point LIBERO_ROOT at the libero_10 suite ONLY. Use the 20 FPS +# nvidia/LIBERO_LeRobot_v3. The default recipe is HSDP 2x8 (global batch 2048); +# set NNODES/NODE_RANK/MASTER_ADDR per node. # See docs/action_policy_libero_sft.md. # # Required env vars: From 4d351ddcbb1a6fea9d62d5020829b1caa16f6908 Mon Sep 17 00:00:00 2001 From: Liang Hao Date: Fri, 26 Jun 2026 22:09:40 +0800 Subject: [PATCH 9/9] libero: lint launch headers (drop GPU counts), drop sweep mention --- docs/action_policy_libero_sft.md | 3 +-- examples/launch_sft_action_policy_libero.sh | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/action_policy_libero_sft.md b/docs/action_policy_libero_sft.md index 84a9cbc..660de11 100644 --- a/docs/action_policy_libero_sft.md +++ b/docs/action_policy_libero_sft.md @@ -46,8 +46,7 @@ bash examples/launch_sft_action_policy_libero.sh # HSDP 2x8; set NNODES/NODE_R Recipe knobs live in `action_policy_libero_nano`; the TOML sets run-level scalars (lr 5e-5, warmup 500, cycle 16000, `save_iter=500`, HSDP 2x8). Global batch is -2048 = `max_samples_per_batch` 128 × 16 ranks × grad_accum 1. Sweep the saved -checkpoints to pick the best iteration. +2048 = `max_samples_per_batch` 128 × 16 ranks × grad_accum 1. ## 3. Closed-loop eval diff --git a/examples/launch_sft_action_policy_libero.sh b/examples/launch_sft_action_policy_libero.sh index 24ab760..29a9a16 100755 --- a/examples/launch_sft_action_policy_libero.sh +++ b/examples/launch_sft_action_policy_libero.sh @@ -23,7 +23,7 @@ # hf download nvidia/LIBERO_LeRobot_v3 --repo-type dataset --include 'libero_10/**' --local-dir # export LIBERO_ROOT=/libero_10 # -# Usage (8-GPU allocation, inside the training container, from the repo root): +# Usage (HSDP 2x8; set NNODES/NODE_RANK/MASTER_ADDR per node): # LIBERO_ROOT=/libero_10 bash examples/launch_sft_action_policy_libero.sh TOML_FILE="examples/toml/sft_config/action_policy_libero_repro.toml"