bghira
diff --git a/‎.github/workflows/publish-pypi.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/publish-pypi.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/push-replicate.yml‎
Lines changed: 41 additions & 0 deletions b/‎.github/workflows/push-replicate.yml‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎.github/workflows/python-tests.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/python-tests.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎documentation/DEEPSPEED.md‎
Lines changed: 1 addition & 1 deletion b/‎documentation/DEEPSPEED.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎documentation/DREAMBOOTH.md‎
Lines changed: 24 additions & 0 deletions b/‎documentation/DREAMBOOTH.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎documentation/INSTALL.md‎
Lines changed: 2 additions & 2 deletions b/‎documentation/INSTALL.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎documentation/OPTIONS.md‎
Lines changed: 80 additions & 4 deletions b/‎documentation/OPTIONS.md‎
Lines changed: 80 additions & 4 deletions
@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.11", "3.12"]
+        python-version: ["3.12", "3.13"]
 
     steps:
     - uses: actions/checkout@v4
@@ -51,7 +51,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
-        python-version: "3.11"
+        python-version: "3.12"
 
     - name: Install build dependencies
       run: |
 
@@ -0,0 +1,41 @@
+name: Push Cog Image
+
+on:
+  workflow_dispatch:
+    inputs:
+      image:
+        description: "Target Replicate image (e.g., r8.im/simpletuner/z-image)"
+        required: false
+        default: "r8.im/simpletuner/z-image"
+
+jobs:
+  push:
+    runs-on: ubuntu-latest
+    env:
+      REPLICATE_API_TOKEN: ${{ secrets.REPLICATE_API_KEY }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          fetch-depth: 0
+
+      - name: Install Cog CLI
+        run: |
+          set -euo pipefail
+          curl -fsSL https://raw.githubusercontent.com/replicate/cog/main/tools/install.sh -o /tmp/install-cog.sh
+          chmod +x /tmp/install-cog.sh
+          INSTALL_DIR="$HOME/.local/bin" /tmp/install-cog.sh
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+          cog --version
+
+      - name: Push image to Replicate
+        env:
+          TARGET_IMAGE: ${{ github.event.inputs.image }}
+        run: |
+          if [ -z "$REPLICATE_API_TOKEN" ]; then
+            echo "REPLICATE_API_KEY secret is missing" >&2
+            exit 1
+          fi
+          cog login --token "$REPLICATE_API_TOKEN"
+          cog push "${TARGET_IMAGE:-r8.im/simpletuner/z-image}"
@@ -23,7 +23,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: 3.11
+        python-version: 3.12
 
     - name: Install Dependencies
       run: python -m pip install --upgrade pip && pip install -e .[test]
 
@@ -1,7 +1,7 @@
 # SimpleTuner needs CU141
 FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
 
-ARG PYTHON_VERSION=3.11
+ARG PYTHON_VERSION=3.12
 
 # Prevent commands from blocking for input during build
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -54,6 +54,7 @@ SimpleTuner provides comprehensive training support across multiple diffusion mo
 - **Multi-GPU training** - Distributed training across multiple GPUs with automatic optimization
 - **Advanced caching** - Image, video, audio, and caption embeddings cached to disk for faster training
 - **Aspect bucketing** - Support for varied image/video sizes and aspect ratios
+- **Concept sliders** - Slider-friendly targeting for LoRA/LyCORIS/full (via LyCORIS `full`) with positive/negative/neutral sampling and per-prompt strength; see [Slider LoRA guide](/documentation/SLIDER_LORA.md)
 - **Memory optimization** - Most models trainable on 24G GPU, many on 16G with optimizations
 - **DeepSpeed & FSDP2 integration** - Train large models on smaller GPUs with optim/grad/parameter sharding, context parallel attention, gradient checkpointing, and optimizer state offload
 - **S3 training** - Train directly from cloud storage (Cloudflare R2, Wasabi S3)
@@ -127,6 +128,8 @@ Detailed quickstart guides are available for all supported models:
 - **[Sana Guide](/documentation/quickstart/SANA.md)** - Lightweight flow-matching model
 - **[Lumina2 Guide](/documentation/quickstart/LUMINA2.md)** - 2B parameter flow-matching model
 - **[Kwai Kolors Guide](/documentation/quickstart/KOLORS.md)** - SDXL-based with ChatGLM encoder
+- **[LongCat-Image Guide](/documentation/quickstart/LONGCAT_IMAGE.md)** - 6B bilingual flow-matching model with Qwen-2.5-VL encoder
+- **[LongCat-Image Edit Guide](/documentation/quickstart/LONGCAT_EDIT.md)** - Image editing flavour requiring reference latents
 - **[LTX Video Guide](/documentation/quickstart/LTXVIDEO.md)** - Video diffusion training
 - **[Hunyuan Video 1.5 Guide](/documentation/quickstart/HUNYUANVIDEO.md)** - 8.3B flow-matching T2V/I2V with SR stages
 - **[Wan Video Guide](/documentation/quickstart/WAN.md)** - Video flow-matching with TREAD support
 
@@ -24,7 +24,7 @@ In v3.0, this support has been greatly improved, with a WebUI configuration buil
 |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
 |        ID   ID                                                   Usage      |
 |=============================================================================|
-|    0   N/A  N/A     11500      C   ...uner/.venv/bin/python3.11     9232MiB |
+|    0   N/A  N/A     11500      C   ...uner/.venv/bin/python3.12     9232MiB |
 +-----------------------------------------------------------------------------+
 ```
 
 
@@ -222,6 +222,30 @@ Alternatively, one might use the real name of their subject, or a 'similar enoug
 
 After a number of training experiments, it seems as though a 'similar enough' celebrity is the best choice, especially if prompting the model for the person's real name ends up looking dissimilar.
 
+# Scheduled Sampling (Rollout)
+
+When training on small datasets like in Dreambooth, models can quickly overfit to the "perfect" noise added during training. This leads to **exposure bias**: the model learns to denoise perfect inputs but fails when faced with its own slightly imperfect outputs during inference.
+
+**Scheduled Sampling (Rollout)** addresses this by occasionally letting the model generate its own noisy latents for a few steps during the training loop. Instead of training on pure Gaussian noise + signal, it trains on "rollout" samples that contain the model's own previous errors. This teaches the model to correct itself, leading to more robust and stable subject generation.
+
+> 🟢 This feature is experimental but highly recommended for small datasets where overfitting or "frying" is common.
+> ⚠️ Enabling rollout increases compute requirements, as the model must perform extra inference steps during the training loop.
+
+To enable it, add these keys to your `config.json`:
+
+```json
+{
+  "scheduled_sampling_max_step_offset": 10,
+  "scheduled_sampling_probability": 1.0,
+  "scheduled_sampling_ramp_steps": 1000,
+  "scheduled_sampling_sampler": "unipc"
+}
+```
+
+*   `scheduled_sampling_max_step_offset`: How many steps to generate. A small value (e.g., 5-10) is often enough.
+*   `scheduled_sampling_probability`: How often to apply this technique (0.0 to 1.0).
+*   `scheduled_sampling_ramp_steps`: Ramp up the probability over the first N steps to avoid destabilizing early training.
+
 # Exponential moving average (EMA)
 
 A second model can be trained in parallel to your checkpoint, nearly for free - only the resulting system memory (by default) is consumed, rather than more VRAM.
 
@@ -35,8 +35,8 @@ git clone --branch=release https://github.com/bghira/SimpleTuner.git
 
 cd SimpleTuner
 
-# if python --version shows 3.11 you can just also use the 'python' command here.
-python3.11 -m venv .venv
+# if python --version shows 3.11 will have to upgrade to 3.12.
+python3.12 -m venv .venv
 
 source .venv/bin/activate
 ```
 
@@ -619,6 +619,74 @@ See the [DATALOADER.md](DATALOADER.md#automatic-dataset-oversubscription) guide
 - **What**: Train a model using a more gradual weighting on the loss landscape.
 - **Why**: When training pixel diffusion models, they will simply degrade without using a specific loss weighting schedule. This is the case with DeepFloyd, where soft-min-snr-gamma was found to essentially be mandatory for good results. You may find success with latent diffusion model training, but in small experiments, it was found to potentially produce blurry results.
 
+### `--diff2flow_enabled`
+
+- **What**: Enable the Diffusion-to-Flow bridge for epsilon or v-prediction models.
+- **Why**: Allows models trained with standard diffusion objectives to use flow-matching targets (noise - latents) without changing the model architecture.
+- **Note**: Experimental feature.
+
+### `--diff2flow_loss`
+
+- **What**: Train with Flow Matching loss instead of the native prediction loss.
+- **Why**: When enabled alongside `--diff2flow_enabled`, this calculates the loss against the flow target (noise - latents) instead of the model's native target (epsilon or velocity).
+- **Note**: Requires `--diff2flow_enabled`.
+
+### `--scheduled_sampling_max_step_offset`
+
+- **What**: Maximum number of steps to "roll out" during training.
+- **Why**: Enables Scheduled Sampling (Rollout), where the model generates its own inputs for a few steps during training. This helps the model learn to correct its own errors and reduces exposure bias.
+- **Default**: 0 (disabled). Set to a positive integer (e.g., 5 or 10) to enable.
+
+### `--scheduled_sampling_strategy`
+
+- **What**: Strategy for choosing the rollout offset.
+- **Choices**: `uniform`, `biased_early`, `biased_late`.
+- **Default**: `uniform`.
+- **Why**: Controls the distribution of rollout lengths. `uniform` samples evenly; `biased_early` favors shorter rollouts; `biased_late` favors longer rollouts.
+
+### `--scheduled_sampling_probability`
+
+- **What**: Probability of applying a non-zero rollout offset for a given sample.
+- **Default**: 0.0.
+- **Why**: Controls how often scheduled sampling is applied. A value of 0.0 disables it even if `max_step_offset` is > 0. A value of 1.0 applies it to every sample.
+
+### `--scheduled_sampling_prob_start`
+
+- **What**: Initial probability for scheduled sampling at the start of the ramp.
+- **Default**: 0.0.
+
+### `--scheduled_sampling_prob_end`
+
+- **What**: Final probability for scheduled sampling at the end of the ramp.
+- **Default**: 0.5.
+
+### `--scheduled_sampling_ramp_steps`
+
+- **What**: Number of steps to ramp the probability from `prob_start` to `prob_end`.
+- **Default**: 0 (no ramp).
+
+### `--scheduled_sampling_start_step`
+
+- **What**: Global step to start the scheduled sampling ramp.
+- **Default**: 0.0.
+
+### `--scheduled_sampling_ramp_shape`
+
+- **What**: Shape of the probability ramp.
+- **Choices**: `linear`, `cosine`.
+- **Default**: `linear`.
+
+### `--scheduled_sampling_sampler`
+
+- **What**: The solver used for the rollout generation steps.
+- **Choices**: `unipc`, `euler`, `dpm`, `rk4`.
+- **Default**: `unipc`.
+
+### `--scheduled_sampling_order`
+
+- **What**: The order of the solver used for rollout.
+- **Default**: 2.
+
 ---
 
 ## 🔄 Checkpointing and Resumption
@@ -745,6 +813,7 @@ usage: train.py [-h] --model_family
                 [--vae_cache_scan_behaviour {recreate,sync}]
                 [--vae_enable_slicing [VAE_ENABLE_SLICING]]
                 [--vae_enable_tiling [VAE_ENABLE_TILING]]
+                [--vae_enable_patch_conv [VAE_ENABLE_PATCH_CONV]]
                 [--vae_batch_size VAE_BATCH_SIZE]
                 [--caption_dropout_probability CAPTION_DROPOUT_PROBABILITY]
                 [--tokenizer_max_length TOKENIZER_MAX_LENGTH]
@@ -782,7 +851,7 @@ usage: train.py [-h] --model_family
                 [--validation_guidance_skip_layers_stop VALIDATION_GUIDANCE_SKIP_LAYERS_STOP]
                 [--validation_guidance_skip_scale VALIDATION_GUIDANCE_SKIP_SCALE]
                 [--validation_lycoris_strength VALIDATION_LYCORIS_STRENGTH]
-                [--validation_noise_scheduler {ddim,ddpm,euler,euler-a,unipc,dpm++}]
+                [--validation_noise_scheduler {ddim,ddpm,euler,euler-a,unipc,dpm++,perflow}]
                 [--validation_num_video_frames VALIDATION_NUM_VIDEO_FRAMES]
                 [--validation_resolution VALIDATION_RESOLUTION]
                 [--validation_seed_source {cpu,gpu}]
@@ -909,7 +978,7 @@ usage: train.py [-h] --model_family
                 [--rescale_betas_zero_snr [RESCALE_BETAS_ZERO_SNR]]
                 [--webhook_config WEBHOOK_CONFIG]
                 [--webhook_reporting_interval WEBHOOK_REPORTING_INTERVAL]
-                [--distillation_method {lcm,dcm}]
+                [--distillation_method {lcm,dcm,dmd,perflow}]
                 [--distillation_config DISTILLATION_CONFIG]
                 [--ema_validation {none,ema_only,comparison}]
                 [--local_rank LOCAL_RANK] [--ltx_train_mode {t2v,i2v}]
@@ -1083,6 +1152,10 @@ options:
                         PEFT LoRA training mode
   --singlora_ramp_up_steps SINGLORA_RAMP_UP_STEPS
                         Number of ramp-up steps for SingLoRA
+  --slider_lora_target [SLIDER_LORA_TARGET]
+                        Route LoRA training to slider-friendly targets
+                        (self-attn + conv/time embeddings). Only affects
+                        standard PEFT LoRA.
   --init_lora INIT_LORA
                         Specify an existing LoRA or LyCORIS safetensors file
                         to initialize the adapter
@@ -1118,6 +1191,9 @@ options:
                         Enable VAE attention slicing for memory efficiency
   --vae_enable_tiling [VAE_ENABLE_TILING]
                         Enable VAE tiling for large images
+  --vae_enable_patch_conv [VAE_ENABLE_PATCH_CONV]
+                        Enable patch-based 3D conv for HunyuanVideo VAE to
+                        reduce peak VRAM (slight slowdown)
   --vae_batch_size VAE_BATCH_SIZE
                         Batch size for VAE encoding during caching
   --caption_dropout_probability CAPTION_DROPOUT_PROBABILITY
@@ -1201,7 +1277,7 @@ options:
                         Scale guidance strength when applying layer skipping
   --validation_lycoris_strength VALIDATION_LYCORIS_STRENGTH
                         Strength multiplier for LyCORIS validation
-  --validation_noise_scheduler {ddim,ddpm,euler,euler-a,unipc,dpm++}
+  --validation_noise_scheduler {ddim,ddpm,euler,euler-a,unipc,dpm++,perflow}
                         Noise scheduler for validation
   --validation_num_video_frames VALIDATION_NUM_VIDEO_FRAMES
                         Number of frames for video validation
@@ -1585,7 +1661,7 @@ options:
                         Path to webhook configuration file
   --webhook_reporting_interval WEBHOOK_REPORTING_INTERVAL
                         Interval for webhook reports (seconds)
-  --distillation_method {lcm,dcm}
+  --distillation_method {lcm,dcm,dmd,perflow}
                         Method for model distillation
   --distillation_config DISTILLATION_CONFIG
                         Path to distillation configuration file