From 992ed807125a27a38849298e74f49f83e62b75db Mon Sep 17 00:00:00 2001
From: Chunyu Li <lichunyu.cs@bytedance.com>
Date: Fri, 20 Dec 2024 00:34:00 +0800
Subject: [PATCH] Released training code for U-Net and SyncNet

---
 README.md                                     |  28 ++++-
 configs/syncnet/syncnet_16_pixel.yaml         |   4 +-
 configs/unet/first_stage.yaml                 | 102 ++++++++++++++++++
 ...nt_16_diffusion.yaml => second_stage.yaml} |   0
 inference.sh                                  |   2 +-
 train_unet.sh                                 |   2 +-
 6 files changed, 130 insertions(+), 8 deletions(-)
 create mode 100644 configs/unet/first_stage.yaml
 rename configs/unet/{unet_latent_16_diffusion.yaml => second_stage.yaml} (100%)

diff --git a/README.md b/README.md
index d38ee6e..9d2a762 100644
--- a/README.md
+++ b/README.md
@@ -69,7 +69,7 @@ We present LatentSync, an end-to-end lip sync framework based on audio condition
 
 - [x] Inference code and checkpoints
 - [x] Data processing pipeline
-- [ ] Training code
+- [x] Training code
 
 ## 🔧 Setting up the Environment
 
@@ -114,11 +114,11 @@ The complete data processing pipeline includes the following steps:
 
 1. Remove the broken video files.
 2. Resample the video FPS to 25, and resample the audio to 16000 Hz.
-3. Scene detect.
+3. Scene detect via [PySceneDetect](https://github.com/Breakthrough/PySceneDetect).
 4. Split each video into 5-10 second segments.
 5. Remove videos where the face is smaller than 256 $\times$ 256, as well as videos with more than one face.
-6. Affine transform the faces according to landmarks, then resize to 256 $\times$ 256.
-7. Remove videos with sync conf lower than 3, and adjust the audio-visual offset to 0.
+6. Affine transform the faces according to the landmarks detected by [face-alignment](https://github.com/1adrianb/face-alignment), then resize to 256 $\times$ 256.
+7. Remove videos with [sync confidence score](https://www.robots.ox.ac.uk/~vgg/publications/2016/Chung16a/chung16a.pdf) lower than 3, and adjust the audio-visual offset to 0.
 8. Calculate [hyperIQA](https://openaccess.thecvf.com/content_CVPR_2020/papers/Su_Blindly_Assess_Image_Quality_in_the_Wild_Guided_by_a_CVPR_2020_paper.pdf) score, and remove videos with scores lower than 40.
 
 Run the script to execute the data processing pipeline:
@@ -128,3 +128,23 @@ Run the script to execute the data processing pipeline:
 ```
 
 You can change the parameter `input_dir` in the script to specify the data directory to be processed. The processed data will be saved in the same directory. Each step will generate a new directory to prevent the need to redo the entire pipeline in case the process is interrupted by an unexpected error.
+
+## 🏋️‍♂️ Training U-Net
+
+Before training, you must process the data as described above and download all the checkpoints. We released a pretrained SyncNet with 94% accuracy on the VoxCeleb2 dataset for the supervision of U-Net training. Note that this SyncNet is trained on affine transformed videos, so when using or evaluating this SyncNet, you need to perform affine transformation on the video first (the code of affine transformation is included in the data processing pipeline).
+
+If all the preparations are complete, you can train the U-Net with the following script:
+
+```bash
+./train_unet.sh
+```
+
+You should change the parameters in U-Net config file to specify the data directory, checkpoint save path, and other training hyperparameters.
+
+## 🏋️‍♂️ Training SyncNet
+
+In case you want to train SyncNet on your own datasets, you can run the following script:
+
+```bash
+./train_syncnet.sh
+```
diff --git a/configs/syncnet/syncnet_16_pixel.yaml b/configs/syncnet/syncnet_16_pixel.yaml
index aec5365..1aabb3e 100644
--- a/configs/syncnet/syncnet_16_pixel.yaml
+++ b/configs/syncnet/syncnet_16_pixel.yaml
@@ -15,7 +15,7 @@ model:
 ckpt:
   resume_ckpt_path: ""
   inference_ckpt_path: checkpoints/latentsync_syncnet.pt
-  save_ckpt_steps: 20
+  save_ckpt_steps: 2500
 
 data:
   train_output_dir: debug/syncnet
@@ -40,6 +40,6 @@ optimizer:
 
 run:
   max_train_steps: 10000000
-  validation_steps: 20
+  validation_steps: 2500
   mixed_precision_training: true
   seed: 42
diff --git a/configs/unet/first_stage.yaml b/configs/unet/first_stage.yaml
new file mode 100644
index 0000000..9dbff62
--- /dev/null
+++ b/configs/unet/first_stage.yaml
@@ -0,0 +1,102 @@
+data:
+  syncnet_config_path: configs/syncnet/syncnet_16_pixel.yaml
+  train_output_dir: debug/unet
+  train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt
+  train_data_dir: ""
+  audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/whisper_new
+
+  val_video_path: assets/demo1_video.mp4
+  val_audio_path: assets/demo1_audio.wav
+  batch_size: 8 # 8
+  num_workers: 11 # 11
+  num_frames: 16
+  resolution: 256
+  mask: fix_mask
+  audio_sample_rate: 16000
+  video_fps: 25
+
+ckpt:
+  resume_ckpt_path: checkpoints/latentsync_unet.pt
+  save_ckpt_steps: 5000
+
+run:
+  pixel_space_supervise: false
+  use_syncnet: true
+  sync_loss_weight: 0.05 # 1/283
+  perceptual_loss_weight: 0.1 # 0.1
+  recon_loss_weight: 1 # 1
+  guidance_scale: 1.0 # 1.5 or 1.0
+  trepa_loss_weight: 10
+  inference_steps: 20
+  seed: 1247
+  use_mixed_noise: true
+  mixed_noise_alpha: 1 # 1
+  mixed_precision_training: true
+  enable_gradient_checkpointing: false
+  enable_xformers_memory_efficient_attention: true
+  max_train_steps: 10000000
+  max_train_epochs: -1
+
+optimizer:
+  lr: 1e-5
+  scale_lr: false
+  max_grad_norm: 1.0
+  lr_scheduler: constant
+  lr_warmup_steps: 0
+
+model:
+  act_fn: silu
+  add_audio_layer: true
+  custom_audio_layer: false
+  audio_condition_method: cross_attn # Choose between [cross_attn, group_norm]
+  attention_head_dim: 8
+  block_out_channels: [320, 640, 1280, 1280]
+  center_input_sample: false
+  cross_attention_dim: 384
+  down_block_types:
+    [
+      "CrossAttnDownBlock3D",
+      "CrossAttnDownBlock3D",
+      "CrossAttnDownBlock3D",
+      "DownBlock3D",
+    ]
+  mid_block_type: UNetMidBlock3DCrossAttn
+  up_block_types:
+    [
+      "UpBlock3D",
+      "CrossAttnUpBlock3D",
+      "CrossAttnUpBlock3D",
+      "CrossAttnUpBlock3D",
+    ]
+  downsample_padding: 1
+  flip_sin_to_cos: true
+  freq_shift: 0
+  in_channels: 13 # 49
+  layers_per_block: 2
+  mid_block_scale_factor: 1
+  norm_eps: 1e-5
+  norm_num_groups: 32
+  out_channels: 4 # 16
+  sample_size: 64
+  resnet_time_scale_shift: default # Choose between [default, scale_shift]
+  unet_use_cross_frame_attention: false
+  unet_use_temporal_attention: false
+
+  # Actually we don't use the motion module in the final version of LatentSync
+  # When we started the project, we used the codebase of AnimateDiff and tried motion module
+  # But the results are poor, and we decied to leave the code here for possible future usage
+  use_motion_module: false
+  motion_module_resolutions: [1, 2, 4, 8]
+  motion_module_mid_block: false
+  motion_module_decoder_only: false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads: 8
+    num_transformer_block: 1
+    attention_block_types:
+      - Temporal_Self
+      - Temporal_Self
+    temporal_position_encoding: true
+    temporal_position_encoding_max_len: 16
+    temporal_attention_dim_div: 1
+    zero_initialize: true
diff --git a/configs/unet/unet_latent_16_diffusion.yaml b/configs/unet/second_stage.yaml
similarity index 100%
rename from configs/unet/unet_latent_16_diffusion.yaml
rename to configs/unet/second_stage.yaml
diff --git a/inference.sh b/inference.sh
index c9cdeeb..36dc9b2 100755
--- a/inference.sh
+++ b/inference.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 python -m scripts.inference \
-    --unet_config_path "configs/unet/unet_latent_16_diffusion.yaml" \
+    --unet_config_path "configs/unet/second_stage.yaml" \
     --inference_ckpt_path "checkpoints/latentsync_unet.pt" \
     --video_path "assets/demo1_video.mp4" \
     --audio_path "assets/demo1_audio.wav" \
diff --git a/train_unet.sh b/train_unet.sh
index 474fdc9..28f6344 100755
--- a/train_unet.sh
+++ b/train_unet.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 
 torchrun --nnodes=1 --nproc_per_node=1 --master_port=25678 -m scripts.train_unet \
-    --unet_config_path "configs/unet/unet_latent_16_diffusion.yaml"
+    --unet_config_path "configs/unet/first_stage.yaml"