Released inference code and checkpoints

bytedance · Dec 19, 2024 · bf00e09 · bf00e09
1 parent 779c065
commit bf00e09
Show file tree

Hide file tree

Showing 106 changed files with 115,408 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,45 @@
-.DS_Store
+# PyCharm files
+.idea/
+
+# macOS dir files
+.DS_Store
+
+# VS Code configuration dir
+.vscode/
+
+# Jupyter Notebook cache files
+.ipynb_checkpoints/
+*.ipynb
+
+# Python cache files
+__pycache__/
+
+# folders
+wandb/
+*debug*
+/debug
+/output
+/validation
+/test
+/models/
+/venv/
+/detect_results/
+/temp
+
+# checkpoint files
+*.safetensors
+*.ckpt
+*.pt
+
+# data files
+*.mp4
+*.avi
+*.wav
+*.png
+*.jpg
+*.jpeg
+*.csv
+
+!/latentsync/utils/mask.png
+/checkpoints/
+!/assets/*
diff --git a/README.md b/README.md
@@ -65,8 +65,45 @@ We present LatentSync, an end-to-end lip sync framework based on audio condition
 
 (Photorealistic videos are filmed by contracted models, and anime videos are from [VASA-1](https://www.microsoft.com/en-us/research/project/vasa-1/) and [EMO](https://humanaigc.github.io/emote-portrait-alive/))
 
-## 📑 Open-Source Plan
+## 📑 Open-source Plan
 
-- [ ] Inference code and checkpoints
+- [x] Inference code and checkpoints
 - [ ] Data processing pipeline
 - [ ] Training code
+
+## Setting up the Environment
+
+Install the required packages and download the checkpoints via:
+
+```bash
+source setup_env.sh
+```
+
+If the download is successful, the checkpoints should appear as follows:
+
+```
+./checkpoints/
+|-- latentsync_unet.pt
+|-- latentsync_syncnet.pt
+|-- whisper
+|   `-- tiny.pt
+|-- auxiliary
+|   |-- 2DFAN4-cd938726ad.zip
+|   |-- i3d_torchscript.pt
+|   |-- koniq_pretrained.pkl
+|   |-- s3fd-619a316812.pth
+|   |-- sfd_face.pth
+|   |-- syncnet_v2.model
+|   |-- vgg16-397923af.pth
+|   `-- vit_g_hybrid_pt_1200e_ssv2_ft.pth
+```
+
+These already include all the checkpoints required for latentsync training and inference. If you only want to try inference, you only need to download `latentsync_unet.pt` and `tiny.pt` from our [HuggingFace repo](https://huggingface.co/chunyu-li/LatentSync)
+
+## Inference
+
+Run the script for inference, which requires about 6.5GB GPU memory.
+
+```bash
+./inference.sh
+```
diff --git a/assets/demo1_audio.wav b/assets/demo1_audio.wav
diff --git a/assets/demo1_video.mp4 b/assets/demo1_video.mp4
diff --git a/assets/demo2_audio.wav b/assets/demo2_audio.wav
diff --git a/assets/demo2_video.mp4 b/assets/demo2_video.mp4
diff --git a/assets/demo3_audio.wav b/assets/demo3_audio.wav
diff --git a/assets/demo3_video.mp4 b/assets/demo3_video.mp4
diff --git a/configs/audio.yaml b/configs/audio.yaml
@@ -0,0 +1,23 @@
+audio:
+  num_mels: 80 # Number of mel-spectrogram channels and local conditioning dimensionality
+  rescale: true # Whether to rescale audio prior to preprocessing
+  rescaling_max: 0.9 # Rescaling value
+  use_lws:
+    false # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
+    # It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
+    # Does not work if n_ffit is not multiple of hop_size!!
+  n_fft: 800 # Extra window size is filled with 0 paddings to match this parameter
+  hop_size: 200 # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
+  win_size: 800 # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
+  sample_rate: 16000 # 16000Hz (corresponding to librispeech) (sox --i <filename>)
+  frame_shift_ms: null
+  signal_normalization: true
+  allow_clipping_in_normalization: true
+  symmetric_mels: true
+  max_abs_value: 4.0
+  preemphasize: true # whether to apply filter
+  preemphasis: 0.97 # filter coefficient.
+  min_level_db: -100
+  ref_level_db: 20
+  fmin: 55
+  fmax: 7600
diff --git a/configs/scheduler_config.json b/configs/scheduler_config.json
@@ -0,0 +1,13 @@
+{
+  "_class_name": "DDIMScheduler",
+  "_diffusers_version": "0.6.0.dev0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "num_train_timesteps": 1000,
+  "set_alpha_to_one": false,
+  "steps_offset": 1,
+  "trained_betas": null,
+  "skip_prk_steps": true
+}
diff --git a/configs/syncnet/syncnet_16_latent.yaml b/configs/syncnet/syncnet_16_latent.yaml
@@ -0,0 +1,46 @@
+model:
+  audio_encoder: # input (1, 80, 52)
+    in_channels: 1
+    block_out_channels: [32, 64, 128, 256, 512, 1024]
+    downsample_factors: [[2, 1], 2, 2, 2, 2, [2, 3]]
+    attn_blocks: [0, 0, 0, 0, 0, 0]
+    dropout: 0.0
+  visual_encoder: # input (64, 32, 32)
+    in_channels: 64
+    block_out_channels: [64, 128, 256, 256, 512, 1024]
+    downsample_factors: [2, 2, 2, 1, 2, 2]
+    attn_blocks: [0, 0, 0, 0, 0, 0]
+    dropout: 0.0
+
+ckpt:
+  resume_ckpt_path: ""
+  inference_ckpt_path: ""
+  save_ckpt_steps: 2500
+
+data:
+  train_output_dir: output/syncnet
+  num_val_samples: 1200
+  batch_size: 120 # 40
+  num_workers: 11 # 11
+  latent_space: true
+  num_frames: 16
+  resolution: 256
+  train_fileslist: ""
+  train_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/train
+  val_fileslist: ""
+  val_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/val
+  audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
+  lower_half: false
+  pretrained_audio_model_path: facebook/wav2vec2-large-xlsr-53
+  audio_sample_rate: 16000
+  video_fps: 25
+
+optimizer:
+  lr: 1e-5
+  max_grad_norm: 1.0
+
+run:
+  max_train_steps: 10000000
+  validation_steps: 2500
+  mixed_precision_training: true
+  seed: 42
diff --git a/configs/syncnet/syncnet_16_pixel.yaml b/configs/syncnet/syncnet_16_pixel.yaml
@@ -0,0 +1,45 @@
+model:
+  audio_encoder: # input (1, 80, 52)
+    in_channels: 1
+    block_out_channels: [32, 64, 128, 256, 512, 1024, 2048]
+    downsample_factors: [[2, 1], 2, 2, 1, 2, 2, [2, 3]]
+    attn_blocks: [0, 0, 0, 0, 0, 0, 0]
+    dropout: 0.0
+  visual_encoder: # input (48, 128, 256)
+    in_channels: 48
+    block_out_channels: [64, 128, 256, 256, 512, 1024, 2048, 2048]
+    downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2]
+    attn_blocks: [0, 0, 0, 0, 0, 0, 0, 0]
+    dropout: 0.0
+
+ckpt:
+  resume_ckpt_path: ""
+  inference_ckpt_path: checkpoints/latentsync_syncnet.pt
+  save_ckpt_steps: 20
+
+data:
+  train_output_dir: debug/syncnet
+  num_val_samples: 2048
+  batch_size: 128 # 128
+  num_workers: 11 # 11
+  latent_space: false
+  num_frames: 16
+  resolution: 256
+  train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt
+  train_data_dir: ""
+  val_fileslist: ""
+  val_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/val
+  audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
+  lower_half: true
+  audio_sample_rate: 16000
+  video_fps: 25
+
+optimizer:
+  lr: 1e-5
+  max_grad_norm: 1.0
+
+run:
+  max_train_steps: 10000000
+  validation_steps: 20
+  mixed_precision_training: true
+  seed: 42
diff --git a/configs/syncnet/syncnet_25_pixel.yaml b/configs/syncnet/syncnet_25_pixel.yaml
@@ -0,0 +1,45 @@
+model:
+  audio_encoder: # input (1, 80, 80)
+    in_channels: 1
+    block_out_channels: [64, 128, 256, 256, 512, 1024]
+    downsample_factors: [2, 2, 2, 2, 2, 2]
+    dropout: 0.0
+  visual_encoder: # input (75, 128, 256)
+    in_channels: 75
+    block_out_channels: [128, 128, 256, 256, 512, 512, 1024, 1024]
+    downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2]
+    dropout: 0.0
+
+ckpt:
+  resume_ckpt_path: ""
+  inference_ckpt_path: ""
+  save_ckpt_steps: 2500
+
+data:
+  train_output_dir: debug/syncnet
+  num_val_samples: 2048
+  batch_size: 64 # 64
+  num_workers: 11 # 11
+  latent_space: false
+  num_frames: 25
+  resolution: 256
+  train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_vox_avatars_ads_affine.txt
+  # /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_voxceleb_avatars_affine.txt
+  train_data_dir: ""
+  val_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/vox_affine_val.txt
+  # /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/voxceleb_val.txt
+  val_data_dir: ""
+  audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel
+  lower_half: true
+  pretrained_audio_model_path: facebook/wav2vec2-large-xlsr-53
+  audio_sample_rate: 16000
+  video_fps: 25
+
+optimizer:
+  lr: 1e-5
+  max_grad_norm: 1.0
+
+run:
+  max_train_steps: 10000000
+  mixed_precision_training: true
+  seed: 42
diff --git a/configs/unet/unet_latent_16_diffusion.yaml b/configs/unet/unet_latent_16_diffusion.yaml
@@ -0,0 +1,102 @@
+data:
+  syncnet_config_path: configs/syncnet/syncnet_16_pixel.yaml
+  train_output_dir: debug/unet
+  train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt
+  train_data_dir: ""
+  audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/whisper_new
+
+  val_video_path: assets/demo1_video.mp4
+  val_audio_path: assets/demo1_audio.wav
+  batch_size: 2 # 8
+  num_workers: 11 # 11
+  num_frames: 16
+  resolution: 256
+  mask: fix_mask
+  audio_sample_rate: 16000
+  video_fps: 25
+
+ckpt:
+  resume_ckpt_path: checkpoints/latentsync_unet.pt
+  save_ckpt_steps: 5000
+
+run:
+  pixel_space_supervise: true
+  use_syncnet: true
+  sync_loss_weight: 0.05 # 1/283
+  perceptual_loss_weight: 0.1 # 0.1
+  recon_loss_weight: 1 # 1
+  guidance_scale: 1.0 # 1.5 or 1.0
+  trepa_loss_weight: 10
+  inference_steps: 20
+  seed: 1247
+  use_mixed_noise: true
+  mixed_noise_alpha: 1 # 1
+  mixed_precision_training: true
+  enable_gradient_checkpointing: false
+  enable_xformers_memory_efficient_attention: true
+  max_train_steps: 10000000
+  max_train_epochs: -1
+
+optimizer:
+  lr: 1e-5
+  scale_lr: false
+  max_grad_norm: 1.0
+  lr_scheduler: constant
+  lr_warmup_steps: 0
+
+model:
+  act_fn: silu
+  add_audio_layer: true
+  custom_audio_layer: false
+  audio_condition_method: cross_attn # Choose between [cross_attn, group_norm]
+  attention_head_dim: 8
+  block_out_channels: [320, 640, 1280, 1280]
+  center_input_sample: false
+  cross_attention_dim: 384
+  down_block_types:
+    [
+      "CrossAttnDownBlock3D",
+      "CrossAttnDownBlock3D",
+      "CrossAttnDownBlock3D",
+      "DownBlock3D",
+    ]
+  mid_block_type: UNetMidBlock3DCrossAttn
+  up_block_types:
+    [
+      "UpBlock3D",
+      "CrossAttnUpBlock3D",
+      "CrossAttnUpBlock3D",
+      "CrossAttnUpBlock3D",
+    ]
+  downsample_padding: 1
+  flip_sin_to_cos: true
+  freq_shift: 0
+  in_channels: 13 # 49
+  layers_per_block: 2
+  mid_block_scale_factor: 1
+  norm_eps: 1e-5
+  norm_num_groups: 32
+  out_channels: 4 # 16
+  sample_size: 64
+  resnet_time_scale_shift: default # Choose between [default, scale_shift]
+  unet_use_cross_frame_attention: false
+  unet_use_temporal_attention: false
+
+  # Actually we don't use the motion module in the final version of LatentSync
+  # When we started the project, we used the codebase of AnimateDiff and tried motion module
+  # But the results are poor, and we decied to leave the code here for possible future usage
+  use_motion_module: false
+  motion_module_resolutions: [1, 2, 4, 8]
+  motion_module_mid_block: false
+  motion_module_decoder_only: false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads: 8
+    num_transformer_block: 1
+    attention_block_types:
+      - Temporal_Self
+      - Temporal_Self
+    temporal_position_encoding: true
+    temporal_position_encoding_max_len: 16
+    temporal_attention_dim_div: 1
+    zero_initialize: true
diff --git a/data_processing_pipeline.sh b/data_processing_pipeline.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+python -m preprocess.data_processing_pipeline \
+    --total_num_workers 20 \
+    --per_gpu_num_workers 20 \
+    --resolution 256 \
+    --sync_conf_threshold 3 \
+    --temp_dir temp \
+    --input_dir /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/raw
diff --git a/eval/detectors/README.md b/eval/detectors/README.md
@@ -0,0 +1,3 @@
+# Face detector
+
+This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.
diff --git a/eval/detectors/__init__.py b/eval/detectors/__init__.py
@@ -0,0 +1 @@
+from .s3fd import S3FD
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Face detector

		This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.