Wan 2.2 LoRA Training

Harry Yang
**Select ~50 high-quality videos with captions.**

1. Generate JSON

python generate_jsonl.py

{"video_path": "*.mp4", "caption": "*"}
{"video_path": "*.mp4", "caption": "*"}

2. Generate TOML

python generate_toml.py

[general]
batch_size = 1
enable_bucket = true
bucket_no_upscale = false

[[datasets]]
video_jsonl_file = jsonl_file_from_step_1
cache_directory = cache_dir
target_frames = [129]

3. Cache Latents and Text Encoder

CUDA_VISIBLE_DEVICES=0 python -m src.musubi_tuner.wan_cache_latents \
--dataset_config toml_file \
--vae Wan2.1_VAE.pth \
--batch_size 2 \
--i2v

CUDA_VISIBLE_DEVICES=7 python -m src.musubi_tuner.wan_cache_text_encoder_outputs \
--dataset_config toml_file \
--t5 models_t5_umt5-xxl-enc-bf16.pth \
--batch_size 2

4. Fine-tune

# High noise
CUDA_VISIBLE_DEVICES=0,1,2 accelerate launch \
    --num_processes 3 \
    --num_cpu_threads_per_process 2 \
    --mixed_precision fp16 \
    --main_process_port 55558 \
    src/musubi_tuner/wan_train_network.py \
    --task i2v-A14B \
    --dit wan2.2_i2v_high_noise_14B_fp16.safetensors \
    **--dataset_config "*.toml" \**
    --sdpa \
    --optimizer_type adamw \
    --learning_rate 1e-4 \
    --gradient_checkpointing \
    --max_data_loader_n_workers 2 \
    --persistent_data_loader_workers \
    --network_module networks.lora_wan \
    --network_dim 8 \
    --network_alpha 4 \
    --timestep_sampling shift \
    --discrete_flow_shift 5.0 \
    --min_timestep 875 \
    --max_timestep 1000 \
    --max_train_epochs 100 \
    --save_every_n_epochs 5 \
    --seed 42 \
    **--output_dir high_output_dir \
    --output_name high_output_name \**
    --log_with wandb \
    **--logging_dir high_log_dir \**
    --lr_scheduler cosine \
    --lr_warmup_steps 1000 \
    --optimizer_args weight_decay=0.05 betas=0.9,0.999 

# Low noise
CUDA_VISIBLE_DEVICES=3,4,5 accelerate launch \
    --num_processes 3 \
    --num_cpu_threads_per_process 2 \
    --mixed_precision fp16 \
    --main_process_port 55557 \
    src/musubi_tuner/wan_train_network.py \
    --task i2v-A14B \
    --dit wan2.2_i2v_low_noise_14B_fp16.safetensors \
    **--dataset_config "*.toml" \**
    --sdpa \
    --optimizer_type adamw \
    --learning_rate 1e-4 \
    --gradient_checkpointing \
    --max_data_loader_n_workers 2 \
    --persistent_data_loader_workers \
    --network_module networks.lora_wan \
    --network_dim 8 \
    --network_alpha 4 \
    --timestep_sampling shift \
    --discrete_flow_shift 5.0 \
    --min_timestep 0 \
    --max_timestep 875 \
    --max_train_epochs 100 \
    --save_every_n_epochs 5 \
    --seed 42 \
    **--output_dir low_output_dir \
    --output_name low_output_name \**
    --log_with wandb \
    **--logging_dir low_log_dir \**
    --lr_scheduler cosine \
    --lr_warmup_steps 1000 \
    --optimizer_args weight_decay=0.05 betas=0.9,0.999 

5. Inference

**Use epoch 80:**

CUDA_VISIBLE_DEVICES=7 python src/musubi_tuner/wan_generate_video.py \
    --task i2v-A14B \
    --infer_steps 30 \
    --save_path save_path.mp4 \
    --output_type both \
    --dit wan2.2_i2v_low_noise_14B_fp16.safetensors \
    --dit_high_noise wan2.2_i2v_high_noise_14B_fp16.safetensors \
    **--lora_weight 000080.safetensors \
    --lora_weight_high_noise high-000080.safetensors \**
    --vae Wan2.1_VAE.pth \
    --t5 models_t5_umt5-xxl-enc-bf16.pth \
    --attn_mode sageattn \
    --video_length 129 \
    --fps 24 \
    --flow_shift 5.0 \
    --timestep_boundary 0.900 \
    **--image_path image_path \**
    --video_size 480 832 \
    **--prompt prompt_text**

**Fast inference:**

CUDA_VISIBLE_DEVICES=6,7 torchrun \
    --nproc_per_node=2 \
    generate.py \
    --task i2v-A14B \
    --ckpt_dir ./Wan2.2-I2V-A14B \
    --lora_dir ./Wan2.2-Lightning/Wan2.2-I2V-A14B-4steps-lora-rank64-Seko-V1 \
    --dit_fsdp \
    --t5_fsdp \
    --ulysses_size 2 \
    --base_seed 42 \
    **--prompt prompt \
    --image_path image_path \**
    **--lora_weight_low_noise 000080.safetensors \
    --lora_weight_high_noise high-000080.safetensors \**
    --save_dir ./test \
    --size 832*480