Harry Yang

**Select ~50 high-quality videos with captions.**

1. Generate JSON

python generate_jsonl.py

{"video_path": "*.mp4", "caption": "*"}
{"video_path": "*.mp4", "caption": "*"}

2. Generate TOML

python generate_toml.py

[general]
batch_size = 1
enable_bucket = true
bucket_no_upscale = false

[[datasets]]
video_jsonl_file = jsonl_file_from_step_1
cache_directory = cache_dir
target_frames = [129]

3. Cache Latents and Text Encoder

CUDA_VISIBLE_DEVICES=0 python -m src.musubi_tuner.wan_cache_latents \\
--dataset_config toml_file \\
--vae Wan2.1_VAE.pth \\
--batch_size 2 \\
--i2v

CUDA_VISIBLE_DEVICES=7 python -m src.musubi_tuner.wan_cache_text_encoder_outputs \\
--dataset_config toml_file \\
--t5 models_t5_umt5-xxl-enc-bf16.pth \\
--batch_size 2

4. Fine-tune

# High noise
CUDA_VISIBLE_DEVICES=0,1,2 accelerate launch \\
    --num_processes 3 \\
    --num_cpu_threads_per_process 2 \\
    --mixed_precision fp16 \\
    --main_process_port 55558 \\
    src/musubi_tuner/wan_train_network.py \\
    --task i2v-A14B \\
    --dit wan2.2_i2v_high_noise_14B_fp16.safetensors \\
    **--dataset_config "*.toml" \\**
    --sdpa \\
    --optimizer_type adamw \\
    --learning_rate 1e-4 \\
    --gradient_checkpointing \\
    --max_data_loader_n_workers 2 \\
    --persistent_data_loader_workers \\
    --network_module networks.lora_wan \\
    --network_dim 8 \\
    --network_alpha 4 \\
    --timestep_sampling shift \\
    --discrete_flow_shift 5.0 \\
    --min_timestep 875 \\
    --max_timestep 1000 \\
    --max_train_epochs 100 \\
    --save_every_n_epochs 5 \\
    --seed 42 \\
    **--output_dir high_output_dir \\
    --output_name high_output_name \\**
    --log_with wandb \\
    **--logging_dir high_log_dir \\**
    --lr_scheduler cosine \\
    --lr_warmup_steps 1000 \\
    --optimizer_args weight_decay=0.05 betas=0.9,0.999 

# Low noise
CUDA_VISIBLE_DEVICES=3,4,5 accelerate launch \\
    --num_processes 3 \\
    --num_cpu_threads_per_process 2 \\
    --mixed_precision fp16 \\
    --main_process_port 55557 \\
    src/musubi_tuner/wan_train_network.py \\
    --task i2v-A14B \\
    --dit wan2.2_i2v_low_noise_14B_fp16.safetensors \\
    **--dataset_config "*.toml" \\**
    --sdpa \\
    --optimizer_type adamw \\
    --learning_rate 1e-4 \\
    --gradient_checkpointing \\
    --max_data_loader_n_workers 2 \\
    --persistent_data_loader_workers \\
    --network_module networks.lora_wan \\
    --network_dim 8 \\
    --network_alpha 4 \\
    --timestep_sampling shift \\
    --discrete_flow_shift 5.0 \\
    --min_timestep 0 \\
    --max_timestep 875 \\
    --max_train_epochs 100 \\
    --save_every_n_epochs 5 \\
    --seed 42 \\
    **--output_dir low_output_dir \\
    --output_name low_output_name \\**
    --log_with wandb \\
    **--logging_dir low_log_dir \\**
    --lr_scheduler cosine \\
    --lr_warmup_steps 1000 \\
    --optimizer_args weight_decay=0.05 betas=0.9,0.999 

5. Inference

**Use epoch 80:**

CUDA_VISIBLE_DEVICES=7 python src/musubi_tuner/wan_generate_video.py \\
    --task i2v-A14B \\
    --infer_steps 30 \\
    --save_path save_path.mp4 \\
    --output_type both \\
    --dit wan2.2_i2v_low_noise_14B_fp16.safetensors \\
    --dit_high_noise wan2.2_i2v_high_noise_14B_fp16.safetensors \\
    **--lora_weight 000080.safetensors \\
    --lora_weight_high_noise high-000080.safetensors \\**
    --vae Wan2.1_VAE.pth \\
    --t5 models_t5_umt5-xxl-enc-bf16.pth \\
    --attn_mode sageattn \\
    --video_length 129 \\
    --fps 24 \\
    --flow_shift 5.0 \\
    --timestep_boundary 0.900 \\
    **--image_path image_path \\**
    --video_size 480 832 \\
    **--prompt prompt_text**

**Fast inference:**

CUDA_VISIBLE_DEVICES=6,7 torchrun \\
    --nproc_per_node=2 \\
    generate.py \\
    --task i2v-A14B \\
    --ckpt_dir ./Wan2.2-I2V-A14B \\
    --lora_dir ./Wan2.2-Lightning/Wan2.2-I2V-A14B-4steps-lora-rank64-Seko-V1 \\
    --dit_fsdp \\
    --t5_fsdp \\
    --ulysses_size 2 \\
    --base_seed 42 \\
    **--prompt prompt \\
    --image_path image_path \\**
    **--lora_weight_low_noise 000080.safetensors \\
    --lora_weight_high_noise high-000080.safetensors \\**
    --save_dir ./test \\
    --size 832*480