Harry Yang
**Select ~50 high-quality videos with captions.**
1. Generate JSON
python generate_jsonl.py
{"video_path": "*.mp4", "caption": "*"}
{"video_path": "*.mp4", "caption": "*"}
2. Generate TOML
python generate_toml.py
[general]
batch_size = 1
enable_bucket = true
bucket_no_upscale = false
[[datasets]]
video_jsonl_file = jsonl_file_from_step_1
cache_directory = cache_dir
target_frames = [129]
3. Cache Latents and Text Encoder
CUDA_VISIBLE_DEVICES=0 python -m src.musubi_tuner.wan_cache_latents \
--dataset_config toml_file \
--vae Wan2.1_VAE.pth \
--batch_size 2 \
--i2v
CUDA_VISIBLE_DEVICES=7 python -m src.musubi_tuner.wan_cache_text_encoder_outputs \
--dataset_config toml_file \
--t5 models_t5_umt5-xxl-enc-bf16.pth \
--batch_size 2
4. Fine-tune
# High noise
CUDA_VISIBLE_DEVICES=0,1,2 accelerate launch \
--num_processes 3 \
--num_cpu_threads_per_process 2 \
--mixed_precision fp16 \
--main_process_port 55558 \
src/musubi_tuner/wan_train_network.py \
--task i2v-A14B \
--dit wan2.2_i2v_high_noise_14B_fp16.safetensors \
**--dataset_config "*.toml" \**
--sdpa \
--optimizer_type adamw \
--learning_rate 1e-4 \
--gradient_checkpointing \
--max_data_loader_n_workers 2 \
--persistent_data_loader_workers \
--network_module networks.lora_wan \
--network_dim 8 \
--network_alpha 4 \
--timestep_sampling shift \
--discrete_flow_shift 5.0 \
--min_timestep 875 \
--max_timestep 1000 \
--max_train_epochs 100 \
--save_every_n_epochs 5 \
--seed 42 \
**--output_dir high_output_dir \
--output_name high_output_name \**
--log_with wandb \
**--logging_dir high_log_dir \**
--lr_scheduler cosine \
--lr_warmup_steps 1000 \
--optimizer_args weight_decay=0.05 betas=0.9,0.999
# Low noise
CUDA_VISIBLE_DEVICES=3,4,5 accelerate launch \
--num_processes 3 \
--num_cpu_threads_per_process 2 \
--mixed_precision fp16 \
--main_process_port 55557 \
src/musubi_tuner/wan_train_network.py \
--task i2v-A14B \
--dit wan2.2_i2v_low_noise_14B_fp16.safetensors \
**--dataset_config "*.toml" \**
--sdpa \
--optimizer_type adamw \
--learning_rate 1e-4 \
--gradient_checkpointing \
--max_data_loader_n_workers 2 \
--persistent_data_loader_workers \
--network_module networks.lora_wan \
--network_dim 8 \
--network_alpha 4 \
--timestep_sampling shift \
--discrete_flow_shift 5.0 \
--min_timestep 0 \
--max_timestep 875 \
--max_train_epochs 100 \
--save_every_n_epochs 5 \
--seed 42 \
**--output_dir low_output_dir \
--output_name low_output_name \**
--log_with wandb \
**--logging_dir low_log_dir \**
--lr_scheduler cosine \
--lr_warmup_steps 1000 \
--optimizer_args weight_decay=0.05 betas=0.9,0.999
5. Inference
**Use epoch 80:**
CUDA_VISIBLE_DEVICES=7 python src/musubi_tuner/wan_generate_video.py \
--task i2v-A14B \
--infer_steps 30 \
--save_path save_path.mp4 \
--output_type both \
--dit wan2.2_i2v_low_noise_14B_fp16.safetensors \
--dit_high_noise wan2.2_i2v_high_noise_14B_fp16.safetensors \
**--lora_weight 000080.safetensors \
--lora_weight_high_noise high-000080.safetensors \**
--vae Wan2.1_VAE.pth \
--t5 models_t5_umt5-xxl-enc-bf16.pth \
--attn_mode sageattn \
--video_length 129 \
--fps 24 \
--flow_shift 5.0 \
--timestep_boundary 0.900 \
**--image_path image_path \**
--video_size 480 832 \
**--prompt prompt_text**
**Fast inference:**
CUDA_VISIBLE_DEVICES=6,7 torchrun \
--nproc_per_node=2 \
generate.py \
--task i2v-A14B \
--ckpt_dir ./Wan2.2-I2V-A14B \
--lora_dir ./Wan2.2-Lightning/Wan2.2-I2V-A14B-4steps-lora-rank64-Seko-V1 \
--dit_fsdp \
--t5_fsdp \
--ulysses_size 2 \
--base_seed 42 \
**--prompt prompt \
--image_path image_path \**
**--lora_weight_low_noise 000080.safetensors \
--lora_weight_high_noise high-000080.safetensors \**
--save_dir ./test \
--size 832*480