Skip to content

Commit

Permalink
Merge branch 'VideoVerses:main' into mochi
Browse files Browse the repository at this point in the history
  • Loading branch information
yzxing87 authored Nov 29, 2024
2 parents 4692778 + b8e1b9f commit 2db7517
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 64 deletions.
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -478,12 +478,20 @@ We thank the following repos for sharing their awesome models and codes!
</a>

## 📋 License
Please follow [CC-BY-NC-ND](./LICENSE). If you want a license authorization, please contact [email protected] and [email protected].
Please follow [CC-BY-NC-ND](./LICENSE). If you want a license authorization, please contact the project leads Yingqing He ([email protected]) and Yazhou Xing ([email protected]).

## 😊 Citation

```bibtex
@software{videotuna,
author = {Yingqing He and Yazhou Xing and Zhefan Rao and Haoyu Wu and Zhaoyang Liu and Jingye Chen and Pengjun Fang and Jiajun Li and Liya Ji and Runtao Liu and Xiaowei Chi and Yang Fei and Guocheng Shao and Yue Ma and Qifeng Chen},
title = {VideoTuna: A Powerful Toolkit for Video Generation with Model Fine-Tuning and Post-Training},
month = {Nov},
year = {2024},
url = {https://github.com/VideoVerses/VideoTuna}
}
```
To be updated...
```


## Star History

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,66 +2,43 @@ model:
# there might be differet to load from hf and resume from pl
# pretrained_checkpoint: "THUDM/CogVideoX-2b"
base_learning_rate: 6e-6
target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkflow
target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkFlow
params:
# first stage model; cond stage model ; denoising model ; scheduler
first_stage_config:
target: diffusers.AutoencoderKLCogVideoX
params:
pretrained_model_name_or_path: THUDM/CogVideoX-2b
pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-2b
subfolder: "vae"
# revision: null
# variant: null
cond_stage_config:
target: src.lvdm.modules.encoders.condition.FrozenT5Embedder
params:
version: "checkpoints/cogvideo/t5-v1_1-xxl"
version: "DeepFloyd/t5-v1_1-xxl"
device: "cuda"
max_length: 226
freeze: True
# cond_stage_config:
# target: src.cogvideo_hf.cogvideo_pl.FrozenT5CondModel
# params:
# tokenizer_config:
# target: transformers.AutoTokenizer
# params:
# pretrained_model_name_or_path: THUDM/CogVideoX-2b
# subfolder: "tokenizer"
# encoder_config:
# target: transformers.T5EncoderModel
# params:
# subfolder: "text_encoder"
# pretrained_model_name_or_path: THUDM/CogVideoX-2b
# # max_length: 226
# freeze: True
# denosier config equal to unet config in vc
denoiser_config:
target: diffusers.CogVideoXTransformer3DModel
params:
pretrained_model_name_or_path: THUDM/CogVideoX-2b
pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-2b
subfolder: "transformer"
load_dtype: fp16 # bf16 5b fp16 2B
# revision: null
# variant: null
adapter_config: # the whole dict is remoable
target: peft.HRAConfig
params:
r: 8
init_weights: True
target_modules: ["to_k", "to_q", "to_v", "to_out.0"]
target: peft.LoraConfig
params:
r: 4
lora_alpha: 1.0
init_lora_weights: True
target_modules: ["to_k", "to_q", "to_v", "to_out.0"]

# adapter_config: # the whole dict is remoable
# target: peft.HRAConfig
# params:
# r: 4
# lora_alpha: 1.0
# init_lora_weights: True
# target_modules: ["to_k", "to_q", "to_v", "to_out.0"]
# sampler config. Wrap it.
scheduler_config:
target: diffusers.CogVideoXDPMScheduler
params:
pretrained_model_name_or_path: THUDM/CogVideoX-2b
pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-2b
subfolder: scheduler

## training config
Expand Down Expand Up @@ -93,8 +70,6 @@ data:
lightning:
trainer:
benchmark: True
batch_size: 3
num_workers: 38
num_nodes: 1
accumulate_grad_batches: 2
max_epochs: 2000
Expand All @@ -108,12 +83,6 @@ lightning:
to_local: True # save videos into files
log_images_kwargs:
unconditional_guidance_scale: 12 # need this, otherwise it is grey
modelcheckpoint:
# target: pytorch_lightning.callbacks.ModelCheckpoin
target: pytorch_lightning.callbacks.ModelCheckpoint
params:
every_n_epochs: 1
filename: "{epoch:04}-{step:06}"
metrics_over_trainsteps_checkpoint:
target: pytorch_lightning.callbacks.ModelCheckpoint
params:
Expand Down
94 changes: 94 additions & 0 deletions configs/004_cogvideox/cogvideo5b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
model:
# there might be differet to load from hf and resume from pl
# pretrained_checkpoint: "THUDM/CogVideoX-2b"
base_learning_rate: 6e-6
target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkFlow
params:
# first stage model; cond stage model ; denoising model ; scheduler
first_stage_config:
target: diffusers.AutoencoderKLCogVideoX
params:
pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b
subfolder: "vae"
cond_stage_config:
target: src.lvdm.modules.encoders.condition.FrozenT5Embedder
params:
version: "DeepFloyd/t5-v1_1-xxl"
device: "cuda"
max_length: 226
freeze: True
# denosier config equal to unet config in vc
denoiser_config:
target: diffusers.CogVideoXTransformer3DModel
params:
pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b
subfolder: "transformer"
load_dtype: fp16 # bf16 5b fp16 2B
# revision: null
# variant: null
adapter_config: # the whole dict is remoable
target: peft.LoraConfig
params:
r: 4
lora_alpha: 1.0
init_lora_weights: True
target_modules: ["to_k", "to_q", "to_v", "to_out.0"]

# sampler config. Wrap it.
scheduler_config:
target: diffusers.CogVideoXDPMScheduler
params:
pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b
subfolder: scheduler

## training config
### data , can a toy dataset given
data:
target: src.data.lightning_data.DataModuleFromConfig
params:
batch_size: 2
num_workers: 16
wrap: false
train:
target: src.data.cogvideo_dataset.VideoDataset
params:
instance_data_root: "inputs/t2v/cogvideo/elon_musk_video"
dataset_name: null
dataset_config_name: null
caption_column: "labels.txt"
video_column: "videos.txt"
height: 480
width: 720
fps: 28
max_num_frames: 2
skip_frames_start: 0
skip_frames_end: 0
cache_dir: ~/.cache
id_token: null

### training_step in cogvideoxft
lightning:
trainer:
benchmark: True
num_nodes: 1
accumulate_grad_batches: 2
max_epochs: 2000
precision: 32 # training precision
callbacks:
image_logger:
target: src.utils.callbacks.ImageLogger
params:
batch_frequency: 100000
max_images: 2
to_local: True # save videos into files
log_images_kwargs:
unconditional_guidance_scale: 12 # need this, otherwise it is grey
metrics_over_trainsteps_checkpoint:
target: pytorch_lightning.callbacks.ModelCheckpoint
params:
filename: "{epoch:06}-{step:09}"
save_weights_only: False
# every_n_epochs: 300
every_n_train_steps: 10


3 changes: 2 additions & 1 deletion scripts/inference_cogvideo.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@

import torch
from pytorch_lightning import seed_everything

from typing import List,Union
from omegaconf import ListConfig

sys.path.insert(0, os.getcwd())
sys.path.insert(1, f'{os.getcwd()}/src')
Expand Down
16 changes: 16 additions & 0 deletions shscripts/inference_cogvideo_lora.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# ----------------------diffusers based pl inference ----------------------
# ‘configs/004_cogvideox/cogvideo2b.yaml’ or 'configs/004_cogvideox/cogvideo5b.yaml'
config='configs/004_cogvideox/cogvideo2b.yaml'
prompt_file="inputs/t2v/prompts.txt"
current_time=$(date +%Y%m%d%H%M%S)
savedir="results/t2v/$current_time-cogvideo"
ckpt="{YOUR_CKPT_PATH}"

python3 scripts/inference_cogvideo.py \
--ckpt_path $ckpt \
--config $config \
--prompt_file $prompt_file \
--savedir $savedir \
--bs 1 --height 480 --width 720 \
--fps 16 \
--seed 6666 \
17 changes: 17 additions & 0 deletions shscripts/train_cogvideox_lora.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
export TOKENIZERS_PARALLELISM=false

# exp settings
EXPNAME="004_cogvideox" # experiment name
CONFIG='configs/004_cogvideox/cogvideo2b.yaml' # experiment config ‘configs/004_cogvideox/cogvideo2b.yaml’ or 'configs/004_cogvideox/cogvideo5b.yaml'
RESROOT="results/cogvideo_train" # experiment saving directory

# run
current_time=$(date +%Y%m%d%H%M%S)
python scripts/train.py \
-t \
--name "$current_time"_$EXPNAME \
--base $CONFIG \
--logdir $RESROOT \
--devices '0,' \
lightning.trainer.num_nodes=1 \
--auto_resume False
45 changes: 28 additions & 17 deletions src/cogvideo_hf/cogvideo_pl.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def retrieve_timesteps(
timesteps = scheduler.timesteps
return timesteps, num_inference_steps

class CogVideoXWorkflow(pl.LightningModule):
class CogVideoXWorkFlow(pl.LightningModule):
def __init__(self,
first_stage_config,
cond_stage_config,
Expand Down Expand Up @@ -129,6 +129,7 @@ def __init__(self,
# are most schduler
self.scheduler = instantiate_from_config(scheduler_config)
# add adapter config (Support Lora and HRA )
self.lora_args = []
if adapter_config is not None:
self.inject_adapter(adapter_config)
def inject_adapter(self, adapter_config):
Expand Down Expand Up @@ -422,22 +423,22 @@ def _prepare_rotary_positional_embeddings(
height: int,
width: int,
num_frames: int,
device: torch.device,
vae_scale_factor_spatial: int = 8,
patch_size: int = 2,
attention_head_dim: int = 64,
device: Optional[torch.device] = None,
base_height: int = 480,
base_width: int = 720,
) -> Tuple[torch.Tensor, torch.Tensor]:
# a merge of _prepare_rotary_positional_embeddings from cogvideoX.finetune.py and diffusers implementation.
# add base_height and base_width to make it more flexible
grid_height = height // (self.vae_scale_factor_spatial * self.model.config.patch_size)
grid_width = width // (self.vae_scale_factor_spatial * self.model.config.patch_size)
base_size_width = base_width // (self.vae_scale_factor_spatial * self.model.config.patch_size)
base_size_height = base_height // (self.vae_scale_factor_spatial * self.model.config.patch_size)

grid_crops_coords = get_resize_crop_region_for_grid(
(grid_height, grid_width), base_size_width, base_size_height
)

grid_height = height // (vae_scale_factor_spatial * patch_size)
grid_width = width // (vae_scale_factor_spatial * patch_size)
base_size_width = base_width // (vae_scale_factor_spatial * patch_size)
base_size_height = base_height // (vae_scale_factor_spatial * patch_size)

grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size_width, base_size_height)
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
embed_dim=self.model.config.attention_head_dim,
embed_dim=attention_head_dim,
crops_coords=grid_crops_coords,
grid_size=(grid_height, grid_width),
temporal_size=num_frames,
Expand All @@ -446,6 +447,7 @@ def _prepare_rotary_positional_embeddings(
freqs_cos = freqs_cos.to(device=device)
freqs_sin = freqs_sin.to(device=device)
return freqs_cos, freqs_sin

@torch.no_grad()
def sample(
self,
Expand Down Expand Up @@ -624,7 +626,15 @@ def sample(

# 7. Create rotary embeds if required
image_rotary_emb = (
self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
self._prepare_rotary_positional_embeddings(
height=height,
width=width,
num_frames=latents.shape[1],
vae_scale_factor_spatial=self.vae_scale_factor_spatial,
patch_size=self.model.config.patch_size,
attention_head_dim=self.model.config.attention_head_dim,
device=self.device,
)
if self.model.config.use_rotary_positional_embeddings
else None
)
Expand All @@ -641,7 +651,7 @@ def sample(

latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
print(i,latent_model_input.max(),latent_model_input.min(),has_nan(latent_model_input))
# print(i,latent_model_input.max(),latent_model_input.min(),has_nan(latent_model_input))
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
timestep = t.expand(latent_model_input.shape[0])
# print(i,num_inference_steps)
Expand Down Expand Up @@ -741,6 +751,7 @@ def get_batch_input(self, batch):
"videos": videos,
"prompts": prompts,
}

def training_step(self, batch, batch_idx):
# print(type(batch),batch.keys(),type(batch['instance_video']),batch['instance_video'].shape);exit(); # <class 'dict'> dict_keys(['instance_prompt', 'instance_video'])
batch = self.get_batch_input(batch)
Expand Down Expand Up @@ -772,8 +783,8 @@ def training_step(self, batch, batch_idx):
image_rotary_emb = (
# in the first place, we assume this function is the same during inference and train.
self._prepare_rotary_positional_embeddings(
height=height,
width=width,
height=height*self.vae_scale_factor_spatial,
width=width*self.vae_scale_factor_spatial,
num_frames=num_frames,
vae_scale_factor_spatial=self.vae_scale_factor_spatial,
patch_size=self.model.config.patch_size,
Expand Down
2 changes: 1 addition & 1 deletion src/utils/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.utilities import rank_zero_only
from pytorch_lightning.utilities import rank_zero_info
from utils.save_video import log_local, prepare_to_log
from .save_video import log_local, prepare_to_log


class LoraModelCheckpoint(pl.callbacks.ModelCheckpoint):
Expand Down

0 comments on commit 2db7517

Please sign in to comment.