diff --git a/README.md b/README.md index 91ffc2e..42bc602 100644 --- a/README.md +++ b/README.md @@ -478,12 +478,20 @@ We thank the following repos for sharing their awesome models and codes! ## 📋 License -Please follow [CC-BY-NC-ND](./LICENSE). If you want a license authorization, please contact yhebm@connect.ust.hk and yxingag@connect.ust.hk. +Please follow [CC-BY-NC-ND](./LICENSE). If you want a license authorization, please contact the project leads Yingqing He (yhebm@connect.ust.hk) and Yazhou Xing (yxingag@connect.ust.hk). ## 😊 Citation + +```bibtex +@software{videotuna, + author = {Yingqing He and Yazhou Xing and Zhefan Rao and Haoyu Wu and Zhaoyang Liu and Jingye Chen and Pengjun Fang and Jiajun Li and Liya Ji and Runtao Liu and Xiaowei Chi and Yang Fei and Guocheng Shao and Yue Ma and Qifeng Chen}, + title = {VideoTuna: A Powerful Toolkit for Video Generation with Model Fine-Tuning and Post-Training}, + month = {Nov}, + year = {2024}, + url = {https://github.com/VideoVerses/VideoTuna} +} ``` -To be updated... -``` + ## Star History diff --git a/configs/004_cogvideox/cogvideo.yaml b/configs/004_cogvideox/cogvideo2b.yaml similarity index 60% rename from configs/004_cogvideox/cogvideo.yaml rename to configs/004_cogvideox/cogvideo2b.yaml index de87f5f..f615eca 100644 --- a/configs/004_cogvideox/cogvideo.yaml +++ b/configs/004_cogvideox/cogvideo2b.yaml @@ -2,66 +2,43 @@ model: # there might be differet to load from hf and resume from pl # pretrained_checkpoint: "THUDM/CogVideoX-2b" base_learning_rate: 6e-6 - target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkflow + target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkFlow params: # first stage model; cond stage model ; denoising model ; scheduler first_stage_config: target: diffusers.AutoencoderKLCogVideoX params: - pretrained_model_name_or_path: THUDM/CogVideoX-2b + pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-2b subfolder: "vae" - # revision: null - # variant: null cond_stage_config: target: src.lvdm.modules.encoders.condition.FrozenT5Embedder params: - version: "checkpoints/cogvideo/t5-v1_1-xxl" + version: "DeepFloyd/t5-v1_1-xxl" device: "cuda" max_length: 226 freeze: True - # cond_stage_config: - # target: src.cogvideo_hf.cogvideo_pl.FrozenT5CondModel - # params: - # tokenizer_config: - # target: transformers.AutoTokenizer - # params: - # pretrained_model_name_or_path: THUDM/CogVideoX-2b - # subfolder: "tokenizer" - # encoder_config: - # target: transformers.T5EncoderModel - # params: - # subfolder: "text_encoder" - # pretrained_model_name_or_path: THUDM/CogVideoX-2b - # # max_length: 226 - # freeze: True # denosier config equal to unet config in vc denoiser_config: target: diffusers.CogVideoXTransformer3DModel params: - pretrained_model_name_or_path: THUDM/CogVideoX-2b + pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-2b subfolder: "transformer" load_dtype: fp16 # bf16 5b fp16 2B # revision: null # variant: null adapter_config: # the whole dict is remoable - target: peft.HRAConfig - params: - r: 8 - init_weights: True - target_modules: ["to_k", "to_q", "to_v", "to_out.0"] + target: peft.LoraConfig + params: + r: 4 + lora_alpha: 1.0 + init_lora_weights: True + target_modules: ["to_k", "to_q", "to_v", "to_out.0"] - # adapter_config: # the whole dict is remoable - # target: peft.HRAConfig - # params: - # r: 4 - # lora_alpha: 1.0 - # init_lora_weights: True - # target_modules: ["to_k", "to_q", "to_v", "to_out.0"] # sampler config. Wrap it. scheduler_config: target: diffusers.CogVideoXDPMScheduler params: - pretrained_model_name_or_path: THUDM/CogVideoX-2b + pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-2b subfolder: scheduler ## training config @@ -93,8 +70,6 @@ data: lightning: trainer: benchmark: True - batch_size: 3 - num_workers: 38 num_nodes: 1 accumulate_grad_batches: 2 max_epochs: 2000 @@ -108,12 +83,6 @@ lightning: to_local: True # save videos into files log_images_kwargs: unconditional_guidance_scale: 12 # need this, otherwise it is grey - modelcheckpoint: - # target: pytorch_lightning.callbacks.ModelCheckpoin - target: pytorch_lightning.callbacks.ModelCheckpoint - params: - every_n_epochs: 1 - filename: "{epoch:04}-{step:06}" metrics_over_trainsteps_checkpoint: target: pytorch_lightning.callbacks.ModelCheckpoint params: diff --git a/configs/004_cogvideox/cogvideo5b.yaml b/configs/004_cogvideox/cogvideo5b.yaml new file mode 100644 index 0000000..3b56b35 --- /dev/null +++ b/configs/004_cogvideox/cogvideo5b.yaml @@ -0,0 +1,94 @@ +model: + # there might be differet to load from hf and resume from pl + # pretrained_checkpoint: "THUDM/CogVideoX-2b" + base_learning_rate: 6e-6 + target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkFlow + params: + # first stage model; cond stage model ; denoising model ; scheduler + first_stage_config: + target: diffusers.AutoencoderKLCogVideoX + params: + pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b + subfolder: "vae" + cond_stage_config: + target: src.lvdm.modules.encoders.condition.FrozenT5Embedder + params: + version: "DeepFloyd/t5-v1_1-xxl" + device: "cuda" + max_length: 226 + freeze: True + # denosier config equal to unet config in vc + denoiser_config: + target: diffusers.CogVideoXTransformer3DModel + params: + pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b + subfolder: "transformer" + load_dtype: fp16 # bf16 5b fp16 2B + # revision: null + # variant: null + adapter_config: # the whole dict is remoable + target: peft.LoraConfig + params: + r: 4 + lora_alpha: 1.0 + init_lora_weights: True + target_modules: ["to_k", "to_q", "to_v", "to_out.0"] + + # sampler config. Wrap it. + scheduler_config: + target: diffusers.CogVideoXDPMScheduler + params: + pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b + subfolder: scheduler + +## training config +### data , can a toy dataset given +data: + target: src.data.lightning_data.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 16 + wrap: false + train: + target: src.data.cogvideo_dataset.VideoDataset + params: + instance_data_root: "inputs/t2v/cogvideo/elon_musk_video" + dataset_name: null + dataset_config_name: null + caption_column: "labels.txt" + video_column: "videos.txt" + height: 480 + width: 720 + fps: 28 + max_num_frames: 2 + skip_frames_start: 0 + skip_frames_end: 0 + cache_dir: ~/.cache + id_token: null + +### training_step in cogvideoxft +lightning: + trainer: + benchmark: True + num_nodes: 1 + accumulate_grad_batches: 2 + max_epochs: 2000 + precision: 32 # training precision + callbacks: + image_logger: + target: src.utils.callbacks.ImageLogger + params: + batch_frequency: 100000 + max_images: 2 + to_local: True # save videos into files + log_images_kwargs: + unconditional_guidance_scale: 12 # need this, otherwise it is grey + metrics_over_trainsteps_checkpoint: + target: pytorch_lightning.callbacks.ModelCheckpoint + params: + filename: "{epoch:06}-{step:09}" + save_weights_only: False + # every_n_epochs: 300 + every_n_train_steps: 10 + + diff --git a/scripts/inference_cogvideo.py b/scripts/inference_cogvideo.py index f601009..0602469 100644 --- a/scripts/inference_cogvideo.py +++ b/scripts/inference_cogvideo.py @@ -11,7 +11,8 @@ import torch from pytorch_lightning import seed_everything - +from typing import List,Union +from omegaconf import ListConfig sys.path.insert(0, os.getcwd()) sys.path.insert(1, f'{os.getcwd()}/src') diff --git a/shscripts/inference_cogvideo_lora.sh b/shscripts/inference_cogvideo_lora.sh new file mode 100644 index 0000000..8c3705b --- /dev/null +++ b/shscripts/inference_cogvideo_lora.sh @@ -0,0 +1,16 @@ +# ----------------------diffusers based pl inference ---------------------- +# ‘configs/004_cogvideox/cogvideo2b.yaml’ or 'configs/004_cogvideox/cogvideo5b.yaml' +config='configs/004_cogvideox/cogvideo2b.yaml' +prompt_file="inputs/t2v/prompts.txt" +current_time=$(date +%Y%m%d%H%M%S) +savedir="results/t2v/$current_time-cogvideo" +ckpt="{YOUR_CKPT_PATH}" + +python3 scripts/inference_cogvideo.py \ +--ckpt_path $ckpt \ +--config $config \ +--prompt_file $prompt_file \ +--savedir $savedir \ +--bs 1 --height 480 --width 720 \ +--fps 16 \ +--seed 6666 \ \ No newline at end of file diff --git a/shscripts/train_cogvideox_lora.sh b/shscripts/train_cogvideox_lora.sh new file mode 100644 index 0000000..d6efa12 --- /dev/null +++ b/shscripts/train_cogvideox_lora.sh @@ -0,0 +1,17 @@ +export TOKENIZERS_PARALLELISM=false + +# exp settings +EXPNAME="004_cogvideox" # experiment name +CONFIG='configs/004_cogvideox/cogvideo2b.yaml' # experiment config ‘configs/004_cogvideox/cogvideo2b.yaml’ or 'configs/004_cogvideox/cogvideo5b.yaml' +RESROOT="results/cogvideo_train" # experiment saving directory + +# run +current_time=$(date +%Y%m%d%H%M%S) +python scripts/train.py \ +-t \ +--name "$current_time"_$EXPNAME \ +--base $CONFIG \ +--logdir $RESROOT \ +--devices '0,' \ +lightning.trainer.num_nodes=1 \ +--auto_resume False \ No newline at end of file diff --git a/src/cogvideo_hf/cogvideo_pl.py b/src/cogvideo_hf/cogvideo_pl.py index 27671e4..4e6b571 100644 --- a/src/cogvideo_hf/cogvideo_pl.py +++ b/src/cogvideo_hf/cogvideo_pl.py @@ -91,7 +91,7 @@ def retrieve_timesteps( timesteps = scheduler.timesteps return timesteps, num_inference_steps -class CogVideoXWorkflow(pl.LightningModule): +class CogVideoXWorkFlow(pl.LightningModule): def __init__(self, first_stage_config, cond_stage_config, @@ -129,6 +129,7 @@ def __init__(self, # are most schduler self.scheduler = instantiate_from_config(scheduler_config) # add adapter config (Support Lora and HRA ) + self.lora_args = [] if adapter_config is not None: self.inject_adapter(adapter_config) def inject_adapter(self, adapter_config): @@ -422,22 +423,22 @@ def _prepare_rotary_positional_embeddings( height: int, width: int, num_frames: int, - device: torch.device, + vae_scale_factor_spatial: int = 8, + patch_size: int = 2, + attention_head_dim: int = 64, + device: Optional[torch.device] = None, base_height: int = 480, base_width: int = 720, ) -> Tuple[torch.Tensor, torch.Tensor]: - # a merge of _prepare_rotary_positional_embeddings from cogvideoX.finetune.py and diffusers implementation. - # add base_height and base_width to make it more flexible - grid_height = height // (self.vae_scale_factor_spatial * self.model.config.patch_size) - grid_width = width // (self.vae_scale_factor_spatial * self.model.config.patch_size) - base_size_width = base_width // (self.vae_scale_factor_spatial * self.model.config.patch_size) - base_size_height = base_height // (self.vae_scale_factor_spatial * self.model.config.patch_size) - - grid_crops_coords = get_resize_crop_region_for_grid( - (grid_height, grid_width), base_size_width, base_size_height - ) + + grid_height = height // (vae_scale_factor_spatial * patch_size) + grid_width = width // (vae_scale_factor_spatial * patch_size) + base_size_width = base_width // (vae_scale_factor_spatial * patch_size) + base_size_height = base_height // (vae_scale_factor_spatial * patch_size) + + grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size_width, base_size_height) freqs_cos, freqs_sin = get_3d_rotary_pos_embed( - embed_dim=self.model.config.attention_head_dim, + embed_dim=attention_head_dim, crops_coords=grid_crops_coords, grid_size=(grid_height, grid_width), temporal_size=num_frames, @@ -446,6 +447,7 @@ def _prepare_rotary_positional_embeddings( freqs_cos = freqs_cos.to(device=device) freqs_sin = freqs_sin.to(device=device) return freqs_cos, freqs_sin + @torch.no_grad() def sample( self, @@ -624,7 +626,15 @@ def sample( # 7. Create rotary embeds if required image_rotary_emb = ( - self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device) + self._prepare_rotary_positional_embeddings( + height=height, + width=width, + num_frames=latents.shape[1], + vae_scale_factor_spatial=self.vae_scale_factor_spatial, + patch_size=self.model.config.patch_size, + attention_head_dim=self.model.config.attention_head_dim, + device=self.device, + ) if self.model.config.use_rotary_positional_embeddings else None ) @@ -641,7 +651,7 @@ def sample( latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - print(i,latent_model_input.max(),latent_model_input.min(),has_nan(latent_model_input)) + # print(i,latent_model_input.max(),latent_model_input.min(),has_nan(latent_model_input)) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML timestep = t.expand(latent_model_input.shape[0]) # print(i,num_inference_steps) @@ -741,6 +751,7 @@ def get_batch_input(self, batch): "videos": videos, "prompts": prompts, } + def training_step(self, batch, batch_idx): # print(type(batch),batch.keys(),type(batch['instance_video']),batch['instance_video'].shape);exit(); # dict_keys(['instance_prompt', 'instance_video']) batch = self.get_batch_input(batch) @@ -772,8 +783,8 @@ def training_step(self, batch, batch_idx): image_rotary_emb = ( # in the first place, we assume this function is the same during inference and train. self._prepare_rotary_positional_embeddings( - height=height, - width=width, + height=height*self.vae_scale_factor_spatial, + width=width*self.vae_scale_factor_spatial, num_frames=num_frames, vae_scale_factor_spatial=self.vae_scale_factor_spatial, patch_size=self.model.config.patch_size, diff --git a/src/utils/callbacks.py b/src/utils/callbacks.py index 5aa7891..ccf505f 100755 --- a/src/utils/callbacks.py +++ b/src/utils/callbacks.py @@ -14,7 +14,7 @@ from pytorch_lightning.callbacks import Callback from pytorch_lightning.utilities import rank_zero_only from pytorch_lightning.utilities import rank_zero_info -from utils.save_video import log_local, prepare_to_log +from .save_video import log_local, prepare_to_log class LoraModelCheckpoint(pl.callbacks.ModelCheckpoint):