Merge branch 'VideoVerses:main' into mochi

VideoVerses · Nov 29, 2024 · 2db7517 · 2db7517
2 parents 4692778 + b8e1b9f
commit 2db7517
Show file tree

Hide file tree

Showing 8 changed files with 180 additions and 64 deletions.
diff --git a/README.md b/README.md
@@ -478,12 +478,20 @@ We thank the following repos for sharing their awesome models and codes!
 </a>
 
 ## 📋 License
-Please follow [CC-BY-NC-ND](./LICENSE). If you want a license authorization, please contact [email protected] and [email protected].
+Please follow [CC-BY-NC-ND](./LICENSE). If you want a license authorization, please contact the project leads Yingqing He ([email protected]) and Yazhou Xing ([email protected]).
 
 ## 😊 Citation
+
+```bibtex
+@software{videotuna,
+  author = {Yingqing He and Yazhou Xing and Zhefan Rao and Haoyu Wu and Zhaoyang Liu and Jingye Chen and Pengjun Fang and Jiajun Li and Liya Ji and Runtao Liu and Xiaowei Chi and Yang Fei and Guocheng Shao and Yue Ma and Qifeng Chen},
+  title = {VideoTuna: A Powerful Toolkit for Video Generation with Model Fine-Tuning and Post-Training},
+  month = {Nov},
+  year = {2024},
+  url = {https://github.com/VideoVerses/VideoTuna}
+}
 ```
-To be updated...
-```
+
 
 ## Star History
 

diff --git a/configs/004_cogvideox/cogvideo.yaml → configs/004_cogvideox/cogvideo2b.yaml b/configs/004_cogvideox/cogvideo.yaml → configs/004_cogvideox/cogvideo2b.yaml
@@ -2,66 +2,43 @@ model:
   # there might be differet to load from hf and resume from pl 
   # pretrained_checkpoint: "THUDM/CogVideoX-2b"
   base_learning_rate: 6e-6
-  target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkflow
+  target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkFlow
   params: 
     # first stage model; cond stage model ; denoising model ; scheduler 
     first_stage_config:
       target: diffusers.AutoencoderKLCogVideoX
       params:
-        pretrained_model_name_or_path: THUDM/CogVideoX-2b 
+        pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-2b 
         subfolder: "vae"
-        # revision: null 
-        # variant: null
     cond_stage_config:
       target: src.lvdm.modules.encoders.condition.FrozenT5Embedder
       params:
-        version: "checkpoints/cogvideo/t5-v1_1-xxl"
+        version: "DeepFloyd/t5-v1_1-xxl"
         device: "cuda"
         max_length: 226 
         freeze: True
-    # cond_stage_config:
-    #   target: src.cogvideo_hf.cogvideo_pl.FrozenT5CondModel
-    #   params:
-    #     tokenizer_config:
-    #       target: transformers.AutoTokenizer
-    #       params:
-    #         pretrained_model_name_or_path: THUDM/CogVideoX-2b 
-    #         subfolder: "tokenizer"
-    #     encoder_config:
-    #       target: transformers.T5EncoderModel
-    #       params:
-    #         subfolder: "text_encoder"
-    #         pretrained_model_name_or_path: THUDM/CogVideoX-2b 
-    #       # max_length: 226 
-    #     freeze: True
     # denosier config equal to unet config in vc 
     denoiser_config:
       target: diffusers.CogVideoXTransformer3DModel
       params:
-        pretrained_model_name_or_path: THUDM/CogVideoX-2b 
+        pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-2b 
         subfolder: "transformer"
         load_dtype: fp16 # bf16 5b fp16 2B 
         # revision: null 
         # variant: null
     adapter_config: # the whole dict is remoable 
-      target: peft.HRAConfig
-      params:
-        r: 8
-        init_weights: True
-        target_modules: ["to_k", "to_q", "to_v", "to_out.0"]
+        target: peft.LoraConfig
+        params:
+            r: 4
+            lora_alpha: 1.0 
+            init_lora_weights: True
+            target_modules: ["to_k", "to_q", "to_v", "to_out.0"]
 
-    # adapter_config: # the whole dict is remoable 
-        # target: peft.HRAConfig
-        # params:
-    #     r: 4
-    #     lora_alpha: 1.0 
-    #     init_lora_weights: True
-    #     target_modules: ["to_k", "to_q", "to_v", "to_out.0"]
     # sampler config. Wrap it.       
     scheduler_config:
       target: diffusers.CogVideoXDPMScheduler
       params:
-        pretrained_model_name_or_path: THUDM/CogVideoX-2b 
+        pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-2b 
         subfolder: scheduler
 
 ## training config 
@@ -93,8 +70,6 @@ data:
 lightning:
   trainer:
     benchmark: True
-    batch_size: 3
-    num_workers: 38
     num_nodes: 1
     accumulate_grad_batches: 2
     max_epochs: 2000
@@ -108,12 +83,6 @@ lightning:
         to_local: True # save videos into files
         log_images_kwargs:
           unconditional_guidance_scale: 12 # need this, otherwise it is grey
-    modelcheckpoint:
-      # target: pytorch_lightning.callbacks.ModelCheckpoin
-      target: pytorch_lightning.callbacks.ModelCheckpoint
-      params:
-        every_n_epochs: 1
-        filename: "{epoch:04}-{step:06}"
     metrics_over_trainsteps_checkpoint:
       target: pytorch_lightning.callbacks.ModelCheckpoint
       params:

diff --git a/configs/004_cogvideox/cogvideo5b.yaml b/configs/004_cogvideox/cogvideo5b.yaml
@@ -0,0 +1,94 @@
+model:
+  # there might be differet to load from hf and resume from pl 
+  # pretrained_checkpoint: "THUDM/CogVideoX-2b"
+  base_learning_rate: 6e-6
+  target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkFlow
+  params: 
+    # first stage model; cond stage model ; denoising model ; scheduler 
+    first_stage_config:
+      target: diffusers.AutoencoderKLCogVideoX
+      params:
+        pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b 
+        subfolder: "vae"
+    cond_stage_config:
+      target: src.lvdm.modules.encoders.condition.FrozenT5Embedder
+      params:
+        version: "DeepFloyd/t5-v1_1-xxl"
+        device: "cuda"
+        max_length: 226 
+        freeze: True
+    # denosier config equal to unet config in vc 
+    denoiser_config:
+      target: diffusers.CogVideoXTransformer3DModel
+      params:
+        pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b 
+        subfolder: "transformer"
+        load_dtype: fp16 # bf16 5b fp16 2B 
+        # revision: null 
+        # variant: null
+    adapter_config: # the whole dict is remoable 
+        target: peft.LoraConfig
+        params:
+            r: 4
+            lora_alpha: 1.0 
+            init_lora_weights: True
+            target_modules: ["to_k", "to_q", "to_v", "to_out.0"]
+
+    # sampler config. Wrap it.       
+    scheduler_config:
+      target: diffusers.CogVideoXDPMScheduler
+      params:
+        pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b
+        subfolder: scheduler
+
+## training config 
+### data , can a toy dataset given 
+data:
+  target: src.data.lightning_data.DataModuleFromConfig
+  params:
+    batch_size: 2
+    num_workers: 16
+    wrap: false
+    train:
+      target: src.data.cogvideo_dataset.VideoDataset
+      params:
+        instance_data_root: "inputs/t2v/cogvideo/elon_musk_video"
+        dataset_name: null 
+        dataset_config_name: null
+        caption_column: "labels.txt"
+        video_column: "videos.txt"
+        height: 480
+        width: 720
+        fps: 28
+        max_num_frames: 2
+        skip_frames_start: 0
+        skip_frames_end: 0
+        cache_dir: ~/.cache
+        id_token: null
+
+### training_step in cogvideoxft
+lightning:
+  trainer:
+    benchmark: True
+    num_nodes: 1
+    accumulate_grad_batches: 2
+    max_epochs: 2000
+    precision: 32 # training precision
+  callbacks:
+    image_logger:
+      target: src.utils.callbacks.ImageLogger
+      params:
+        batch_frequency: 100000
+        max_images: 2
+        to_local: True # save videos into files
+        log_images_kwargs:
+          unconditional_guidance_scale: 12 # need this, otherwise it is grey
+    metrics_over_trainsteps_checkpoint:
+      target: pytorch_lightning.callbacks.ModelCheckpoint
+      params:
+        filename: "{epoch:06}-{step:09}"
+        save_weights_only: False
+        # every_n_epochs: 300
+        every_n_train_steps: 10
+
+
diff --git a/scripts/inference_cogvideo.py b/scripts/inference_cogvideo.py
@@ -11,7 +11,8 @@
 
 import torch
 from pytorch_lightning import seed_everything
-
+from typing import List,Union
+from omegaconf import ListConfig
 
 sys.path.insert(0, os.getcwd())
 sys.path.insert(1, f'{os.getcwd()}/src')

diff --git a/shscripts/inference_cogvideo_lora.sh b/shscripts/inference_cogvideo_lora.sh
@@ -0,0 +1,16 @@
+# ----------------------diffusers based pl inference ----------------------
+# ‘configs/004_cogvideox/cogvideo2b.yaml’ or 'configs/004_cogvideox/cogvideo5b.yaml'
+config='configs/004_cogvideox/cogvideo2b.yaml'
+prompt_file="inputs/t2v/prompts.txt"
+current_time=$(date +%Y%m%d%H%M%S)
+savedir="results/t2v/$current_time-cogvideo"
+ckpt="{YOUR_CKPT_PATH}"
+
+python3 scripts/inference_cogvideo.py \
+--ckpt_path $ckpt \
+--config $config \
+--prompt_file $prompt_file \
+--savedir $savedir \
+--bs 1 --height 480 --width 720 \
+--fps 16 \
+--seed 6666 \
diff --git a/shscripts/train_cogvideox_lora.sh b/shscripts/train_cogvideox_lora.sh
@@ -0,0 +1,17 @@
+export TOKENIZERS_PARALLELISM=false
+
+# exp settings
+EXPNAME="004_cogvideox"                          # experiment name 
+CONFIG='configs/004_cogvideox/cogvideo2b.yaml'   # experiment config ‘configs/004_cogvideox/cogvideo2b.yaml’ or 'configs/004_cogvideox/cogvideo5b.yaml'
+RESROOT="results/cogvideo_train"                 # experiment saving directory
+
+# run
+current_time=$(date +%Y%m%d%H%M%S)
+python scripts/train.py \
+-t \
+--name "$current_time"_$EXPNAME \
+--base $CONFIG \
+--logdir $RESROOT \
+--devices '0,' \
+lightning.trainer.num_nodes=1 \
+--auto_resume False
diff --git a/src/cogvideo_hf/cogvideo_pl.py b/src/cogvideo_hf/cogvideo_pl.py
@@ -91,7 +91,7 @@ def retrieve_timesteps(
         timesteps = scheduler.timesteps
     return timesteps, num_inference_steps
 
-class CogVideoXWorkflow(pl.LightningModule):
+class CogVideoXWorkFlow(pl.LightningModule):
     def __init__(self, 
                 first_stage_config,
                 cond_stage_config,
@@ -129,6 +129,7 @@ def __init__(self,
         # are most schduler 
         self.scheduler = instantiate_from_config(scheduler_config)
         # add adapter config (Support Lora and HRA ) 
+        self.lora_args = []
         if adapter_config is not None:
             self.inject_adapter(adapter_config)
     def inject_adapter(self, adapter_config):
@@ -422,22 +423,22 @@ def _prepare_rotary_positional_embeddings(
         height: int,
         width: int,
         num_frames: int,
-        device: torch.device,
+        vae_scale_factor_spatial: int = 8,
+        patch_size: int = 2,
+        attention_head_dim: int = 64,
+        device: Optional[torch.device] = None,
         base_height: int = 480,
         base_width: int = 720,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # a merge of _prepare_rotary_positional_embeddings from cogvideoX.finetune.py and diffusers implementation. 
-        # add base_height and base_width to make it more flexible
-        grid_height = height // (self.vae_scale_factor_spatial * self.model.config.patch_size)
-        grid_width = width // (self.vae_scale_factor_spatial * self.model.config.patch_size)
-        base_size_width = base_width // (self.vae_scale_factor_spatial * self.model.config.patch_size)
-        base_size_height = base_height // (self.vae_scale_factor_spatial * self.model.config.patch_size)
-
-        grid_crops_coords = get_resize_crop_region_for_grid(
-            (grid_height, grid_width), base_size_width, base_size_height
-        )
+
+        grid_height = height // (vae_scale_factor_spatial * patch_size)
+        grid_width = width // (vae_scale_factor_spatial * patch_size)
+        base_size_width = base_width // (vae_scale_factor_spatial * patch_size)
+        base_size_height = base_height // (vae_scale_factor_spatial * patch_size)
+
+        grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size_width, base_size_height)
         freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
-            embed_dim=self.model.config.attention_head_dim,
+            embed_dim=attention_head_dim,
             crops_coords=grid_crops_coords,
             grid_size=(grid_height, grid_width),
             temporal_size=num_frames,
@@ -446,6 +447,7 @@ def _prepare_rotary_positional_embeddings(
         freqs_cos = freqs_cos.to(device=device)
         freqs_sin = freqs_sin.to(device=device)
         return freqs_cos, freqs_sin
+
     @torch.no_grad()
     def sample(
         self,
@@ -624,7 +626,15 @@ def sample(
 
         # 7. Create rotary embeds if required
         image_rotary_emb = (
-            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            self._prepare_rotary_positional_embeddings(
+                height=height,
+                width=width,
+                num_frames=latents.shape[1],
+                vae_scale_factor_spatial=self.vae_scale_factor_spatial,
+                patch_size=self.model.config.patch_size,
+                attention_head_dim=self.model.config.attention_head_dim,
+                device=self.device,
+            )
             if self.model.config.use_rotary_positional_embeddings
             else None
         )
@@ -641,7 +651,7 @@ def sample(
 
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-            print(i,latent_model_input.max(),latent_model_input.min(),has_nan(latent_model_input))
+            # print(i,latent_model_input.max(),latent_model_input.min(),has_nan(latent_model_input))
             # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
             timestep = t.expand(latent_model_input.shape[0])
             # print(i,num_inference_steps)
@@ -741,6 +751,7 @@ def get_batch_input(self, batch):
             "videos": videos,
             "prompts": prompts,
         }
+
     def training_step(self, batch, batch_idx):
         # print(type(batch),batch.keys(),type(batch['instance_video']),batch['instance_video'].shape);exit(); # <class 'dict'> dict_keys(['instance_prompt', 'instance_video'])
         batch = self.get_batch_input(batch)
@@ -772,8 +783,8 @@ def training_step(self, batch, batch_idx):
         image_rotary_emb = (
             # in the first place, we assume this function is the same during inference and train. 
             self._prepare_rotary_positional_embeddings(
-                height=height,
-                width=width,
+                height=height*self.vae_scale_factor_spatial,
+                width=width*self.vae_scale_factor_spatial,
                 num_frames=num_frames,
                 vae_scale_factor_spatial=self.vae_scale_factor_spatial,
                 patch_size=self.model.config.patch_size,

diff --git a/src/utils/callbacks.py b/src/utils/callbacks.py
@@ -14,7 +14,7 @@
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.utilities import rank_zero_only
 from pytorch_lightning.utilities import rank_zero_info
-from utils.save_video import log_local, prepare_to_log
+from .save_video import log_local, prepare_to_log
 
 
 class LoraModelCheckpoint(pl.callbacks.ModelCheckpoint):