diff --git a/README.md b/README.md
index 91ffc2e..42bc602 100644
--- a/README.md
+++ b/README.md
@@ -478,12 +478,20 @@ We thank the following repos for sharing their awesome models and codes!
 </a>
 
 ## 📋 License
-Please follow [CC-BY-NC-ND](./LICENSE). If you want a license authorization, please contact yhebm@connect.ust.hk and yxingag@connect.ust.hk.
+Please follow [CC-BY-NC-ND](./LICENSE). If you want a license authorization, please contact the project leads Yingqing He (yhebm@connect.ust.hk) and Yazhou Xing (yxingag@connect.ust.hk).
 
 ## 😊 Citation
+
+```bibtex
+@software{videotuna,
+  author = {Yingqing He and Yazhou Xing and Zhefan Rao and Haoyu Wu and Zhaoyang Liu and Jingye Chen and Pengjun Fang and Jiajun Li and Liya Ji and Runtao Liu and Xiaowei Chi and Yang Fei and Guocheng Shao and Yue Ma and Qifeng Chen},
+  title = {VideoTuna: A Powerful Toolkit for Video Generation with Model Fine-Tuning and Post-Training},
+  month = {Nov},
+  year = {2024},
+  url = {https://github.com/VideoVerses/VideoTuna}
+}
 ```
-To be updated...
-```
+
 
 ## Star History
 
diff --git a/configs/004_cogvideox/cogvideo.yaml b/configs/004_cogvideox/cogvideo2b.yaml
similarity index 60%
rename from configs/004_cogvideox/cogvideo.yaml
rename to configs/004_cogvideox/cogvideo2b.yaml
index de87f5f..f615eca 100644
--- a/configs/004_cogvideox/cogvideo.yaml
+++ b/configs/004_cogvideox/cogvideo2b.yaml
@@ -2,66 +2,43 @@ model:
   # there might be differet to load from hf and resume from pl 
   # pretrained_checkpoint: "THUDM/CogVideoX-2b"
   base_learning_rate: 6e-6
-  target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkflow
+  target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkFlow
   params: 
     # first stage model; cond stage model ; denoising model ; scheduler 
     first_stage_config:
       target: diffusers.AutoencoderKLCogVideoX
       params:
-        pretrained_model_name_or_path: THUDM/CogVideoX-2b 
+        pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-2b 
         subfolder: "vae"
-        # revision: null 
-        # variant: null
     cond_stage_config:
       target: src.lvdm.modules.encoders.condition.FrozenT5Embedder
       params:
-        version: "checkpoints/cogvideo/t5-v1_1-xxl"
+        version: "DeepFloyd/t5-v1_1-xxl"
         device: "cuda"
         max_length: 226 
         freeze: True
-    # cond_stage_config:
-    #   target: src.cogvideo_hf.cogvideo_pl.FrozenT5CondModel
-    #   params:
-    #     tokenizer_config:
-    #       target: transformers.AutoTokenizer
-    #       params:
-    #         pretrained_model_name_or_path: THUDM/CogVideoX-2b 
-    #         subfolder: "tokenizer"
-    #     encoder_config:
-    #       target: transformers.T5EncoderModel
-    #       params:
-    #         subfolder: "text_encoder"
-    #         pretrained_model_name_or_path: THUDM/CogVideoX-2b 
-    #       # max_length: 226 
-    #     freeze: True
     # denosier config equal to unet config in vc 
     denoiser_config:
       target: diffusers.CogVideoXTransformer3DModel
       params:
-        pretrained_model_name_or_path: THUDM/CogVideoX-2b 
+        pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-2b 
         subfolder: "transformer"
         load_dtype: fp16 # bf16 5b fp16 2B 
         # revision: null 
         # variant: null
     adapter_config: # the whole dict is remoable 
-      target: peft.HRAConfig
-      params:
-        r: 8
-        init_weights: True
-        target_modules: ["to_k", "to_q", "to_v", "to_out.0"]
+        target: peft.LoraConfig
+        params:
+            r: 4
+            lora_alpha: 1.0 
+            init_lora_weights: True
+            target_modules: ["to_k", "to_q", "to_v", "to_out.0"]
 
-    # adapter_config: # the whole dict is remoable 
-        # target: peft.HRAConfig
-        # params:
-    #     r: 4
-    #     lora_alpha: 1.0 
-    #     init_lora_weights: True
-    #     target_modules: ["to_k", "to_q", "to_v", "to_out.0"]
     # sampler config. Wrap it.       
     scheduler_config:
       target: diffusers.CogVideoXDPMScheduler
       params:
-        pretrained_model_name_or_path: THUDM/CogVideoX-2b 
+        pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-2b 
         subfolder: scheduler
 
 ## training config 
@@ -93,8 +70,6 @@ data:
 lightning:
   trainer:
     benchmark: True
-    batch_size: 3
-    num_workers: 38
     num_nodes: 1
     accumulate_grad_batches: 2
     max_epochs: 2000
@@ -108,12 +83,6 @@ lightning:
         to_local: True # save videos into files
         log_images_kwargs:
           unconditional_guidance_scale: 12 # need this, otherwise it is grey
-    modelcheckpoint:
-      # target: pytorch_lightning.callbacks.ModelCheckpoin
-      target: pytorch_lightning.callbacks.ModelCheckpoint
-      params:
-        every_n_epochs: 1
-        filename: "{epoch:04}-{step:06}"
     metrics_over_trainsteps_checkpoint:
       target: pytorch_lightning.callbacks.ModelCheckpoint
       params:
diff --git a/configs/004_cogvideox/cogvideo5b.yaml b/configs/004_cogvideox/cogvideo5b.yaml
new file mode 100644
index 0000000..3b56b35
--- /dev/null
+++ b/configs/004_cogvideox/cogvideo5b.yaml
@@ -0,0 +1,94 @@
+model:
+  # there might be differet to load from hf and resume from pl 
+  # pretrained_checkpoint: "THUDM/CogVideoX-2b"
+  base_learning_rate: 6e-6
+  target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkFlow
+  params: 
+    # first stage model; cond stage model ; denoising model ; scheduler 
+    first_stage_config:
+      target: diffusers.AutoencoderKLCogVideoX
+      params:
+        pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b 
+        subfolder: "vae"
+    cond_stage_config:
+      target: src.lvdm.modules.encoders.condition.FrozenT5Embedder
+      params:
+        version: "DeepFloyd/t5-v1_1-xxl"
+        device: "cuda"
+        max_length: 226 
+        freeze: True
+    # denosier config equal to unet config in vc 
+    denoiser_config:
+      target: diffusers.CogVideoXTransformer3DModel
+      params:
+        pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b 
+        subfolder: "transformer"
+        load_dtype: fp16 # bf16 5b fp16 2B 
+        # revision: null 
+        # variant: null
+    adapter_config: # the whole dict is remoable 
+        target: peft.LoraConfig
+        params:
+            r: 4
+            lora_alpha: 1.0 
+            init_lora_weights: True
+            target_modules: ["to_k", "to_q", "to_v", "to_out.0"]
+
+    # sampler config. Wrap it.       
+    scheduler_config:
+      target: diffusers.CogVideoXDPMScheduler
+      params:
+        pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b
+        subfolder: scheduler
+
+## training config 
+### data , can a toy dataset given 
+data:
+  target: src.data.lightning_data.DataModuleFromConfig
+  params:
+    batch_size: 2
+    num_workers: 16
+    wrap: false
+    train:
+      target: src.data.cogvideo_dataset.VideoDataset
+      params:
+        instance_data_root: "inputs/t2v/cogvideo/elon_musk_video"
+        dataset_name: null 
+        dataset_config_name: null
+        caption_column: "labels.txt"
+        video_column: "videos.txt"
+        height: 480
+        width: 720
+        fps: 28
+        max_num_frames: 2
+        skip_frames_start: 0
+        skip_frames_end: 0
+        cache_dir: ~/.cache
+        id_token: null
+
+### training_step in cogvideoxft
+lightning:
+  trainer:
+    benchmark: True
+    num_nodes: 1
+    accumulate_grad_batches: 2
+    max_epochs: 2000
+    precision: 32 # training precision
+  callbacks:
+    image_logger:
+      target: src.utils.callbacks.ImageLogger
+      params:
+        batch_frequency: 100000
+        max_images: 2
+        to_local: True # save videos into files
+        log_images_kwargs:
+          unconditional_guidance_scale: 12 # need this, otherwise it is grey
+    metrics_over_trainsteps_checkpoint:
+      target: pytorch_lightning.callbacks.ModelCheckpoint
+      params:
+        filename: "{epoch:06}-{step:09}"
+        save_weights_only: False
+        # every_n_epochs: 300
+        every_n_train_steps: 10
+    
+
diff --git a/scripts/inference_cogvideo.py b/scripts/inference_cogvideo.py
index f601009..0602469 100644
--- a/scripts/inference_cogvideo.py
+++ b/scripts/inference_cogvideo.py
@@ -11,7 +11,8 @@
 
 import torch
 from pytorch_lightning import seed_everything
-
+from typing import List,Union
+from omegaconf import ListConfig
 
 sys.path.insert(0, os.getcwd())
 sys.path.insert(1, f'{os.getcwd()}/src')
diff --git a/shscripts/inference_cogvideo_lora.sh b/shscripts/inference_cogvideo_lora.sh
new file mode 100644
index 0000000..8c3705b
--- /dev/null
+++ b/shscripts/inference_cogvideo_lora.sh
@@ -0,0 +1,16 @@
+# ----------------------diffusers based pl inference ----------------------
+# ‘configs/004_cogvideox/cogvideo2b.yaml’ or 'configs/004_cogvideox/cogvideo5b.yaml'
+config='configs/004_cogvideox/cogvideo2b.yaml'
+prompt_file="inputs/t2v/prompts.txt"
+current_time=$(date +%Y%m%d%H%M%S)
+savedir="results/t2v/$current_time-cogvideo"
+ckpt="{YOUR_CKPT_PATH}"
+
+python3 scripts/inference_cogvideo.py \
+--ckpt_path $ckpt \
+--config $config \
+--prompt_file $prompt_file \
+--savedir $savedir \
+--bs 1 --height 480 --width 720 \
+--fps 16 \
+--seed 6666 \
\ No newline at end of file
diff --git a/shscripts/train_cogvideox_lora.sh b/shscripts/train_cogvideox_lora.sh
new file mode 100644
index 0000000..d6efa12
--- /dev/null
+++ b/shscripts/train_cogvideox_lora.sh
@@ -0,0 +1,17 @@
+export TOKENIZERS_PARALLELISM=false
+
+# exp settings
+EXPNAME="004_cogvideox"                          # experiment name 
+CONFIG='configs/004_cogvideox/cogvideo2b.yaml'   # experiment config ‘configs/004_cogvideox/cogvideo2b.yaml’ or 'configs/004_cogvideox/cogvideo5b.yaml'
+RESROOT="results/cogvideo_train"                 # experiment saving directory
+
+# run
+current_time=$(date +%Y%m%d%H%M%S)
+python scripts/train.py \
+-t \
+--name "$current_time"_$EXPNAME \
+--base $CONFIG \
+--logdir $RESROOT \
+--devices '0,' \
+lightning.trainer.num_nodes=1 \
+--auto_resume False
\ No newline at end of file
diff --git a/src/cogvideo_hf/cogvideo_pl.py b/src/cogvideo_hf/cogvideo_pl.py
index 27671e4..4e6b571 100644
--- a/src/cogvideo_hf/cogvideo_pl.py
+++ b/src/cogvideo_hf/cogvideo_pl.py
@@ -91,7 +91,7 @@ def retrieve_timesteps(
         timesteps = scheduler.timesteps
     return timesteps, num_inference_steps
 
-class CogVideoXWorkflow(pl.LightningModule):
+class CogVideoXWorkFlow(pl.LightningModule):
     def __init__(self, 
                 first_stage_config,
                 cond_stage_config,
@@ -129,6 +129,7 @@ def __init__(self,
         # are most schduler 
         self.scheduler = instantiate_from_config(scheduler_config)
         # add adapter config (Support Lora and HRA ) 
+        self.lora_args = []
         if adapter_config is not None:
             self.inject_adapter(adapter_config)
     def inject_adapter(self, adapter_config):
@@ -422,22 +423,22 @@ def _prepare_rotary_positional_embeddings(
         height: int,
         width: int,
         num_frames: int,
-        device: torch.device,
+        vae_scale_factor_spatial: int = 8,
+        patch_size: int = 2,
+        attention_head_dim: int = 64,
+        device: Optional[torch.device] = None,
         base_height: int = 480,
         base_width: int = 720,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # a merge of _prepare_rotary_positional_embeddings from cogvideoX.finetune.py and diffusers implementation. 
-        # add base_height and base_width to make it more flexible
-        grid_height = height // (self.vae_scale_factor_spatial * self.model.config.patch_size)
-        grid_width = width // (self.vae_scale_factor_spatial * self.model.config.patch_size)
-        base_size_width = base_width // (self.vae_scale_factor_spatial * self.model.config.patch_size)
-        base_size_height = base_height // (self.vae_scale_factor_spatial * self.model.config.patch_size)
-
-        grid_crops_coords = get_resize_crop_region_for_grid(
-            (grid_height, grid_width), base_size_width, base_size_height
-        )
+
+        grid_height = height // (vae_scale_factor_spatial * patch_size)
+        grid_width = width // (vae_scale_factor_spatial * patch_size)
+        base_size_width = base_width // (vae_scale_factor_spatial * patch_size)
+        base_size_height = base_height // (vae_scale_factor_spatial * patch_size)
+
+        grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size_width, base_size_height)
         freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
-            embed_dim=self.model.config.attention_head_dim,
+            embed_dim=attention_head_dim,
             crops_coords=grid_crops_coords,
             grid_size=(grid_height, grid_width),
             temporal_size=num_frames,
@@ -446,6 +447,7 @@ def _prepare_rotary_positional_embeddings(
         freqs_cos = freqs_cos.to(device=device)
         freqs_sin = freqs_sin.to(device=device)
         return freqs_cos, freqs_sin
+    
     @torch.no_grad()
     def sample(
         self,
@@ -624,7 +626,15 @@ def sample(
 
         # 7. Create rotary embeds if required
         image_rotary_emb = (
-            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            self._prepare_rotary_positional_embeddings(
+                height=height,
+                width=width,
+                num_frames=latents.shape[1],
+                vae_scale_factor_spatial=self.vae_scale_factor_spatial,
+                patch_size=self.model.config.patch_size,
+                attention_head_dim=self.model.config.attention_head_dim,
+                device=self.device,
+            )
             if self.model.config.use_rotary_positional_embeddings
             else None
         )
@@ -641,7 +651,7 @@ def sample(
 
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-            print(i,latent_model_input.max(),latent_model_input.min(),has_nan(latent_model_input))
+            # print(i,latent_model_input.max(),latent_model_input.min(),has_nan(latent_model_input))
             # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
             timestep = t.expand(latent_model_input.shape[0])
             # print(i,num_inference_steps)
@@ -741,6 +751,7 @@ def get_batch_input(self, batch):
             "videos": videos,
             "prompts": prompts,
         }
+    
     def training_step(self, batch, batch_idx):
         # print(type(batch),batch.keys(),type(batch['instance_video']),batch['instance_video'].shape);exit(); # <class 'dict'> dict_keys(['instance_prompt', 'instance_video'])
         batch = self.get_batch_input(batch)
@@ -772,8 +783,8 @@ def training_step(self, batch, batch_idx):
         image_rotary_emb = (
             # in the first place, we assume this function is the same during inference and train. 
             self._prepare_rotary_positional_embeddings(
-                height=height,
-                width=width,
+                height=height*self.vae_scale_factor_spatial,
+                width=width*self.vae_scale_factor_spatial,
                 num_frames=num_frames,
                 vae_scale_factor_spatial=self.vae_scale_factor_spatial,
                 patch_size=self.model.config.patch_size,
diff --git a/src/utils/callbacks.py b/src/utils/callbacks.py
index 5aa7891..ccf505f 100755
--- a/src/utils/callbacks.py
+++ b/src/utils/callbacks.py
@@ -14,7 +14,7 @@
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.utilities import rank_zero_only
 from pytorch_lightning.utilities import rank_zero_info
-from utils.save_video import log_local, prepare_to_log
+from .save_video import log_local, prepare_to_log
 
 
 class LoraModelCheckpoint(pl.callbacks.ModelCheckpoint):