-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'VideoVerses:main' into mochi
- Loading branch information
Showing
8 changed files
with
180 additions
and
64 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -478,12 +478,20 @@ We thank the following repos for sharing their awesome models and codes! | |
</a> | ||
|
||
## 📋 License | ||
Please follow [CC-BY-NC-ND](./LICENSE). If you want a license authorization, please contact [email protected] and [email protected]. | ||
Please follow [CC-BY-NC-ND](./LICENSE). If you want a license authorization, please contact the project leads Yingqing He ([email protected]) and Yazhou Xing ([email protected]). | ||
|
||
## 😊 Citation | ||
|
||
```bibtex | ||
@software{videotuna, | ||
author = {Yingqing He and Yazhou Xing and Zhefan Rao and Haoyu Wu and Zhaoyang Liu and Jingye Chen and Pengjun Fang and Jiajun Li and Liya Ji and Runtao Liu and Xiaowei Chi and Yang Fei and Guocheng Shao and Yue Ma and Qifeng Chen}, | ||
title = {VideoTuna: A Powerful Toolkit for Video Generation with Model Fine-Tuning and Post-Training}, | ||
month = {Nov}, | ||
year = {2024}, | ||
url = {https://github.com/VideoVerses/VideoTuna} | ||
} | ||
``` | ||
To be updated... | ||
``` | ||
|
||
|
||
## Star History | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
model: | ||
# there might be differet to load from hf and resume from pl | ||
# pretrained_checkpoint: "THUDM/CogVideoX-2b" | ||
base_learning_rate: 6e-6 | ||
target: src.cogvideo_hf.cogvideo_pl.CogVideoXWorkFlow | ||
params: | ||
# first stage model; cond stage model ; denoising model ; scheduler | ||
first_stage_config: | ||
target: diffusers.AutoencoderKLCogVideoX | ||
params: | ||
pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b | ||
subfolder: "vae" | ||
cond_stage_config: | ||
target: src.lvdm.modules.encoders.condition.FrozenT5Embedder | ||
params: | ||
version: "DeepFloyd/t5-v1_1-xxl" | ||
device: "cuda" | ||
max_length: 226 | ||
freeze: True | ||
# denosier config equal to unet config in vc | ||
denoiser_config: | ||
target: diffusers.CogVideoXTransformer3DModel | ||
params: | ||
pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b | ||
subfolder: "transformer" | ||
load_dtype: fp16 # bf16 5b fp16 2B | ||
# revision: null | ||
# variant: null | ||
adapter_config: # the whole dict is remoable | ||
target: peft.LoraConfig | ||
params: | ||
r: 4 | ||
lora_alpha: 1.0 | ||
init_lora_weights: True | ||
target_modules: ["to_k", "to_q", "to_v", "to_out.0"] | ||
|
||
# sampler config. Wrap it. | ||
scheduler_config: | ||
target: diffusers.CogVideoXDPMScheduler | ||
params: | ||
pretrained_model_name_or_path: checkpoints/cogvideo/CogVideoX-5b | ||
subfolder: scheduler | ||
|
||
## training config | ||
### data , can a toy dataset given | ||
data: | ||
target: src.data.lightning_data.DataModuleFromConfig | ||
params: | ||
batch_size: 2 | ||
num_workers: 16 | ||
wrap: false | ||
train: | ||
target: src.data.cogvideo_dataset.VideoDataset | ||
params: | ||
instance_data_root: "inputs/t2v/cogvideo/elon_musk_video" | ||
dataset_name: null | ||
dataset_config_name: null | ||
caption_column: "labels.txt" | ||
video_column: "videos.txt" | ||
height: 480 | ||
width: 720 | ||
fps: 28 | ||
max_num_frames: 2 | ||
skip_frames_start: 0 | ||
skip_frames_end: 0 | ||
cache_dir: ~/.cache | ||
id_token: null | ||
|
||
### training_step in cogvideoxft | ||
lightning: | ||
trainer: | ||
benchmark: True | ||
num_nodes: 1 | ||
accumulate_grad_batches: 2 | ||
max_epochs: 2000 | ||
precision: 32 # training precision | ||
callbacks: | ||
image_logger: | ||
target: src.utils.callbacks.ImageLogger | ||
params: | ||
batch_frequency: 100000 | ||
max_images: 2 | ||
to_local: True # save videos into files | ||
log_images_kwargs: | ||
unconditional_guidance_scale: 12 # need this, otherwise it is grey | ||
metrics_over_trainsteps_checkpoint: | ||
target: pytorch_lightning.callbacks.ModelCheckpoint | ||
params: | ||
filename: "{epoch:06}-{step:09}" | ||
save_weights_only: False | ||
# every_n_epochs: 300 | ||
every_n_train_steps: 10 | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# ----------------------diffusers based pl inference ---------------------- | ||
# ‘configs/004_cogvideox/cogvideo2b.yaml’ or 'configs/004_cogvideox/cogvideo5b.yaml' | ||
config='configs/004_cogvideox/cogvideo2b.yaml' | ||
prompt_file="inputs/t2v/prompts.txt" | ||
current_time=$(date +%Y%m%d%H%M%S) | ||
savedir="results/t2v/$current_time-cogvideo" | ||
ckpt="{YOUR_CKPT_PATH}" | ||
|
||
python3 scripts/inference_cogvideo.py \ | ||
--ckpt_path $ckpt \ | ||
--config $config \ | ||
--prompt_file $prompt_file \ | ||
--savedir $savedir \ | ||
--bs 1 --height 480 --width 720 \ | ||
--fps 16 \ | ||
--seed 6666 \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
export TOKENIZERS_PARALLELISM=false | ||
|
||
# exp settings | ||
EXPNAME="004_cogvideox" # experiment name | ||
CONFIG='configs/004_cogvideox/cogvideo2b.yaml' # experiment config ‘configs/004_cogvideox/cogvideo2b.yaml’ or 'configs/004_cogvideox/cogvideo5b.yaml' | ||
RESROOT="results/cogvideo_train" # experiment saving directory | ||
|
||
# run | ||
current_time=$(date +%Y%m%d%H%M%S) | ||
python scripts/train.py \ | ||
-t \ | ||
--name "$current_time"_$EXPNAME \ | ||
--base $CONFIG \ | ||
--logdir $RESROOT \ | ||
--devices '0,' \ | ||
lightning.trainer.num_nodes=1 \ | ||
--auto_resume False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters