Merge pull request #10 from VideoVerses/mochi

feat: add mochi inference
VideoVerses · Dec 11, 2024 · ffc6dfe · ffc6dfe
2 parents 04da47b + 5841d26
commit ffc6dfe
Show file tree

Hide file tree

Showing 7 changed files with 87 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -295,6 +295,7 @@ VideoTuna/
 
 |T2V-Models|HxWxL|Checkpoints|
 |:---------|:---------|:--------|
+|Mochi|848x480, 3s|[Hugging Face](https://huggingface.co/genmo/mochi-1-preview)
 |CogVideoX-2B|720x480, 6s|[Hugging Face](https://huggingface.co/THUDM/CogVideoX-2b)
 |CogVideoX-5B|720x480, 6s|[Hugging Face](https://huggingface.co/THUDM/CogVideoX-5b)
 |Open-Sora 1.0|512×512x16|[Hugging Face](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x512x512.pth)
@@ -361,6 +362,7 @@ After downloading, the model checkpoints should be placed as [Checkpoint Structu
 
 Task|Model|Command|Length (#frames)|Resolution|Inference Time (s)|GPU Memory (GiB)|
 |:---------|:---------|:---------|:---------|:---------|:---------|:---------|
+|T2V|Mochi|`bash shscripts/inference_mochi.sh`|84|480x848|109.0|26|
 |I2V|CogVideoX-5b-I2V|`bash shscripts/inference_cogVideo_i2v_diffusers.sh`|49|576x1024|310.4|4.78|
 |T2V|CogVideoX-2b|`bash shscripts/inference_cogVideo_t2v_diffusers.sh`|49|576x1024|107.6|2.32|
 |T2V|Open Sora V1.0|`bash shscripts/inference_opensora_v10_16x256x256.sh`|16|256x256|11.2|23.99|
@@ -442,6 +444,7 @@ We support video alignment post-training to align human perference for video dif
 
 ## Acknowledgement
 We thank the following repos for sharing their awesome models and codes!
+* [Mochi](https://www.genmo.ai/blog): A new SOTA in open-source video generation models
 * [VideoCrafter2](https://github.com/AILab-CVC/VideoCrafter): Overcoming Data Limitations for High-Quality Video Diffusion Models
 * [VideoCrafter1](https://github.com/AILab-CVC/VideoCrafter): Open Diffusion Models for High-Quality Video Generation
 * [DynamiCrafter](https://github.com/Doubiiu/DynamiCrafter): Animating Open-domain Images with Video Diffusion Priors

diff --git a/requirements.txt b/requirements.txt
@@ -38,4 +38,4 @@ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn
 git+https://github.com/huggingface/diffusers
 open_clip_torch==2.12.0
 lmdeploy
-moviepy
+moviepy==1.0.2
diff --git a/scripts/inference_mochi.py b/scripts/inference_mochi.py
@@ -0,0 +1,39 @@
+import torch
+from diffusers import MochiPipeline
+from diffusers.utils import export_to_video
+import argparse
+import os
+
+# create arg parser
+parser = argparse.ArgumentParser()
+parser.add_argument("--ckpt_path", type=str, default="genmo/mochi-1-preview")
+parser.add_argument("--prompt_file", type=str, default="inputs/t2v/prompts.txt")
+parser.add_argument("--savedir", type=str, default="results/t2v/")
+parser.add_argument("--height", type=int, default=480)
+parser.add_argument("--width", type=int, default=848)
+parser.add_argument("--bs", type=int, default=1)
+parser.add_argument("--fps", type=int, default=28)
+parser.add_argument("--seed", type=int, default=123)
+
+args = parser.parse_args()
+
+os.makedirs(args.savedir, exist_ok=True)
+
+pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", variant="bf16", torch_dtype=torch.bfloat16)
+# Enable memory savings
+pipe.enable_model_cpu_offload()
+pipe.enable_vae_tiling()
+
+# there are many prompts in the prompt_file, we need to read them all
+with open(args.prompt_file, 'r') as file:
+    prompts = file.readlines()
+
+# set seed
+torch.manual_seed(args.seed)
+
+for index, prompt in enumerate(prompts):
+
+    with torch.autocast("cuda", torch.bfloat16, cache_enabled=False):
+        frames = pipe(prompt, num_frames=84).frames[0]
+
+    export_to_video(frames, f"{args.savedir}/mochi_{index}.mp4", fps=30)
diff --git a/shscripts/inference_mochi.sh b/shscripts/inference_mochi.sh
@@ -0,0 +1,14 @@
+ckpt='checkpoints/mochi-1-preview'
+prompt_file="inputs/t2v/prompts.txt"
+savedir="results/t2v/mochi2"
+height=480
+width=848
+
+python3 scripts/inference_mochi.py \
+    --ckpt_path $ckpt \
+    --prompt_file $prompt_file \
+    --savedir $savedir \
+    --bs 1 --height $height --width $width \
+    --fps 28 \
+    --seed 124
+
diff --git a/tools/video_comparison/check_input.py b/tools/video_comparison/check_input.py
@@ -3,6 +3,7 @@
 
 parser = argparse.ArgumentParser(description='Check the input directory')
 parser.add_argument('--input_dir', type=str, help='The input should be a directory', required=True)
+parser.add_argument('--seed', type=int, help='The seed for the random number generator', default=42)
 args = parser.parse_args()
 
 # check if there are images in the input directory, jpg/png...
@@ -26,13 +27,15 @@
     for index, line in enumerate(lines):
         prompt = line.strip()
         print(f'creating image {index} using prompt: {prompt}')
+
         out = pipe(
             prompt=prompt,
             guidance_scale=0.,
             height=576,
             width=1024,
             num_inference_steps=4,
             max_sequence_length=256,
+            generator=torch.Generator("cuda").manual_seed(args.seed)
         ).images[0]
         index_str = str(index).zfill(5)
         out.save(f"{args.input_dir}/prompt_{index_str}.png")
diff --git a/tools/video_comparison/combine.py b/tools/video_comparison/combine.py
@@ -1,13 +1,14 @@
 import os
 import argparse
 import glob
-from moviepy.editor import VideoFileClip, clips_array, vfx, TextClip
+from moviepy.editor import VideoFileClip, clips_array
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
 
 parser = argparse.ArgumentParser(description='Check the input directory')
 parser.add_argument('--input_dir', type=str, help='The input should be a directory', required=True)
 parser.add_argument('--save_dir', type=str, help='The directory of saving results', required=True)
+parser.add_argument('--unified_height', type=int, help='The height of the unified video', default=320)
 args = parser.parse_args()
 
 methods = glob.glob(f'{args.save_dir}/*/*')
@@ -40,12 +41,8 @@ def add_text_to_frame(frame, text='hi', position=(0,0)):
     max_duration = max([clip.duration for clip in clips])
     clips = [clip.set_end(max_duration).set_fps(max_fps) for clip in clips]
 
-    # txt_clip = TextClip('hello world', color='orange', size=(100, 100))
-    # txt_clip = txt_clip.set_position('center').set_duration(max_duration)
-    # clips = [clip.resize(height=1080) for clip in clips]
-    # video_heights = [clip.size[1] for clip in clips] 
-    # print(methods)
-    # print(len(clips))
+    clips = [clip.resize(height=args.unified_height) for clip in clips]
+
     clips_with_name = []
     for index, clip in enumerate(clips):
         method = methods[index].split('/')[-1]

diff --git a/tools/video_comparison/compare.sh b/tools/video_comparison/compare.sh
@@ -4,7 +4,7 @@ input_dir='inputs/t2v'
 save_dir='results/compare1/'
 seed=42
 unified_visualization_height=320
-inference_methods="videocrafter2;dynamicrafter;cogvideo—t2v;cogvideo—i2v;opensora"
+inference_methods="videocrafter2;dynamicrafter;cogvideo—t2v;cogvideo—i2v;opensora;mochi"
 
 #### check input ####
 # Check if the directory exists
@@ -26,7 +26,7 @@ python tools/video_comparison/check_input.py --input_dir=$input_dir --seed=$seed
 
 ################################ videocrafter2 ################################
 ckpt='checkpoints/videocrafter/t2v_v2_512/model.ckpt'
-config='configs/train/000_videocrafter2ft/config.yaml'
+config='configs/001_videocrafter2/vc2_t2v_320x512.yaml'
 prompt_file="${input_dir}/prompts.txt"
 height=320
 width=512
@@ -46,7 +46,7 @@ fi
 
 ################################ dynamicrafter ################################
 ckpt=checkpoints/dynamicrafter/i2v_576x1024/model.ckpt
-config=configs/train/002_dynamicrafterft_1024/config.yaml
+config=configs/002_dynamicrafter/dc_i2v_1024.yaml
 prompt_dir="${input_dir}"
 height=576
 width=1024
@@ -93,7 +93,7 @@ fi
 
 ################################ opensora ################################
 ckpt="checkpoints/open-sora/t2v_v10/OpenSora-v1-HQ-16x256x256.pth"
-config='configs/train/001_opensorav10/config_opensorav10.yaml'
+config='configs/003_opensora/opensorav10_256x256.yaml'
 height=256
 width=256
 fps=8
@@ -116,5 +116,24 @@ if [[ $inference_methods == *"opensora"* ]]; then
       --frames 16
 fi
 
+################################ mochi ################################
+if [[ $inference_methods == *"mochi"* ]]; then
+  ckpt='genmo/mochi-1-preview'
+  prompt_file="${input_dir}/prompts.txt"
+  height=480
+  width=848
+  savedir="${save_dir}/t2v/mochi-${width}x${height}-28fps"
+
+  python3 scripts/inference_mochi.py \
+      --ckpt_path $ckpt \
+      --prompt_file $prompt_file \
+      --savedir $savedir \
+      --bs 1 --height $height --width $width \
+      --fps 28 \
+      --seed ${seed}
+fi
+
+
+
 #### combine video
 python3 tools/video_comparison/combine.py --save_dir=$save_dir --input_dir=$input_dir --unified_height=$unified_visualization_height