Skip to content

Commit

Permalink
Merge pull request #10 from VideoVerses/mochi
Browse files Browse the repository at this point in the history
feat: add mochi inference
  • Loading branch information
yzxing87 authored Dec 11, 2024
2 parents 04da47b + 5841d26 commit ffc6dfe
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 12 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ VideoTuna/

|T2V-Models|HxWxL|Checkpoints|
|:---------|:---------|:--------|
|Mochi|848x480, 3s|[Hugging Face](https://huggingface.co/genmo/mochi-1-preview)
|CogVideoX-2B|720x480, 6s|[Hugging Face](https://huggingface.co/THUDM/CogVideoX-2b)
|CogVideoX-5B|720x480, 6s|[Hugging Face](https://huggingface.co/THUDM/CogVideoX-5b)
|Open-Sora 1.0|512×512x16|[Hugging Face](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x512x512.pth)
Expand Down Expand Up @@ -361,6 +362,7 @@ After downloading, the model checkpoints should be placed as [Checkpoint Structu

Task|Model|Command|Length (#frames)|Resolution|Inference Time (s)|GPU Memory (GiB)|
|:---------|:---------|:---------|:---------|:---------|:---------|:---------|
|T2V|Mochi|`bash shscripts/inference_mochi.sh`|84|480x848|109.0|26|
|I2V|CogVideoX-5b-I2V|`bash shscripts/inference_cogVideo_i2v_diffusers.sh`|49|576x1024|310.4|4.78|
|T2V|CogVideoX-2b|`bash shscripts/inference_cogVideo_t2v_diffusers.sh`|49|576x1024|107.6|2.32|
|T2V|Open Sora V1.0|`bash shscripts/inference_opensora_v10_16x256x256.sh`|16|256x256|11.2|23.99|
Expand Down Expand Up @@ -442,6 +444,7 @@ We support video alignment post-training to align human perference for video dif

## Acknowledgement
We thank the following repos for sharing their awesome models and codes!
* [Mochi](https://www.genmo.ai/blog): A new SOTA in open-source video generation models
* [VideoCrafter2](https://github.com/AILab-CVC/VideoCrafter): Overcoming Data Limitations for High-Quality Video Diffusion Models
* [VideoCrafter1](https://github.com/AILab-CVC/VideoCrafter): Open Diffusion Models for High-Quality Video Generation
* [DynamiCrafter](https://github.com/Doubiiu/DynamiCrafter): Animating Open-domain Images with Video Diffusion Priors
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn
git+https://github.com/huggingface/diffusers
open_clip_torch==2.12.0
lmdeploy
moviepy
moviepy==1.0.2
39 changes: 39 additions & 0 deletions scripts/inference_mochi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import torch
from diffusers import MochiPipeline
from diffusers.utils import export_to_video
import argparse
import os

# create arg parser
parser = argparse.ArgumentParser()
parser.add_argument("--ckpt_path", type=str, default="genmo/mochi-1-preview")
parser.add_argument("--prompt_file", type=str, default="inputs/t2v/prompts.txt")
parser.add_argument("--savedir", type=str, default="results/t2v/")
parser.add_argument("--height", type=int, default=480)
parser.add_argument("--width", type=int, default=848)
parser.add_argument("--bs", type=int, default=1)
parser.add_argument("--fps", type=int, default=28)
parser.add_argument("--seed", type=int, default=123)

args = parser.parse_args()

os.makedirs(args.savedir, exist_ok=True)

pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", variant="bf16", torch_dtype=torch.bfloat16)
# Enable memory savings
pipe.enable_model_cpu_offload()
pipe.enable_vae_tiling()

# there are many prompts in the prompt_file, we need to read them all
with open(args.prompt_file, 'r') as file:
prompts = file.readlines()

# set seed
torch.manual_seed(args.seed)

for index, prompt in enumerate(prompts):

with torch.autocast("cuda", torch.bfloat16, cache_enabled=False):
frames = pipe(prompt, num_frames=84).frames[0]

export_to_video(frames, f"{args.savedir}/mochi_{index}.mp4", fps=30)
14 changes: 14 additions & 0 deletions shscripts/inference_mochi.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
ckpt='checkpoints/mochi-1-preview'
prompt_file="inputs/t2v/prompts.txt"
savedir="results/t2v/mochi2"
height=480
width=848

python3 scripts/inference_mochi.py \
--ckpt_path $ckpt \
--prompt_file $prompt_file \
--savedir $savedir \
--bs 1 --height $height --width $width \
--fps 28 \
--seed 124

3 changes: 3 additions & 0 deletions tools/video_comparison/check_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

parser = argparse.ArgumentParser(description='Check the input directory')
parser.add_argument('--input_dir', type=str, help='The input should be a directory', required=True)
parser.add_argument('--seed', type=int, help='The seed for the random number generator', default=42)
args = parser.parse_args()

# check if there are images in the input directory, jpg/png...
Expand All @@ -26,13 +27,15 @@
for index, line in enumerate(lines):
prompt = line.strip()
print(f'creating image {index} using prompt: {prompt}')

out = pipe(
prompt=prompt,
guidance_scale=0.,
height=576,
width=1024,
num_inference_steps=4,
max_sequence_length=256,
generator=torch.Generator("cuda").manual_seed(args.seed)
).images[0]
index_str = str(index).zfill(5)
out.save(f"{args.input_dir}/prompt_{index_str}.png")
11 changes: 4 additions & 7 deletions tools/video_comparison/combine.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import os
import argparse
import glob
from moviepy.editor import VideoFileClip, clips_array, vfx, TextClip
from moviepy.editor import VideoFileClip, clips_array
from PIL import Image, ImageDraw, ImageFont
import numpy as np

parser = argparse.ArgumentParser(description='Check the input directory')
parser.add_argument('--input_dir', type=str, help='The input should be a directory', required=True)
parser.add_argument('--save_dir', type=str, help='The directory of saving results', required=True)
parser.add_argument('--unified_height', type=int, help='The height of the unified video', default=320)
args = parser.parse_args()

methods = glob.glob(f'{args.save_dir}/*/*')
Expand Down Expand Up @@ -40,12 +41,8 @@ def add_text_to_frame(frame, text='hi', position=(0,0)):
max_duration = max([clip.duration for clip in clips])
clips = [clip.set_end(max_duration).set_fps(max_fps) for clip in clips]

# txt_clip = TextClip('hello world', color='orange', size=(100, 100))
# txt_clip = txt_clip.set_position('center').set_duration(max_duration)
# clips = [clip.resize(height=1080) for clip in clips]
# video_heights = [clip.size[1] for clip in clips]
# print(methods)
# print(len(clips))
clips = [clip.resize(height=args.unified_height) for clip in clips]

clips_with_name = []
for index, clip in enumerate(clips):
method = methods[index].split('/')[-1]
Expand Down
27 changes: 23 additions & 4 deletions tools/video_comparison/compare.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ input_dir='inputs/t2v'
save_dir='results/compare1/'
seed=42
unified_visualization_height=320
inference_methods="videocrafter2;dynamicrafter;cogvideo—t2v;cogvideo—i2v;opensora"
inference_methods="videocrafter2;dynamicrafter;cogvideo—t2v;cogvideo—i2v;opensora;mochi"

#### check input ####
# Check if the directory exists
Expand All @@ -26,7 +26,7 @@ python tools/video_comparison/check_input.py --input_dir=$input_dir --seed=$seed

################################ videocrafter2 ################################
ckpt='checkpoints/videocrafter/t2v_v2_512/model.ckpt'
config='configs/train/000_videocrafter2ft/config.yaml'
config='configs/001_videocrafter2/vc2_t2v_320x512.yaml'
prompt_file="${input_dir}/prompts.txt"
height=320
width=512
Expand All @@ -46,7 +46,7 @@ fi

################################ dynamicrafter ################################
ckpt=checkpoints/dynamicrafter/i2v_576x1024/model.ckpt
config=configs/train/002_dynamicrafterft_1024/config.yaml
config=configs/002_dynamicrafter/dc_i2v_1024.yaml
prompt_dir="${input_dir}"
height=576
width=1024
Expand Down Expand Up @@ -93,7 +93,7 @@ fi

################################ opensora ################################
ckpt="checkpoints/open-sora/t2v_v10/OpenSora-v1-HQ-16x256x256.pth"
config='configs/train/001_opensorav10/config_opensorav10.yaml'
config='configs/003_opensora/opensorav10_256x256.yaml'
height=256
width=256
fps=8
Expand All @@ -116,5 +116,24 @@ if [[ $inference_methods == *"opensora"* ]]; then
--frames 16
fi

################################ mochi ################################
if [[ $inference_methods == *"mochi"* ]]; then
ckpt='genmo/mochi-1-preview'
prompt_file="${input_dir}/prompts.txt"
height=480
width=848
savedir="${save_dir}/t2v/mochi-${width}x${height}-28fps"

python3 scripts/inference_mochi.py \
--ckpt_path $ckpt \
--prompt_file $prompt_file \
--savedir $savedir \
--bs 1 --height $height --width $width \
--fps 28 \
--seed ${seed}
fi



#### combine video
python3 tools/video_comparison/combine.py --save_dir=$save_dir --input_dir=$input_dir --unified_height=$unified_visualization_height

0 comments on commit ffc6dfe

Please sign in to comment.