diff --git a/README.md b/README.md index 1ab61da9..4e1d4678 100644 --- a/README.md +++ b/README.md @@ -323,10 +323,6 @@ Please check [docs/CHECKPOINTS.md](docs/CHECKPOINTS.md) to download all the mode ``` shell conda create --name videotuna python=3.10 -y conda activate videotuna -pip install -U poetry pip -poetry config virtualenvs.create false -poetry install -pip install optimum-quanto==0.2.1 pip install -r requirements.txt git clone https://github.com/JingyeChen/SwissArmyTransformer pip install -e SwissArmyTransformer/ @@ -335,6 +331,8 @@ git clone https://github.com/tgxs002/HPSv2.git cd ./HPSv2 pip install -e . cd .. +conda config --add channels conda-forge +conda install ffmpeg ``` ### 2.Prepare checkpoints diff --git a/docs/CHECKPOINTS.md b/docs/CHECKPOINTS.md index e801bccf..8ee7cbd3 100644 --- a/docs/CHECKPOINTS.md +++ b/docs/CHECKPOINTS.md @@ -12,9 +12,10 @@ mkdir checkpoints # ---- CogVideo (diffusers) ---- mkdir -p checkpoints/cogvideo; cd checkpoints/cogvideo -git clone https://huggingface.co/THUDM/CogVideoX-2b -git clone https://huggingface.co/THUDM/CogVideoX-5b -git clone https://huggingface.co/THUDM/CogVideoX-5b-I2V +git clone https://huggingface.co/THUDM/CogVideoX-2b # This are checkpoints for CogVideoX T2V-2B +git clone https://huggingface.co/THUDM/CogVideoX-5b # This are checkpoints for CogVideoX T2V-5B +git clone https://huggingface.co/THUDM/CogVideoX-5b-I2V # This are checkpoints for CogVideoX I2V-5B +git clone https://huggingface.co/THUDM/CogVideoX1.5-5B-SAT # This are checkpoints for CogVideoX 1.5-5B (both T2V and I2V) # ---- Open-Sora ---- @@ -53,19 +54,26 @@ mkdir checkpoints/dynamicrafter/i2v_576x1024 wget https://huggingface.co/Doubiiu/DynamiCrafter_1024/resolve/main/model.ckpt -P checkpoints/dynamicrafter/i2v_576x1024 # dynamicrafter-i2v-1024 # ---- Videocrafter ---- -mkdir checkpoints/videocrafter/ -mkdir checkpoints/videocrafter/i2v_v1_512 +mkdir -p checkpoints/videocrafter/i2v_v1_512 wget https://huggingface.co/VideoCrafter/Image2Video-512/resolve/main/model.ckpt -P checkpoints/videocrafter/i2v_v1_512 # videocrafter1-i2v-512 # ---- Stable Diffusion checkpoint for VC2 Training ---- -mkdir checkpoints/stablediffusion/ -mkdir checkpoints/stablediffusion/v2-1_512-ema +mkdir -p checkpoints/stablediffusion/v2-1_512-ema +wget https://huggingface.co/stabilityai/stable-diffusion-2-1-base/resolve/main/v2-1_512-ema-pruned.ckpt -P checkpoints/stablediffusion/v2-1_512-ema + -wget https://huggingface.co/stabilityai/stable-diffusion-2-1-base/blob/main/v2-1_512-ema-pruned.ckpt -P checkpoints/stablediffusion/v2-1_512-ema +# ---------------------------- V2V ---------------------------- +# ---- ModelScope Video-to-Video ---- +cd checkpoints +# please ensure that you have installed lfs. If not, you can install it by running the following command: +git lfs install +# after installing lfs, you can clone the Video-to-Video checkpoints +git clone https://www.modelscope.cn/iic/Video-to-Video.git ``` + ### Checkpoint Orgnization Structure After downloading, the model checkpoints should be placed as follows: diff --git a/inputs/v2v/001/00.mp4 b/inputs/v2v/001/00.mp4 new file mode 100644 index 00000000..8b5e504f Binary files /dev/null and b/inputs/v2v/001/00.mp4 differ diff --git a/inputs/v2v/001/prompts.txt b/inputs/v2v/001/prompts.txt new file mode 100644 index 00000000..b3127584 --- /dev/null +++ b/inputs/v2v/001/prompts.txt @@ -0,0 +1 @@ +a cartoon dog is running in the forest. diff --git a/scripts/inference_utils.py b/scripts/inference_utils.py index 097144ed..ece8d07e 100644 --- a/scripts/inference_utils.py +++ b/scripts/inference_utils.py @@ -108,6 +108,41 @@ def load_inputs_i2v(input_dir, video_size=(256,256), video_frames=16): return filename_list, image_list, prompt_list +def load_inputs_v2v(input_dir, video_size=None, video_frames=None): + """ + Load prompt list and input videos for v2v from an input directory. + """ + # load prompt files + prompt_files = get_target_filelist(input_dir, ext='txt') + if len(prompt_files) > 1: + # only use the first one (sorted by name) if multiple exist + print(f"Warning: multiple prompt files exist. The one {os.path.split(prompt_files[0])[1]} is used.") + prompt_file = prompt_files[0] + elif len(prompt_files) == 1: + prompt_file = prompt_files[0] + elif len(prompt_files) == 0: + print(prompt_files) + raise ValueError(f"Error: found NO prompt file in {input_dir}") + prompt_list = load_prompts(prompt_file) + n_samples = len(prompt_list) + + ## load videos + video_filepaths = get_target_filelist(input_dir, ext='[m][p][4]') + video_filenames = [os.path.split(video_filepath)[-1] for video_filepath in video_filepaths] + + return prompt_list, video_filepaths, video_filenames + +def open_video_to_tensor(filepath, video_width=None, video_height=None): + if video_width is None and video_height is None: + vidreader = VideoReader(filepath, ctx=cpu(0), width=video_width, height=video_height) + else: + vidreader = VideoReader(filepath, ctx=cpu(0)) + frame_indices = list(range(len(vidreader))) + frames = vidreader.get_batch(frame_indices) + frame_tensor = torch.tensor(frames.asnumpy()).permute(3, 0, 1, 2).float() + frame_tensor = (frame_tensor / 255. - 0.5) * 2 + return frame_tensor.unsqueeze(0) + def load_video_batch(filepath_list, frame_stride, video_size=(256,256), video_frames=16): ''' Notice about some special cases: diff --git a/scripts/inference_v2v_ms.py b/scripts/inference_v2v_ms.py new file mode 100644 index 00000000..400bfb46 --- /dev/null +++ b/scripts/inference_v2v_ms.py @@ -0,0 +1,39 @@ +import os, sys +import argparse +sys.path.insert(0, os.getcwd()) + +from modelscope.models import Model +from modelscope.pipelines import pipeline +from modelscope.outputs import OutputKeys + +from scripts.inference_utils import load_inputs_v2v + +def get_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("--ckpt_path", type=str, default="checkpoints/Video-to-Video", help="Checkpoint path of the model") + parser.add_argument("--input_dir", type=str, default=None, help="A input directory containing videos and prompts for video-to-video enhancement") + parser.add_argument("--output_dir", type=str, default=None, help="Results saving directory") + return parser + +# prepare arguments, model, pipeline. +args = get_parser().parse_args() +model = Model.from_pretrained(args.ckpt_path) +pipe = pipeline(task="video-to-video", model=model, model_revision='v1.1.0', device='cuda:0') +print(f"Successfully loaded model from {args.ckpt_path}") + +os.makedirs(args.output_dir, exist_ok=True) + +# load input prompts, video paths, video filenames +prompt_list, video_filepaths, video_filenames = load_inputs_v2v(input_dir=args.input_dir) + +# video-to-video enhancement +for i, (prompt, videofilepath, videofilename) in enumerate(zip(prompt_list, video_filepaths, video_filenames)): + print(f"[{i}:03d] input path: {videofilepath}") + print(f"[{i}:03d] input name: {videofilename}") + print(f"[{i}:03d] prompt: {prompt}") + p_input = { + 'video_path': videofilepath, + 'text': prompt + } + output_video_path = pipe(p_input, output_video=os.path.join(args.output_dir, videofilename))[OutputKeys.OUTPUT_VIDEO] + print(f"Successfully processed {videofilename} and saved to {output_video_path}") diff --git a/shscripts/inference_v2v_ms.sh b/shscripts/inference_v2v_ms.sh new file mode 100644 index 00000000..1d34b420 --- /dev/null +++ b/shscripts/inference_v2v_ms.sh @@ -0,0 +1,6 @@ +input_dir="inputs/v2v/001" +current_time=$(date +%Y%m%d%H%M%S) +output_dir="results/v2v/$current_time-v2v-modelscope-001" + +python3 scripts/inference_v2v_ms.py \ + --input_dir $input_dir --output_dir $output_dir \ No newline at end of file