Merge pull request #5 from VideoVerses/feat/v2v

add: v2v-modelscope
VideoVerses · Nov 22, 2024 · 7d19e72 · 7d19e72
2 parents 319cbe6 + c42b53f
commit 7d19e72
Show file tree

Hide file tree

Showing 7 changed files with 99 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -323,10 +323,6 @@ Please check [docs/CHECKPOINTS.md](docs/CHECKPOINTS.md) to download all the mode
 ``` shell
 conda create --name videotuna python=3.10 -y
 conda activate videotuna
-pip install -U poetry pip
-poetry config virtualenvs.create false
-poetry install
-pip install optimum-quanto==0.2.1
 pip install -r requirements.txt
 git clone https://github.com/JingyeChen/SwissArmyTransformer
 pip install -e SwissArmyTransformer/
@@ -335,6 +331,8 @@ git clone https://github.com/tgxs002/HPSv2.git
 cd ./HPSv2
 pip install -e .
 cd ..
+conda config --add channels conda-forge
+conda install ffmpeg
 ```
 
 ### 2.Prepare checkpoints

diff --git a/docs/CHECKPOINTS.md b/docs/CHECKPOINTS.md
@@ -12,9 +12,10 @@ mkdir checkpoints
 
 # ---- CogVideo (diffusers) ----
 mkdir -p checkpoints/cogvideo; cd checkpoints/cogvideo
-git clone https://huggingface.co/THUDM/CogVideoX-2b
-git clone https://huggingface.co/THUDM/CogVideoX-5b
-git clone https://huggingface.co/THUDM/CogVideoX-5b-I2V
+git clone https://huggingface.co/THUDM/CogVideoX-2b         # This are checkpoints for CogVideoX T2V-2B
+git clone https://huggingface.co/THUDM/CogVideoX-5b         # This are checkpoints for CogVideoX T2V-5B
+git clone https://huggingface.co/THUDM/CogVideoX-5b-I2V     # This are checkpoints for CogVideoX I2V-5B
+git clone https://huggingface.co/THUDM/CogVideoX1.5-5B-SAT  # This are checkpoints for CogVideoX 1.5-5B (both T2V and I2V)
 
 
 # ---- Open-Sora ----
@@ -53,19 +54,26 @@ mkdir checkpoints/dynamicrafter/i2v_576x1024
 wget https://huggingface.co/Doubiiu/DynamiCrafter_1024/resolve/main/model.ckpt -P checkpoints/dynamicrafter/i2v_576x1024  # dynamicrafter-i2v-1024
 
 # ---- Videocrafter ----
-mkdir checkpoints/videocrafter/
-mkdir checkpoints/videocrafter/i2v_v1_512
+mkdir -p checkpoints/videocrafter/i2v_v1_512
 
 wget https://huggingface.co/VideoCrafter/Image2Video-512/resolve/main/model.ckpt -P checkpoints/videocrafter/i2v_v1_512 # videocrafter1-i2v-512
 
 # ---- Stable Diffusion checkpoint for VC2 Training ----
-mkdir checkpoints/stablediffusion/
-mkdir checkpoints/stablediffusion/v2-1_512-ema
+mkdir -p checkpoints/stablediffusion/v2-1_512-ema
+wget https://huggingface.co/stabilityai/stable-diffusion-2-1-base/resolve/main/v2-1_512-ema-pruned.ckpt -P checkpoints/stablediffusion/v2-1_512-ema
+
 
-wget https://huggingface.co/stabilityai/stable-diffusion-2-1-base/blob/main/v2-1_512-ema-pruned.ckpt -P checkpoints/stablediffusion/v2-1_512-ema
+# ---------------------------- V2V ----------------------------
+# ---- ModelScope Video-to-Video ----
+cd checkpoints
+# please ensure that you have installed lfs. If not, you can install it by running the following command:
+git lfs install
+# after installing lfs, you can clone the Video-to-Video checkpoints
+git clone https://www.modelscope.cn/iic/Video-to-Video.git
 
 ```
 
+
 ### Checkpoint Orgnization Structure
 After downloading, the model checkpoints should be placed as follows:  
 

diff --git a/inputs/v2v/001/00.mp4 b/inputs/v2v/001/00.mp4
diff --git a/inputs/v2v/001/prompts.txt b/inputs/v2v/001/prompts.txt
@@ -0,0 +1 @@
+a cartoon dog is running in the forest.
diff --git a/scripts/inference_utils.py b/scripts/inference_utils.py
@@ -108,6 +108,41 @@ def load_inputs_i2v(input_dir, video_size=(256,256), video_frames=16):
 
     return filename_list, image_list, prompt_list
 
+def load_inputs_v2v(input_dir, video_size=None, video_frames=None):
+    """
+    Load prompt list and input videos for v2v from an input directory.
+    """
+    # load prompt files
+    prompt_files = get_target_filelist(input_dir, ext='txt')
+    if len(prompt_files) > 1:
+        # only use the first one (sorted by name) if multiple exist
+        print(f"Warning: multiple prompt files exist. The one {os.path.split(prompt_files[0])[1]} is used.")
+        prompt_file = prompt_files[0]
+    elif len(prompt_files) == 1:
+        prompt_file = prompt_files[0]
+    elif len(prompt_files) == 0:
+        print(prompt_files)
+        raise ValueError(f"Error: found NO prompt file in {input_dir}")
+    prompt_list = load_prompts(prompt_file)
+    n_samples = len(prompt_list)
+
+    ## load videos
+    video_filepaths = get_target_filelist(input_dir, ext='[m][p][4]')
+    video_filenames = [os.path.split(video_filepath)[-1] for video_filepath in video_filepaths]
+
+    return prompt_list, video_filepaths, video_filenames
+
+def open_video_to_tensor(filepath, video_width=None, video_height=None):
+    if video_width is None and video_height is None:
+        vidreader = VideoReader(filepath, ctx=cpu(0), width=video_width, height=video_height)
+    else:
+        vidreader = VideoReader(filepath, ctx=cpu(0))
+    frame_indices = list(range(len(vidreader)))
+    frames = vidreader.get_batch(frame_indices)
+    frame_tensor = torch.tensor(frames.asnumpy()).permute(3, 0, 1, 2).float()
+    frame_tensor = (frame_tensor / 255. - 0.5) * 2
+    return frame_tensor.unsqueeze(0)
+
 def load_video_batch(filepath_list, frame_stride, video_size=(256,256), video_frames=16):
     '''
     Notice about some special cases:

diff --git a/scripts/inference_v2v_ms.py b/scripts/inference_v2v_ms.py
@@ -0,0 +1,39 @@
+import os, sys
+import argparse
+sys.path.insert(0, os.getcwd())
+
+from modelscope.models import Model
+from modelscope.pipelines import pipeline	
+from modelscope.outputs import OutputKeys	
+
+from scripts.inference_utils import load_inputs_v2v
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_path", type=str, default="checkpoints/Video-to-Video", help="Checkpoint path of the model")
+    parser.add_argument("--input_dir", type=str, default=None, help="A input directory containing videos and prompts for video-to-video enhancement")
+    parser.add_argument("--output_dir", type=str, default=None, help="Results saving directory")
+    return parser
+
+# prepare arguments, model, pipeline.
+args = get_parser().parse_args()
+model = Model.from_pretrained(args.ckpt_path)
+pipe = pipeline(task="video-to-video", model=model, model_revision='v1.1.0', device='cuda:0')	
+print(f"Successfully loaded model from {args.ckpt_path}")
+
+os.makedirs(args.output_dir, exist_ok=True)
+
+# load input prompts, video paths, video filenames
+prompt_list, video_filepaths, video_filenames = load_inputs_v2v(input_dir=args.input_dir)
+
+# video-to-video enhancement
+for i, (prompt, videofilepath, videofilename) in enumerate(zip(prompt_list, video_filepaths, video_filenames)):
+    print(f"[{i}:03d] input path: {videofilepath}")
+    print(f"[{i}:03d] input name: {videofilename}")
+    print(f"[{i}:03d] prompt: {prompt}")
+    p_input = {	
+        'video_path': videofilepath,	
+        'text': prompt	
+    }	
+    output_video_path = pipe(p_input, output_video=os.path.join(args.output_dir, videofilename))[OutputKeys.OUTPUT_VIDEO]
+    print(f"Successfully processed {videofilename} and saved to {output_video_path}")
diff --git a/shscripts/inference_v2v_ms.sh b/shscripts/inference_v2v_ms.sh
@@ -0,0 +1,6 @@
+input_dir="inputs/v2v/001"
+current_time=$(date +%Y%m%d%H%M%S)
+output_dir="results/v2v/$current_time-v2v-modelscope-001"
+
+python3 scripts/inference_v2v_ms.py \
+    --input_dir $input_dir --output_dir $output_dir