You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Koala Single Video Inference
"""
import argparse
import os
import random
import numpy as np
import torch
import torch.backends.cudnn as cudnn
from koala.common.config import Config
from koala.common.dist_utils import get_rank
from koala.common.registry import registry
from koala.conversation.conversation_video import Chat, Conversation, default_conversation, SeparatorStyle, conv_llava_llama_2
import decord
decord.bridge.set_bridge('torch')
import copy
Imports modules for registration
from koala.datasets.builders import *
from koala.models import *
from koala.processors import *
from koala.runners import *
from koala.tasks import *
def parse_args():
parser = argparse.ArgumentParser(description="Koala Inference")
parser.add_argument("--cfg-path", default="./eval_configs/conversation_demo.yaml", help="Path to configuration file.")
parser.add_argument("--video-path", default="/share/softwares/uzair/datasets/LoViQA/test_video3_converted/excercise_trick.mp4", help="Path to the input video file.")
parser.add_argument("--gpu-id", type=int, default=0, help="Specify the GPU to load the model.")
parser.add_argument("--model_type", type=str, default='vicuna', help="Specify LLM.")
parser.add_argument('--pretrained_weight_path', type=str, default="./koala_model.pth", help='Path to pretrained weight path')
parser.add_argument('--num_frames_per_clip', type=int, default=16, help='Specify how many frames to use per clip')
parser.add_argument('--num_segments', type=int, default=4, help='Specify number of video segments')
parser.add_argument('--hierarchical_agg_function', type=str, default="without-top-final-global-prompts-region-segment-full-dis-spatiotemporal-prompts-attn-early-attn-linear-learned", help='Specify function to merge global and clip visual representations')
parser.add_argument("--num-beams", type=int, default=1, help="Beam search numbers.")
parser.add_argument("--temperature", type=float, default=1.0, help="Sampling temperature.")
parser.add_argument("--prompt", type=str, default="Please explain the video in detail.", help="User query for video inference.")
parser.add_argument(
"--options",
nargs="+",
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
)
args = parser.parse_args()
return args
best_checkpoint = torch.load(args.pretrained_weight_path, map_location='cpu')['model_state_dict'] #*
pretrained_dict = {} #*
for k, v in best_checkpoint.items(): #*
pretrained_dict[k.replace('module.', '')] = v #*
chat_state.system = "You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail."
img_list = []
llm_message = chat.upload_video_without_audio(video_path, chat_state, img_list) #VIDEO_PATH
@insafim I have uploaded the correct version of demo.py now. I realized that it was the older version that I pushed previously. Please let me know if you face any issues.
I have written a code for single inference. But I am not getting the expected output.
User Query: Please explain the video in detail.
Koala Response: What query are you answering?
Here is the code:
`"""
Adapted from: https://github.com/Vision-CAIR/MiniGPT-4/blob/main/demo.py
Koala Single Video Inference
"""
import argparse
import os
import random
import numpy as np
import torch
import torch.backends.cudnn as cudnn
from koala.common.config import Config
from koala.common.dist_utils import get_rank
from koala.common.registry import registry
from koala.conversation.conversation_video import Chat, Conversation, default_conversation, SeparatorStyle, conv_llava_llama_2
import decord
decord.bridge.set_bridge('torch')
import copy
Imports modules for registration
from koala.datasets.builders import *
from koala.models import *
from koala.processors import *
from koala.runners import *
from koala.tasks import *
def parse_args():
parser = argparse.ArgumentParser(description="Koala Inference")
parser.add_argument("--cfg-path", default="./eval_configs/conversation_demo.yaml", help="Path to configuration file.")
parser.add_argument("--video-path", default="/share/softwares/uzair/datasets/LoViQA/test_video3_converted/excercise_trick.mp4", help="Path to the input video file.")
parser.add_argument("--gpu-id", type=int, default=0, help="Specify the GPU to load the model.")
parser.add_argument("--model_type", type=str, default='vicuna', help="Specify LLM.")
========================================
Model Initialization
========================================
print('Initializing Chat')
args = parse_args()
cfg = Config(args)
model_config = cfg.model_cfg
model_config.device_8bit = args.gpu_id
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
model.num_frames_per_clip = args.num_frames_per_clip #*
model.num_segments = args.num_segments #*
model.hierarchical_agg_function = args.hierarchical_agg_function #*
model.global_region_embed_weight = None #*
model.initialize_visual_agg_function() #*
best_checkpoint = torch.load(args.pretrained_weight_path, map_location='cpu')['model_state_dict'] #*
pretrained_dict = {} #*
for k, v in best_checkpoint.items(): #*
pretrained_dict[k.replace('module.', '')] = v #*
model_dict = model.state_dict() #*
model_dict.update(pretrained_dict) #*
model.load_state_dict(model_dict) #*
model.cuda().eval() #*
#vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
print('Initialization Finished')
========================================
Prepare conversation
if args.model_type == 'vicuna':
chat_state = default_conversation.copy()
else:
chat_state = conv_llava_llama_2.copy()
Upload the single video
chat_state.system = "You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail."
img_list = []
llm_message = chat.upload_video_without_audio(video_path, chat_state, img_list) #VIDEO_PATH
chat_state_seperate = copy.deepcopy(chat_state)
user_question = args.prompt #TEXT_INPUT
chat.ask(user_question, chat_state_seperate)
llm_message = chat.answer(conv=chat_state_seperate,
img_list=img_list,
num_beams=args.num_beams,
temperature=args.temperature,
max_new_tokens=300,
max_length=2000)[0]
Display the response
print("User Query:", args.prompt)
print("Koala Response:", llm_message)
`
Can you please help me to correct this.
The text was updated successfully, but these errors were encountered: