diff --git a/demo/demo_bodymocap.py b/demo/demo_bodymocap.py index 704b77f..eabeacb 100644 --- a/demo/demo_bodymocap.py +++ b/demo/demo_bodymocap.py @@ -1,92 +1,23 @@ # Copyright (c) Facebook, Inc. and its affiliates. -import os -import sys -import os.path as osp import torch -from torchvision.transforms import Normalize import numpy as np -import cv2 -import argparse -import json -import pickle -from datetime import datetime -from demo.demo_options import DemoOptions from bodymocap.body_mocap_api import BodyMocap from bodymocap.body_bbox_detector import BodyPoseEstimator import mocap_utils.demo_utils as demo_utils -import mocap_utils.general_utils as gnu -from mocap_utils.timer import Timer -import renderer.image_utils as imu -from renderer.viewer2D import ImShow +import demo +from demo.demo_options import DemoOptions def run_body_mocap(args, body_bbox_detector, body_mocap, visualizer): - #Setup input data to handle different types of inputs - input_type, input_data = demo_utils.setup_input(args) - - cur_frame = args.start_frame - video_frame = 0 - timer = Timer() - while True: - timer.tic() - # load data - load_bbox = False - - if input_type =='image_dir': - if cur_frame < len(input_data): - image_path = input_data[cur_frame] - img_original_bgr = cv2.imread(image_path) - else: - img_original_bgr = None - - elif input_type == 'bbox_dir': - if cur_frame < len(input_data): - print("Use pre-computed bounding boxes") - image_path = input_data[cur_frame]['image_path'] - hand_bbox_list = input_data[cur_frame]['hand_bbox_list'] - body_bbox_list = input_data[cur_frame]['body_bbox_list'] - img_original_bgr = cv2.imread(image_path) - load_bbox = True - else: - img_original_bgr = None - - elif input_type == 'video': - _, img_original_bgr = input_data.read() - if video_frame < cur_frame: - video_frame += 1 - continue - # save the obtained video frames - image_path = osp.join(args.out_dir, "frames", f"{cur_frame:05d}.jpg") - if img_original_bgr is not None: - video_frame += 1 - if args.save_frame: - gnu.make_subdir(image_path) - cv2.imwrite(image_path, img_original_bgr) - - elif input_type == 'webcam': - _, img_original_bgr = input_data.read() - - if video_frame < cur_frame: - video_frame += 1 - continue - # save the obtained video frames - image_path = osp.join(args.out_dir, "frames", f"scene_{cur_frame:05d}.jpg") - if img_original_bgr is not None: - video_frame += 1 - if args.save_frame: - gnu.make_subdir(image_path) - cv2.imwrite(image_path, img_original_bgr) - else: - assert False, "Unknown input_type" - - cur_frame +=1 - if img_original_bgr is None or cur_frame > args.end_frame: - break - print("--------------------------------------") + for input_frame_and_metadata in demo.demo_common.input_frame_and_metadata_iterator(args): + image_path = input_frame_and_metadata.image_path + img_original_bgr = input_frame_and_metadata.img_original_bgr + load_bbox = input_frame_and_metadata.load_bbox if load_bbox: + body_bbox_list = input_frame_and_metadata.body_bbox_list body_pose_list = None else: body_pose_list, body_bbox_list = body_bbox_detector.detect_body_pose( @@ -113,40 +44,9 @@ def run_body_mocap(args, body_bbox_detector, body_mocap, visualizer): pred_output_list = body_mocap.regress(img_original_bgr, body_bbox_list) assert len(body_bbox_list) == len(pred_output_list) - # extract mesh for rendering (vertices in image space and faces) from pred_output_list - pred_mesh_list = demo_utils.extract_mesh_from_output(pred_output_list) - - # visualization - res_img = visualizer.visualize( - img_original_bgr, - pred_mesh_list = pred_mesh_list, - body_bbox_list = body_bbox_list) - - # show result in the screen - if not args.no_display: - res_img = res_img.astype(np.uint8) - ImShow(res_img) - - # save result image - if args.out_dir is not None: - demo_utils.save_res_img(args.out_dir, image_path, res_img) - - # save predictions to pkl - if args.save_pred_pkl: - demo_type = 'body' - demo_utils.save_pred_to_pkl( - args, demo_type, image_path, body_bbox_list, hand_bbox_list, pred_output_list) - - timer.toc(bPrint=True,title="Time") - print(f"Processed : {image_path}") - - #save images as a video - if not args.no_video_out and input_type in ['video', 'webcam']: - demo_utils.gen_video_out(args.out_dir, args.seq_name) - - if input_type =='webcam' and input_data is not None: - input_data.release() - cv2.destroyAllWindows() + demo.demo_common.show_and_save_result( + args, 'body', input_frame_and_metadata, visualizer, pred_output_list + ) def main(): diff --git a/demo/demo_common.py b/demo/demo_common.py new file mode 100644 index 0000000..bf690db --- /dev/null +++ b/demo/demo_common.py @@ -0,0 +1,206 @@ +import argparse +import os.path + +import cv2 +import numpy as np + +import mocap_utils.demo_utils as demo_utils +import mocap_utils.general_utils as gnu +from mocap_utils.timer import Timer + +from renderer.viewer2D import ImShow + + +def input_frame_and_metadata_iterator(args): + # Setup input data to handle different types of inputs + input_type, input_data = demo_utils.setup_input(args) + + assert args.out_dir is not None, "Please specify output dir to store the results" + cur_frame = args.start_frame + video_frame = 0 + timer = Timer() + + while True: + timer.tic() + + # load data + load_bbox = False + hand_bbox_list = None + body_bbox_list = None + image_path = None + + if input_type =='image_dir': + if cur_frame < len(input_data): + image_path = input_data[cur_frame] + img_original_bgr = cv2.imread(image_path) + else: + img_original_bgr = None + + elif input_type == 'bbox_dir': + if cur_frame < len(input_data): + print("Use pre-computed bounding boxes") + image_path = input_data[cur_frame]['image_path'] + hand_bbox_list = input_data[cur_frame]['hand_bbox_list'] + body_bbox_list = input_data[cur_frame]['body_bbox_list'] + img_original_bgr = cv2.imread(image_path) + load_bbox = True + else: + img_original_bgr = None + + elif input_type == 'video': + _, img_original_bgr = input_data.read() + if video_frame < cur_frame: + video_frame += 1 + continue + # save the obtained video frames + image_path = os.path.join(args.out_dir, "frames", f"{cur_frame:05d}.jpg") + if img_original_bgr is not None: + video_frame += 1 + if args.save_frame: + gnu.make_subdir(image_path) + cv2.imwrite(image_path, img_original_bgr) + + elif input_type == 'webcam': + _, img_original_bgr = input_data.read() + + if video_frame < cur_frame: + video_frame += 1 + continue + # save the obtained video frames + image_path = os.path.join(args.out_dir, "frames", f"scene_{cur_frame:05d}.jpg") + if img_original_bgr is not None: + video_frame += 1 + if args.save_frame: + gnu.make_subdir(image_path) + cv2.imwrite(image_path, img_original_bgr) + else: + assert False, "Unknown input_type" + + cur_frame += 1 + if img_original_bgr is None or cur_frame > args.end_frame: + break + + input_frame_and_metadata = argparse.Namespace( + image_path=image_path, + img_original_bgr=img_original_bgr, + load_bbox=load_bbox, + ) + + print("--------------------------------------") + + if load_bbox: + input_data.body_bbox_list = body_bbox_list + input_data.hand_bbox_list = hand_bbox_list + yield input_frame_and_metadata + + timer.toc(bPrint=True, title="Time") + print(f"Processed : {image_path}") + + # save images as a video + if not args.no_video_out and input_type in ['video', 'webcam']: + demo_utils.gen_video_out(args.out_dir, args.seq_name) + + # When everything done, release the capture + if input_type == 'webcam' and input_data is not None: + input_data.release() + cv2.destroyAllWindows() + + +def detect_hand_bbox_and_save_it_into_frame_and_metadata(args, input_frame_and_metadata, bbox_detector_method): + image_path = input_frame_and_metadata.image_path + img_original_bgr = input_frame_and_metadata.img_original_bgr + load_bbox = input_frame_and_metadata.load_bbox + + # bbox detection + body_bbox_list = None + if load_bbox: + body_bbox_list = input_frame_and_metadata.body_bbox_list + hand_bbox_list = input_frame_and_metadata.hand_bbox_list + body_pose_list = None + raw_hand_bboxes = None + elif args.crop_type == 'hand_crop': + # hand already cropped, therefore, no need for detection + img_h, img_w = img_original_bgr.shape[:2] + body_pose_list = None + raw_hand_bboxes = None + hand_bbox_list = [dict(right_hand=np.array([0, 0, img_w, img_h]))] + else: + # Input images has other body part or hand not cropped. + # Use hand detection model & body detector for hand detection + assert args.crop_type == 'no_crop' + detect_output = bbox_detector_method(img_original_bgr.copy()) + body_pose_list, body_bbox_list, hand_bbox_list, raw_hand_bboxes = detect_output + + # save the obtained body & hand bbox to json file + if args.save_bbox_output: + demo_utils.save_info_to_json(args, image_path, body_bbox_list, hand_bbox_list) + + if len(hand_bbox_list) < 1: + print(f"No hand detected: {image_path}") + return False + + input_frame_and_metadata.body_bbox_list = body_bbox_list + input_frame_and_metadata.hand_bbox_list = hand_bbox_list + input_frame_and_metadata.body_pose_list = body_pose_list + input_frame_and_metadata.raw_hand_bboxes = raw_hand_bboxes + return True + + +def show_and_save_result( + args, demo_type, input_frame_and_metadata, visualizer=None, + pred_output_list=None, transformed_image=None, image_category="rendered" +): + image_path = input_frame_and_metadata.image_path + img_original_bgr = input_frame_and_metadata.img_original_bgr + body_bbox_list = input_frame_and_metadata.body_bbox_list + hand_bbox_list = input_frame_and_metadata.hand_bbox_list + + # extract mesh for rendering (vertices in image space and faces) from pred_output_list + pred_mesh_list = None + if pred_output_list is not None: + pred_mesh_list = demo_utils.extract_mesh_from_output(pred_output_list) + + res_img = None + if transformed_image is not None: + res_img = transformed_image + elif visualizer is not None: + # visualization + if demo_type == 'frank': + res_img = visualizer.visualize( + img_original_bgr, + pred_mesh_list=pred_mesh_list, + body_bbox_list=body_bbox_list, + hand_bbox_list=hand_bbox_list + ) + elif demo_type == 'body': + res_img = visualizer.visualize( + img_original_bgr, + pred_mesh_list=pred_mesh_list, + hand_bbox_list=body_bbox_list + ) + elif demo_type == 'hand': + res_img = visualizer.visualize( + img_original_bgr, + pred_mesh_list=pred_mesh_list, + hand_bbox_list=hand_bbox_list + ) + else: + raise ValueError("Unknown demo_type") + + if res_img is not None: + # show result in the screen + if not args.no_display: + res_img = res_img.astype(np.uint8) + ImShow(res_img) + + # save result image (we can make an option here) + if args.out_dir is not None: + demo_utils.save_res_img( + args.out_dir, image_path, res_img, image_category=image_category + ) + + # save predictions to pkl + if args.save_pred_pkl: + demo_utils.save_pred_to_pkl( + args, demo_type, image_path, body_bbox_list, hand_bbox_list, pred_output_list + ) diff --git a/demo/demo_frankmocap.py b/demo/demo_frankmocap.py index e920261..0534786 100644 --- a/demo/demo_frankmocap.py +++ b/demo/demo_frankmocap.py @@ -1,31 +1,17 @@ # Copyright (c) Facebook, Inc. and its affiliates. -import os -import sys -import os.path as osp import torch -from torchvision.transforms import Normalize import numpy as np -import cv2 -import argparse -import json -import pickle -############# input parameters ############# -from demo.demo_options import DemoOptions from bodymocap.body_mocap_api import BodyMocap from handmocap.hand_mocap_api import HandMocap import mocap_utils.demo_utils as demo_utils -import mocap_utils.general_utils as gnu -from mocap_utils.timer import Timer -from datetime import datetime -from bodymocap.body_bbox_detector import BodyPoseEstimator from handmocap.hand_bbox_detector import HandBboxDetector from integration.copy_and_paste import integration_copy_paste -import renderer.image_utils as imu -from renderer.viewer2D import ImShow +import demo.demo_common +from demo.demo_options import DemoOptions def __filter_bbox_list(body_bbox_list, hand_bbox_list, single_person): @@ -108,66 +94,11 @@ def run_regress( def run_frank_mocap(args, bbox_detector, body_mocap, hand_mocap, visualizer): - #Setup input data to handle different types of inputs - input_type, input_data = demo_utils.setup_input(args) - - cur_frame = args.start_frame - video_frame = 0 - while True: - # load data - load_bbox = False - - if input_type =='image_dir': - if cur_frame < len(input_data): - image_path = input_data[cur_frame] - img_original_bgr = cv2.imread(image_path) - else: - img_original_bgr = None - - elif input_type == 'bbox_dir': - if cur_frame < len(input_data): - image_path = input_data[cur_frame]['image_path'] - hand_bbox_list = input_data[cur_frame]['hand_bbox_list'] - body_bbox_list = input_data[cur_frame]['body_bbox_list'] - img_original_bgr = cv2.imread(image_path) - load_bbox = True - else: - img_original_bgr = None - - elif input_type == 'video': - _, img_original_bgr = input_data.read() - if video_frame < cur_frame: - video_frame += 1 - continue - # save the obtained video frames - image_path = osp.join(args.out_dir, "frames", f"{cur_frame:05d}.jpg") - if img_original_bgr is not None: - video_frame += 1 - if args.save_frame: - gnu.make_subdir(image_path) - cv2.imwrite(image_path, img_original_bgr) - - elif input_type == 'webcam': - _, img_original_bgr = input_data.read() - - if video_frame < cur_frame: - video_frame += 1 - continue - # save the obtained video frames - image_path = osp.join(args.out_dir, "frames", f"scene_{cur_frame:05d}.jpg") - if img_original_bgr is not None: - video_frame += 1 - if args.save_frame: - gnu.make_subdir(image_path) - cv2.imwrite(image_path, img_original_bgr) - else: - assert False, "Unknown input_type" + for input_frame_and_metadata in demo.demo_common.input_frame_and_metadata_iterator(args): + image_path = input_frame_and_metadata.image_path + img_original_bgr = input_frame_and_metadata.img_original_bgr + load_bbox = input_frame_and_metadata.load_bbox - cur_frame +=1 - if img_original_bgr is None or cur_frame > args.end_frame: - break - print("--------------------------------------") - # bbox detection if not load_bbox: body_bbox_list, hand_bbox_list = list(), list() @@ -186,39 +117,9 @@ def run_frank_mocap(args, bbox_detector, body_mocap, hand_mocap, visualizer): print(f"No body deteced: {image_path}") continue - pred_mesh_list = demo_utils.extract_mesh_from_output(pred_output_list) - - # visualization - res_img = visualizer.visualize( - img_original_bgr, - pred_mesh_list = pred_mesh_list, - body_bbox_list = body_bbox_list, - hand_bbox_list = hand_bbox_list) - - # show result in the screen - if not args.no_display: - res_img = res_img.astype(np.uint8) - ImShow(res_img) - - # save result image - if args.out_dir is not None: - demo_utils.save_res_img(args.out_dir, image_path, res_img) - - # save predictions to pkl - if args.save_pred_pkl: - demo_type = 'frank' - demo_utils.save_pred_to_pkl( - args, demo_type, image_path, body_bbox_list, hand_bbox_list, pred_output_list) - - print(f"Processed : {image_path}") - - # save images as a video - if not args.no_video_out and input_type in ['video', 'webcam']: - demo_utils.gen_video_out(args.out_dir, args.seq_name) - - if input_type =='webcam' and input_data is not None: - input_data.release() - cv2.destroyAllWindows() + demo.demo_common.show_and_save_result( + args, 'frank', input_frame_and_metadata, visualizer, pred_output_list + ) def main(): args = DemoOptions().parse() diff --git a/demo/demo_handmocap.py b/demo/demo_handmocap.py index 2262c33..573b72c 100644 --- a/demo/demo_handmocap.py +++ b/demo/demo_handmocap.py @@ -1,156 +1,37 @@ # Copyright (c) Facebook, Inc. and its affiliates. -import os, sys, shutil -import os.path as osp -import numpy as np -import cv2 -import json import torch -from torchvision.transforms import Normalize - -from demo.demo_options import DemoOptions -import mocap_utils.general_utils as gnu -import mocap_utils.demo_utils as demo_utils from handmocap.hand_mocap_api import HandMocap from handmocap.hand_bbox_detector import HandBboxDetector -import renderer.image_utils as imu -from renderer.viewer2D import ImShow -import time +import demo.demo_common +from demo.demo_options import DemoOptions def run_hand_mocap(args, bbox_detector, hand_mocap, visualizer): - #Set up input data (images or webcam) - input_type, input_data = demo_utils.setup_input(args) - - assert args.out_dir is not None, "Please specify output dir to store the results" - cur_frame = args.start_frame - video_frame = 0 - - while True: - # load data - load_bbox = False - - if input_type =='image_dir': - if cur_frame < len(input_data): - image_path = input_data[cur_frame] - img_original_bgr = cv2.imread(image_path) - else: - img_original_bgr = None - - elif input_type == 'bbox_dir': - if cur_frame < len(input_data): - print("Use pre-computed bounding boxes") - image_path = input_data[cur_frame]['image_path'] - hand_bbox_list = input_data[cur_frame]['hand_bbox_list'] - body_bbox_list = input_data[cur_frame]['body_bbox_list'] - img_original_bgr = cv2.imread(image_path) - load_bbox = True - else: - img_original_bgr = None - - elif input_type == 'video': - _, img_original_bgr = input_data.read() - if video_frame < cur_frame: - video_frame += 1 - continue - # save the obtained video frames - image_path = osp.join(args.out_dir, "frames", f"{cur_frame:05d}.jpg") - if img_original_bgr is not None: - video_frame += 1 - if args.save_frame: - gnu.make_subdir(image_path) - cv2.imwrite(image_path, img_original_bgr) - - elif input_type == 'webcam': - _, img_original_bgr = input_data.read() - - if video_frame < cur_frame: - video_frame += 1 - continue - # save the obtained video frames - image_path = osp.join(args.out_dir, "frames", f"scene_{cur_frame:05d}.jpg") - if img_original_bgr is not None: - video_frame += 1 - if args.save_frame: - gnu.make_subdir(image_path) - cv2.imwrite(image_path, img_original_bgr) - else: - assert False, "Unknown input_type" - - cur_frame +=1 - if img_original_bgr is None or cur_frame > args.end_frame: - break - print("--------------------------------------") - - # bbox detection - if load_bbox: - body_pose_list = None - raw_hand_bboxes = None - elif args.crop_type == 'hand_crop': - # hand already cropped, thererore, no need for detection - img_h, img_w = img_original_bgr.shape[:2] - body_pose_list = None - raw_hand_bboxes = None - hand_bbox_list = [ dict(right_hand = np.array([0, 0, img_w, img_h])) ] - else: - # Input images has other body part or hand not cropped. - # Use hand detection model & body detector for hand detection - assert args.crop_type == 'no_crop' - detect_output = bbox_detector.detect_hand_bbox(img_original_bgr.copy()) - body_pose_list, body_bbox_list, hand_bbox_list, raw_hand_bboxes = detect_output - - # save the obtained body & hand bbox to json file - if args.save_bbox_output: - demo_utils.save_info_to_json(args, image_path, body_bbox_list, hand_bbox_list) - - if len(hand_bbox_list) < 1: - print(f"No hand deteced: {image_path}") + for input_frame_and_metadata in demo.demo_common.input_frame_and_metadata_iterator(args): + img_original_bgr = input_frame_and_metadata.img_original_bgr + + if not demo.demo_common.detect_hand_bbox_and_save_it_into_frame_and_metadata( + args, input_frame_and_metadata, bbox_detector.detect_hand_bbox + ): continue - + + body_bbox_list = input_frame_and_metadata.body_bbox_list + hand_bbox_list = input_frame_and_metadata.hand_bbox_list + # Hand Pose Regression pred_output_list = hand_mocap.regress( img_original_bgr, hand_bbox_list, add_margin=True) assert len(hand_bbox_list) == len(body_bbox_list) assert len(body_bbox_list) == len(pred_output_list) - # extract mesh for rendering (vertices in image space and faces) from pred_output_list - pred_mesh_list = demo_utils.extract_mesh_from_output(pred_output_list) - - # visualize - res_img = visualizer.visualize( - img_original_bgr, - pred_mesh_list = pred_mesh_list, - hand_bbox_list = hand_bbox_list) - - # show result in the screen - if not args.no_display: - res_img = res_img.astype(np.uint8) - ImShow(res_img) - - # save the image (we can make an option here) - if args.out_dir is not None: - demo_utils.save_res_img(args.out_dir, image_path, res_img) - - # save predictions to pkl - if args.save_pred_pkl: - demo_type = 'hand' - demo_utils.save_pred_to_pkl( - args, demo_type, image_path, body_bbox_list, hand_bbox_list, pred_output_list) - - print(f"Processed : {image_path}") - - #save images as a video - if not args.no_video_out and input_type in ['video', 'webcam']: - demo_utils.gen_video_out(args.out_dir, args.seq_name) - - # When everything done, release the capture - if input_type =='webcam' and input_data is not None: - input_data.release() - cv2.destroyAllWindows() - - + demo.demo_common.show_and_save_result( + args, 'hand', input_frame_and_metadata, visualizer, pred_output_list + ) + + def main(): args = DemoOptions().parse() args.use_smplx = True