From 3b7eac2ac73e4dc6e22d39d126051251495e26b0 Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 2 Jan 2025 16:23:55 +0800 Subject: [PATCH] update yolov8 det seg pose cls obb examples (#5863) --- examples/CMakeLists.txt | 6 +- examples/yolov8.cpp | 494 ++++++++++++++++++++++--------- examples/yolov8_cls.cpp | 325 ++++++++++++++++++++ examples/yolov8_obb.cpp | 522 ++++++++++++++++++++++++++++++++ examples/yolov8_pose.cpp | 561 +++++++++++++++++++++++++++++++++++ examples/yolov8_seg.cpp | 624 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 2395 insertions(+), 137 deletions(-) create mode 100644 examples/yolov8_cls.cpp create mode 100644 examples/yolov8_obb.cpp create mode 100644 examples/yolov8_pose.cpp create mode 100644 examples/yolov8_seg.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index bf3017dbe68..75b9d8de269 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -52,6 +52,10 @@ if(NCNN_PIXEL) ncnn_add_example(yolov5_pnnx) ncnn_add_example(yolov7_pnnx) ncnn_add_example(yolov7) + ncnn_add_example(yolov8) + ncnn_add_example(yolov8_seg) + ncnn_add_example(yolov8_pose) + ncnn_add_example(yolov8_cls) ncnn_add_example(yolox) ncnn_add_example(mobilenetv2ssdlite) ncnn_add_example(mobilenetssd) @@ -67,9 +71,9 @@ if(NCNN_PIXEL) ncnn_add_example(scrfd_crowdhuman) if(OpenCV_FOUND) ncnn_add_example(yolov4) + ncnn_add_example(yolov8_obb) ncnn_add_example(rvm) ncnn_add_example(p2pnet) - ncnn_add_example(yolov8) endif() else() message(WARNING "OpenCV not found and NCNN_SIMPLEOCV disabled, examples won't be built") diff --git a/examples/yolov8.cpp b/examples/yolov8.cpp index e166e6c1d17..3a5068b5f79 100644 --- a/examples/yolov8.cpp +++ b/examples/yolov8.cpp @@ -2,8 +2,6 @@ // // Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. // -// Copyright (C) 2024 whyb(https://github.com/whyb). All rights reserved. -// // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // @@ -14,49 +12,61 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -// ReadMe -// Convert yolov8 model to ncnn model workflow: -// -// step 1: -// If you don't want to train the model yourself. You should go to the ultralytics website download the pretrained model file. -// original pretrained model from https://docs.ultralytics.com/models/yolov8/#supported-tasks-and-modes +// 1. install +// pip3 install -U ultralytics pnnx ncnn +// 2. export yolov8 torchscript +// yolo export model=yolov8n.pt format=torchscript +// 3. convert torchscript with static shape +// pnnx yolov8n.torchscript +// 4. modify yolov8n_pnnx.py for dynamic shape inference +// A. modify reshape to support dynamic image sizes +// B. permute tensor before concat and adjust concat axis +// C. drop post-process part +// before: +// v_165 = v_142.view(1, 144, 6400) +// v_166 = v_153.view(1, 144, 1600) +// v_167 = v_164.view(1, 144, 400) +// v_168 = torch.cat((v_165, v_166, v_167), dim=2) +// ... +// after: +// v_165 = v_142.view(1, 144, -1).transpose(1, 2) +// v_166 = v_153.view(1, 144, -1).transpose(1, 2) +// v_167 = v_164.view(1, 144, -1).transpose(1, 2) +// v_168 = torch.cat((v_165, v_166, v_167), dim=1) +// return v_168 +// 5. re-export yolov8 torchscript +// python3 -c 'import yolov8n_pnnx; yolov8n_pnnx.export_torchscript()' +// 6. convert new torchscript with dynamic shape +// pnnx yolov8n_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320] +// 7. now you get ncnn model files +// mv yolov8n_pnnx.py.ncnn.param yolov8n.ncnn.param +// mv yolov8n_pnnx.py.ncnn.bin yolov8n.ncnn.bin + +// the out blob would be a 2-dim tensor with w=144 h=8400 // -// step 2: -// run this command. -// conda create --name yolov8 python=3.11 -// conda activate yolov8 -// pip install ultralytics onnx numpy protobuf +// | bbox-reg 16 x 4 | per-class scores(80) | +// +-----+-----+-----+-----+----------------------+ +// | dx0 | dy0 | dx1 | dy1 |0.1 0.0 0.0 0.5 ......| +// all /| | | | | . | +// boxes | .. | .. | .. | .. |0.0 0.9 0.0 0.0 ......| +// (8400)| | | | | . | +// \| | | | | . | +// +-----+-----+-----+-----+----------------------+ // -// step 3: -// save source code file(export_model_to_ncnn.py): -// from ultralytics import YOLO -// detection_models = [ -// ["./Detection-pt/yolov8n.pt", "./Detection-pt/"], -// ["./Detection-pt/yolov8s.pt", "./Detection-pt/"], -// ["./Detection-pt/yolov8m.pt", "./Detection-pt/"], -// ["./Detection-pt/yolov8l.pt", "./Detection-pt/"], -// ["./Detection-pt/yolov8x.pt", "./Detection-pt/"] -// ] -// for model_dict in detection_models: -// model = YOLO(model_dict[0]) # load an official pretrained weight model -// model.export(format="ncnn", dynamic=True, save_dir=model_dict[1], simplify=True) -// -// step 4: -// run command: python export_model_to_ncnn.py -#include -#include -#include #include "layer.h" #include "net.h" -#include +#if defined(USE_NCNN_SIMPLEOCV) +#include "simpleocv.h" +#else #include #include +#include +#endif #include #include - -#define MAX_STRIDE 32 +#include struct Object { @@ -95,13 +105,13 @@ static void qsort_descent_inplace(std::vector& objects, int left, int ri } } - #pragma omp parallel sections + // #pragma omp parallel sections { - #pragma omp section + // #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } - #pragma omp section + // #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } @@ -116,26 +126,26 @@ static void qsort_descent_inplace(std::vector& objects) qsort_descent_inplace(objects, 0, objects.size() - 1); } -static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) +static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); - const int n = faceobjects.size(); + const int n = objects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { - areas[i] = faceobjects[i].rect.area(); + areas[i] = objects[i].rect.area(); } for (int i = 0; i < n; i++) { - const Object& a = faceobjects[i]; + const Object& a = objects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { - const Object& b = faceobjects[picked[j]]; + const Object& b = objects[picked[j]]; if (!agnostic && a.label != b.label) continue; @@ -155,66 +165,146 @@ static void nms_sorted_bboxes(const std::vector& faceobjects, std::vecto static inline float sigmoid(float x) { - return static_cast(1.f / (1.f + exp(-x))); + return 1.0f / (1.0f + expf(-x)); } -static inline float clampf(float d, float min, float max) +static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { - const float t = d < min ? min : d; - return t > max ? max : t; -} + const int w = in_pad.w; + const int h = in_pad.h; -static void parse_yolov8_detections( - float* inputs, float confidence_threshold, - int num_channels, int num_anchors, int num_labels, - int infer_img_width, int infer_img_height, - std::vector& objects) -{ - std::vector detections; - cv::Mat output = cv::Mat((int)num_channels, (int)num_anchors, CV_32F, inputs).t(); + const int num_grid_x = w / stride; + const int num_grid_y = h / stride; - for (int i = 0; i < num_anchors; i++) + const int reg_max_1 = 16; + const int num_class = pred.w - reg_max_1 * 4; // number of classes. 80 for COCO + + for (int y = 0; y < num_grid_y; y++) { - const float* row_ptr = output.row(i).ptr(); - const float* bboxes_ptr = row_ptr; - const float* scores_ptr = row_ptr + 4; - const float* max_s_ptr = std::max_element(scores_ptr, scores_ptr + num_labels); - float score = *max_s_ptr; - if (score > confidence_threshold) + for (int x = 0; x < num_grid_x; x++) { - float x = *bboxes_ptr++; - float y = *bboxes_ptr++; - float w = *bboxes_ptr++; - float h = *bboxes_ptr; - - float x0 = clampf((x - 0.5f * w), 0.f, (float)infer_img_width); - float y0 = clampf((y - 0.5f * h), 0.f, (float)infer_img_height); - float x1 = clampf((x + 0.5f * w), 0.f, (float)infer_img_width); - float y1 = clampf((y + 0.5f * h), 0.f, (float)infer_img_height); - - cv::Rect_ bbox; - bbox.x = x0; - bbox.y = y0; - bbox.width = x1 - x0; - bbox.height = y1 - y0; - Object object; - object.label = max_s_ptr - scores_ptr; - object.prob = score; - object.rect = bbox; - detections.push_back(object); + const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1); + + // find label with max score + int label = -1; + float score = -FLT_MAX; + { + const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class); + + for (int k = 0; k < num_class; k++) + { + float s = pred_score[k]; + if (s > score) + { + label = k; + score = s; + } + } + + score = sigmoid(score); + } + + if (score >= prob_threshold) + { + ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4); + + { + ncnn::Layer* softmax = ncnn::create_layer("Softmax"); + + ncnn::ParamDict pd; + pd.set(0, 1); // axis + pd.set(1, 1); + softmax->load_param(pd); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = false; + + softmax->create_pipeline(opt); + + softmax->forward_inplace(pred_bbox, opt); + + softmax->destroy_pipeline(opt); + + delete softmax; + } + + float pred_ltrb[4]; + for (int k = 0; k < 4; k++) + { + float dis = 0.f; + const float* dis_after_sm = pred_bbox.row(k); + for (int l = 0; l < reg_max_1; l++) + { + dis += l * dis_after_sm[l]; + } + + pred_ltrb[k] = dis * stride; + } + + float pb_cx = (x + 0.5f) * stride; + float pb_cy = (y + 0.5f) * stride; + + float x0 = pb_cx - pred_ltrb[0]; + float y0 = pb_cy - pred_ltrb[1]; + float x1 = pb_cx + pred_ltrb[2]; + float y1 = pb_cy + pred_ltrb[3]; + + Object obj; + obj.rect.x = x0; + obj.rect.y = y0; + obj.rect.width = x1 - x0; + obj.rect.height = y1 - y0; + obj.label = label; + obj.prob = score; + + objects.push_back(obj); + } } } - objects = detections; +} + +static void generate_proposals(const ncnn::Mat& pred, const std::vector& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) +{ + const int w = in_pad.w; + const int h = in_pad.h; + + int pred_row_offset = 0; + for (size_t i = 0; i < strides.size(); i++) + { + const int stride = strides[i]; + + const int num_grid_x = w / stride; + const int num_grid_y = h / stride; + const int num_grid = num_grid_x * num_grid_y; + + generate_proposals(pred.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects); + pred_row_offset += num_grid; + } } static int detect_yolov8(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolov8; - yolov8.opt.use_vulkan_compute = true; // if you want detect in hardware, then enable it - - yolov8.load_param("yolov8n.param"); - yolov8.load_model("yolov8n.bin"); + yolov8.opt.use_vulkan_compute = true; + // yolov8.opt.use_bf16_storage = true; + + // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets + yolov8.load_param("yolov8n_pnnx.py.ncnn.param"); + yolov8.load_model("yolov8n_pnnx.py.ncnn.bin"); + // yolov8.load_param("yolov8s_pnnx.py.ncnn.param"); + // yolov8.load_model("yolov8s_pnnx.py.ncnn.bin"); + // yolov8.load_param("yolov8m_pnnx.py.ncnn.param"); + // yolov8.load_model("yolov8m_pnnx.py.ncnn.bin"); + + // if you use oiv7 models, you shall call draw_objects_oiv() instead + // yolov8.load_param("yolov8n_oiv7_pnnx.py.ncnn.param"); + // yolov8.load_model("yolov8n_oiv7_pnnx.py.ncnn.bin"); + // yolov8.load_param("yolov8s_oiv7_pnnx.py.ncnn.param"); + // yolov8.load_model("yolov8s_oiv7_pnnx.py.ncnn.bin"); + // yolov8.load_param("yolov8m_oiv7_pnnx.py.ncnn.param"); + // yolov8.load_model("yolov8m_oiv7_pnnx.py.ncnn.bin"); const int target_size = 640; const float prob_threshold = 0.25f; @@ -223,7 +313,14 @@ static int detect_yolov8(const cv::Mat& bgr, std::vector& objects) int img_w = bgr.cols; int img_h = bgr.rows; - // letterbox pad to multiple of MAX_STRIDE + // ultralytics/cfg/models/v8/yolov8.yaml + std::vector strides(3); + strides[0] = 8; + strides[1] = 16; + strides[2] = 32; + const int max_stride = 32; + + // letterbox pad to multiple of max_stride int w = img_w; int h = img_h; float scale = 1.f; @@ -242,8 +339,9 @@ static int detect_yolov8(const cv::Mat& bgr, std::vector& objects) ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); - int wpad = (target_size + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w; - int hpad = (target_size + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h; + // letterbox pad to target_size rectangle + int wpad = (w + max_stride - 1) / max_stride * max_stride - w; + int hpad = (h + max_stride - 1) / max_stride * max_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); @@ -254,22 +352,11 @@ static int detect_yolov8(const cv::Mat& bgr, std::vector& objects) ex.input("in0", in_pad); - std::vector proposals; + ncnn::Mat out; + ex.extract("out0", out); - // stride 32 - { - ncnn::Mat out; - ex.extract("out0", out); - - std::vector objects32; - const int num_labels = 80; // COCO has detect 80 object labels. - parse_yolov8_detections( - (float*)out.data, prob_threshold, - out.h, out.w, num_labels, - in_pad.w, in_pad.h, - objects32); - proposals.insert(proposals.end(), objects32.begin(), objects32.end()); - } + std::vector proposals; + generate_proposals(out, strides, in_pad, prob_threshold, proposals); // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); @@ -306,7 +393,7 @@ static int detect_yolov8(const cv::Mat& bgr, std::vector& objects) return 0; } -static void draw_objects(const cv::Mat& bgr, const std::vector& objects) +static void draw_objects_coco(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", @@ -320,45 +407,179 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) "hair drier", "toothbrush" }; - static const unsigned char colors[19][3] = { - {54, 67, 244}, - {99, 30, 233}, - {176, 39, 156}, - {183, 58, 103}, - {181, 81, 63}, - {243, 150, 33}, - {244, 169, 3}, - {212, 188, 0}, - {136, 150, 0}, - {80, 175, 76}, - {74, 195, 139}, - {57, 220, 205}, - {59, 235, 255}, - {7, 193, 255}, - {0, 152, 255}, - {34, 87, 255}, - {72, 85, 121}, - {158, 158, 158}, - {139, 125, 96} + static cv::Scalar colors[] = { + cv::Scalar(244, 67, 54), + cv::Scalar(233, 30, 99), + cv::Scalar(156, 39, 176), + cv::Scalar(103, 58, 183), + cv::Scalar(63, 81, 181), + cv::Scalar(33, 150, 243), + cv::Scalar(3, 169, 244), + cv::Scalar(0, 188, 212), + cv::Scalar(0, 150, 136), + cv::Scalar(76, 175, 80), + cv::Scalar(139, 195, 74), + cv::Scalar(205, 220, 57), + cv::Scalar(255, 235, 59), + cv::Scalar(255, 193, 7), + cv::Scalar(255, 152, 0), + cv::Scalar(255, 87, 34), + cv::Scalar(121, 85, 72), + cv::Scalar(158, 158, 158), + cv::Scalar(96, 125, 139) }; - int color_index = 0; - cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; - const unsigned char* color = colors[color_index % 19]; - color_index++; + const cv::Scalar& color = colors[i % 19]; + + fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, + obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); + + cv::rectangle(image, obj.rect, color); + + char text[256]; + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); + + int baseLine = 0; + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + int x = obj.rect.x; + int y = obj.rect.y - label_size.height - baseLine; + if (y < 0) + y = 0; + if (x + label_size.width > image.cols) + x = image.cols - label_size.width; + + cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), + cv::Scalar(255, 255, 255), -1); + + cv::putText(image, text, cv::Point(x, y + label_size.height), + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); + } + + cv::imshow("image", image); + cv::waitKey(0); +} + +static void draw_objects_oiv(const cv::Mat& bgr, const std::vector& objects) +{ + static const char* class_names[] = { + "Accordion", "Adhesive tape", "Aircraft", "Airplane", "Alarm clock", "Alpaca", "Ambulance", "Animal", + "Ant", "Antelope", "Apple", "Armadillo", "Artichoke", "Auto part", "Axe", "Backpack", "Bagel", + "Baked goods", "Balance beam", "Ball", "Balloon", "Banana", "Band-aid", "Banjo", "Barge", "Barrel", + "Baseball bat", "Baseball glove", "Bat (Animal)", "Bathroom accessory", "Bathroom cabinet", "Bathtub", + "Beaker", "Bear", "Bed", "Bee", "Beehive", "Beer", "Beetle", "Bell pepper", "Belt", "Bench", "Bicycle", + "Bicycle helmet", "Bicycle wheel", "Bidet", "Billboard", "Billiard table", "Binoculars", "Bird", + "Blender", "Blue jay", "Boat", "Bomb", "Book", "Bookcase", "Boot", "Bottle", "Bottle opener", + "Bow and arrow", "Bowl", "Bowling equipment", "Box", "Boy", "Brassiere", "Bread", "Briefcase", + "Broccoli", "Bronze sculpture", "Brown bear", "Building", "Bull", "Burrito", "Bus", "Bust", "Butterfly", + "Cabbage", "Cabinetry", "Cake", "Cake stand", "Calculator", "Camel", "Camera", "Can opener", "Canary", + "Candle", "Candy", "Cannon", "Canoe", "Cantaloupe", "Car", "Carnivore", "Carrot", "Cart", "Cassette deck", + "Castle", "Cat", "Cat furniture", "Caterpillar", "Cattle", "Ceiling fan", "Cello", "Centipede", + "Chainsaw", "Chair", "Cheese", "Cheetah", "Chest of drawers", "Chicken", "Chime", "Chisel", "Chopsticks", + "Christmas tree", "Clock", "Closet", "Clothing", "Coat", "Cocktail", "Cocktail shaker", "Coconut", + "Coffee", "Coffee cup", "Coffee table", "Coffeemaker", "Coin", "Common fig", "Common sunflower", + "Computer keyboard", "Computer monitor", "Computer mouse", "Container", "Convenience store", "Cookie", + "Cooking spray", "Corded phone", "Cosmetics", "Couch", "Countertop", "Cowboy hat", "Crab", "Cream", + "Cricket ball", "Crocodile", "Croissant", "Crown", "Crutch", "Cucumber", "Cupboard", "Curtain", + "Cutting board", "Dagger", "Dairy Product", "Deer", "Desk", "Dessert", "Diaper", "Dice", "Digital clock", + "Dinosaur", "Dishwasher", "Dog", "Dog bed", "Doll", "Dolphin", "Door", "Door handle", "Doughnut", + "Dragonfly", "Drawer", "Dress", "Drill (Tool)", "Drink", "Drinking straw", "Drum", "Duck", "Dumbbell", + "Eagle", "Earrings", "Egg (Food)", "Elephant", "Envelope", "Eraser", "Face powder", "Facial tissue holder", + "Falcon", "Fashion accessory", "Fast food", "Fax", "Fedora", "Filing cabinet", "Fire hydrant", + "Fireplace", "Fish", "Flag", "Flashlight", "Flower", "Flowerpot", "Flute", "Flying disc", "Food", + "Food processor", "Football", "Football helmet", "Footwear", "Fork", "Fountain", "Fox", "French fries", + "French horn", "Frog", "Fruit", "Frying pan", "Furniture", "Garden Asparagus", "Gas stove", "Giraffe", + "Girl", "Glasses", "Glove", "Goat", "Goggles", "Goldfish", "Golf ball", "Golf cart", "Gondola", + "Goose", "Grape", "Grapefruit", "Grinder", "Guacamole", "Guitar", "Hair dryer", "Hair spray", "Hamburger", + "Hammer", "Hamster", "Hand dryer", "Handbag", "Handgun", "Harbor seal", "Harmonica", "Harp", + "Harpsichord", "Hat", "Headphones", "Heater", "Hedgehog", "Helicopter", "Helmet", "High heels", + "Hiking equipment", "Hippopotamus", "Home appliance", "Honeycomb", "Horizontal bar", "Horse", "Hot dog", + "House", "Houseplant", "Human arm", "Human beard", "Human body", "Human ear", "Human eye", "Human face", + "Human foot", "Human hair", "Human hand", "Human head", "Human leg", "Human mouth", "Human nose", + "Humidifier", "Ice cream", "Indoor rower", "Infant bed", "Insect", "Invertebrate", "Ipod", "Isopod", + "Jacket", "Jacuzzi", "Jaguar (Animal)", "Jeans", "Jellyfish", "Jet ski", "Jug", "Juice", "Kangaroo", + "Kettle", "Kitchen & dining room table", "Kitchen appliance", "Kitchen knife", "Kitchen utensil", + "Kitchenware", "Kite", "Knife", "Koala", "Ladder", "Ladle", "Ladybug", "Lamp", "Land vehicle", + "Lantern", "Laptop", "Lavender (Plant)", "Lemon", "Leopard", "Light bulb", "Light switch", "Lighthouse", + "Lily", "Limousine", "Lion", "Lipstick", "Lizard", "Lobster", "Loveseat", "Luggage and bags", "Lynx", + "Magpie", "Mammal", "Man", "Mango", "Maple", "Maracas", "Marine invertebrates", "Marine mammal", + "Measuring cup", "Mechanical fan", "Medical equipment", "Microphone", "Microwave oven", "Milk", + "Miniskirt", "Mirror", "Missile", "Mixer", "Mixing bowl", "Mobile phone", "Monkey", "Moths and butterflies", + "Motorcycle", "Mouse", "Muffin", "Mug", "Mule", "Mushroom", "Musical instrument", "Musical keyboard", + "Nail (Construction)", "Necklace", "Nightstand", "Oboe", "Office building", "Office supplies", "Orange", + "Organ (Musical Instrument)", "Ostrich", "Otter", "Oven", "Owl", "Oyster", "Paddle", "Palm tree", + "Pancake", "Panda", "Paper cutter", "Paper towel", "Parachute", "Parking meter", "Parrot", "Pasta", + "Pastry", "Peach", "Pear", "Pen", "Pencil case", "Pencil sharpener", "Penguin", "Perfume", "Person", + "Personal care", "Personal flotation device", "Piano", "Picnic basket", "Picture frame", "Pig", + "Pillow", "Pineapple", "Pitcher (Container)", "Pizza", "Pizza cutter", "Plant", "Plastic bag", "Plate", + "Platter", "Plumbing fixture", "Polar bear", "Pomegranate", "Popcorn", "Porch", "Porcupine", "Poster", + "Potato", "Power plugs and sockets", "Pressure cooker", "Pretzel", "Printer", "Pumpkin", "Punching bag", + "Rabbit", "Raccoon", "Racket", "Radish", "Ratchet (Device)", "Raven", "Rays and skates", "Red panda", + "Refrigerator", "Remote control", "Reptile", "Rhinoceros", "Rifle", "Ring binder", "Rocket", + "Roller skates", "Rose", "Rugby ball", "Ruler", "Salad", "Salt and pepper shakers", "Sandal", + "Sandwich", "Saucer", "Saxophone", "Scale", "Scarf", "Scissors", "Scoreboard", "Scorpion", + "Screwdriver", "Sculpture", "Sea lion", "Sea turtle", "Seafood", "Seahorse", "Seat belt", "Segway", + "Serving tray", "Sewing machine", "Shark", "Sheep", "Shelf", "Shellfish", "Shirt", "Shorts", + "Shotgun", "Shower", "Shrimp", "Sink", "Skateboard", "Ski", "Skirt", "Skull", "Skunk", "Skyscraper", + "Slow cooker", "Snack", "Snail", "Snake", "Snowboard", "Snowman", "Snowmobile", "Snowplow", + "Soap dispenser", "Sock", "Sofa bed", "Sombrero", "Sparrow", "Spatula", "Spice rack", "Spider", + "Spoon", "Sports equipment", "Sports uniform", "Squash (Plant)", "Squid", "Squirrel", "Stairs", + "Stapler", "Starfish", "Stationary bicycle", "Stethoscope", "Stool", "Stop sign", "Strawberry", + "Street light", "Stretcher", "Studio couch", "Submarine", "Submarine sandwich", "Suit", "Suitcase", + "Sun hat", "Sunglasses", "Surfboard", "Sushi", "Swan", "Swim cap", "Swimming pool", "Swimwear", + "Sword", "Syringe", "Table", "Table tennis racket", "Tablet computer", "Tableware", "Taco", "Tank", + "Tap", "Tart", "Taxi", "Tea", "Teapot", "Teddy bear", "Telephone", "Television", "Tennis ball", + "Tennis racket", "Tent", "Tiara", "Tick", "Tie", "Tiger", "Tin can", "Tire", "Toaster", "Toilet", + "Toilet paper", "Tomato", "Tool", "Toothbrush", "Torch", "Tortoise", "Towel", "Tower", "Toy", + "Traffic light", "Traffic sign", "Train", "Training bench", "Treadmill", "Tree", "Tree house", + "Tripod", "Trombone", "Trousers", "Truck", "Trumpet", "Turkey", "Turtle", "Umbrella", "Unicycle", + "Van", "Vase", "Vegetable", "Vehicle", "Vehicle registration plate", "Violin", "Volleyball (Ball)", + "Waffle", "Waffle iron", "Wall clock", "Wardrobe", "Washing machine", "Waste container", "Watch", + "Watercraft", "Watermelon", "Weapon", "Whale", "Wheel", "Wheelchair", "Whisk", "Whiteboard", "Willow", + "Window", "Window blind", "Wine", "Wine glass", "Wine rack", "Winter melon", "Wok", "Woman", + "Wood-burning stove", "Woodpecker", "Worm", "Wrench", "Zebra", "Zucchini" + }; + + static cv::Scalar colors[] = { + cv::Scalar(244, 67, 54), + cv::Scalar(233, 30, 99), + cv::Scalar(156, 39, 176), + cv::Scalar(103, 58, 183), + cv::Scalar(63, 81, 181), + cv::Scalar(33, 150, 243), + cv::Scalar(3, 169, 244), + cv::Scalar(0, 188, 212), + cv::Scalar(0, 150, 136), + cv::Scalar(76, 175, 80), + cv::Scalar(139, 195, 74), + cv::Scalar(205, 220, 57), + cv::Scalar(255, 235, 59), + cv::Scalar(255, 193, 7), + cv::Scalar(255, 152, 0), + cv::Scalar(255, 87, 34), + cv::Scalar(121, 85, 72), + cv::Scalar(158, 158, 158), + cv::Scalar(96, 125, 139) + }; + + cv::Mat image = bgr.clone(); + + for (size_t i = 0; i < objects.size(); i++) + { + const Object& obj = objects[i]; - cv::Scalar cc(color[0], color[1], color[2]); + const cv::Scalar& color = colors[i % 19]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); - cv::rectangle(image, obj.rect, cc, 2); + cv::rectangle(image, obj.rect, color); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); @@ -374,10 +595,10 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), - cc, -1); + cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), - cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 255, 255)); + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); @@ -404,7 +625,8 @@ int main(int argc, char** argv) std::vector objects; detect_yolov8(m, objects); - draw_objects(m, objects); + draw_objects_coco(m, objects); + // draw_objects_oiv(m, objects); return 0; } diff --git a/examples/yolov8_cls.cpp b/examples/yolov8_cls.cpp new file mode 100644 index 00000000000..d682a7e5be2 --- /dev/null +++ b/examples/yolov8_cls.cpp @@ -0,0 +1,325 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +// 1. install +// pip3 install -U ultralytics pnnx ncnn +// 2. export yolov8-cls torchscript +// yolo export model=yolov8n-cls.pt format=torchscript +// 3. convert torchscript with static shape +// pnnx yolov8n-cls.torchscript +// 4. now you get ncnn model files +// yolov8n_cls.ncnn.param +// yolov8n_cls.ncnn.bin + +#include "net.h" + +#if defined(USE_NCNN_SIMPLEOCV) +#include "simpleocv.h" +#else +#include +#include +#include +#endif +#include +#include +#include + +struct Object +{ + int label; + float prob; +}; + +static void get_topk(const ncnn::Mat& cls_scores, int topk, std::vector& objects) +{ + // partial sort topk with index + int size = cls_scores.w; + std::vector > vec; + vec.resize(size); + for (int i = 0; i < size; i++) + { + vec[i] = std::make_pair(cls_scores[i], i); + } + + std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), + std::greater >()); + + objects.resize(topk); + for (int i = 0; i < topk; i++) + { + objects[i].label = vec[i].second; + objects[i].prob = vec[i].first; + } +} + +static int detect_yolov8_cls(const cv::Mat& bgr, std::vector& objects) +{ + ncnn::Net yolov8; + + yolov8.opt.use_vulkan_compute = true; + // yolov8.opt.use_bf16_storage = true; + + // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets + yolov8.load_param("yolov8n_cls.ncnn.param"); + yolov8.load_model("yolov8n_cls.ncnn.bin"); + // yolov8.load_param("yolov8s_cls.ncnn.param"); + // yolov8.load_model("yolov8s_cls.ncnn.bin"); + // yolov8.load_param("yolov8m_cls.ncnn.param"); + // yolov8.load_model("yolov8m_cls.ncnn.bin"); + + const int target_size = 224; + const int topk = 5; + + int img_w = bgr.cols; + int img_h = bgr.rows; + + // letterbox pad + int w = img_w; + int h = img_h; + float scale = 1.f; + if (w > h) + { + scale = (float)target_size / w; + w = target_size; + h = h * scale; + } + else + { + scale = (float)target_size / h; + h = target_size; + w = w * scale; + } + + ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); + + // letterbox pad to target_size rectangle + int wpad = target_size - w; + int hpad = target_size - h; + ncnn::Mat in_pad; + ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); + + const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; + in_pad.substract_mean_normalize(0, norm_vals); + + ncnn::Extractor ex = yolov8.create_extractor(); + + ex.input("in0", in_pad); + + ncnn::Mat out; + ex.extract("out0", out); + + // return top-5 + get_topk(out, topk, objects); + + return 0; +} + +static void draw_objects(const cv::Mat& bgr, const std::vector& objects) +{ + static const char* class_names[] = { + "tench", "goldfish", "great white shark", "tiger shark", "hammerhead", "electric ray", "stingray", "cock", + "hen", "ostrich", "brambling", "goldfinch", "house finch", "junco", "indigo bunting", "robin", "bulbul", + "jay", "magpie", "chickadee", "water ouzel", "kite", "bald eagle", "vulture", "great grey owl", + "European fire salamander", "common newt", "eft", "spotted salamander", "axolotl", "bullfrog", "tree frog", + "tailed frog", "loggerhead", "leatherback turtle", "mud turtle", "terrapin", "box turtle", "banded gecko", + "common iguana", "American chameleon", "whiptail", "agama", "frilled lizard", "alligator lizard", + "Gila monster", "green lizard", "African chameleon", "Komodo dragon", "African crocodile", + "American alligator", "triceratops", "thunder snake", "ringneck snake", "hognose snake", "green snake", + "king snake", "garter snake", "water snake", "vine snake", "night snake", "boa constrictor", "rock python", + "Indian cobra", "green mamba", "sea snake", "horned viper", "diamondback", "sidewinder", "trilobite", + "harvestman", "scorpion", "black and gold garden spider", "barn spider", "garden spider", "black widow", + "tarantula", "wolf spider", "tick", "centipede", "black grouse", "ptarmigan", "ruffed grouse", + "prairie chicken", "peacock", "quail", "partridge", "African grey", "macaw", "sulphur-crested cockatoo", + "lorikeet", "coucal", "bee eater", "hornbill", "hummingbird", "jacamar", "toucan", "drake", + "red-breasted merganser", "goose", "black swan", "tusker", "echidna", "platypus", "wallaby", "koala", + "wombat", "jellyfish", "sea anemone", "brain coral", "flatworm", "nematode", "conch", "snail", "slug", + "sea slug", "chiton", "chambered nautilus", "Dungeness crab", "rock crab", "fiddler crab", "king crab", + "American lobster", "spiny lobster", "crayfish", "hermit crab", "isopod", "white stork", "black stork", + "spoonbill", "flamingo", "little blue heron", "American egret", "bittern", "crane (bird)", "limpkin", + "European gallinule", "American coot", "bustard", "ruddy turnstone", "red-backed sandpiper", "redshank", + "dowitcher", "oystercatcher", "pelican", "king penguin", "albatross", "grey whale", "killer whale", + "dugong", "sea lion", "Chihuahua", "Japanese spaniel", "Maltese dog", "Pekinese", "Shih-Tzu", + "Blenheim spaniel", "papillon", "toy terrier", "Rhodesian ridgeback", "Afghan hound", "basset", "beagle", + "bloodhound", "bluetick", "black-and-tan coonhound", "Walker hound", "English foxhound", "redbone", + "borzoi", "Irish wolfhound", "Italian greyhound", "whippet", "Ibizan hound", "Norwegian elkhound", + "otterhound", "Saluki", "Scottish deerhound", "Weimaraner", "Staffordshire bullterrier", + "American Staffordshire terrier", "Bedlington terrier", "Border terrier", "Kerry blue terrier", + "Irish terrier", "Norfolk terrier", "Norwich terrier", "Yorkshire terrier", "wire-haired fox terrier", + "Lakeland terrier", "Sealyham terrier", "Airedale", "cairn", "Australian terrier", "Dandie Dinmont", + "Boston bull", "miniature schnauzer", "giant schnauzer", "standard schnauzer", "Scotch terrier", + "Tibetan terrier", "silky terrier", "soft-coated wheaten terrier", "West Highland white terrier", + "Lhasa", "flat-coated retriever", "curly-coated retriever", "golden retriever", "Labrador retriever", + "Chesapeake Bay retriever", "German short-haired pointer", "vizsla", "English setter", "Irish setter", + "Gordon setter", "Brittany spaniel", "clumber", "English springer", "Welsh springer spaniel", + "cocker spaniel", "Sussex spaniel", "Irish water spaniel", "kuvasz", "schipperke", "groenendael", + "malinois", "briard", "kelpie", "komondor", "Old English sheepdog", "Shetland sheepdog", "collie", + "Border collie", "Bouvier des Flandres", "Rottweiler", "German shepherd", "Doberman", + "miniature pinscher", "Greater Swiss Mountain dog", "Bernese mountain dog", "Appenzeller", "EntleBucher", + "boxer", "bull mastiff", "Tibetan mastiff", "French bulldog", "Great Dane", "Saint Bernard", + "Eskimo dog", "malamute", "Siberian husky", "dalmatian", "affenpinscher", "basenji", "pug", "Leonberg", + "Newfoundland", "Great Pyrenees", "Samoyed", "Pomeranian", "chow", "keeshond", "Brabancon griffon", + "Pembroke", "Cardigan", "toy poodle", "miniature poodle", "standard poodle", "Mexican hairless", + "timber wolf", "white wolf", "red wolf", "coyote", "dingo", "dhole", "African hunting dog", "hyena", + "red fox", "kit fox", "Arctic fox", "grey fox", "tabby", "tiger cat", "Persian cat", "Siamese cat", + "Egyptian cat", "cougar", "lynx", "leopard", "snow leopard", "jaguar", "lion", "tiger", "cheetah", + "brown bear", "American black bear", "ice bear", "sloth bear", "mongoose", "meerkat", "tiger beetle", + "ladybug", "ground beetle", "long-horned beetle", "leaf beetle", "dung beetle", "rhinoceros beetle", + "weevil", "fly", "bee", "ant", "grasshopper", "cricket", "walking stick", "cockroach", "mantis", + "cicada", "leafhopper", "lacewing", "dragonfly", "damselfly", "admiral", "ringlet", "monarch", + "cabbage butterfly", "sulphur butterfly", "lycaenid", "starfish", "sea urchin", "sea cucumber", + "wood rabbit", "hare", "Angora", "hamster", "porcupine", "fox squirrel", "marmot", "beaver", + "guinea pig", "sorrel", "zebra", "hog", "wild boar", "warthog", "hippopotamus", "ox", "water buffalo", + "bison", "ram", "bighorn", "ibex", "hartebeest", "impala", "gazelle", "Arabian camel", "llama", + "weasel", "mink", "polecat", "black-footed ferret", "otter", "skunk", "badger", "armadillo", + "three-toed sloth", "orangutan", "gorilla", "chimpanzee", "gibbon", "siamang", "guenon", "patas", + "baboon", "macaque", "langur", "colobus", "proboscis monkey", "marmoset", "capuchin", "howler monkey", + "titi", "spider monkey", "squirrel monkey", "Madagascar cat", "indri", "Indian elephant", + "African elephant", "lesser panda", "giant panda", "barracouta", "eel", "coho", "rock beauty", + "anemone fish", "sturgeon", "gar", "lionfish", "puffer", "abacus", "abaya", "academic gown", + "accordion", "acoustic guitar", "aircraft carrier", "airliner", "airship", "altar", "ambulance", + "amphibian", "analog clock", "apiary", "apron", "ashcan", "assault rifle", "backpack", "bakery", + "balance beam", "balloon", "ballpoint", "Band Aid", "banjo", "bannister", "barbell", "barber chair", + "barbershop", "barn", "barometer", "barrel", "barrow", "baseball", "basketball", "bassinet", "bassoon", + "bathing cap", "bath towel", "bathtub", "beach wagon", "beacon", "beaker", "bearskin", "beer bottle", + "beer glass", "bell cote", "bib", "bicycle-built-for-two", "bikini", "binder", "binoculars", + "birdhouse", "boathouse", "bobsled", "bolo tie", "bonnet", "bookcase", "bookshop", "bottlecap", "bow", + "bow tie", "brass", "brassiere", "breakwater", "breastplate", "broom", "bucket", "buckle", + "bulletproof vest", "bullet train", "butcher shop", "cab", "caldron", "candle", "cannon", "canoe", + "can opener", "cardigan", "car mirror", "carousel", "carpenter's kit", "carton", "car wheel", + "cash machine", "cassette", "cassette player", "castle", "catamaran", "CD player", "cello", + "cellular telephone", "chain", "chainlink fence", "chain mail", "chain saw", "chest", "chiffonier", + "chime", "china cabinet", "Christmas stocking", "church", "cinema", "cleaver", "cliff dwelling", + "cloak", "clog", "cocktail shaker", "coffee mug", "coffeepot", "coil", "combination lock", + "computer keyboard", "confectionery", "container ship", "convertible", "corkscrew", "cornet", + "cowboy boot", "cowboy hat", "cradle", "crane (machine)", "crash helmet", "crate", "crib", + "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer", "dial telephone", + "diaper", "digital clock", "digital watch", "dining table", "dishrag", "dishwasher", "disk brake", + "dock", "dogsled", "dome", "doormat", "drilling platform", "drum", "drumstick", "dumbbell", + "Dutch oven", "electric fan", "electric guitar", "electric locomotive", "entertainment center", + "envelope", "espresso maker", "face powder", "feather boa", "file", "fireboat", "fire engine", + "fire screen", "flagpole", "flute", "folding chair", "football helmet", "forklift", "fountain", + "fountain pen", "four-poster", "freight car", "French horn", "frying pan", "fur coat", "garbage truck", + "gasmask", "gas pump", "goblet", "go-kart", "golf ball", "golfcart", "gondola", "gong", "gown", + "grand piano", "greenhouse", "grille", "grocery store", "guillotine", "hair slide", "hair spray", + "half track", "hammer", "hamper", "hand blower", "hand-held computer", "handkerchief", "hard disc", + "harmonica", "harp", "harvester", "hatchet", "holster", "home theater", "honeycomb", "hook", + "hoopskirt", "horizontal bar", "horse cart", "hourglass", "iPod", "iron", "jack-o'-lantern", "jean", + "jeep", "jersey", "jigsaw puzzle", "jinrikisha", "joystick", "kimono", "knee pad", "knot", "lab coat", + "ladle", "lampshade", "laptop", "lawn mower", "lens cap", "letter opener", "library", "lifeboat", + "lighter", "limousine", "liner", "lipstick", "Loafer", "lotion", "loudspeaker", "loupe", "lumbermill", + "magnetic compass", "mailbag", "mailbox", "maillot (tights)", "maillot (tank suit)", "manhole cover", + "maraca", "marimba", "mask", "matchstick", "maypole", "maze", "measuring cup", "medicine chest", + "megalith", "microphone", "microwave", "military uniform", "milk can", "minibus", "miniskirt", + "minivan", "missile", "mitten", "mixing bowl", "mobile home", "Model T", "modem", "monastery", + "monitor", "moped", "mortar", "mortarboard", "mosque", "mosquito net", "motor scooter", "mountain bike", + "mountain tent", "mouse", "mousetrap", "moving van", "muzzle", "nail", "neck brace", "necklace", + "nipple", "notebook", "obelisk", "oboe", "ocarina", "odometer", "oil filter", "organ", "oscilloscope", + "overskirt", "oxcart", "oxygen mask", "packet", "paddle", "paddlewheel", "padlock", "paintbrush", + "pajama", "palace", "panpipe", "paper towel", "parachute", "parallel bars", "park bench", + "parking meter", "passenger car", "patio", "pay-phone", "pedestal", "pencil box", "pencil sharpener", + "perfume", "Petri dish", "photocopier", "pick", "pickelhaube", "picket fence", "pickup", "pier", + "piggy bank", "pill bottle", "pillow", "ping-pong ball", "pinwheel", "pirate", "pitcher", "plane", + "planetarium", "plastic bag", "plate rack", "plow", "plunger", "Polaroid camera", "pole", + "police van", "poncho", "pool table", "pop bottle", "pot", "potter's wheel", "power drill", + "prayer rug", "printer", "prison", "projectile", "projector", "puck", "punching bag", "purse", + "quill", "quilt", "racer", "racket", "radiator", "radio", "radio telescope", "rain barrel", + "recreational vehicle", "reel", "reflex camera", "refrigerator", "remote control", "restaurant", + "revolver", "rifle", "rocking chair", "rotisserie", "rubber eraser", "rugby ball", "rule", + "running shoe", "safe", "safety pin", "saltshaker", "sandal", "sarong", "sax", "scabbard", "scale", + "school bus", "schooner", "scoreboard", "screen", "screw", "screwdriver", "seat belt", "sewing machine", + "shield", "shoe shop", "shoji", "shopping basket", "shopping cart", "shovel", "shower cap", + "shower curtain", "ski", "ski mask", "sleeping bag", "slide rule", "sliding door", "slot", "snorkel", + "snowmobile", "snowplow", "soap dispenser", "soccer ball", "sock", "solar dish", "sombrero", + "soup bowl", "space bar", "space heater", "space shuttle", "spatula", "speedboat", "spider web", + "spindle", "sports car", "spotlight", "stage", "steam locomotive", "steel arch bridge", "steel drum", + "stethoscope", "stole", "stone wall", "stopwatch", "stove", "strainer", "streetcar", "stretcher", + "studio couch", "stupa", "submarine", "suit", "sundial", "sunglass", "sunglasses", "sunscreen", + "suspension bridge", "swab", "sweatshirt", "swimming trunks", "swing", "switch", "syringe", + "table lamp", "tank", "tape player", "teapot", "teddy", "television", "tennis ball", "thatch", + "theater curtain", "thimble", "thresher", "throne", "tile roof", "toaster", "tobacco shop", + "toilet seat", "torch", "totem pole", "tow truck", "toyshop", "tractor", "trailer truck", "tray", + "trench coat", "tricycle", "trimaran", "tripod", "triumphal arch", "trolleybus", "trombone", "tub", + "turnstile", "typewriter keyboard", "umbrella", "unicycle", "upright", "vacuum", "vase", "vault", + "velvet", "vending machine", "vestment", "viaduct", "violin", "volleyball", "waffle iron", "wall clock", + "wallet", "wardrobe", "warplane", "washbasin", "washer", "water bottle", "water jug", "water tower", + "whiskey jug", "whistle", "wig", "window screen", "window shade", "Windsor tie", "wine bottle", "wing", + "wok", "wooden spoon", "wool", "worm fence", "wreck", "yawl", "yurt", "web site", "comic book", + "crossword puzzle", "street sign", "traffic light", "book jacket", "menu", "plate", "guacamole", + "consomme", "hot pot", "trifle", "ice cream", "ice lolly", "French loaf", "bagel", "pretzel", + "cheeseburger", "hotdog", "mashed potato", "head cabbage", "broccoli", "cauliflower", "zucchini", + "spaghetti squash", "acorn squash", "butternut squash", "cucumber", "artichoke", "bell pepper", + "cardoon", "mushroom", "Granny Smith", "strawberry", "orange", "lemon", "fig", "pineapple", "banana", + "jackfruit", "custard apple", "pomegranate", "hay", "carbonara", "chocolate sauce", "dough", + "meat loaf", "pizza", "potpie", "burrito", "red wine", "espresso", "cup", "eggnog", "alp", "bubble", + "cliff", "coral reef", "geyser", "lakeside", "promontory", "sandbar", "seashore", "valley", "volcano", + "ballplayer", "groom", "scuba diver", "rapeseed", "daisy", "yellow lady's slipper", "corn", "acorn", + "hip", "buckeye", "coral fungus", "agaric", "gyromitra", "stinkhorn", "earthstar", "hen-of-the-woods", + "bolete", "ear", "toilet tissue" + }; + + cv::Mat image = bgr.clone(); + + int y_offset = 0; + for (size_t i = 0; i < objects.size(); i++) + { + const Object& obj = objects[i]; + + fprintf(stderr, "%d = %.5f\n", obj.label, obj.prob); + + char text[256]; + sprintf(text, "%4.1f%% %s", obj.prob * 100, class_names[obj.label]); + + int baseLine = 0; + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + int x = 0; + int y = y_offset; + + cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), + cv::Scalar(255, 255, 255), -1); + + cv::putText(image, text, cv::Point(x, y + label_size.height), + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); + + y_offset += label_size.height; + } + + cv::imshow("image", image); + cv::waitKey(0); +} + +int main(int argc, char** argv) +{ + if (argc != 2) + { + fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); + return -1; + } + + const char* imagepath = argv[1]; + + cv::Mat m = cv::imread(imagepath, 1); + if (m.empty()) + { + fprintf(stderr, "cv::imread %s failed\n", imagepath); + return -1; + } + + std::vector objects; + detect_yolov8_cls(m, objects); + + draw_objects(m, objects); + + return 0; +} diff --git a/examples/yolov8_obb.cpp b/examples/yolov8_obb.cpp new file mode 100644 index 00000000000..c80c6fd99d7 --- /dev/null +++ b/examples/yolov8_obb.cpp @@ -0,0 +1,522 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +// 1. install +// pip3 install -U ultralytics pnnx ncnn +// 2. export yolov8-obb torchscript +// yolo export model=yolov8n-obb.pt format=torchscript +// 3. convert torchscript with static shape +// pnnx yolov8n-obb.torchscript +// 4. modify yolov8n_obb_pnnx.py for dynamic shape inference +// A. modify reshape to support dynamic image sizes +// B. permute tensor before concat and adjust concat axis +// C. drop post-process part +// before: +// v_137 = v_136.view(1, 1, 16384) +// v_143 = v_142.view(1, 1, 4096) +// v_149 = v_148.view(1, 1, 1024) +// v_150 = torch.cat((v_137, v_143, v_149), dim=2) +// ... +// v_186 = v_163.view(1, 79, 16384) +// v_187 = v_174.view(1, 79, 4096) +// v_188 = v_185.view(1, 79, 1024) +// v_189 = torch.cat((v_186, v_187, v_188), dim=2) +// ... +// after: +// v_137 = v_136.view(1, 1, -1).transpose(1, 2) +// v_143 = v_142.view(1, 1, -1).transpose(1, 2) +// v_149 = v_148.view(1, 1, -1).transpose(1, 2) +// v_150 = torch.cat((v_137, v_143, v_149), dim=1) +// ... +// v_186 = v_163.view(1, 79, -1).transpose(1, 2) +// v_187 = v_174.view(1, 79, -1).transpose(1, 2) +// v_188 = v_185.view(1, 79, -1).transpose(1, 2) +// v_189 = torch.cat((v_186, v_187, v_188), dim=1) +// return v_189, v_150 +// 5. re-export yolov8-obb torchscript +// python3 -c 'import yolov8n_obb_pnnx; yolov8n_obb_pnnx.export_torchscript()' +// 6. convert new torchscript with dynamic shape +// pnnx yolov8n_obb_pnnx.py.pt inputshape=[1,3,1024,1024] inputshape2=[1,3,512,512] +// 7. now you get ncnn model files +// mv yolov8n_obb_pnnx.py.ncnn.param yolov8n_obb.ncnn.param +// mv yolov8n_obb_pnnx.py.ncnn.bin yolov8n_obb.ncnn.bin + +// the out blob would be a 2-dim tensor with w=79 h=21504 +// +// | bbox-reg 16 x 4 |score(15)| +// +-----+-----+-----+-----+---------+ +// | dx0 | dy0 | dx1 | dy1 | 0.1 ... | +// all /| | | | | ... | +// boxes | .. | .. | .. | .. | 0.0 ... | +// (21504)| | | | | . ... | +// \| | | | | . ... | +// +-----+-----+-----+-----+---------+ +// + +// the out blob would be a 2-dim tensor with w=1 h=21504 +// +// | degree(1)| +// +----------+ +// | 0.1 | +// all /| | +// boxes | 0.0 | +// (21504)| . | +// \| . | +// +----------+ +// + +#include "layer.h" +#include "net.h" + +#include +#include +#include + +#include +#include +#include +#include + +struct Object +{ + cv::RotatedRect rrect; + int label; + float prob; +}; + +static inline float intersection_area(const Object& a, const Object& b) +{ + std::vector intersection; + cv::rotatedRectangleIntersection(a.rrect, b.rrect, intersection); + if (intersection.empty()) + return 0.f; + + return cv::contourArea(intersection); +} + +static void qsort_descent_inplace(std::vector& objects, int left, int right) +{ + int i = left; + int j = right; + float p = objects[(left + right) / 2].prob; + + while (i <= j) + { + while (objects[i].prob > p) + i++; + + while (objects[j].prob < p) + j--; + + if (i <= j) + { + // swap + std::swap(objects[i], objects[j]); + + i++; + j--; + } + } + + // #pragma omp parallel sections + { + // #pragma omp section + { + if (left < j) qsort_descent_inplace(objects, left, j); + } + // #pragma omp section + { + if (i < right) qsort_descent_inplace(objects, i, right); + } + } +} + +static void qsort_descent_inplace(std::vector& objects) +{ + if (objects.empty()) + return; + + qsort_descent_inplace(objects, 0, objects.size() - 1); +} + +static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) +{ + picked.clear(); + + const int n = objects.size(); + + std::vector areas(n); + for (int i = 0; i < n; i++) + { + areas[i] = objects[i].rrect.size.area(); + } + + for (int i = 0; i < n; i++) + { + const Object& a = objects[i]; + + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) + { + const Object& b = objects[picked[j]]; + + if (!agnostic && a.label != b.label) + continue; + + // intersection over union + float inter_area = intersection_area(a, b); + float union_area = areas[i] + areas[picked[j]] - inter_area; + // float IoU = inter_area / union_area; + if (inter_area / union_area > nms_threshold) + keep = 0; + } + + if (keep) + picked.push_back(i); + } +} + +static inline float sigmoid(float x) +{ + return 1.0f / (1.0f + expf(-x)); +} + +static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_angle, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) +{ + const int w = in_pad.w; + const int h = in_pad.h; + + const int num_grid_x = w / stride; + const int num_grid_y = h / stride; + + const int reg_max_1 = 16; + const int num_class = pred.w - reg_max_1 * 4; // number of classes. 15 for DOTAv1 + + for (int y = 0; y < num_grid_y; y++) + { + for (int x = 0; x < num_grid_x; x++) + { + const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1); + + // find label with max score + int label = -1; + float score = -FLT_MAX; + { + const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class); + + for (int k = 0; k < num_class; k++) + { + float s = pred_score[k]; + if (s > score) + { + label = k; + score = s; + } + } + + score = sigmoid(score); + } + + if (score >= prob_threshold) + { + ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone(); + + { + ncnn::Layer* softmax = ncnn::create_layer("Softmax"); + + ncnn::ParamDict pd; + pd.set(0, 1); // axis + pd.set(1, 1); + softmax->load_param(pd); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = false; + + softmax->create_pipeline(opt); + + softmax->forward_inplace(pred_bbox, opt); + + softmax->destroy_pipeline(opt); + + delete softmax; + } + + float pred_ltrb[4]; + for (int k = 0; k < 4; k++) + { + float dis = 0.f; + const float* dis_after_sm = pred_bbox.row(k); + for (int l = 0; l < reg_max_1; l++) + { + dis += l * dis_after_sm[l]; + } + + pred_ltrb[k] = dis * stride; + } + + float pb_cx = (x + 0.5f) * stride; + float pb_cy = (y + 0.5f) * stride; + + const float angle = sigmoid(pred_angle.row(y * num_grid_x + x)[0]) - 0.25f; + + const float angle_rad = angle * 3.14159265358979323846f; + const float angle_degree = angle * 180.f; + + float cos = cosf(angle_rad); + float sin = sinf(angle_rad); + + float xx = (pred_ltrb[2] - pred_ltrb[0]) * 0.5f; + float yy = (pred_ltrb[3] - pred_ltrb[1]) * 0.5f; + float xr = xx * cos - yy * sin; + float yr = xx * sin + yy * cos; + const float cx = pb_cx + xr; + const float cy = pb_cy + yr; + const float ww = pred_ltrb[2] + pred_ltrb[0]; + const float hh = pred_ltrb[3] + pred_ltrb[1]; + + Object obj; + obj.rrect = cv::RotatedRect(cv::Point2f(cx, cy), cv::Size_(ww, hh), angle_degree); + obj.label = label; + obj.prob = score; + + objects.push_back(obj); + } + } + } +} + +static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_angle, const std::vector& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) +{ + const int w = in_pad.w; + const int h = in_pad.h; + + int pred_row_offset = 0; + for (size_t i = 0; i < strides.size(); i++) + { + const int stride = strides[i]; + + const int num_grid_x = w / stride; + const int num_grid_y = h / stride; + const int num_grid = num_grid_x * num_grid_y; + + generate_proposals(pred.row_range(pred_row_offset, num_grid), pred_angle.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects); + + pred_row_offset += num_grid; + } +} + +static int detect_yolov8_obb(const cv::Mat& bgr, std::vector& objects) +{ + ncnn::Net yolov8; + + yolov8.opt.use_vulkan_compute = true; + // yolov8.opt.use_bf16_storage = true; + + // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets + yolov8.load_param("yolov8n_obb_pnnx.py.ncnn.param"); + yolov8.load_model("yolov8n_obb_pnnx.py.ncnn.bin"); + // yolov8.load_param("yolov8s_obb_pnnx.py.ncnn.param"); + // yolov8.load_model("yolov8s_obb_pnnx.py.ncnn.bin"); + // yolov8.load_param("yolov8m_obb_pnnx.py.ncnn.param"); + // yolov8.load_model("yolov8m_obb_pnnx.py.ncnn.bin"); + + const int target_size = 1024; + const float prob_threshold = 0.25f; + const float nms_threshold = 0.45f; + + int img_w = bgr.cols; + int img_h = bgr.rows; + + // ultralytics/cfg/models/v8/yolov8.yaml + std::vector strides(3); + strides[0] = 8; + strides[1] = 16; + strides[2] = 32; + const int max_stride = 32; + + // letterbox pad to multiple of max_stride + int w = img_w; + int h = img_h; + float scale = 1.f; + if (w > h) + { + scale = (float)target_size / w; + w = target_size; + h = h * scale; + } + else + { + scale = (float)target_size / h; + h = target_size; + w = w * scale; + } + + ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); + + // letterbox pad to target_size rectangle + int wpad = (w + max_stride - 1) / max_stride * max_stride - w; + int hpad = (h + max_stride - 1) / max_stride * max_stride - h; + ncnn::Mat in_pad; + ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); + + const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; + in_pad.substract_mean_normalize(0, norm_vals); + + ncnn::Extractor ex = yolov8.create_extractor(); + + ex.input("in0", in_pad); + + ncnn::Mat out; + ex.extract("out0", out); + + ncnn::Mat out_angle; + ex.extract("out1", out_angle); + + std::vector proposals; + generate_proposals(out, out_angle, strides, in_pad, prob_threshold, proposals); + + // sort all proposals by score from highest to lowest + qsort_descent_inplace(proposals); + + // apply nms with nms_threshold + std::vector picked; + nms_sorted_bboxes(proposals, picked, nms_threshold); + + int count = picked.size(); + if (count == 0) + return 0; + + objects.resize(count); + for (int i = 0; i < count; i++) + { + Object obj = proposals[picked[i]]; + + // adjust offset to original unpadded + obj.rrect.center.x = (obj.rrect.center.x - (wpad / 2)) / scale; + obj.rrect.center.y = (obj.rrect.center.y - (hpad / 2)) / scale; + obj.rrect.size.width = (obj.rrect.size.width) / scale; + obj.rrect.size.height = (obj.rrect.size.height) / scale; + + objects[i] = obj; + } + + return 0; +} + +static void draw_objects(const cv::Mat& bgr, const std::vector& objects) +{ + static const char* class_names[] = { + "plane", "ship", "storage tank", "baseball diamond", "tennis court", + "basketball court", "ground track field", "harbor", "bridge", "large vehicle", + "small vehicle", "helicopter", "roundabout", "soccer ball field", "swimming pool" + }; + + static const cv::Scalar colors[] = { + cv::Scalar(156, 39, 176), + cv::Scalar(103, 58, 183), + cv::Scalar(63, 81, 181), + cv::Scalar(33, 150, 243), + cv::Scalar(3, 169, 244), + cv::Scalar(0, 188, 212), + cv::Scalar(0, 150, 136), + cv::Scalar(76, 175, 80), + cv::Scalar(139, 195, 74), + cv::Scalar(205, 220, 57), + cv::Scalar(255, 235, 59), + cv::Scalar(255, 193, 7), + cv::Scalar(255, 152, 0), + cv::Scalar(255, 87, 34), + cv::Scalar(121, 85, 72), + cv::Scalar(158, 158, 158), + cv::Scalar(96, 125, 139) + }; + + cv::Mat image = bgr.clone(); + + for (size_t i = 0; i < objects.size(); i++) + { + const Object& obj = objects[i]; + + const cv::Scalar& color = colors[obj.label]; + + fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f @ %.2f\n", obj.label, obj.prob, + obj.rrect.center.x, obj.rrect.center.y, obj.rrect.size.width, obj.rrect.size.height, obj.rrect.angle); + + cv::Point2f corners[4]; + obj.rrect.points(corners); + cv::line(image, corners[0], corners[1], color); + cv::line(image, corners[1], corners[2], color); + cv::line(image, corners[2], corners[3], color); + cv::line(image, corners[3], corners[0], color); + } + + for (size_t i = 0; i < objects.size(); i++) + { + const Object& obj = objects[i]; + + const cv::Scalar& color = colors[obj.label]; + + char text[256]; + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); + + int baseLine = 0; + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + int x = obj.rrect.center.x - label_size.width / 2; + int y = obj.rrect.center.y - label_size.height / 2 - baseLine; + if (y < 0) + y = 0; + if (y + label_size.height > image.rows) + y = image.rows - label_size.height; + if (x < 0) + x = 0; + if (x + label_size.width > image.cols) + x = image.cols - label_size.width; + + cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), + cv::Scalar(255, 255, 255), -1); + + cv::putText(image, text, cv::Point(x, y + label_size.height), + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); + } + + cv::imshow("image", image); + cv::waitKey(0); +} + +int main(int argc, char** argv) +{ + if (argc != 2) + { + fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); + return -1; + } + + const char* imagepath = argv[1]; + + cv::Mat m = cv::imread(imagepath, 1); + if (m.empty()) + { + fprintf(stderr, "cv::imread %s failed\n", imagepath); + return -1; + } + + std::vector objects; + detect_yolov8_obb(m, objects); + + draw_objects(m, objects); + + return 0; +} diff --git a/examples/yolov8_pose.cpp b/examples/yolov8_pose.cpp new file mode 100644 index 00000000000..e4b6137766c --- /dev/null +++ b/examples/yolov8_pose.cpp @@ -0,0 +1,561 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +// 1. install +// pip3 install -U ultralytics pnnx ncnn +// 2. export yolov8-pose torchscript +// yolo export model=yolov8n-pose.pt format=torchscript +// 3. convert torchscript with static shape +// pnnx yolov8n-pose.torchscript +// 4. modify yolov8n_pose_pnnx.py for dynamic shape inference +// A. modify reshape to support dynamic image sizes +// B. permute tensor before concat and adjust concat axis +// C. drop post-process part +// before: +// v_137 = v_136.view(1, 51, 6400) +// v_143 = v_142.view(1, 51, 1600) +// v_149 = v_148.view(1, 51, 400) +// v_150 = torch.cat((v_137, v_143, v_149), dim=-1) +// ... +// v_184 = v_161.view(1, 65, 6400) +// v_185 = v_172.view(1, 65, 1600) +// v_186 = v_183.view(1, 65, 400) +// v_187 = torch.cat((v_184, v_185, v_186), dim=2) +// ... +// after: +// v_137 = v_136.view(1, 51, -1).transpose(1, 2) +// v_143 = v_142.view(1, 51, -1).transpose(1, 2) +// v_149 = v_148.view(1, 51, -1).transpose(1, 2) +// v_150 = torch.cat((v_137, v_143, v_149), dim=1) +// ... +// v_184 = v_161.view(1, 65, -1).transpose(1, 2) +// v_185 = v_172.view(1, 65, -1).transpose(1, 2) +// v_186 = v_183.view(1, 65, -1).transpose(1, 2) +// v_187 = torch.cat((v_184, v_185, v_186), dim=1) +// return v_187, v_150 +// 5. re-export yolov8-pose torchscript +// python3 -c 'import yolov8n_pose_pnnx; yolov8n_pose_pnnx.export_torchscript()' +// 6. convert new torchscript with dynamic shape +// pnnx yolov8n_pose_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320] +// 7. now you get ncnn model files +// mv yolov8n_pose_pnnx.py.ncnn.param yolov8n_pose.ncnn.param +// mv yolov8n_pose_pnnx.py.ncnn.bin yolov8n_pose.ncnn.bin + +// the out blob would be a 2-dim tensor with w=65 h=8400 +// +// | bbox-reg 16 x 4 |score(1)| +// +-----+-----+-----+-----+--------+ +// | dx0 | dy0 | dx1 | dy1 | 0.1 | +// all /| | | | | | +// boxes | .. | .. | .. | .. | 0.0 | +// (8400)| | | | | . | +// \| | | | | . | +// +-----+-----+-----+-----+--------+ +// + +// +// | pose (51) | +// +-----------+ +// |0.1........| +// all /| | +// boxes |0.0........| +// (8400)| . | +// \| . | +// +-----------+ +// + +#include "layer.h" +#include "net.h" + +#if defined(USE_NCNN_SIMPLEOCV) +#include "simpleocv.h" +#else +#include +#include +#include +#endif +#include +#include +#include + +struct KeyPoint +{ + cv::Point2f p; + float prob; +}; + +struct Object +{ + cv::Rect_ rect; + int label; + float prob; + std::vector keypoints; +}; + +static inline float intersection_area(const Object& a, const Object& b) +{ + cv::Rect_ inter = a.rect & b.rect; + return inter.area(); +} + +static void qsort_descent_inplace(std::vector& objects, int left, int right) +{ + int i = left; + int j = right; + float p = objects[(left + right) / 2].prob; + + while (i <= j) + { + while (objects[i].prob > p) + i++; + + while (objects[j].prob < p) + j--; + + if (i <= j) + { + // swap + std::swap(objects[i], objects[j]); + + i++; + j--; + } + } + + // #pragma omp parallel sections + { + // #pragma omp section + { + if (left < j) qsort_descent_inplace(objects, left, j); + } + // #pragma omp section + { + if (i < right) qsort_descent_inplace(objects, i, right); + } + } +} + +static void qsort_descent_inplace(std::vector& objects) +{ + if (objects.empty()) + return; + + qsort_descent_inplace(objects, 0, objects.size() - 1); +} + +static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) +{ + picked.clear(); + + const int n = objects.size(); + + std::vector areas(n); + for (int i = 0; i < n; i++) + { + areas[i] = objects[i].rect.area(); + } + + for (int i = 0; i < n; i++) + { + const Object& a = objects[i]; + + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) + { + const Object& b = objects[picked[j]]; + + if (!agnostic && a.label != b.label) + continue; + + // intersection over union + float inter_area = intersection_area(a, b); + float union_area = areas[i] + areas[picked[j]] - inter_area; + // float IoU = inter_area / union_area + if (inter_area / union_area > nms_threshold) + keep = 0; + } + + if (keep) + picked.push_back(i); + } +} + +static inline float sigmoid(float x) +{ + return 1.0f / (1.0f + expf(-x)); +} + +static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_points, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) +{ + const int w = in_pad.w; + const int h = in_pad.h; + + const int num_grid_x = w / stride; + const int num_grid_y = h / stride; + + const int reg_max_1 = 16; + const int num_points = pred_points.w / 3; + + for (int y = 0; y < num_grid_y; y++) + { + for (int x = 0; x < num_grid_x; x++) + { + const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1); + const ncnn::Mat pred_points_grid = pred_points.row_range(y * num_grid_x + x, 1).reshape(3, num_points); + + // find label with max score + int label = 0; + float score = sigmoid(pred_grid[reg_max_1 * 4]); + + if (score >= prob_threshold) + { + ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone(); + + { + ncnn::Layer* softmax = ncnn::create_layer("Softmax"); + + ncnn::ParamDict pd; + pd.set(0, 1); // axis + pd.set(1, 1); + softmax->load_param(pd); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = false; + + softmax->create_pipeline(opt); + + softmax->forward_inplace(pred_bbox, opt); + + softmax->destroy_pipeline(opt); + + delete softmax; + } + + float pred_ltrb[4]; + for (int k = 0; k < 4; k++) + { + float dis = 0.f; + const float* dis_after_sm = pred_bbox.row(k); + for (int l = 0; l < reg_max_1; l++) + { + dis += l * dis_after_sm[l]; + } + + pred_ltrb[k] = dis * stride; + } + + float pb_cx = (x + 0.5f) * stride; + float pb_cy = (y + 0.5f) * stride; + + float x0 = pb_cx - pred_ltrb[0]; + float y0 = pb_cy - pred_ltrb[1]; + float x1 = pb_cx + pred_ltrb[2]; + float y1 = pb_cy + pred_ltrb[3]; + + std::vector keypoints; + for (int k = 0; k < num_points; k++) + { + KeyPoint keypoint; + keypoint.p.x = (x + pred_points_grid.row(k)[0] * 2) * stride; + keypoint.p.y = (y + pred_points_grid.row(k)[1] * 2) * stride; + keypoint.prob = sigmoid(pred_points_grid.row(k)[2]); + keypoints.push_back(keypoint); + } + + Object obj; + obj.rect.x = x0; + obj.rect.y = y0; + obj.rect.width = x1 - x0; + obj.rect.height = y1 - y0; + obj.label = label; + obj.prob = score; + obj.keypoints = keypoints; + + objects.push_back(obj); + } + } + } +} + +static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_points, const std::vector& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) +{ + const int w = in_pad.w; + const int h = in_pad.h; + + int pred_row_offset = 0; + for (size_t i = 0; i < strides.size(); i++) + { + const int stride = strides[i]; + + const int num_grid_x = w / stride; + const int num_grid_y = h / stride; + const int num_grid = num_grid_x * num_grid_y; + + generate_proposals(pred.row_range(pred_row_offset, num_grid), pred_points.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects); + + pred_row_offset += num_grid; + } +} + +static int detect_yolov8_pose(const cv::Mat& bgr, std::vector& objects) +{ + ncnn::Net yolov8; + + yolov8.opt.use_vulkan_compute = true; + // yolov8.opt.use_bf16_storage = true; + + // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets + yolov8.load_param("yolov8n_pose_pnnx.py.ncnn.param"); + yolov8.load_model("yolov8n_pose_pnnx.py.ncnn.bin"); + // yolov8.load_param("yolov8s_pose_pnnx.py.ncnn.param"); + // yolov8.load_model("yolov8s_pose_pnnx.py.ncnn.bin"); + // yolov8.load_param("yolov8m_pose_pnnx.py.ncnn.param"); + // yolov8.load_model("yolov8m_pose_pnnx.py.ncnn.bin"); + + const int target_size = 640; + const float prob_threshold = 0.25f; + const float nms_threshold = 0.45f; + const float mask_threshold = 0.5f; + + int img_w = bgr.cols; + int img_h = bgr.rows; + + // ultralytics/cfg/models/v8/yolov8.yaml + std::vector strides(3); + strides[0] = 8; + strides[1] = 16; + strides[2] = 32; + const int max_stride = 32; + + // letterbox pad to multiple of max_stride + int w = img_w; + int h = img_h; + float scale = 1.f; + if (w > h) + { + scale = (float)target_size / w; + w = target_size; + h = h * scale; + } + else + { + scale = (float)target_size / h; + h = target_size; + w = w * scale; + } + + ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); + + // letterbox pad to target_size rectangle + int wpad = (w + max_stride - 1) / max_stride * max_stride - w; + int hpad = (h + max_stride - 1) / max_stride * max_stride - h; + ncnn::Mat in_pad; + ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); + + const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; + in_pad.substract_mean_normalize(0, norm_vals); + + ncnn::Extractor ex = yolov8.create_extractor(); + + ex.input("in0", in_pad); + + ncnn::Mat out; + ex.extract("out0", out); + + ncnn::Mat out_points; + ex.extract("out1", out_points); + + std::vector proposals; + generate_proposals(out, out_points, strides, in_pad, prob_threshold, proposals); + + // sort all proposals by score from highest to lowest + qsort_descent_inplace(proposals); + + // apply nms with nms_threshold + std::vector picked; + nms_sorted_bboxes(proposals, picked, nms_threshold); + + int count = picked.size(); + if (count == 0) + return 0; + + const int num_points = out_points.w / 3; + + objects.resize(count); + for (int i = 0; i < count; i++) + { + objects[i] = proposals[picked[i]]; + + // adjust offset to original unpadded + float x0 = (objects[i].rect.x - (wpad / 2)) / scale; + float y0 = (objects[i].rect.y - (hpad / 2)) / scale; + float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; + float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; + + for (int j = 0; j < num_points; j++) + { + objects[i].keypoints[j].p.x = (objects[i].keypoints[j].p.x - (wpad / 2)) / scale; + objects[i].keypoints[j].p.y = (objects[i].keypoints[j].p.y - (hpad / 2)) / scale; + } + + // clip + x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); + y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); + x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); + y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); + + objects[i].rect.x = x0; + objects[i].rect.y = y0; + objects[i].rect.width = x1 - x0; + objects[i].rect.height = y1 - y0; + } + + return 0; +} + +static void draw_objects(const cv::Mat& bgr, const std::vector& objects) +{ + static const char* class_names[] = {"person"}; + + static const cv::Scalar colors[] = { + cv::Scalar(244, 67, 54), + cv::Scalar(233, 30, 99), + cv::Scalar(156, 39, 176), + cv::Scalar(103, 58, 183), + cv::Scalar(63, 81, 181), + cv::Scalar(33, 150, 243), + cv::Scalar(3, 169, 244), + cv::Scalar(0, 188, 212), + cv::Scalar(0, 150, 136), + cv::Scalar(76, 175, 80), + cv::Scalar(139, 195, 74), + cv::Scalar(205, 220, 57), + cv::Scalar(255, 235, 59), + cv::Scalar(255, 193, 7), + cv::Scalar(255, 152, 0), + cv::Scalar(255, 87, 34), + cv::Scalar(121, 85, 72), + cv::Scalar(158, 158, 158), + cv::Scalar(96, 125, 139) + }; + + cv::Mat image = bgr.clone(); + + for (size_t i = 0; i < objects.size(); i++) + { + const Object& obj = objects[i]; + + const cv::Scalar& color = colors[i % 19]; + + fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, + obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); + + // draw bone + static const int joint_pairs[16][2] = { + {0, 1}, {1, 3}, {0, 2}, {2, 4}, {5, 6}, {5, 7}, {7, 9}, {6, 8}, {8, 10}, {5, 11}, {6, 12}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16} + }; + static const cv::Scalar bone_colors[] = { + cv::Scalar(0, 255, 0), + cv::Scalar(0, 255, 0), + cv::Scalar(0, 255, 0), + cv::Scalar(0, 255, 0), + cv::Scalar(255, 128, 0), + cv::Scalar(255, 128, 0), + cv::Scalar(255, 128, 0), + cv::Scalar(255, 128, 0), + cv::Scalar(255, 128, 0), + cv::Scalar(255, 51, 255), + cv::Scalar(255, 51, 255), + cv::Scalar(255, 51, 255), + cv::Scalar(51, 153, 255), + cv::Scalar(51, 153, 255), + cv::Scalar(51, 153, 255), + cv::Scalar(51, 153, 255), + }; + + for (int j = 0; j < 16; j++) + { + const KeyPoint& p1 = obj.keypoints[joint_pairs[j][0]]; + const KeyPoint& p2 = obj.keypoints[joint_pairs[j][1]]; + + if (p1.prob < 0.2f || p2.prob < 0.2f) + continue; + + cv::line(image, p1.p, p2.p, bone_colors[j], 2); + } + + // draw joint + for (size_t j = 0; j < obj.keypoints.size(); j++) + { + const KeyPoint& keypoint = obj.keypoints[j]; + + fprintf(stderr, "%.2f %.2f = %.5f\n", keypoint.p.x, keypoint.p.y, keypoint.prob); + + if (keypoint.prob < 0.2f) + continue; + + cv::circle(image, keypoint.p, 3, color, -1); + } + + cv::rectangle(image, obj.rect, color); + + char text[256]; + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); + + int baseLine = 0; + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + int x = obj.rect.x; + int y = obj.rect.y - label_size.height - baseLine; + if (y < 0) + y = 0; + if (x + label_size.width > image.cols) + x = image.cols - label_size.width; + + cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), + cv::Scalar(255, 255, 255), -1); + + cv::putText(image, text, cv::Point(x, y + label_size.height), + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); + } + + cv::imshow("image", image); + cv::waitKey(0); +} + +int main(int argc, char** argv) +{ + if (argc != 2) + { + fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); + return -1; + } + + const char* imagepath = argv[1]; + + cv::Mat m = cv::imread(imagepath, 1); + if (m.empty()) + { + fprintf(stderr, "cv::imread %s failed\n", imagepath); + return -1; + } + + std::vector objects; + detect_yolov8_pose(m, objects); + + draw_objects(m, objects); + + return 0; +} diff --git a/examples/yolov8_seg.cpp b/examples/yolov8_seg.cpp new file mode 100644 index 00000000000..6472199bdd6 --- /dev/null +++ b/examples/yolov8_seg.cpp @@ -0,0 +1,624 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +// 1. install +// pip3 install -U ultralytics pnnx ncnn +// 2. export yolov8-seg torchscript +// yolo export model=yolov8n-seg.pt format=torchscript +// 3. convert torchscript with static shape +// pnnx yolov8n-seg.torchscript +// 4. modify yolov8n_seg_pnnx.py for dynamic shape inference +// A. modify reshape to support dynamic image sizes +// B. permute tensor before concat and adjust concat axis +// C. drop post-process part +// before: +// v_144 = v_143.view(1, 32, 6400) +// v_150 = v_149.view(1, 32, 1600) +// v_156 = v_155.view(1, 32, 400) +// v_157 = torch.cat((v_144, v_150, v_156), dim=2) +// ... +// v_191 = v_168.view(1, 144, 6400) +// v_192 = v_179.view(1, 144, 1600) +// v_193 = v_190.view(1, 144, 400) +// v_194 = torch.cat((v_191, v_192, v_193), dim=2) +// ... +// v_215 = (v_214, v_138, ) +// return v_215 +// after: +// v_144 = v_143.view(1, 32, -1).transpose(1, 2) +// v_150 = v_149.view(1, 32, -1).transpose(1, 2) +// v_156 = v_155.view(1, 32, -1).transpose(1, 2) +// v_157 = torch.cat((v_144, v_150, v_156), dim=1) +// ... +// v_191 = v_168.view(1, 144, -1).transpose(1, 2) +// v_192 = v_179.view(1, 144, -1).transpose(1, 2) +// v_193 = v_190.view(1, 144, -1).transpose(1, 2) +// v_194 = torch.cat((v_191, v_192, v_193), dim=1) +// return v_194, v_157, v_138 +// 5. re-export yolov8-seg torchscript +// python3 -c 'import yolov8n_seg_pnnx; yolov8n_seg_pnnx.export_torchscript()' +// 6. convert new torchscript with dynamic shape +// pnnx yolov8n_seg_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320] +// 7. now you get ncnn model files +// mv yolov8n_seg_pnnx.py.ncnn.param yolov8n_seg.ncnn.param +// mv yolov8n_seg_pnnx.py.ncnn.bin yolov8n_seg.ncnn.bin + +// the out blob would be a 2-dim tensor with w=176 h=8400 +// +// | bbox-reg 16 x 4 | per-class scores(80) | +// +-----+-----+-----+-----+----------------------+ +// | dx0 | dy0 | dx1 | dy1 |0.1 0.0 0.0 0.5 ......| +// all /| | | | | . | +// boxes | .. | .. | .. | .. |0.0 0.9 0.0 0.0 ......| +// (8400)| | | | | . | +// \| | | | | . | +// +-----+-----+-----+-----+----------------------+ +// + +// +// | mask (32) | +// +-----------+ +// |0.1........| +// all /| | +// boxes |0.0........| +// (8400)| . | +// \| . | +// +-----------+ +// + +#include "layer.h" +#include "net.h" + +#if defined(USE_NCNN_SIMPLEOCV) +#include "simpleocv.h" +#else +#include +#include +#include +#endif +#include +#include +#include + +struct Object +{ + cv::Rect_ rect; + int label; + float prob; + int gindex; + cv::Mat mask; +}; + +static inline float intersection_area(const Object& a, const Object& b) +{ + cv::Rect_ inter = a.rect & b.rect; + return inter.area(); +} + +static void qsort_descent_inplace(std::vector& objects, int left, int right) +{ + int i = left; + int j = right; + float p = objects[(left + right) / 2].prob; + + while (i <= j) + { + while (objects[i].prob > p) + i++; + + while (objects[j].prob < p) + j--; + + if (i <= j) + { + // swap + std::swap(objects[i], objects[j]); + + i++; + j--; + } + } + + // #pragma omp parallel sections + { + // #pragma omp section + { + if (left < j) qsort_descent_inplace(objects, left, j); + } + // #pragma omp section + { + if (i < right) qsort_descent_inplace(objects, i, right); + } + } +} + +static void qsort_descent_inplace(std::vector& objects) +{ + if (objects.empty()) + return; + + qsort_descent_inplace(objects, 0, objects.size() - 1); +} + +static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) +{ + picked.clear(); + + const int n = objects.size(); + + std::vector areas(n); + for (int i = 0; i < n; i++) + { + areas[i] = objects[i].rect.area(); + } + + for (int i = 0; i < n; i++) + { + const Object& a = objects[i]; + + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) + { + const Object& b = objects[picked[j]]; + + if (!agnostic && a.label != b.label) + continue; + + // intersection over union + float inter_area = intersection_area(a, b); + float union_area = areas[i] + areas[picked[j]] - inter_area; + // float IoU = inter_area / union_area + if (inter_area / union_area > nms_threshold) + keep = 0; + } + + if (keep) + picked.push_back(i); + } +} + +static inline float sigmoid(float x) +{ + return 1.0f / (1.0f + expf(-x)); +} + +static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) +{ + const int w = in_pad.w; + const int h = in_pad.h; + + const int num_grid_x = w / stride; + const int num_grid_y = h / stride; + + const int reg_max_1 = 16; + const int num_class = pred.w - reg_max_1 * 4; // number of classes. 80 for COCO + + for (int y = 0; y < num_grid_y; y++) + { + for (int x = 0; x < num_grid_x; x++) + { + const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1); + + // find label with max score + int label = -1; + float score = -FLT_MAX; + { + const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class); + + for (int k = 0; k < num_class; k++) + { + float s = pred_score[k]; + if (s > score) + { + label = k; + score = s; + } + } + + score = sigmoid(score); + } + + if (score >= prob_threshold) + { + ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone(); + + { + ncnn::Layer* softmax = ncnn::create_layer("Softmax"); + + ncnn::ParamDict pd; + pd.set(0, 1); // axis + pd.set(1, 1); + softmax->load_param(pd); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = false; + + softmax->create_pipeline(opt); + + softmax->forward_inplace(pred_bbox, opt); + + softmax->destroy_pipeline(opt); + + delete softmax; + } + + float pred_ltrb[4]; + for (int k = 0; k < 4; k++) + { + float dis = 0.f; + const float* dis_after_sm = pred_bbox.row(k); + for (int l = 0; l < reg_max_1; l++) + { + dis += l * dis_after_sm[l]; + } + + pred_ltrb[k] = dis * stride; + } + + float pb_cx = (x + 0.5f) * stride; + float pb_cy = (y + 0.5f) * stride; + + float x0 = pb_cx - pred_ltrb[0]; + float y0 = pb_cy - pred_ltrb[1]; + float x1 = pb_cx + pred_ltrb[2]; + float y1 = pb_cy + pred_ltrb[3]; + + Object obj; + obj.rect.x = x0; + obj.rect.y = y0; + obj.rect.width = x1 - x0; + obj.rect.height = y1 - y0; + obj.label = label; + obj.prob = score; + obj.gindex = y * num_grid_x + x; + + objects.push_back(obj); + } + } + } +} + +static void generate_proposals(const ncnn::Mat& pred, const std::vector& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) +{ + const int w = in_pad.w; + const int h = in_pad.h; + + int pred_row_offset = 0; + for (size_t i = 0; i < strides.size(); i++) + { + const int stride = strides[i]; + + const int num_grid_x = w / stride; + const int num_grid_y = h / stride; + const int num_grid = num_grid_x * num_grid_y; + + std::vector objects_stride; + generate_proposals(pred.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects_stride); + + for (size_t j = 0; j < objects_stride.size(); j++) + { + Object obj = objects_stride[j]; + obj.gindex += pred_row_offset; + objects.push_back(obj); + } + + pred_row_offset += num_grid; + } +} + +static int detect_yolov8_seg(const cv::Mat& bgr, std::vector& objects) +{ + ncnn::Net yolov8; + + yolov8.opt.use_vulkan_compute = true; + // yolov8.opt.use_bf16_storage = true; + + // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets + yolov8.load_param("yolov8n_seg_pnnx.py.ncnn.param"); + yolov8.load_model("yolov8n_seg_pnnx.py.ncnn.bin"); + // yolov8.load_param("yolov8s_seg_pnnx.py.ncnn.param"); + // yolov8.load_model("yolov8s_seg_pnnx.py.ncnn.bin"); + // yolov8.load_param("yolov8m_seg_pnnx.py.ncnn.param"); + // yolov8.load_model("yolov8m_seg_pnnx.py.ncnn.bin"); + + const int target_size = 640; + const float prob_threshold = 0.25f; + const float nms_threshold = 0.45f; + const float mask_threshold = 0.5f; + + int img_w = bgr.cols; + int img_h = bgr.rows; + + // ultralytics/cfg/models/v8/yolov8.yaml + std::vector strides(3); + strides[0] = 8; + strides[1] = 16; + strides[2] = 32; + const int max_stride = 32; + + // letterbox pad to multiple of max_stride + int w = img_w; + int h = img_h; + float scale = 1.f; + if (w > h) + { + scale = (float)target_size / w; + w = target_size; + h = h * scale; + } + else + { + scale = (float)target_size / h; + h = target_size; + w = w * scale; + } + + ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); + + // letterbox pad to target_size rectangle + int wpad = (w + max_stride - 1) / max_stride * max_stride - w; + int hpad = (h + max_stride - 1) / max_stride * max_stride - h; + ncnn::Mat in_pad; + ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); + + const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; + in_pad.substract_mean_normalize(0, norm_vals); + + ncnn::Extractor ex = yolov8.create_extractor(); + + ex.input("in0", in_pad); + + ncnn::Mat out; + ex.extract("out0", out); + + std::vector proposals; + generate_proposals(out, strides, in_pad, prob_threshold, proposals); + + // sort all proposals by score from highest to lowest + qsort_descent_inplace(proposals); + + // apply nms with nms_threshold + std::vector picked; + nms_sorted_bboxes(proposals, picked, nms_threshold); + + int count = picked.size(); + if (count == 0) + return 0; + + ncnn::Mat mask_feat; + ex.extract("out1", mask_feat); + + ncnn::Mat mask_protos; + ex.extract("out2", mask_protos); + + ncnn::Mat objects_mask_feat(mask_feat.w, 1, count); + + objects.resize(count); + for (int i = 0; i < count; i++) + { + objects[i] = proposals[picked[i]]; + + // adjust offset to original unpadded + float x0 = (objects[i].rect.x - (wpad / 2)) / scale; + float y0 = (objects[i].rect.y - (hpad / 2)) / scale; + float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; + float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; + + // clip + x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); + y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); + x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); + y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); + + objects[i].rect.x = x0; + objects[i].rect.y = y0; + objects[i].rect.width = x1 - x0; + objects[i].rect.height = y1 - y0; + + // pick mask feat + memcpy(objects_mask_feat.channel(i), mask_feat.row(objects[i].gindex), mask_feat.w * sizeof(float)); + } + + // process mask + ncnn::Mat objects_mask; + { + ncnn::Layer* gemm = ncnn::create_layer("Gemm"); + + ncnn::ParamDict pd; + pd.set(6, 1); // constantC + pd.set(7, count); // constantM + pd.set(8, mask_protos.w * mask_protos.h); // constantN + pd.set(9, mask_feat.w); // constantK + pd.set(10, -1); // constant_broadcast_type_C + pd.set(11, 1); // output_N1M + gemm->load_param(pd); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = false; + + gemm->create_pipeline(opt); + + std::vector gemm_inputs(2); + gemm_inputs[0] = objects_mask_feat; + gemm_inputs[1] = mask_protos.reshape(mask_protos.w * mask_protos.h, 1, mask_protos.c); + std::vector gemm_outputs(1); + gemm->forward(gemm_inputs, gemm_outputs, opt); + objects_mask = gemm_outputs[0].reshape(mask_protos.w, mask_protos.h, count); + + gemm->destroy_pipeline(opt); + + delete gemm; + } + { + ncnn::Layer* sigmoid = ncnn::create_layer("Sigmoid"); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = false; + + sigmoid->create_pipeline(opt); + + sigmoid->forward_inplace(objects_mask, opt); + + sigmoid->destroy_pipeline(opt); + + delete sigmoid; + } + + // resize mask map + { + ncnn::Mat objects_mask_resized; + ncnn::resize_bilinear(objects_mask, objects_mask_resized, in_pad.w / scale, in_pad.h / scale); + objects_mask = objects_mask_resized; + } + + // create per-object mask + for (int i = 0; i < count; i++) + { + Object& obj = objects[i]; + + const ncnn::Mat mm = objects_mask.channel(i); + + obj.mask = cv::Mat((int)obj.rect.height, (int)obj.rect.width, CV_8UC1); + + // adjust offset to original unpadded and clip inside object box + for (int y = 0; y < (int)obj.rect.height; y++) + { + const float* pmm = mm.row((int)(hpad / 2 / scale + obj.rect.y + y)) + (int)(wpad / 2 / scale + obj.rect.x); + uchar* pmask = obj.mask.ptr(y); + for (int x = 0; x < (int)obj.rect.width; x++) + { + pmask[x] = pmm[x] > mask_threshold ? 1 : 0; + } + } + } + + return 0; +} + +static void draw_objects(const cv::Mat& bgr, const std::vector& objects) +{ + static const char* class_names[] = { + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush" + }; + + static cv::Scalar colors[] = { + cv::Scalar(244, 67, 54), + cv::Scalar(233, 30, 99), + cv::Scalar(156, 39, 176), + cv::Scalar(103, 58, 183), + cv::Scalar(63, 81, 181), + cv::Scalar(33, 150, 243), + cv::Scalar(3, 169, 244), + cv::Scalar(0, 188, 212), + cv::Scalar(0, 150, 136), + cv::Scalar(76, 175, 80), + cv::Scalar(139, 195, 74), + cv::Scalar(205, 220, 57), + cv::Scalar(255, 235, 59), + cv::Scalar(255, 193, 7), + cv::Scalar(255, 152, 0), + cv::Scalar(255, 87, 34), + cv::Scalar(121, 85, 72), + cv::Scalar(158, 158, 158), + cv::Scalar(96, 125, 139) + }; + + cv::Mat image = bgr.clone(); + + for (size_t i = 0; i < objects.size(); i++) + { + const Object& obj = objects[i]; + + const cv::Scalar& color = colors[i % 19]; + + fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, + obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); + + for (int y = 0; y < (int)obj.rect.height; y++) + { + const uchar* maskptr = obj.mask.ptr(y); + uchar* bgrptr = image.ptr((int)obj.rect.y + y) + (int)obj.rect.x * 3; + for (int x = 0; x < (int)obj.rect.width; x++) + { + if (maskptr[x]) + { + bgrptr[0] = bgrptr[0] * 0.5 + color[0] * 0.5; + bgrptr[1] = bgrptr[1] * 0.5 + color[1] * 0.5; + bgrptr[2] = bgrptr[2] * 0.5 + color[2] * 0.5; + } + bgrptr += 3; + } + } + + cv::rectangle(image, obj.rect, color); + + char text[256]; + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); + + int baseLine = 0; + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + int x = obj.rect.x; + int y = obj.rect.y - label_size.height - baseLine; + if (y < 0) + y = 0; + if (x + label_size.width > image.cols) + x = image.cols - label_size.width; + + cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), + cv::Scalar(255, 255, 255), -1); + + cv::putText(image, text, cv::Point(x, y + label_size.height), + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); + } + + cv::imshow("image", image); + cv::waitKey(0); +} + +int main(int argc, char** argv) +{ + if (argc != 2) + { + fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); + return -1; + } + + const char* imagepath = argv[1]; + + cv::Mat m = cv::imread(imagepath, 1); + if (m.empty()) + { + fprintf(stderr, "cv::imread %s failed\n", imagepath); + return -1; + } + + std::vector objects; + detect_yolov8_seg(m, objects); + + draw_objects(m, objects); + + return 0; +}