Visual Servoing Platform version 3.7.0
Loading...
Searching...
No Matches
vpDetectorDNNOpenCV.cpp
1/*
2 * ViSP, open source Visual Servoing Platform software.
3 * Copyright (C) 2005 - 2025 by Inria. All rights reserved.
4 *
5 * This software is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 * See the file LICENSE.txt at the root directory of this source
10 * distribution for additional information about the GNU GPL.
11 *
12 * For using ViSP with software that can not be combined with the GNU
13 * GPL, please contact Inria about acquiring a ViSP Professional
14 * Edition License.
15 *
16 * See https://visp.inria.fr for more information.
17 *
18 * This software was developed at:
19 * Inria Rennes - Bretagne Atlantique
20 * Campus Universitaire de Beaulieu
21 * 35042 Rennes Cedex
22 * France
23 *
24 * If you have questions regarding the use of this file, please contact
25 * Inria at visp@inria.fr
26 *
27 * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
28 * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
29 *
30 * Description:
31 * DNN object detection using OpenCV DNN module.
32 */
33
34#include <visp3/core/vpConfig.h>
35
36// Check if std:c++17 or higher
37#if defined(VISP_HAVE_OPENCV) && (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
38 ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
39
40#include <visp3/core/vpImageConvert.h>
41#include <visp3/detection/vpDetectorDNNOpenCV.h>
42#include <visp3/core/vpIoTools.h>
43
44#include<algorithm>
45
53{
54 std::string list = "[";
55 for (unsigned int i = 0; i < vpDetectorDNNOpenCV::COUNT - 1; i++) {
57 }
59 return list;
60}
61
72{
73 std::string name;
74 switch (type) {
75 case YOLO_V3:
76 name = "yolov3";
77 break;
78 case YOLO_V4:
79 name = "yolov4";
80 break;
81 case YOLO_V5:
82 name = "yolov5";
83 break;
84 case YOLO_V7:
85 name = "yolov7";
86 break;
87 case YOLO_V8:
88 name = "yolov8";
89 break;
90 case YOLO_V11:
91 name = "yolov11";
92 break;
93 case YOLO_V12:
94 name = "yolov12";
95 break;
96 case FASTER_RCNN:
97 name = "faster-rcnn";
98 break;
99 case SSD_MOBILENET:
100 name = "ssd-mobilenet";
101 break;
102 case RESNET_10:
103 name = "resnet-10";
104 break;
105 case USER_SPECIFIED:
106 name = "user-specified";
107 break;
108 case COUNT:
109 name = "unknown";
110 break;
111 }
112 return name;
113}
114
124{
126 bool hasFoundMatch = false;
127 std::string name_lowercase = vpIoTools::toLowerCase(name);
128 for (int id = 0; id < COUNT && !hasFoundMatch; id++) {
130 if (dnnResultsParsingTypeToString(temp) == name_lowercase) {
131 res = temp;
132 hasFoundMatch = true;
133 }
134 }
135 return res;
136}
137
148std::vector<std::string> vpDetectorDNNOpenCV::parseClassNamesFile(const std::string &filename)
149{
150 return NetConfig::parseClassNamesFile(filename);
151}
152
160
168vpDetectorDNNOpenCV::vpDetectorDNNOpenCV(const NetConfig &config, const DNNResultsParsingType &typeParsingMethod, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &))
170 m_net(), m_netConfig(config), m_outNames(), m_dnnRes()
171{
172 setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
173 setParsingMethod(typeParsingMethod, parsingMethod);
174 if (!m_netConfig.m_modelFilename.empty()) {
175 readNet(m_netConfig.m_modelFilename, m_netConfig.m_modelConfigFilename, m_netConfig.m_framework);
176 }
177}
178
179#ifdef VISP_HAVE_NLOHMANN_JSON
180
181using json = nlohmann::json;
182
189vpDetectorDNNOpenCV::vpDetectorDNNOpenCV(const std::string &jsonPath, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &))
192{
193 initFromJSON(jsonPath);
194 setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
195 setParsingMethod(m_netConfig.m_parsingMethodType, parsingMethod);
196}
197
203void vpDetectorDNNOpenCV::initFromJSON(const std::string &jsonPath)
204{
205 std::ifstream file(jsonPath);
206 if (!file.good()) {
207 std::stringstream ss;
208 ss << "Problem opening file " << jsonPath << ". Make sure it exists and is readable" << std::endl;
209 throw vpException(vpException::ioError, ss.str());
210 }
211 json j;
212 try {
213 j = json::parse(file);
214 }
215 catch (json::parse_error &e) {
216 std::stringstream msg;
217 msg << "Could not parse JSON file : \n";
218
219 msg << e.what() << std::endl;
220 msg << "Byte position of error: " << e.byte;
221 throw vpException(vpException::ioError, msg.str());
222 }
223 *this = j; // Call from_json(const json& j, vpDetectorDNN& *this) to read json
224 file.close();
225 readNet(m_netConfig.m_modelFilename, m_netConfig.m_modelConfigFilename, m_netConfig.m_framework);
226}
227
233void vpDetectorDNNOpenCV::saveConfigurationInJSON(const std::string &jsonPath) const
234{
235 std::ofstream file(jsonPath);
236 const json j = *this;
237 file << j.dump(4);
238 file.close();
239}
240#endif
241
246
256bool vpDetectorDNNOpenCV::detect(const vpImage<unsigned char> &I, std::vector<DetectedFeatures2D> &output)
257{
259
260 return detect(m_I_color, output);
261}
262
273bool vpDetectorDNNOpenCV::detect(const vpImage<unsigned char> &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output)
274{
276
277 return detect(m_I_color, output);
278}
279
289bool vpDetectorDNNOpenCV::detect(const vpImage<unsigned char> &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output)
290{
292
293 return detect(m_I_color, output);
294}
295
305bool vpDetectorDNNOpenCV::detect(const vpImage<vpRGBa> &I, std::vector<DetectedFeatures2D> &output)
306{
308
309 return detect(m_img, output);
310}
311
321bool vpDetectorDNNOpenCV::detect(const vpImage<vpRGBa> &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output)
322{
324
325 return detect(m_img, output);
326}
327
335bool vpDetectorDNNOpenCV::detect(const vpImage<vpRGBa> &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output)
336{
338
339 return detect(m_img, output);
340}
341
349bool vpDetectorDNNOpenCV::detect(const cv::Mat &I, std::vector<DetectedFeatures2D> &output)
350{
351 m_img = I;
352 output.clear();
353
354 cv::Size inputSize(m_netConfig.m_inputSize.width > 0 ? m_netConfig.m_inputSize.width : m_img.cols,
355 m_netConfig.m_inputSize.height > 0 ? m_netConfig.m_inputSize.height : m_img.rows);
356 cv::dnn::blobFromImage(m_img, m_blob, m_netConfig.m_scaleFactor, inputSize, m_netConfig.m_mean, m_netConfig.m_swapRB, false);
357
358 m_net.setInput(m_blob);
359 try {
360 m_net.forward(m_dnnRes, m_outNames);
361 }
362 catch (const cv::Exception &e) {
363 std::cerr << "Caught an exception trying to run inference:" << std::endl << "\t"
364 << e.what()
365 << "\nCuda and/or GPU driver might not be correctly installed. Setting preferable backend to CPU and trying again." << std::endl;
366 m_net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
367 m_net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
368 m_net.forward(m_dnnRes, m_outNames);
369 }
370
371 DetectionCandidates proposals;
372 postProcess(proposals);
373 size_t nbClassNames = m_netConfig.m_classNames.size();
374 for (size_t i = 0; i < m_indices.size(); ++i) {
375 int idx = m_indices[i];
376 cv::Rect box = proposals.m_boxes[idx];
377 std::optional<std::string> classname_opt;
378 if (nbClassNames > 0) {
379 classname_opt = m_netConfig.m_classNames[proposals.m_classIds[idx]];
380 }
381 output.emplace_back(box.x, box.x + box.width, box.y, box.y + box.height
382 , proposals.m_classIds[idx], proposals.m_confidences[idx]
383 , classname_opt
384 );
385 }
386
388 // removing false detections, based on the bbox sizes
389 output = filterDetectionMultiClassInput(output, m_netConfig.m_filterSizeRatio);
390 }
391
392 return !output.empty();
393}
394
402bool vpDetectorDNNOpenCV::detect(const cv::Mat &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output)
403{
404 m_img = I;
405 output.clear();
406
407 cv::Size inputSize(m_netConfig.m_inputSize.width > 0 ? m_netConfig.m_inputSize.width : m_img.cols,
408 m_netConfig.m_inputSize.height > 0 ? m_netConfig.m_inputSize.height : m_img.rows);
409 cv::dnn::blobFromImage(m_img, m_blob, m_netConfig.m_scaleFactor, inputSize, m_netConfig.m_mean, m_netConfig.m_swapRB, false);
410
411 m_net.setInput(m_blob);
412 try {
413 m_net.forward(m_dnnRes, m_outNames);
414 }
415 catch (const cv::Exception &e) {
416 std::cerr << "Caught an exception trying to run inference:" << std::endl << "\t"
417 << e.what()
418 << "\nCuda and/or GPU driver might not be correctly installed. Setting preferable backend to CPU and trying again." << std::endl;
419 m_net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
420 m_net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
421 m_net.forward(m_dnnRes, m_outNames);
422 }
423
424 DetectionCandidates proposals;
425 postProcess(proposals);
426 size_t nbClassNames = m_netConfig.m_classNames.size();
427 for (size_t i = 0; i < m_indices.size(); ++i) {
428 int idx = m_indices[i];
429 cv::Rect box = proposals.m_boxes[idx];
430 std::string classname;
431 if (nbClassNames > 0) {
432 classname = m_netConfig.m_classNames[proposals.m_classIds[idx]];
433 }
434 else {
435 classname = std::to_string(proposals.m_classIds[idx]);
436 }
437 std::optional<std::string> classname_opt = std::optional<std::string>(classname);
438 output[classname].emplace_back(box.x, box.x + box.width, box.y, box.y + box.height
439 , proposals.m_classIds[idx], proposals.m_confidences[idx]
440 , classname_opt
441 );
442 }
443
445 output = filterDetectionMultiClassInput(output, m_netConfig.m_filterSizeRatio);
446 }
447
448 return !output.empty();
449}
450
458bool vpDetectorDNNOpenCV::detect(const cv::Mat &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output)
459{
460 std::map< std::string, std::vector<DetectedFeatures2D>> map_output;
461 bool returnStatus = detect(I, map_output);
462 for (auto key_val : map_output) {
463 output.push_back(key_val);
464 }
465 return returnStatus;
466}
467
468#if (VISP_HAVE_OPENCV_VERSION == 0x030403)
475{
476 static std::vector<cv::String> names;
477 if (names.empty()) {
478 std::vector<int> outLayers = m_net.getUnconnectedOutLayers();
479 std::vector<cv::String> layersNames = m_net.getLayerNames();
480 names.resize(outLayers.size());
481 for (size_t i = 0; i < outLayers.size(); ++i)
482 names[i] = layersNames[outLayers[i] - 1];
483 }
484 return names;
485}
486#endif
487
497{
498 switch (m_netConfig.m_parsingMethodType) {
499 case YOLO_V3:
500 case YOLO_V4:
502 break;
503 case YOLO_V5:
504 case YOLO_V7:
506 break;
507 case YOLO_V8:
508 case YOLO_V11:
509 case YOLO_V12:
511 break;
512 case FASTER_RCNN:
514 break;
515 case SSD_MOBILENET:
516#if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
517 void postProcess_SSD_MobileNet(DetectionCandidates & proposals, std::vector<cv::Mat> &dnnRes, const NetConfig & netConfig);
518#else
519 // NB: the two SSD-MobileNet DNNs that have been tested worked only
520 // using the ResNet-10 parsing method
522#endif
523 break;
524 case RESNET_10:
526 break;
527 case USER_SPECIFIED:
529 break;
530 default:
531 throw(vpException(vpException::badValue, "Type of DNN post-processing method not handled."));
532 }
533
534 m_indices.clear();
535 cv::dnn::NMSBoxes(proposals.m_boxes, proposals.m_confidences, m_netConfig.m_confThreshold, m_netConfig.m_nmsThreshold, m_indices);
536}
537
549std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>
550vpDetectorDNNOpenCV::filterDetectionSingleClassInput(const std::vector<DetectedFeatures2D> &detected_features, const double minRatioOfAreaOk)
551{
552 double meanArea(0.);
553 double originalNumberOfObj = static_cast<double>(detected_features.size());
554 double meanFactor = 1. / originalNumberOfObj;
555
556 // Computing the average area of the class
557 for (DetectedFeatures2D feature : detected_features) {
558 meanArea += feature.m_bbox.getArea();
559 }
560 meanArea *= meanFactor;
561
562 // Keeping only the detections that respect the area criterion
563 std::vector<DetectedFeatures2D> filtered_features;
564 for (DetectedFeatures2D feature : detected_features) {
565 if (feature.m_bbox.getArea() >= minRatioOfAreaOk * meanArea && feature.m_bbox.getArea() < meanArea / minRatioOfAreaOk) {
566 filtered_features.push_back(feature);
567 }
568 }
569
570 return filtered_features;
571}
572
585std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>
586vpDetectorDNNOpenCV::filterDetectionMultiClassInput(const std::vector<DetectedFeatures2D> &detected_features, const double minRatioOfAreaOk)
587{
588#ifndef DOXYGEN_SHOULD_SKIP_THIS
593 class MeanAreaComputer
594 {
595 private:
596 std::map<int, std::pair<int, double>> m_map_id_pairOccurrencesAreas;
598
599 std::map<int, double> m_mapMeans;
606 double computeMeanArea(const int &class_id)
607 {
608 return m_map_id_pairOccurrencesAreas[class_id].second / static_cast<double>(m_map_id_pairOccurrencesAreas[class_id].first);
609 }
610
611 public:
615 void computeMeans()
616 {
617 for (const auto &classID_pair : m_map_id_pairOccurrencesAreas) {
618 m_mapMeans[classID_pair.first] = computeMeanArea(classID_pair.first);
619 }
620 }
621
622 double getMean(const int &class_id)
623 {
624 if (m_map_id_pairOccurrencesAreas.find(class_id) == m_map_id_pairOccurrencesAreas.end()) {
625 throw(vpException(vpException::badValue, "[MeanAreaComputer::getMean] Asking for class_id \"" + std::to_string(class_id) + "\" that is not present in m_mapMeans. Did you call computeMeans ?"));
626 }
627 return m_mapMeans[class_id];
628 }
629
635 void operator()(const DetectedFeatures2D &feature)
636 {
637 int class_id = feature.getClassId();
638 double area = feature.getBoundingBox().getArea();
639 if (m_map_id_pairOccurrencesAreas.find(class_id) == m_map_id_pairOccurrencesAreas.end()) {
640 m_map_id_pairOccurrencesAreas[class_id] = std::pair<int, double>(1, area);
641 }
642 else {
643 std::pair<int, double> prev_state = m_map_id_pairOccurrencesAreas[class_id];
644 m_map_id_pairOccurrencesAreas[class_id] = std::pair<int, double>(prev_state.first + 1, prev_state.second + area);
645 }
646 }
647 };
648#endif // DOXYGEN_SHOULD_SKIP_THIS
649
650 // Computing the average area of each class
651 MeanAreaComputer meanComputer;
652 std::for_each(detected_features.begin(), detected_features.end(), meanComputer);
653 meanComputer.computeMeans();
654
655 // Keeping only the detections that respect the area criterion
656 std::vector<DetectedFeatures2D> filtered_features;
657 for (DetectedFeatures2D feature : detected_features) {
658 double meanArea = meanComputer.getMean(feature.getClassId());
659 if (feature.m_bbox.getArea() >= minRatioOfAreaOk * meanArea
660 && feature.m_bbox.getArea() < meanArea / minRatioOfAreaOk) {
661 filtered_features.push_back(feature);
662 }
663 }
664
665 return filtered_features;
666}
667
677std::map<std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>>
678vpDetectorDNNOpenCV::filterDetectionMultiClassInput(const std::map< std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>> &detected_features, const double minRatioOfAreaOk)
679{
680 std::map<std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>> output;
681 for (auto keyval : detected_features) {
682 output[keyval.first] = filterDetectionSingleClassInput(detected_features.at(keyval.first), minRatioOfAreaOk); // removing false detections
683 }
684 return output;
685}
686
700void vpDetectorDNNOpenCV::postProcess_YoloV3_V4(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
701{
702 size_t nbBatches = dnnRes.size();
703
704 for (size_t i = 0; i < nbBatches; i++) {
705 // Slightly modify from here: https://github.com/opencv/opencv/blob/8c25a8eb7b10fb50cda323ee6bec68aa1a9ce43c/samples/dnn/object_detection.cpp#L192-L221
706 // Counts the number of proposed detections and the number of data corresponding to 1 detection
707 int num_proposal = dnnRes[i].size[0]; // Number of detections
708 int nout = dnnRes[i].size[1]; // Number of data for each detection
709 if (dnnRes[i].dims > 2) {
710 num_proposal = dnnRes[i].size[1];
711 nout = dnnRes[i].size[2];
712 dnnRes[i] = dnnRes[i].reshape(0, num_proposal);
713 }
714
715 int n = 0, row_ind = 0;
716 float *pdata = (float *)dnnRes[i].data;
717
718 // Iterate on the detections to keep only the meaningful ones
719 for (n = 0; n < num_proposal; n++) {
720 float box_score = pdata[4];
721 if (box_score > netConfig.m_confThreshold) {
722 cv::Mat scores = dnnRes[i].row(row_ind).colRange(5, nout);
723 cv::Point classIdPoint;
724 double max_class_score;
725 // Get the value and location of the maximum score
726 cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
727
728 max_class_score *= box_score;
729
730 // The detection is kept only if the confidence is greater than the threshold
731 if (max_class_score > netConfig.m_confThreshold) {
732 const int class_idx = classIdPoint.x;
733 float cx = pdata[0] * m_img.cols;
734 float cy = pdata[1] * m_img.rows;
735 float w = pdata[2] * m_img.cols;
736 float h = pdata[3] * m_img.rows;
737
738 int left = int(cx - 0.5 * w);
739 int top = int(cy - 0.5 * h);
740
741 proposals.m_confidences.push_back(static_cast<float>(max_class_score));
742 proposals.m_boxes.push_back(cv::Rect(left, top, static_cast<int>(w), static_cast<int>(h)));
743 proposals.m_classIds.push_back(class_idx);
744 }
745 }
746 row_ind++;
747 pdata += nout;
748 }
749 }
750}
751
763void vpDetectorDNNOpenCV::postProcess_YoloV5_V7(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
764{
765 // Compute the ratio between the original size of the image and the network size to translate network coordinates into
766 // image coordinates
767 float ratioh = static_cast<float>(m_img.rows) / netConfig.m_inputSize.height, ratiow = static_cast<float>(m_img.cols) / netConfig.m_inputSize.width;
768 size_t nbBatches = dnnRes.size();
769
770 for (size_t i = 0; i < nbBatches; i++) {
771 // Counts the number of proposed detections and the number of data corresponding to 1 detection
772 int num_proposal = dnnRes[i].size[0]; // Number of detections
773 int nout = dnnRes[i].size[1]; // Number of data for each detection
774 if (dnnRes[i].dims > 2) {
775 num_proposal = dnnRes[i].size[1];
776 nout = dnnRes[i].size[2];
777 dnnRes[i] = dnnRes[i].reshape(0, num_proposal);
778 }
779
780 int n = 0, row_ind = 0;
781 float *pdata = (float *)dnnRes[i].data;
782
783 // Iterate on the detections to keep only the meaningful ones
784 for (n = 0; n < num_proposal; n++) {
785 float box_score = pdata[4];
786
787 if (box_score > netConfig.m_confThreshold) {
788 cv::Mat scores = dnnRes[i].row(row_ind).colRange(5, nout);
789 cv::Point classIdPoint;
790 double max_class_score;
791 // Get the value and location of the maximum score
792 cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
793 max_class_score *= box_score;
794
795 // The detection is kept only if the confidence is greater than the threshold
796 if (max_class_score > netConfig.m_confThreshold) {
797 const int class_idx = classIdPoint.x;
798 float cx = pdata[0] * ratiow;
799 float cy = pdata[1] * ratioh;
800 float w = pdata[2] * ratiow;
801 float h = pdata[3] * ratioh;
802
803 int left = int(cx - 0.5 * w);
804 int top = int(cy - 0.5 * h);
805
806 proposals.m_confidences.push_back(static_cast<float>(max_class_score));
807 proposals.m_boxes.push_back(cv::Rect(left, top, static_cast<int>(w), static_cast<int>(h)));
808 proposals.m_classIds.push_back(class_idx);
809 }
810 }
811 row_ind++;
812 pdata += nout;
813 }
814 }
815}
816
828void vpDetectorDNNOpenCV::postProcess_YoloV8_V11_V12(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
829{
830 // Code adapted from here: https://github.com/JustasBart/yolov8_CPP_Inference_OpenCV_ONNX/blob/minimalistic/inference.cpp
831 // Compute the ratio between the original size of the image and the network size to translate network coordinates into
832 // image coordinates
833 float ratioh = static_cast<float>(m_img.rows) / netConfig.m_inputSize.height, ratiow = static_cast<float>(m_img.cols) / netConfig.m_inputSize.width;
834 size_t nbBatches = dnnRes.size();
835
836 for (size_t i = 0; i < nbBatches; i++) {
837 // Counts the number of proposed detections and the number of data corresponding to 1 detection
838 int num_proposal = dnnRes[i].size[1]; // Number of detections
839 int nout = dnnRes[i].size[0]; // Number of data for each detection
840 if (dnnRes[i].dims > 2) {
841 num_proposal = dnnRes[i].size[2];
842 nout = dnnRes[i].size[1];
843 dnnRes[i] = dnnRes[i].reshape(0, nout);
844 }
845 cv::transpose(dnnRes[i], dnnRes[i]); // Organise data as YoloV5 i.e. [batchsize][1:num_proposals][1:4+nb_classes]
846
847 int n = 0, row_ind = 0;
848 float *pdata = (float *)dnnRes[i].data;
849
850 // Iterate on the detections to keep only the meaningful ones
851 for (n = 0; n < num_proposal; n++) {
852 cv::Mat scores = dnnRes[i].row(row_ind).colRange(4, nout);
853 cv::Point classIdPoint;
854 double max_class_score;
855 // Get the value and location of the maximum score
856 cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
857
858 // The detection is kept only if the confidence is greater than the threshold
859 if (max_class_score > netConfig.m_confThreshold) {
860 const int class_idx = classIdPoint.x;
861 float cx = pdata[0] * ratiow;
862 float cy = pdata[1] * ratioh;
863 float w = pdata[2] * ratiow;
864 float h = pdata[3] * ratioh;
865
866 int left = int(cx - 0.5 * w);
867 int top = int(cy - 0.5 * h);
868
869 proposals.m_confidences.push_back(static_cast<float>(max_class_score));
870 proposals.m_boxes.push_back(cv::Rect(left, top, static_cast<int>(w), static_cast<int>(h)));
871 proposals.m_classIds.push_back(class_idx);
872 }
873
874 row_ind++;
875 pdata += nout;
876 }
877 }
878}
879
891void vpDetectorDNNOpenCV::postProcess_FasterRCNN(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
892{
893 // Direct copy from object_detection.cpp OpenCV sample
894 // Faster-RCNN
895
896 // Network produces output blob with a shape 1x1xNx7 where N is a number of
897 // detections and an every detection is a vector of values
898 // [batchId, classId, confidence, left, top, right, bottom]
899 size_t nbBatches = dnnRes.size();
900 for (size_t j = 0; j < nbBatches; j++) {
901 float *data = (float *)dnnRes[j].data;
902 for (size_t i = 0; i < dnnRes[j].total(); i += 7) {
903 float confidence = data[i + 2];
904 if (confidence > netConfig.m_confThreshold) {
905 int left = static_cast<int>(data[i + 3] * m_img.cols);
906 int top = static_cast<int>(data[i + 4] * m_img.rows);
907 int right = static_cast<int>(data[i + 5] * m_img.cols);
908 int bottom = static_cast<int>(data[i + 6] * m_img.rows);
909 int classId = static_cast<int>(data[i + 1]);
910
911 proposals.m_confidences.push_back(static_cast<float>(confidence));
912 proposals.m_boxes.push_back(cv::Rect(left, top, right - left + 1, bottom - top + 1));
913 proposals.m_classIds.push_back(classId);
914 }
915 }
916 }
917
918}
919
920#if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
933void vpDetectorDNNOpenCV::postProcess_SSD_MobileNet(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
934{
935 // Network produces 2 outputs blob:
936 // - `scores` with dimensions 1xNxC
937 // - 'boxes' with dimensions 1xNx4
938 // where `N` is a number of detections and `C` is the number of classes (with `BACKGROUND` as classId = 0).
939
940 int scores_index = m_outNames[0] == "scores" ? 0 : 1; // scores output index.
941 int boxes_index = m_outNames[0] == "boxes" ? 0 : 1; // boxes output index.
942
943 int N = dnnRes[scores_index].size[1], C = dnnRes[scores_index].size[2];
944
945 float *confidence = (float *)dnnRes[scores_index].data;
946 float *bbox = (float *)dnnRes[boxes_index].data;
947
948 // Loop over all guesses on the output of the network.
949 for (int i = 0; i < N; i++) {
950 uint32_t maxClass = 0;
951 float maxScore = -1000.0f;
952
953 for (int j = 1; j < C; j++) // ignore background (classId = 0).
954 {
955 const float score = confidence[i * C + j];
956
957 if (score < netConfig.m_confThreshold)
958 continue;
959
960 if (score > maxScore) {
961 maxScore = score;
962 maxClass = j;
963 }
964 }
965
966 if (maxScore > netConfig.m_confThreshold) {
967 int left = static_cast<int>(bbox[4 * i] * m_img.cols);
968 int top = static_cast<int>(bbox[4 * i + 1] * m_img.rows);
969 int right = static_cast<int>(bbox[4 * i + 2] * m_img.cols);
970 int bottom = static_cast<int>(bbox[4 * i + 3] * m_img.rows);
971 int width = right - left + 1;
972 int height = bottom - top + 1;
973
974 int classId = maxClass;
975 proposals.m_confidences.push_back(maxScore);
976 proposals.m_boxes.push_back(cv::Rect(left, top, width, height));
977 proposals.m_classIds.push_back(classId);
978 }
979 }
980}
981#endif
982
994void vpDetectorDNNOpenCV::postProcess_ResNet_10(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
995{
996 // Direct copy from object_detection.cpp OpenCV sample
997
998 // Network produces output blob with a shape 1x1xNx7 where N is a number of
999 // detections and an every detection is a vector of values
1000 // [batchId, classId, confidence, left, top, right, bottom]
1001 CV_Assert(dnnRes.size() == 1);
1002 float *data = (float *)dnnRes[0].data;
1003 for (size_t i = 0; i < dnnRes[0].total(); i += 7) {
1004 float confidence = data[i + 2];
1005 if (confidence > netConfig.m_confThreshold) {
1006 int left = static_cast<int>(data[i + 3] * m_img.cols);
1007 int top = static_cast<int>(data[i + 4] * m_img.rows);
1008 int right = static_cast<int>(data[i + 5] * m_img.cols);
1009 int bottom = static_cast<int>(data[i + 6] * m_img.rows);
1010 int classId = static_cast<int>(data[i + 1]) - 1;
1011
1012 proposals.m_confidences.push_back(static_cast<float>(confidence));
1013 proposals.m_boxes.push_back(cv::Rect(left, top, right - left + 1, bottom - top + 1));
1014 proposals.m_classIds.push_back(classId);
1015 }
1016 }
1017}
1018
1027void vpDetectorDNNOpenCV::postProcess_unimplemented(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
1028{
1029 (void)proposals;
1030 (void)dnnRes;
1031 (void)netConfig;
1032 throw(vpException(vpException::functionNotImplementedError, "vpDetectorDNNOpenCV::postProcess was called with a USER_SPECIFIED DNN but not post processing method was given."));
1033}
1034
1054void vpDetectorDNNOpenCV::readNet(const std::string &model, const std::string &config, const std::string &framework)
1055{
1056 m_netConfig.m_modelFilename = model;
1057 m_netConfig.m_modelConfigFilename = config;
1058 m_netConfig.m_framework = framework;
1059 m_net = cv::dnn::readNet(model, config, framework);
1060#if (VISP_HAVE_OPENCV_VERSION == 0x030403)
1062#else
1063 m_outNames = m_net.getUnconnectedOutLayersNames();
1064#endif
1065}
1066
1074{
1075 m_netConfig = config;
1076 setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
1077 setParsingMethod(m_netConfig.m_parsingMethodType);
1078 if (!m_netConfig.m_modelFilename.empty()) {
1079 readNet(m_netConfig.m_modelFilename, m_netConfig.m_modelConfigFilename, m_netConfig.m_framework);
1080 }
1081}
1082
1088void vpDetectorDNNOpenCV::setConfidenceThreshold(const float &confThreshold) { m_netConfig.m_confThreshold = confThreshold; }
1089
1096void vpDetectorDNNOpenCV::setNMSThreshold(const float &nmsThreshold) { m_netConfig.m_nmsThreshold = nmsThreshold; }
1097
1106{
1107 m_netConfig.m_filterSizeRatio = sizeRatio;
1108 if (m_netConfig.m_filterSizeRatio > std::numeric_limits<double>::epsilon()) {
1110 }
1111 else {
1113 }
1114}
1115
1122void vpDetectorDNNOpenCV::setInputSize(const int &width, const int &height)
1123{
1124 m_netConfig.m_inputSize.width = width;
1125 m_netConfig.m_inputSize.height = height;
1126}
1127
1135void vpDetectorDNNOpenCV::setMean(const double &meanR, const double &meanG, const double &meanB) { m_netConfig.m_mean = cv::Scalar(meanR, meanG, meanB); }
1136
1143void vpDetectorDNNOpenCV::setPreferableBackend(const int &backendId) { m_net.setPreferableBackend(backendId); }
1144
1151void vpDetectorDNNOpenCV::setPreferableTarget(const int &targetId) { m_net.setPreferableTarget(targetId); }
1152
1156void vpDetectorDNNOpenCV::setScaleFactor(const double &scaleFactor)
1157{
1158 m_netConfig.m_scaleFactor = scaleFactor;
1159 if ((m_netConfig.m_parsingMethodType == YOLO_V7 || m_netConfig.m_parsingMethodType == YOLO_V8 || m_netConfig.m_parsingMethodType == YOLO_V11 || m_netConfig.m_parsingMethodType == YOLO_V12) && m_netConfig.m_scaleFactor != 1 / 255.) {
1160 std::cout << "[vpDetectorDNNOpenCV::setParsingMethod] WARNING: scale factor should be 1/255. to normalize pixels value." << std::endl;
1161 }
1162}
1163
1169void vpDetectorDNNOpenCV::setSwapRB(const bool &swapRB) { m_netConfig.m_swapRB = swapRB; }
1170
1178void vpDetectorDNNOpenCV::setParsingMethod(const DNNResultsParsingType &typeParsingMethod, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &))
1179{
1180 m_netConfig.m_parsingMethodType = typeParsingMethod;
1181 m_parsingMethod = parsingMethod;
1182 if ((m_netConfig.m_parsingMethodType == YOLO_V7 || m_netConfig.m_parsingMethodType == YOLO_V8 || m_netConfig.m_parsingMethodType == YOLO_V11 || m_netConfig.m_parsingMethodType == YOLO_V12) && m_netConfig.m_scaleFactor != 1 / 255.) {
1183 m_netConfig.m_scaleFactor = 1 / 255.;
1184 std::cout << "[vpDetectorDNNOpenCV::setParsingMethod] NB: scale factor changed to 1/255. to normalize pixels value." << std::endl;
1185 }
1186
1187#if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
1188 if (m_netConfig.m_parsingMethodType == SSD_MOBILENET) {
1189 std::cout << "[vpDetectorDNNOpenCV::setParsingMethod] WARNING: The chosen type of network is " << dnnResultsParsingTypeToString(m_netConfig.m_parsingMethodType) << " VISP_BUILD_DEPRECATED_FUNCTIONS is set to true." << std::endl;
1190 std::cout << "\tThe parsing method that worked with the networks quoted in the ViSP documentation was postProcess_ResNet_10 instead of postProcess_SSD_MobileNet." << std::endl;
1191 std::cout << "\tIf the SSD-MobileNet network does not seem to work, please try to recompile ViSP setting VISP_BUILD_DEPRECATED_FUNCTIONS as false." << std::endl << std::flush;
1192 }
1193#endif
1194}
1195
1196END_VISP_NAMESPACE
1197#elif !defined(VISP_BUILD_SHARED_LIBS)
1198// Work around to avoid warning: libvisp_core.a(vpDetectorDNNOpenCV.cpp.o) has no symbols
1199void dummy_vpDetectorDNN() { }
1200#endif
Structure containing the bounding box, expressed in pixels, confidence and class information about an...
Structure containing some information required for the configuration of a vpDetectorDNNOpenCV object.
cv::Mat m_blob
Buffer for the blob in input net.
void postProcess_YoloV5_V7(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
void setScaleFactor(const double &scaleFactor)
void initFromJSON(const std::string &jsonPath)
Initialize detector from a json config file.
void readNet(const std::string &model, const std::string &config="", const std::string &framework="")
static void postProcess_unimplemented(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
void setDetectionFilterSizeRatio(const double &sizeRatio)
DNNResultsParsingType
Enumeration listing the types of DNN for which the vpDetectorDNNOpenCV furnishes the methods permitti...
static DNNResultsParsingType dnnResultsParsingTypeFromString(const std::string &name)
void postProcess_SSD_MobileNet(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
std::vector< cv::String > m_outNames
Names of layers with unconnected outputs.
void setMean(const double &meanR, const double &meanG, const double &meanB)
void setSwapRB(const bool &swapRB)
cv::Mat m_img
Buffer for the input image.
static std::vector< std::string > parseClassNamesFile(const std::string &filename)
Parse the designated file that contains the list of the classes the network can detect....
std::vector< int > m_indices
Indices for NMS.
void setParsingMethod(const DNNResultsParsingType &typeParsingMethod, void(*parsingMethod)(DetectionCandidates &, std::vector< cv::Mat > &, const NetConfig &)=postProcess_unimplemented)
NetConfig m_netConfig
Configuration of the DNN.
std::vector< cv::Mat > m_dnnRes
Contains all output blobs for each layer specified in m_outNames.
cv::dnn::Net m_net
DNN network.
bool m_applySizeFilterAfterNMS
If true, filter the detections removing the ones for which the bbox does not respect area(bbox) € [me...
std::vector< cv::String > getOutputsNames()
Get the names of the output layers of the DNN.
void setNetConfig(const NetConfig &config)
void postProcess_YoloV3_V4(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
virtual bool detect(const vpImage< unsigned char > &I, std::vector< DetectedFeatures2D > &output)
Object detection using OpenCV DNN module.
void postProcess_ResNet_10(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
void setPreferableBackend(const int &backendId)
void setNMSThreshold(const float &nmsThreshold)
virtual ~vpDetectorDNNOpenCV()
Destroy the vpDetectorDNNOpenCV object.
void postProcess_FasterRCNN(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
std::vector< DetectedFeatures2D > filterDetectionMultiClassInput(const std::vector< DetectedFeatures2D > &detected_features, const double minRatioOfAreaOk)
Return a new vector, ordered by vpDetectorDNNOpenCV::DetectedFeatures2D::m_cls , where the area of ea...
void postProcess_YoloV8_V11_V12(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
void setPreferableTarget(const int &targetId)
void setInputSize(const int &width, const int &height)
static std::string dnnResultsParsingTypeToString(const DNNResultsParsingType &type)
void postProcess(DetectionCandidates &proposals)
static std::string getAvailableDnnResultsParsingTypes()
Get the list of the parsing methods / types of DNNs supported by the vpDetectorDNNOpenCV class.
void(* m_parsingMethod)(DetectionCandidates &, std::vector< cv::Mat > &, const NetConfig &)
Pointer towards the parsing method, used if m_parsingMethodType is equal to m_parsingMethodType::USER...
std::vector< DetectedFeatures2D > filterDetectionSingleClassInput(const std::vector< DetectedFeatures2D > &detected_features, const double minRatioOfAreaOk)
Return a new vector of detected features whose area is greater or equal to the average area x minRati...
void saveConfigurationInJSON(const std::string &jsonPath) const
Save the network configuration in a JSON file.
void setConfidenceThreshold(const float &confThreshold)
vpImage< vpRGBa > m_I_color
Buffer for gray to RGBa image conversion.
error that can be emitted by ViSP classes.
Definition vpException.h:60
@ ioError
I/O error.
Definition vpException.h:67
@ badValue
Used to indicate that a value is not in the allowed range.
Definition vpException.h:73
@ functionNotImplementedError
Function not implemented.
Definition vpException.h:66
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Definition of the vpImage class member functions.
Definition vpImage.h:131
static std::string toLowerCase(const std::string &input)
Return a lower-case version of the string input . Numbers and special characters stay the same.
double getArea() const
Definition vpRect.h:91