JeVois  1.22
JeVois Smart Embedded Machine Vision Toolkit
Share this page:
Loading...
Searching...
No Matches
PostProcessorDetect.C
Go to the documentation of this file.
1// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2//
3// JeVois Smart Embedded Machine Vision Toolkit - Copyright (C) 2021 by Laurent Itti, the University of Southern
4// California (USC), and iLab at USC. See http://iLab.usc.edu and http://jevois.org for information about this project.
5//
6// This file is part of the JeVois Smart Embedded Machine Vision Toolkit. This program is free software; you can
7// redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software
8// Foundation, version 2. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
9// without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
10// License for more details. You should have received a copy of the GNU General Public License along with this program;
11// if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
12//
13// Contact information: Laurent Itti - 3641 Watt Way, HNB-07A - Los Angeles, CA 90089-2520 - USA.
14// Tel: +1 213 740 3527 - itti@pollux.usc.edu - http://iLab.usc.edu - http://jevois.org
15// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
16/*! \file */
17
21#include <jevois/DNN/Utils.H>
22#include <jevois/Util/Utils.H>
24#include <jevois/Core/Engine.H>
25#include <jevois/Core/Module.H>
27
28#include <opencv2/dnn.hpp>
29#include <opencv2/imgproc/imgproc.hpp> // for findContours()
30
31// ####################################################################################################
34
35// ####################################################################################################
37{
38 classes::freeze(doit);
39 detecttype::freeze(doit);
40 if (itsYOLO) itsYOLO->freeze(doit);
41 if (detecttype::get() != postprocessor::DetectType::YOLOv8seg &&
42 detecttype::get() != postprocessor::DetectType::YOLOv8segt)
43 masksmooth::freeze(doit);
44}
45
46// ####################################################################################################
47void jevois::dnn::PostProcessorDetect::onParamChange(postprocessor::classes const &, std::string const & val)
48{
49 if (val.empty()) { itsLabels.clear(); return; }
51}
52
53// ####################################################################################################
54void jevois::dnn::PostProcessorDetect::onParamChange(postprocessor::detecttype const &,
55 postprocessor::DetectType const & val)
56{
57 if (val == postprocessor::DetectType::RAWYOLO)
58 itsYOLO = addSubComponent<jevois::dnn::PostProcessorDetectYOLO>("yolo");
59 else
60 {
61 itsYOLO.reset();
62 removeSubComponent("yolo", false);
63 }
64}
65
66// ####################################################################################################
67void jevois::dnn::PostProcessorDetect::process(std::vector<cv::Mat> const & outs, jevois::dnn::PreProcessor * preproc)
68{
69 if (outs.empty()) LFATAL("No outputs received, we need at least one.");
70 cv::Mat const & out = outs[0]; cv::MatSize const & msiz = out.size;
71
72 float const confThreshold = cthresh::get() * 0.01F;
73 float const boxThreshold = dthresh::get() * 0.01F;
74 float const nmsThreshold = nms::get() * 0.01F;
75 bool const sigmo = sigmoid::get();
76 bool const clampbox = boxclamp::get();
77 int const fudge = classoffset::get();
78 bool const smoothmsk = masksmooth::get();
79 itsImageSize = preproc->imagesize();
80
81 // To draw boxes, we will need to:
82 // - scale from [0..1]x[0..1] to blobw x blobh
83 // - scale and center from blobw x blobh to input image w x h, provided by PreProcessor::b2i()
84 // - when using the GUI, we further scale and translate to OpenGL display coordinates using GUIhelper::i2d()
85 // Here we assume that the first blob sets the input size.
86 cv::Size const bsiz = preproc->blobsize(0);
87
88 // We keep 3 vectors here instead of creating a class to hold all of the data because OpenCV will need that for
89 // non-maximum suppression:
90 std::vector<int> classIds;
91 std::vector<float> confidences;
92 std::vector<cv::Rect> boxes;
93 std::vector<cv::Mat> mask_coeffs; // mask coefficients when doing instance segmentation
94 cv::Mat mask_proto; // The output containing the mask prototypes (usually the last one)
95 int mask_proto_h = 1; // number of rows in the mask prototypes tensor, will be updated
96
97 // Here we just scale the coords from [0..1]x[0..1] to blobw x blobh:
98 try
99 {
100 switch(detecttype::get())
101 {
102 // ----------------------------------------------------------------------------------------------------
103 case jevois::dnn::postprocessor::DetectType::FasterRCNN:
104 {
105 if (outs.size() != 1 || msiz.dims() != 4 || msiz[0] != 1 || msiz[1] != 1 || msiz[3] != 7)
106 LTHROW("Expected 1 output blob with shape 1x1xNx7 for N detections with values "
107 "[batchId, classId, confidence, left, top, right, bottom]");
108
109 float const * data = (float const *)out.data;
110 for (size_t i = 0; i < out.total(); i += 7)
111 {
112 float confidence = data[i + 2];
113 if (confidence > confThreshold)
114 {
115 int left = (int)data[i + 3];
116 int top = (int)data[i + 4];
117 int right = (int)data[i + 5];
118 int bottom = (int)data[i + 6];
119 int width = right - left + 1;
120 int height = bottom - top + 1;
121 classIds.push_back((int)(data[i + 1]) + fudge); // Skip 0th background class id.
122 boxes.push_back(cv::Rect(left, top, width, height));
123 confidences.push_back(confidence);
124 }
125 }
126 }
127 break;
128
129 // ----------------------------------------------------------------------------------------------------
130 case jevois::dnn::postprocessor::DetectType::SSD:
131 {
132 if (outs.size() != 1 || msiz.dims() != 4 || msiz[0] != 1 || msiz[1] != 1 || msiz[3] != 7)
133 LTHROW("Expected 1 output blob with shape 1x1xNx7 for N detections with values "
134 "[batchId, classId, confidence, left, top, right, bottom]");
135
136 float const * data = (float const *)out.data;
137 for (size_t i = 0; i < out.total(); i += 7)
138 {
139 float confidence = data[i + 2];
140 if (confidence > confThreshold)
141 {
142 int left = (int)(data[i + 3] * bsiz.width);
143 int top = (int)(data[i + 4] * bsiz.height);
144 int right = (int)(data[i + 5] * bsiz.width);
145 int bottom = (int)(data[i + 6] * bsiz.height);
146 int width = right - left + 1;
147 int height = bottom - top + 1;
148 classIds.push_back((int)(data[i + 1]) + fudge); // Skip 0th background class id.
149 boxes.push_back(cv::Rect(left, top, width, height));
150 confidences.push_back(confidence);
151 }
152 }
153 }
154 break;
155
156 // ----------------------------------------------------------------------------------------------------
157 case jevois::dnn::postprocessor::DetectType::TPUSSD:
158 {
159 if (outs.size() != 4)
160 LTHROW("Expected 4 output blobs with shapes 4xN for boxes, N for IDs, N for scores, and 1x1 for count");
161 cv::Mat const & bboxes = outs[0];
162 cv::Mat const & ids = outs[1];
163 cv::Mat const & scores = outs[2];
164 cv::Mat const & count = outs[3];
165 if (bboxes.total() != 4 * ids.total() || bboxes.total() != 4 * scores.total() || count.total() != 1)
166 LTHROW("Expected 4 output blobs with shapes 4xN for boxes, N for IDs, N for scores, and 1x1 for count");
167
168 size_t num = count.at<float>(0);
169 if (num > ids.total()) LTHROW("Too many detections: " << num << " for only " << ids.total() << " ids");
170 float const * bb = (float const *)bboxes.data;
171
172 for (size_t i = 0; i < num; ++i)
173 {
174 if (scores.at<float>(i) < confThreshold) continue;
175
176 int top = (int)(bb[4 * i] * bsiz.height);
177 int left = (int)(bb[4 * i + 1] * bsiz.width);
178 int bottom = (int)(bb[4 * i + 2] * bsiz.height);
179 int right = (int)(bb[4 * i + 3] * bsiz.width);
180 int width = right - left + 1;
181 int height = bottom - top + 1;
182 classIds.push_back((int)(ids.at<float>(i)) + fudge); // Skip 0th background class id.
183 boxes.push_back(cv::Rect(left, top, width, height));
184 confidences.push_back(scores.at<float>(i));
185 }
186 }
187 break;
188
189 // ----------------------------------------------------------------------------------------------------
190 case jevois::dnn::postprocessor::DetectType::YOLO:
191 {
192 for (size_t i = 0; i < outs.size(); ++i)
193 {
194 // Network produces output blob(s) with shape Nx(5+C) where N is a number of detected objects and C is a number
195 // of classes + 5 where the first 5 numbers are [center_x, center_y, width, height, box score].
196 cv::Mat const & out = outs[i];
197 cv::MatSize const & ms = out.size; int const nd = ms.dims();
198 int nbox = -1, ndata = -1;
199
200 if (nd >= 2)
201 {
202 nbox = ms[nd-2];
203 ndata = ms[nd-1];
204 for (int i = 0; i < nd-2; ++i) if (ms[i] != 1) nbox = -1; // reject if more than 2 effective dims
205 }
206
207 if (nbox < 0 || ndata < 5)
208 LTHROW("Expected 1 or more output blobs with shape Nx(5+C) where N is the number of "
209 "detected objects, C is the number of classes, and the first 5 columns are "
210 "[center_x, center_y, width, height, box score]. // "
211 "Incorrect size " << jevois::dnn::shapestr(out) << " for output " << i <<
212 ": need Nx(5+C) or 1xNx(5+C)");
213
214 // Some networks, like YOLOv5 or YOLOv7, output 3D 1xNx(5+C), so here we slice off the last 2 dims:
215 int sz2[] = { nbox, ndata };
216 cv::Mat const out2(2, sz2, out.type(), out.data);
217
218 float const * data = (float const *)out2.data;
219 for (int j = 0; j < nbox; ++j, data += ndata)
220 {
221 if (data[4] < boxThreshold) continue; // skip if box score is too low
222
223 cv::Mat scores = out2.row(j).colRange(5, ndata);
224 cv::Point classIdPoint; double confidence;
225 cv::minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
226
227 if (confidence < confThreshold) continue; // skip if class score too low
228
229 // YOLO<5 produces boxes in [0..1[x[0..1[ and 2D output blob:
230 int centerX, centerY, width, height;
231 if (nd == 2)
232 {
233 centerX = (int)(data[0] * bsiz.width);
234 centerY = (int)(data[1] * bsiz.height);
235 width = (int)(data[2] * bsiz.width);
236 height = (int)(data[3] * bsiz.height);
237 }
238 else
239 {
240 // YOLOv5, YOLOv7 produce boxes already scaled by input blob size, and 3D output blob:
241 centerX = (int)(data[0]);
242 centerY = (int)(data[1]);
243 width = (int)(data[2]);
244 height = (int)(data[3]);
245 }
246
247 int left = centerX - width / 2;
248 int top = centerY - height / 2;
249 boxes.push_back(cv::Rect(left, top, width, height));
250 classIds.push_back(classIdPoint.x);
251 confidences.push_back((float)confidence);
252 }
253 }
254 }
255 break;
256
257 // ----------------------------------------------------------------------------------------------------
258 case jevois::dnn::postprocessor::DetectType::YOLOv10:
259 {
260 for (size_t i = 0; i < outs.size(); ++i)
261 {
262 cv::Mat const & out = outs[i];
263 cv::MatSize const & ms = out.size; int const nd = ms.dims();
264
265 if (jevois::dnn::effectiveDims(out) != 2 || ms[nd-1] < 5)
266 LTHROW("Expected 1 or more output blobs with shape Nx(4+C) where N is the number of "
267 "detected objects, C is the number of classes, and the first 4 columns are "
268 "[x1, y1, x2, y2]. // "
269 "Incorrect size " << jevois::dnn::shapestr(out) << " for output " << i <<
270 ": need Nx(4+C)");
271
272 // Some networks may produce 3D, slice off the last 2 dims:
273 int const nbox = ms[nd-2];
274 int const ndata = ms[nd-1];
275 int sz2[] = { nbox, ndata };
276 cv::Mat const out2(2, sz2, out.type(), out.data);
277
278 // Ok, we are ready with Nx(4+C):
279 float const * data = (float const *)out2.data;
280 for (int j = 0; j < nbox; ++j, data += ndata)
281 {
282 cv::Mat scores = out2.row(j).colRange(4, ndata);
283 cv::Point classIdPoint; double confidence;
284 cv::minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
285
286 if (confidence < confThreshold) continue; // skip if class score too low
287
288 // Boxes are already scaled by input blob size, and are x1, y1, x2, y2:
289 boxes.push_back(cv::Rect(data[0], data[1], data[2]-data[0]+1, data[3]-data[1]+1));
290 classIds.push_back(classIdPoint.x);
291 confidences.push_back((float)confidence);
292 }
293 }
294 }
295 break;
296
297 // ----------------------------------------------------------------------------------------------------
298 case jevois::dnn::postprocessor::DetectType::YOLOv10pp:
299 {
300 if (outs.size() != 1 || msiz.dims() != 3 || msiz[0] != 1 || msiz[2] != 6)
301 LTHROW("Expected 1 output blob with shape 1xNx6 for N detections with values "
302 "[left, top, right, bottom, confidence, classId]");
303
304 float const * data = (float const *)out.data;
305 for (size_t i = 0; i < out.total(); i += 6)
306 {
307 float confidence = data[i + 4];
308 if (confidence > confThreshold)
309 {
310 // Boxes are already scaled by input blob size, and are x1, y1, x2, y2:
311 int left = (int)data[i + 0];
312 int top = (int)data[i + 1];
313 int right = (int)data[i + 2];
314 int bottom = (int)data[i + 3];
315 int width = right - left + 1;
316 int height = bottom - top + 1;
317 classIds.push_back((int)(data[i + 5]) + fudge); // Skip 0th background class id.
318 boxes.push_back(cv::Rect(left, top, width, height));
319 confidences.push_back(confidence);
320 }
321 }
322 }
323 break;
324
325 // ----------------------------------------------------------------------------------------------------
326 case jevois::dnn::postprocessor::DetectType::RAWYOLO:
327 {
328 if (itsYOLO) itsYOLO->yolo(outs, classIds, confidences, boxes, itsLabels.size(), boxThreshold, confThreshold,
329 bsiz, fudge, maxnbox::get(), sigmo);
330 else LFATAL("Internal error -- no YOLO subcomponent");
331 }
332 break;
333
334 // ----------------------------------------------------------------------------------------------------
335 case jevois::dnn::postprocessor::DetectType::YOLOX:
336 {
337 if ((outs.size() % 3) != 0 || msiz.dims() != 4 || msiz[0] != 1)
338 LTHROW("Expected several (usually 3, for 3 strides) sets of 3 blobs: 1xHxWxC (class scores), 1xHxWx4 (boxes), "
339 "1xHxWx1 (objectness scores)");
340
341 int stride = 8;
342
343 for (size_t idx = 0; idx < outs.size(); idx += 3)
344 {
345 cv::Mat const & cls = outs[idx]; cv::MatSize const & cls_siz = cls.size;
346 if (cls_siz.dims() != 4) LTHROW("Output " << idx << " is not 4D 1xHxWxC");
347 float const * cls_data = (float const *)cls.data;
348
349 cv::Mat const & bx = outs[idx + 1]; cv::MatSize const & bx_siz = bx.size;
350 if (bx_siz.dims() != 4 || bx_siz[3] != 4) LTHROW("Output " << idx << " is not 1xHxWx4");
351 float const * bx_data = (float const *)bx.data;
352
353 cv::Mat const & obj = outs[idx + 2]; cv::MatSize const & obj_siz = obj.size;
354 if (obj_siz.dims() != 4 || obj_siz[3] != 1) LTHROW("Output " << idx << " is not 1xHxWx1");
355 float const * obj_data = (float const *)obj.data;
356
357 for (int i = 1; i < 3; ++i)
358 if (cls_siz[i] != bx_siz[i] || cls_siz[i] != obj_siz[i])
359 LTHROW("Mismatched HxW sizes for outputs " << idx << " .. " << idx + 2);
360
361 size_t const nclass = cls_siz[3];
362
363 // Loop over all locations:
364 for (int y = 0; y < cls_siz[1]; ++y)
365 for (int x = 0; x < cls_siz[2]; ++x)
366 {
367 // Only consider if objectness score is high enough:
368 float objectness = obj_data[0];
369 if (objectness >= boxThreshold)
370 {
371 // Get the top class score:
372 size_t best_idx = 0; float confidence = cls_data[0];
373 for (size_t i = 1; i < nclass; ++i)
374 if (cls_data[i] > confidence) { confidence = cls_data[i]; best_idx = i; }
375
376 confidence *= objectness;
377
378 if (confidence >= confThreshold)
379 {
380 // Decode the box:
381 float cx = (x /*+ 0.5F*/ + bx_data[0]) * stride;
382 float cy = (y /*+ 0.5F*/ + bx_data[1]) * stride;
383 float width = std::exp(bx_data[2]) * stride;
384 float height = std::exp(bx_data[3]) * stride;
385 float left = cx - 0.5F * width;
386 float top = cy - 0.5F * height;
387
388 // Store this detection:
389 boxes.push_back(cv::Rect(left, top, width, height));
390 classIds.push_back(int(best_idx) + fudge);
391 confidences.push_back(confidence);
392 }
393 }
394
395 // Move to the next location:
396 cls_data += nclass;
397 bx_data += 4;
398 obj_data += 1;
399 }
400
401 // Move to the next scale:
402 stride *= 2;
403 }
404 }
405 break;
406
407 // ----------------------------------------------------------------------------------------------------
408 case jevois::dnn::postprocessor::DetectType::YOLOv8t:
409 {
410 if ((outs.size() % 2) != 0 || msiz.dims() != 4 || msiz[0] != 1)
411 LTHROW("Expected several (usually 3, for 3 strides) sets of 2 blobs: 1xHxWx64 (raw boxes) "
412 "and 1xHxWxC (class scores)");
413
414 int stride = 8;
415 int constexpr reg_max = 16;
416
417 for (size_t idx = 0; idx < outs.size(); idx += 2)
418 {
419 cv::Mat const & bx = outs[idx]; cv::MatSize const & bx_siz = bx.size;
420 if (bx_siz.dims() != 4 || bx_siz[3] != 4 * reg_max) LTHROW("Output " << idx << " is not 4D 1xHxWx64");
421 float const * bx_data = (float const *)bx.data;
422
423 cv::Mat const & cls = outs[idx + 1]; cv::MatSize const & cls_siz = cls.size;
424 if (cls_siz.dims() != 4) LTHROW("Output " << idx << " is not 4D 1xHxWxC");
425 float const * cls_data = (float const *)cls.data;
426 size_t const nclass = cls_siz[3];
427
428 for (int i = 1; i < 3; ++i)
429 if (cls_siz[i] != bx_siz[i]) LTHROW("Mismatched HxW sizes for outputs " << idx << " .. " << idx + 1);
430
431 // Loop over all locations:
432 for (int y = 0; y < cls_siz[1]; ++y)
433 for (int x = 0; x < cls_siz[2]; ++x)
434 {
435 // Get the top class score:
436 size_t best_idx = 0; float confidence = cls_data[0];
437 for (size_t i = 1; i < nclass; ++i)
438 if (cls_data[i] > confidence) { confidence = cls_data[i]; best_idx = i; }
439
440 // Apply sigmoid to it, if needed (output layer did not already have sigmoid activations):
441 if (sigmo) confidence = jevois::dnn::sigmoid(confidence);
442
443 if (confidence >= confThreshold)
444 {
445 // Decode a 4-coord box from 64 received values:
446 // Code here inspired from https://github.com/trinhtuanvubk/yolo-ncnn-cpp/blob/main/yolov8/yolov8.cpp
447 float dst[reg_max];
448
449 float xmin = (x + 0.5f - softmax_dfl(bx_data, dst, reg_max)) * stride;
450 float ymin = (y + 0.5f - softmax_dfl(bx_data + reg_max, dst, reg_max)) * stride;
451 float xmax = (x + 0.5f + softmax_dfl(bx_data + 2 * reg_max, dst, reg_max)) * stride;
452 float ymax = (y + 0.5f + softmax_dfl(bx_data + 3 * reg_max, dst, reg_max)) * stride;
453
454 // Store this detection:
455 boxes.push_back(cv::Rect(xmin, ymin, xmax - xmin, ymax - ymin));
456 classIds.push_back(int(best_idx) + fudge);
457 confidences.push_back(confidence);
458 }
459
460 // Move to the next location:
461 cls_data += nclass;
462 bx_data += 4 * reg_max;
463 }
464
465 // Move to the next scale:
466 stride *= 2;
467 }
468 }
469 break;
470
471 // ----------------------------------------------------------------------------------------------------
472 case jevois::dnn::postprocessor::DetectType::YOLOv8:
473 {
474 if ((outs.size() % 2) != 0 || msiz.dims() != 4 || msiz[0] != 1)
475 LTHROW("Expected several (usually 3, for 3 strides) sets of 2 blobs: 1x64xHxW (raw boxes) "
476 "and 1xCxHxW (class scores)");
477
478 int stride = 8;
479 int constexpr reg_max = 16;
480
481 for (size_t idx = 0; idx < outs.size(); idx += 2)
482 {
483 cv::Mat const & bx = outs[idx]; cv::MatSize const & bx_siz = bx.size;
484 if (bx_siz.dims() != 4 || bx_siz[1] != 4 * reg_max) LTHROW("Output " << idx << " is not 4D 1x64xHxW");
485 float const * bx_data = (float const *)bx.data;
486
487 cv::Mat const & cls = outs[idx + 1]; cv::MatSize const & cls_siz = cls.size;
488 if (cls_siz.dims() != 4) LTHROW("Output " << idx << " is not 4D 1xCxHxW");
489 float const * cls_data = (float const *)cls.data;
490 size_t const nclass = cls_siz[1];
491
492 for (int i = 2; i < 4; ++i)
493 if (cls_siz[i] != bx_siz[i]) LTHROW("Mismatched HxW sizes for outputs " << idx << " .. " << idx + 1);
494
495 size_t const step = cls_siz[2] * cls_siz[3]; // HxW
496
497 // Loop over all locations:
498 for (int y = 0; y < cls_siz[2]; ++y)
499 for (int x = 0; x < cls_siz[3]; ++x)
500 {
501 // Get the top class score:
502 size_t best_idx = 0; float confidence = cls_data[0];
503 for (size_t i = 1; i < nclass; ++i)
504 if (cls_data[i * step] > confidence) { confidence = cls_data[i * step]; best_idx = i; }
505
506 // Apply sigmoid to it, if needed (output layer did not already have sigmoid activations):
507 if (sigmo) confidence = jevois::dnn::sigmoid(confidence);
508
509 if (confidence >= confThreshold)
510 {
511 // Decode a 4-coord box from 64 received values:
512 // Code here inspired from https://github.com/trinhtuanvubk/yolo-ncnn-cpp/blob/main/yolov8/yolov8.cpp
513 float dst[reg_max];
514
515 float xmin = (x + 0.5f - softmax_dfl(bx_data, dst, reg_max, step)) * stride;
516 float ymin = (y + 0.5f - softmax_dfl(bx_data + reg_max * step, dst, reg_max, step)) * stride;
517 float xmax = (x + 0.5f + softmax_dfl(bx_data + 2 * reg_max * step, dst, reg_max, step)) * stride;
518 float ymax = (y + 0.5f + softmax_dfl(bx_data + 3 * reg_max * step, dst, reg_max, step)) * stride;
519
520 // Store this detection:
521 boxes.push_back(cv::Rect(xmin, ymin, xmax - xmin, ymax - ymin));
522 classIds.push_back(int(best_idx) + fudge);
523 confidences.push_back(confidence);
524 }
525
526 // Move to the next location:
527 ++cls_data;
528 ++bx_data;
529 }
530
531 // Move to the next scale:
532 stride *= 2;
533 }
534 }
535 break;
536
537 // ----------------------------------------------------------------------------------------------------
538 case jevois::dnn::postprocessor::DetectType::YOLOv8seg:
539 {
540 if (outs.size() % 3 != 1 || msiz.dims() != 4 || msiz[0] != 1)
541 LTHROW("Expected several (usually 3, for 3 strides) sets of 3 tensors: 1x64xHxW (raw boxes), "
542 "1xCxHxW (class scores), and 1xMxHxW (mask coeffs for M masks); then one 1xMxHxW for M mask prototypes");
543
544 int stride = 8;
545 int constexpr reg_max = 16;
546
547 // Get the mask prototypes as 2D 32xHW:
548 cv::MatSize const & mps = outs.back().size;
549 if (mps.dims() != 4) LTHROW("Mask prototypes not 4D 1xMxHxW");
550 mask_proto = cv::Mat(std::vector<int>{ mps[1], mps[2] * mps[3] }, CV_32F, outs.back().data);
551 int const mask_num = mps[1];
552 mask_proto_h = mps[2]; // will be needed later to unpack from HW to HxW
553
554 // Process each scale (aka stride):
555 for (size_t idx = 0; idx < outs.size() - 1; idx += 3)
556 {
557 cv::Mat const & bx = outs[idx]; cv::MatSize const & bx_siz = bx.size;
558 if (bx_siz.dims() != 4 || bx_siz[1] != 4 * reg_max) LTHROW("Output " << idx << " is not 4D 1x64xHxW");
559 float const * bx_data = (float const *)bx.data;
560
561 cv::Mat const & cls = outs[idx + 1]; cv::MatSize const & cls_siz = cls.size;
562 if (cls_siz.dims() != 4) LTHROW("Output " << idx << " is not 4D 1xCxHxW");
563 float const * cls_data = (float const *)cls.data;
564 size_t const nclass = cls_siz[1];
565
566 cv::Mat const & msk = outs[idx + 2]; cv::MatSize const & msk_siz = msk.size;
567 if (msk_siz.dims() != 4 || msk_siz[1] != mask_num) LTHROW("Output " << idx << " is not 4D 1xMxHxW");
568 float const * msk_data = (float const *)msk.data;
569
570 for (int i = 2; i < 4; ++i)
571 if (cls_siz[i] != bx_siz[i] || cls_siz[i] != msk_siz[i])
572 LTHROW("Mismatched HxW sizes for outputs " << idx << " .. " << idx + 1);
573
574 size_t const step = cls_siz[2] * cls_siz[3]; // HxW
575
576 // Loop over all locations:
577 for (int y = 0; y < cls_siz[2]; ++y)
578 for (int x = 0; x < cls_siz[3]; ++x)
579 {
580 // Get the top class score:
581 size_t best_idx = 0; float confidence = cls_data[0];
582 for (size_t i = 1; i < nclass; ++i)
583 if (cls_data[i * step] > confidence) { confidence = cls_data[i * step]; best_idx = i; }
584
585 // Apply sigmoid to it, if needed (output layer did not already have sigmoid activations):
586 if (sigmo) confidence = jevois::dnn::sigmoid(confidence);
587
588 if (confidence >= confThreshold)
589 {
590 // Decode a 4-coord box from 64 received values:
591 float dst[reg_max];
592
593 float xmin = (x + 0.5f - softmax_dfl(bx_data, dst, reg_max, step)) * stride;
594 float ymin = (y + 0.5f - softmax_dfl(bx_data + reg_max * step, dst, reg_max, step)) * stride;
595 float xmax = (x + 0.5f + softmax_dfl(bx_data + 2 * reg_max * step, dst, reg_max, step)) * stride;
596 float ymax = (y + 0.5f + softmax_dfl(bx_data + 3 * reg_max * step, dst, reg_max, step)) * stride;
597
598 // Store this detection:
599 boxes.push_back(cv::Rect(xmin, ymin, xmax - xmin, ymax - ymin));
600 classIds.push_back(int(best_idx) + fudge);
601 confidences.push_back(confidence);
602
603 // Also store raw mask coefficients data, will decode the masks after NMS to save time:
604 cv::Mat coeffs(1, mask_num, CV_32F); float * cptr = (float *)coeffs.data;
605 for (int i = 0; i < mask_num; ++i) *cptr++ = msk_data[i * step];
606 mask_coeffs.emplace_back(coeffs);
607 }
608
609 // Move to the next location:
610 ++cls_data; ++bx_data; ++msk_data;
611 }
612
613 // Move to the next scale:
614 stride *= 2;
615 }
616 }
617 break;
618
619 // ----------------------------------------------------------------------------------------------------
620 case jevois::dnn::postprocessor::DetectType::YOLOv8segt:
621 {
622 if (outs.size() % 3 != 1 || msiz.dims() != 4 || msiz[0] != 1)
623 LTHROW("Expected several (usually 3, for 3 strides) sets of 3 tensors: 1xHxWx64 (raw boxes), "
624 "1xHxWxC (class scores), and 1xHxWxM (mask coeffs for M masks); then one 1xHxWxM for M mask prototypes");
625
626 int stride = 8;
627 int constexpr reg_max = 16;
628
629 // Get the mask prototypes as 2D HWx32:
630 cv::MatSize const & mps = outs.back().size;
631 if (mps.dims() != 4) LTHROW("Mask prototypes not 4D 1xHxWxM");
632 mask_proto = cv::Mat(std::vector<int>{ mps[1] * mps[2], mps[3] }, CV_32F, outs.back().data);
633 int const mask_num = mps[3];
634 mask_proto_h = mps[1]; // will be needed later to unpack from HW to HxW
635
636 // Process each scale (aka stride):
637 for (size_t idx = 0; idx < outs.size() - 1; idx += 3)
638 {
639 cv::Mat const & bx = outs[idx]; cv::MatSize const & bx_siz = bx.size;
640 if (bx_siz.dims() != 4 || bx_siz[3] != 4 * reg_max) LTHROW("Output " << idx << " is not 4D 1xHxWx64");
641 float const * bx_data = (float const *)bx.data;
642
643 cv::Mat const & cls = outs[idx + 1]; cv::MatSize const & cls_siz = cls.size;
644 if (cls_siz.dims() != 4) LTHROW("Output " << idx << " is not 4D 1xHxWxC");
645 float const * cls_data = (float const *)cls.data;
646 size_t const nclass = cls_siz[3];
647
648 cv::Mat const & msk = outs[idx + 2]; cv::MatSize const & msk_siz = msk.size;
649 if (msk_siz.dims() != 4 || msk_siz[3] != mask_num) LTHROW("Output " << idx << " is not 4D 1xHxWxM");
650 float const * msk_data = (float const *)msk.data;
651
652 for (int i = 1; i < 3; ++i)
653 if (cls_siz[i] != bx_siz[i] || cls_siz[i] != msk_siz[i])
654 LTHROW("Mismatched HxW sizes for outputs " << idx << " .. " << idx + 1);
655
656 // Loop over all locations:
657 for (int y = 0; y < cls_siz[1]; ++y)
658 for (int x = 0; x < cls_siz[2]; ++x)
659 {
660 // Get the top class score:
661 size_t best_idx = 0; float confidence = cls_data[0];
662 for (size_t i = 1; i < nclass; ++i)
663 if (cls_data[i] > confidence) { confidence = cls_data[i]; best_idx = i; }
664
665 // Apply sigmoid to it, if needed (output layer did not already have sigmoid activations):
666 if (sigmo) confidence = jevois::dnn::sigmoid(confidence);
667
668 if (confidence >= confThreshold)
669 {
670 // Decode a 4-coord box from 64 received values:
671 float dst[reg_max];
672
673 float xmin = (x + 0.5f - softmax_dfl(bx_data, dst, reg_max)) * stride;
674 float ymin = (y + 0.5f - softmax_dfl(bx_data + reg_max, dst, reg_max)) * stride;
675 float xmax = (x + 0.5f + softmax_dfl(bx_data + 2 * reg_max, dst, reg_max)) * stride;
676 float ymax = (y + 0.5f + softmax_dfl(bx_data + 3 * reg_max, dst, reg_max)) * stride;
677
678 // Store this detection:
679 boxes.push_back(cv::Rect(xmin, ymin, xmax - xmin, ymax - ymin));
680 classIds.push_back(int(best_idx) + fudge);
681 confidences.push_back(confidence);
682
683 // Also store raw mask coefficients data, will decode the masks after NMS to save time:
684 cv::Mat coeffs(mask_num, 1, CV_32F);
685 std::memcpy(coeffs.data, msk_data, mask_num * sizeof(float));
686 mask_coeffs.emplace_back(coeffs);
687 }
688
689 // Move to the next location:
690 cls_data += nclass;
691 bx_data += 4 * reg_max;
692 msk_data += mask_num;
693 }
694
695 // Move to the next scale:
696 stride *= 2;
697 }
698 }
699 break;
700
701 // ----------------------------------------------------------------------------------------------------
702 default:
703 // Do not use strget() here as it will throw!
704 LTHROW("Unsupported Post-processor detecttype " << int(detecttype::get()));
705 }
706 }
707 // Abort here if the received outputs were malformed:
708 catch (std::exception const & e)
709 {
710 std::string err = "Selected detecttype is " + detecttype::strget() + " and network produced:\n\n";
711 for (cv::Mat const & m : outs) err += "- " + jevois::dnn::shapestr(m) + "\n";
712 err += "\nFATAL ERROR(s):\n\n";
713 err += e.what();
714 LFATAL(err);
715 }
716
717 // Cleanup overlapping boxes, either globally or per class, and possibly limit number of reported boxes:
718 std::vector<int> indices;
719 if (nmsperclass::get())
720 cv::dnn::NMSBoxesBatched(boxes, confidences, classIds, confThreshold, nmsThreshold, indices, 1.0F, maxnbox::get());
721 else
722 cv::dnn::NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices, 1.0F, maxnbox::get());
723
724 // Store results:
725 itsDetections.clear(); bool namonly = namedonly::get();
726 std::vector<cv::Vec4i> contour_hierarchy;
727
728 for (size_t i = 0; i < indices.size(); ++i)
729 {
730 int idx = indices[i];
731 std::string const label = jevois::dnn::getLabel(itsLabels, classIds[idx], namonly);
732 if (namonly == false || label.empty() == false)
733 {
734 cv::Rect & b = boxes[idx];
735
736 // Now clamp box to be within blob:
737 if (clampbox) jevois::dnn::clamp(b, bsiz.width, bsiz.height);
738
739 // Decode the mask if doing instance segmentation:
740 std::vector<cv::Point> poly;
741 if (mask_coeffs.empty() == false)
742 {
743 // Multiply the 1x32 mask coeffs by the 32xHW mask prototypes to get a 1xHW weighted mask (YOLOv8seg), or
744 // multiply the HWx32 mask prototypes by the 32x1 mask coeffs to get a HWx1 weighted mask (YOLOv8segt):
745 cv::Mat weighted_mask;
746 if (mask_coeffs[idx].rows == 1) weighted_mask = mask_coeffs[idx] * mask_proto;
747 else weighted_mask = mask_proto * mask_coeffs[idx];
748
749 // Reshape to HxW:
750 weighted_mask = weighted_mask.reshape(0, mask_proto_h);
751
752 // Apply sigmoid to all mask elements:
753 jevois::dnn::sigmoid(weighted_mask);
754
755 // Typically, mask prototypes are 4x smaller than input blob; we want to detect contours inside the obj rect. We
756 // have two approaches here: 1) detect contours on the original masks at low resolution (faster but contours are
757 // not very smooth), 2) scale the mask 4x with bilinear interpolation and then detect the contours (slower but
758 // smoother contours):
759 int mask_scale = bsiz.height / mask_proto_h;
760 if (smoothmsk)
761 {
762 cv::Mat src = weighted_mask;
763 cv::resize(src, weighted_mask, cv::Size(), mask_scale, mask_scale, cv::INTER_LINEAR);
764 mask_scale = 1;
765 }
766
767 cv::Rect scaled_rect(b.tl() / mask_scale, b.br() / mask_scale);
768 scaled_rect &= cv::Rect(cv::Point(0, 0), weighted_mask.size()); // constrain roi to within mask image
769
770 // Binarize the mask roi:
771 cv::Mat roi_mask; cv::threshold(weighted_mask(scaled_rect), roi_mask, 0.5, 255.0, cv::THRESH_BINARY);
772 cv::Mat roi_u8; roi_mask.convertTo(roi_u8, CV_8U);
773
774 // Detect object contours that are inside the scaled rect:
775 std::vector<std::vector<cv::Point>> polys;
776 cv::findContours(roi_u8, polys, contour_hierarchy, cv::RETR_EXTERNAL,
777 cv::CHAIN_APPROX_SIMPLE, scaled_rect.tl()); // or CHAIN_APPROX_NONE
778
779 // Pick the largest poly:
780 size_t polyidx = 0; size_t largest_poly_size = 0; size_t j = 0;
781 for (auto const & p : polys)
782 {
783 if (p.size() > largest_poly_size) { largest_poly_size = p.size(); polyidx = j; }
784 ++j;
785 }
786
787 // Scale from mask to blob to image:
788 if (polys.empty() == false)
789 for (cv::Point & pt : polys[polyidx])
790 {
791 float x = pt.x * mask_scale, y = pt.y * mask_scale;
792 preproc->b2i(x, y);
793 poly.emplace_back(cv::Point(x, y));
794 }
795 }
796
797 // Rescale the box from blob to (processing) image:
798 cv::Point2f tl = b.tl(); preproc->b2i(tl.x, tl.y);
799 cv::Point2f br = b.br(); preproc->b2i(br.x, br.y);
800 b.x = tl.x; b.y = tl.y; b.width = br.x - tl.x; b.height = br.y - tl.y;
801
802 // Store this detection for later report:
803 jevois::ObjReco o { confidences[idx] * 100.0f, label };
804 std::vector<jevois::ObjReco> ov;
805 ov.emplace_back(o);
806 jevois::ObjDetect od { b.x, b.y, b.x + b.width, b.y + b.height, ov, poly };
807 itsDetections.emplace_back(od);
808 }
809 }
810}
811
812// ####################################################################################################
814 jevois::OptGUIhelper * helper, bool overlay,
815 bool /*idle*/)
816{
817 bool const serreport = serialreport::get();
818
819 for (jevois::ObjDetect const & o : itsDetections)
820 {
821 std::string categ, label;
822
823 if (o.reco.empty())
824 {
825 categ = "unknown";
826 label = "unknown";
827 }
828 else
829 {
830 categ = o.reco[0].category;
831 label = jevois::sformat("%s: %.2f", categ.c_str(), o.reco[0].score);
832 }
833
834 // If desired, draw boxes in output image:
835 if (outimg && overlay)
836 {
837 jevois::rawimage::drawRect(*outimg, o.tlx, o.tly, o.brx - o.tlx, o.bry - o.tly, 2, jevois::yuyv::LightGreen);
838 if (o.contour.empty() == false) LERROR("Need to implement drawPoly() for RawImage");
839 jevois::rawimage::writeText(*outimg, label, o.tlx + 6, o.tly + 2, jevois::yuyv::LightGreen,
841 }
842
843#ifdef JEVOIS_PRO
844 // If desired, draw results on GUI:
845 if (helper)
846 {
847 int col = jevois::dnn::stringToRGBA(categ, 0xff);
848 helper->drawRect(o.tlx, o.tly, o.brx, o.bry, col, true);
849 if (o.contour.empty() == false) helper->drawPoly(o.contour, col, false);
850 helper->drawText(o.tlx + 3.0f, o.tly + 3.0f, label.c_str(), col);
851 }
852#else
853 (void)helper; // keep compiler happy
854#endif
855
856 // If desired, send results to serial port:
857 if (mod && serreport) mod->sendSerialObjDetImg2D(itsImageSize.width, itsImageSize.height, o);
858 }
859}
860
861// ####################################################################################################
862std::vector<jevois::ObjDetect> const & jevois::dnn::PostProcessorDetect::latestDetections() const
863{ return itsDetections; }
#define JEVOIS_SHARE_PATH
Base path for shared files (e.g., neural network weights, etc)
Definition Config.H:82
#define o
Definition Font10x20.C:6
#define LTHROW(msg)
Definition Log.H:251
Helper class to assist modules in creating graphical and GUI elements.
Definition GUIhelper.H:133
void drawText(float x, float y, char const *txt, ImU32 col=IM_COL32(128, 255, 128, 255))
Draw text over an image.
Definition GUIhelper.C:624
void drawRect(float x1, float y1, float x2, float y2, ImU32 col=IM_COL32(128, 255, 128, 255), bool filled=true)
Draw rectangular box over an image.
Definition GUIhelper.C:478
void drawPoly(std::vector< cv::Point > const &pts, ImU32 col=IM_COL32(128, 255, 128, 255), bool filled=true)
Draw polygon over an image.
Definition GUIhelper.C:514
A raw image as coming from a V4L2 Camera and/or being sent out to a USB Gadget.
Definition RawImage.H:111
Base class for a module that supports standardized serial messages.
Definition Module.H:234
void sendSerialObjDetImg2D(unsigned int camw, unsigned int camh, float x, float y, float w, float h, std::vector< ObjReco > const &res)
Send a standardized object detection + recognition message.
Definition Module.C:572
void onParamChange(postprocessor::detecttype const &param, postprocessor::DetectType const &val) override
void report(jevois::StdModule *mod, jevois::RawImage *outimg=nullptr, jevois::OptGUIhelper *helper=nullptr, bool overlay=true, bool idle=false) override
Report what happened in last process() to console/output video/GUI.
void process(std::vector< cv::Mat > const &outs, PreProcessor *preproc) override
Process outputs and draw/send some results.
void freeze(bool doit) override
Freeze/unfreeze parameters that users should not change while running.
virtual ~PostProcessorDetect()
Destructor.
std::vector< ObjDetect > const & latestDetections() const
Get the latest detections, use with caution, not thread-safe.
Pre-Processor for neural network pipeline.
cv::Size const & imagesize() const
Access the last processed image size.
void b2i(float &x, float &y, size_t blobnum=0)
Convert coordinates from blob back to original image.
cv::Size blobsize(size_t num) const
Access the width and height of a given blob, accounting for NCHW or NHWC.
#define LFATAL(msg)
Convenience macro for users to print out console or syslog messages, FATAL level.
Definition Log.H:230
#define LERROR(msg)
Convenience macro for users to print out console or syslog messages, ERROR level.
Definition Log.H:211
std::string getLabel(std::map< int, std::string > const &labels, int id, bool namedonly=false)
Get a label from an id.
Definition Utils.C:68
std::map< int, std::string > readLabelsFile(std::string const &fname)
Read a label file.
Definition Utils.C:25
float sigmoid(float x)
Compute sigmoid using fastexp.
void clamp(cv::Rect &r, int width, int height)
Clamp a rectangle to within given image width and height.
Definition Utils.C:391
float softmax_dfl(float const *src, float *dst, size_t const n, size_t const stride=1)
Compute softmax and return DFL distance.
Definition Utils.C:752
size_t effectiveDims(cv::Mat const &m)
Returns the number of non-unit dims in a cv::Mat.
Definition Utils.C:910
int stringToRGBA(std::string const &label, unsigned char alpha=128)
Compute a color from a label name.
Definition Utils.C:80
std::string shapestr(cv::Mat const &m)
Get a string of the form: "nD AxBxC... TYPE" from an n-dimensional cv::Mat with data type TYPE.
Definition Utils.C:109
void writeText(RawImage &img, std::string const &txt, int x, int y, unsigned int col, Font font=Font6x10)
Write some text in an image.
void drawRect(RawImage &img, int x, int y, unsigned int w, unsigned int h, unsigned int thick, unsigned int col)
Draw a rectangle in a YUYV image.
std::string sformat(char const *fmt,...) __attribute__((format(__printf__
Create a string using printf style arguments.
Definition Utils.C:440
std::filesystem::path absolutePath(std::filesystem::path const &root, std::filesystem::path const &path)
Compute an absolute path from two paths.
Definition Utils.C:386
unsigned short constexpr LightGreen
YUYV color value.
Definition RawImage.H:63
A trivial struct to store object detection results, for standard (straight up) bounding boxes.
Definition ObjDetect.H:29
A trivial struct to store object recognition results.
Definition ObjReco.H:25