From 115c3d16c9932b2a94231f695e8260c05419c614 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Tue, 4 Nov 2025 14:24:22 +0100 Subject: [PATCH 01/24] Move example --- examples/cpp/DetectionNetwork/CMakeLists.txt | 2 +- .../cpp/DetectionNetwork/{RVC4 => }/detection_and_keypoints.cpp | 2 +- .../DetectionNetwork/{RVC4 => }/detection_and_keypoints.py | 2 +- tests/CMakeLists.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) rename examples/cpp/DetectionNetwork/{RVC4 => }/detection_and_keypoints.cpp (97%) rename examples/python/DetectionNetwork/{RVC4 => }/detection_and_keypoints.py (96%) diff --git a/examples/cpp/DetectionNetwork/CMakeLists.txt b/examples/cpp/DetectionNetwork/CMakeLists.txt index 9a3adc4e6..8193faeb9 100644 --- a/examples/cpp/DetectionNetwork/CMakeLists.txt +++ b/examples/cpp/DetectionNetwork/CMakeLists.txt @@ -26,5 +26,5 @@ dai_set_example_test_labels(detection_network_remap ondevice rvc2_all rvc4 ci) dai_add_example(detection_and_segmentation RVC4/detection_and_segmentation.cpp ON OFF) dai_set_example_test_labels(detection_and_segmentation rvc4) -dai_add_example(detection_and_keypoints RVC4/detection_and_keypoints.cpp ON OFF) +dai_add_example(detection_and_keypoints detection_and_keypoints.cpp ON OFF) dai_set_example_test_labels(detection_and_keypoints rvc4) \ No newline at end of file diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp similarity index 97% rename from examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp rename to examples/cpp/DetectionNetwork/detection_and_keypoints.cpp index bc8dca07c..667151bb0 100644 --- a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp +++ b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp @@ -22,7 +22,7 @@ int main() { auto detectionNetwork = pipeline.create(); dai::NNModelDescription modelDescription; - modelDescription.model = "luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39"; + modelDescription.model = "luxonis/yolov8-nano-pose-estimation:coco-512x288"; detectionNetwork->build(cameraNode, modelDescription); auto labelMap = detectionNetwork->getClasses(); diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py b/examples/python/DetectionNetwork/detection_and_keypoints.py similarity index 96% rename from examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py rename to examples/python/DetectionNetwork/detection_and_keypoints.py index c62987701..431679544 100644 --- a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py +++ b/examples/python/DetectionNetwork/detection_and_keypoints.py @@ -9,7 +9,7 @@ # Create pipeline with dai.Pipeline() as pipeline: cameraNode = pipeline.create(dai.node.Camera).build() - detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39")) + detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-nano-pose-estimation:coco-512x288")) labelMap = detectionNetwork.getClasses() qRgb = detectionNetwork.passthrough.createOutputQueue() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e88884733..6ab38e604 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -389,7 +389,7 @@ dai_set_test_labels(nndata_test onhost ci) #ImgDetections tests dai_add_test(imgdetections_test src/onhost_tests/pipeline/datatype/imgdetections_test.cpp) -dai_set_test_labels(imgdetections_test onhost ci) +dai_set_test_labels(imgdetections_test ondevice rvc2 rvc4 onhost ci) # Model description tests dai_add_test(model_slug_test src/onhost_tests/model_slug_test.cpp) From bb3204ebd1abeaa5a6748e77a1d4cf9ed075c04a Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Tue, 4 Nov 2025 16:59:00 +0100 Subject: [PATCH 02/24] Add host parsing option --- CMakeLists.txt | 1 + .../RVC4/detection_and_segmentation.py | 3 +- .../depthai/pipeline/node/DetectionParser.hpp | 29 +- src/pipeline/node/DetectionParser.cpp | 184 ++++ .../DetectionParser/DetectionParserUtils.cpp | 897 ++++++++++++++++++ .../DetectionParser/DetectionParserUtils.hpp | 85 ++ src/pipeline/utilities/NNDataViewer.hpp | 163 ++++ 7 files changed, 1360 insertions(+), 2 deletions(-) create mode 100644 src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp create mode 100644 src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp create mode 100644 src/pipeline/utilities/NNDataViewer.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ae50f4e25..54150150f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -311,6 +311,7 @@ set(TARGET_CORE_SOURCES src/pipeline/node/ImageAlign.cpp src/pipeline/node/ToF.cpp src/pipeline/node/DetectionParser.cpp + src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp src/pipeline/node/test/MyProducer.cpp src/pipeline/node/test/MyConsumer.cpp src/pipeline/node/UVC.cpp diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py b/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py index fcbbbfd2f..650f90f2f 100644 --- a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py +++ b/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py @@ -9,7 +9,8 @@ # Create pipeline with dai.Pipeline() as pipeline: cameraNode = pipeline.create(dai.node.Camera).build() - detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-large:coco-640x480")) + detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-nano:coco-512x288")) + # detectionNetwork.detectionParser.runOnHost(True) labelMap = detectionNetwork.getClasses() qRgb = detectionNetwork.passthrough.createOutputQueue() diff --git a/include/depthai/pipeline/node/DetectionParser.hpp b/include/depthai/pipeline/node/DetectionParser.hpp index 78bb8ce8e..4b50a75b3 100644 --- a/include/depthai/pipeline/node/DetectionParser.hpp +++ b/include/depthai/pipeline/node/DetectionParser.hpp @@ -15,6 +15,8 @@ #include #include "depthai/common/YoloDecodingFamily.hpp" +#include "depthai/pipeline/datatype/ImgDetections.hpp" +#include "depthai/pipeline/datatype/NNData.hpp" namespace dai { namespace node { @@ -23,7 +25,7 @@ namespace node { * @brief DetectionParser node. Parses detection results from different neural networks and is being used internally by MobileNetDetectionNetwork and * YoloDetectionNetwork. */ -class DetectionParser : public DeviceNodeCRTP { +class DetectionParser : public DeviceNodeCRTP, public HostRunnable { public: constexpr static const char* NAME = "DetectionParser"; using DeviceNodeCRTP::DeviceNodeCRTP; @@ -177,7 +179,23 @@ class DetectionParser : public DeviceNodeCRTP decodeMobilenet(std::shared_ptr nnData, float confidenceThr); + private: + bool runOnHostVar = false; void setNNArchiveBlob(const NNArchive& nnArchive); void setNNArchiveSuperblob(const NNArchive& nnArchive, int numShaves); void setNNArchiveOther(const NNArchive& nnArchive); @@ -185,6 +203,15 @@ class DetectionParser : public DeviceNodeCRTP& outputs); + // host runnable requirements + void buildStage1() override; + void decodeYolo(std::shared_ptr nnData, std::shared_ptr outDetections); + std::vector inTensorInfo; + uint32_t imgWidth; + uint32_t imgHeight; + uint32_t imgSizesSet = false; + // + std::optional mArchive; std::optional archiveConfig; diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp index a03b64633..2c0e07b9a 100644 --- a/src/pipeline/node/DetectionParser.cpp +++ b/src/pipeline/node/DetectionParser.cpp @@ -13,6 +13,8 @@ #include "nn_archive/NNArchive.hpp" #include "nn_archive/v1/Head.hpp" #include "pipeline/ThreadedNodeImpl.hpp" +#include "pipeline/datatype/NNData.hpp" +#include "pipeline/utilities/DetectionParser/DetectionParserUtils.hpp" #include "spdlog/fmt/fmt.h" // internal headers @@ -349,5 +351,187 @@ std::vector DetectionParser::getStrides() const { return properties.parser.strides; } +void DetectionParser::setRunOnHost(bool runOnHost) { + if(runOnHost) { + pimpl->logger->warn("Detection parser set to run on host."); + } + runOnHostVar = runOnHost; +} + +/** + * Check if the node is set to run on host + */ +bool DetectionParser::runOnHost() const { + return runOnHostVar; +} + +void DetectionParser::run() { + auto& logger = pimpl->logger; + logger->info("Detection parser running on host."); + + using namespace std::chrono; + while(isRunning()) { + auto tAbsoluteBeginning = steady_clock::now(); + std::shared_ptr inputData; + inputData = input.get(); + if(!inputData) { + logger->error("Error while receiving NN frame."); + continue; + } + auto tAfterMessageBeginning = steady_clock::now(); + + if(!imgSizesSet) { + const bool containsTransformation = inputData->transformation.has_value(); + if(containsTransformation) { + std::tie(imgWidth, imgHeight) = inputData->transformation->getSize(); + } else { + logger->warn("No image size provided for detection parser. Skipping processing and sending empty detections."); + continue; + } + + imgSizesSet = true; + } + + auto outDetections = std::make_shared(); + + switch(properties.parser.nnFamily) { + case DetectionNetworkType::YOLO: { + decodeYolo(inputData, outDetections); + break; + } + case DetectionNetworkType::MOBILENET: { + auto dets = decodeMobilenet(inputData, properties.parser.confidenceThreshold); // TODO (aljaz) update to shared pointer + outDetections->detections = dets; + break; + } + default: { + logger->error("Unknown NN family. 'YOLO' and 'MOBILENET' are supported."); + break; + } + } + + auto tBeforeSend = steady_clock::now(); + + // Copy over seq and ts + outDetections->setSequenceNum(inputData->getSequenceNum()); + outDetections->setTimestamp(inputData->getTimestamp()); + outDetections->setTimestampDevice(inputData->getTimestampDevice()); + outDetections->transformation = inputData->transformation; + // Send detections + out.send(outDetections); + + auto tAbsoluteEnd = steady_clock::now(); + logger->debug("Detection parser total took {}ms, processing {}ms, getting_frames {}ms, sending_frames {}ms", + duration_cast(tAbsoluteEnd - tAbsoluteBeginning).count() / 1000, + duration_cast(tBeforeSend - tAfterMessageBeginning).count() / 1000, + duration_cast(tAfterMessageBeginning - tAbsoluteBeginning).count() / 1000, + duration_cast(tAbsoluteEnd - tBeforeSend).count() / 1000); + } +} + +void DetectionParser::buildStage1() { + auto& logger = pimpl->logger; + + // Grab dimensions from input tensor info + if(properties.networkInputs.size() > 0) { + if(properties.networkInputs.size() > 1) { + logger->warn("Detection parser supports only single input networks, assuming first input"); + } + for(const auto& kv : properties.networkInputs) { + const dai::TensorInfo& tensorInfo = kv.second; + inTensorInfo.push_back(tensorInfo); + } + } + if(inTensorInfo.size() > 0) { + int numDimensions = inTensorInfo[0].numDimensions; + if(numDimensions < 2) { + logger->error("Number of input dimensions is less than 2"); + } else { + imgSizesSet = true; + imgWidth = inTensorInfo[0].dims[numDimensions - 1]; + imgHeight = inTensorInfo[0].dims[numDimensions - 2]; + } + } else { + logger->info("Unable to read input tensor height and width from static inputs. The node will try to get input sizes at runtime."); + } +} + +std::vector DetectionParser::decodeMobilenet(std::shared_ptr nnData, float confidenceThr) { + auto& logger = pimpl->logger; + + if(!nnData) { + return {}; + } + int maxDetections = 100; + std::vector detections; + std::string tensorName; + for(const auto& tensor : nnData->getAllLayers()) { + if(tensor.offset == 0) { + tensorName = tensor.name; + } + } + + auto tensorData = nnData->getTensor(tensorName); + maxDetections = tensorData.size() / 7; + if(static_cast(tensorData.size()) < maxDetections * 7) { + logger->error("Error while parsing Mobilenet. Vector not long enough, expected size: {}, real size {}", maxDetections * 7, tensorData.size()); + return {}; + } + + struct raw_Detection { // need to update it to include more + float header; + float label; + float confidence; + float xmin; + float ymin; + float xmax; + float ymax; + }; + + float* rawPtr = tensorData.data(); + for(int i = 0; i < maxDetections; i++) { + raw_Detection temp; + // TODO This is likely unnecessary optimisation + memcpy(&temp, &rawPtr[i * 7], sizeof(raw_Detection)); + + // if header == -1, stop sooner + if(temp.header == -1.0f) break; + + float currentConfidence = temp.confidence; + if(currentConfidence >= confidenceThr) { + dai::ImgDetection d; + d.label = temp.label; + + d.confidence = currentConfidence; + + d.xmin = temp.xmin; + d.ymin = temp.ymin; + d.xmax = temp.xmax; + d.ymax = temp.ymax; + + detections.push_back(d); + } + } + return detections; +} + +void DetectionParser::decodeYolo(std::shared_ptr nnData, std::shared_ptr outDetections) { + auto& logger = pimpl->logger; + switch(properties.parser.decodingFamily) { + case YoloDecodingFamily::R1AF: // anchor free: yolo v6r1 + utilities::DetectionParserUtils::decodeR1AF(nnData, outDetections, properties, logger); + break; + case YoloDecodingFamily::v3AB: // anchor based yolo v3 v3-Tiny + utilities::DetectionParserUtils::decodeV3AB(nnData, outDetections, properties, logger); + break; + case YoloDecodingFamily::v5AB: // anchor based yolo v5, v7, P + utilities::DetectionParserUtils::decodeV5AB(nnData, outDetections, properties, logger); + break; + case YoloDecodingFamily::TLBR: // top left bottom right anchor free: yolo v6r2, v8 v10 v11 + utilities::DetectionParserUtils::decodeTLBR(nnData, outDetections, properties, logger); + break; + } +} + } // namespace node } // namespace dai diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp new file mode 100644 index 000000000..c1809e847 --- /dev/null +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp @@ -0,0 +1,897 @@ +#include "DetectionParserUtils.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "depthai/common/KeypointsList.hpp" +#include "depthai/common/RotatedRect.hpp" +#include "depthai/common/TensorInfo.hpp" +#include "depthai/pipeline/datatype/ImgDetections.hpp" +#include "depthai/pipeline/datatype/NNData.hpp" +#include "depthai/properties/DetectionParserProperties.hpp" +#include "pipeline/utilities/NNDataViewer.hpp" + +namespace dai { +namespace utilities { +namespace DetectionParserUtils { + +// yolo v6 r1 - anchor free +void decodeR1AF(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger) { + auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + + const std::vector strides = properties.parser.strides; + if(strides.size() != layerNames.size()) { + std::string errorMsg = fmt::format( + "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size()); + throw std::runtime_error(errorMsg); + } + const float confidenceThr = properties.parser.confidenceThreshold; + const float iouThr = properties.parser.iouThreshold; + const int numClasses = properties.parser.classes; + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + + if(inputWidth <= 0 || inputHeight <= 0) { + throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); + } + std::vector detectionCandidates; + detectionCandidates.reserve(250); + + for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { + std::string layerName = layerNames[strideIdx]; + auto tensorInfo = nnData->getTensorInfo(layerName); + if(!tensorInfo) { + std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); + throw std::runtime_error(errorMsg); + } + + if(!isTensorOrderValid(*tensorInfo, properties, logger)) { + logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName); + continue; + } + + int layerHeight = tensorInfo->getHeight(); + int layerWidth = tensorInfo->getWidth(); + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger); + if(!outputData.build()) { + std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); + throw std::runtime_error(errorMsg); + } + + for(int row = 0; row < layerHeight; ++row) { + for(int col = 0; col < layerWidth; ++col) { + const float score = outputData.get(4, row, col); + if(score < confidenceThr) { + continue; + } + + int bestC = 0; + float bestConf = 0.0f; + for(int c = 0; c < numClasses; ++c) { + float candidateProb = outputData.get(c + 5, row, col); + if(candidateProb > bestConf) { + bestConf = candidateProb; + bestC = c; + } + } + if(bestConf * score < confidenceThr) { + continue; + } + + float cx = outputData.get(0, row, col); + float cy = outputData.get(1, row, col); + float w = outputData.get(2, row, col); + float h = outputData.get(3, row, col); + + float xmin = cx - w * 0.5f; + float ymin = cy - h * 0.5f; + float xmax = cx + w * 0.5f; + float ymax = cy + h * 0.5f; + + xmin = std::max(0.0f, std::min(xmin, float(inputWidth))); + ymin = std::max(0.0f, std::min(ymin, float(inputHeight))); + xmax = std::max(0.0f, std::min(xmax, float(inputWidth))); + ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); + + if(xmax <= xmin || ymax <= ymin) { + logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping"); + continue; + } + DetectionCandidate candidate = DetectionCandidate{ + xmin, + ymin, + xmax, + ymax, + bestConf * score, + bestC, + strideIdx, + row, + col, + std::nullopt, + }; + + if(!properties.parser.classNames->empty()) { + candidate.labelName = (*properties.parser.classNames)[bestC]; + } + detectionCandidates.emplace_back(std::move(candidate)); + } + } + } + + std::vector keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr); + if(keepCandidates.size() == 0) { + logger->trace("No detections after NMS, skipping overlay."); + return; + } + + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); + + if(properties.parser.decodeSegmentation) { + logger->trace("Segmentation decoding."); + segmentationDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + if(properties.parser.decodeKeypoints) { + logger->trace("Keypoints decoding."); + keypointDecode(nnData, keepCandidates, outDetections, properties, logger); + } +} + +/* +Decode anchor based yolo v3 and v3-Tiny +*/ +void decodeV3AB(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger) { + auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + + const std::vector strides = properties.parser.strides; + if(strides.size() != layerNames.size()) { + std::string errorMsg = fmt::format( + "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size()); + throw std::runtime_error(errorMsg); + } + + const float confidenceThr = properties.parser.confidenceThreshold; + const float iouThr = properties.parser.iouThreshold; + const int numClasses = properties.parser.classes; + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + if(inputWidth <= 0 || inputHeight <= 0) { + throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); + } + + if(properties.parser.anchorsV2.size() != layerNames.size()) { + logger->error("Number of anchor sets does not match number of output layers. Anchor sets size: {}, output layers size: {}", + properties.parser.anchorsV2.size(), + layerNames.size()); + return; + } + + std::vector detectionCandidates; + detectionCandidates.reserve(250); + + for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { + std::string layerName = layerNames[strideIdx]; + int stride = strides[strideIdx]; + auto tensorInfo = nnData->getTensorInfo(layerName); + if(!tensorInfo) { + std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); + throw std::runtime_error(errorMsg); + } + + if(!isTensorOrderValid(*tensorInfo, properties, logger)) { + logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName); + continue; + } + + int layerHeight = tensorInfo->getHeight(); + int layerWidth = tensorInfo->getWidth(); + int layerChannels = tensorInfo->getChannels(); + + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger); + if(!outputData.build()) { + std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); + throw std::runtime_error(errorMsg); + } + std::vector>& anchors = properties.parser.anchorsV2[strideIdx]; + int numAnchors = anchors.size(); + int block = 5 + numClasses; + int expectedC = numAnchors * block; + + if(layerChannels != expectedC) { + std::string errorMsg = fmt::format("Layer {} channels mismatch. Expected {}, got {}", layerName, expectedC, layerChannels); + throw std::runtime_error(errorMsg); + } + + auto sigmoid = [](float x) -> float { return 1.f / (1.f + std::exp(-x)); }; + + for(int row = 0; row < layerHeight; ++row) { + for(int col = 0; col < layerWidth; ++col) { + for(int a = 0; a < numAnchors; ++a) { + const int ch0 = a * block; + const float tx = sigmoid(outputData.get(ch0 + 0, row, col)); + const float ty = sigmoid(outputData.get(ch0 + 1, row, col)); + const float tw = outputData.get(ch0 + 2, row, col); + const float th = outputData.get(ch0 + 3, row, col); + const float obj = sigmoid(outputData.get(ch0 + 4, row, col)); + if(obj < confidenceThr) continue; + + int bestC = 0; + float clsProb = 0.0f; + for(int c = 0; c < numClasses; ++c) { + const float prob = outputData.get(ch0 + 5 + c, row, col); + if(prob > clsProb) { + clsProb = prob; + bestC = c; + } + } + const float conf = obj * 1.f / (1.f + std::exp(-clsProb)); + if(conf < confidenceThr) continue; + + // YOLOv3 decode + const float cx = (static_cast(col) + tx) * static_cast(stride); + const float cy = (static_cast(row) + ty) * static_cast(stride); + const float w_exp = std::exp(tw); + const float h_exp = std::exp(th); + const float w = w_exp * anchors[a][0]; + const float h = h_exp * anchors[a][1]; + + float xmin = cx - 0.5f * w; + float ymin = cy - 0.5f * h; + float xmax = cx + 0.5f * w; + float ymax = cy + 0.5f * h; + + xmin = std::max(0.0f, std::min(xmin, float(inputWidth))); + ymin = std::max(0.0f, std::min(ymin, float(inputHeight))); + xmax = std::max(0.0f, std::min(xmax, float(inputWidth))); + ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); + + if(xmax <= xmin || ymax <= ymin) { + logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping"); + continue; + } + + DetectionCandidate candidate = DetectionCandidate{ + xmin, + ymin, + xmax, + ymax, + conf, + bestC, + strideIdx, + row, + col, + std::nullopt, + }; + + if(!properties.parser.classNames->empty()) { + candidate.labelName = (*properties.parser.classNames)[bestC]; + } + detectionCandidates.emplace_back(std::move(candidate)); + } + } + } + } + + std::vector keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr); + if(keepCandidates.size() == 0) { + logger->trace("No detections after NMS, skipping overlay."); + return; + } + + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); + + if(properties.parser.decodeSegmentation) { + logger->trace("Segmentation decoding."); + segmentationDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + if(properties.parser.decodeKeypoints) { + logger->trace("Keypoints decoding."); + keypointDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + // +} + +/* +Decode anchor based networks, e.g., yolo v5, v7, P +*/ +void decodeV5AB(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger) { + auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + + const std::vector strides = properties.parser.strides; + if(strides.size() != layerNames.size()) { + std::string errorMsg = fmt::format( + "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size()); + throw std::runtime_error(errorMsg); + } + + const float confidenceThr = properties.parser.confidenceThreshold; + const float iouThr = properties.parser.iouThreshold; + const int numClasses = properties.parser.classes; + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + + if(inputWidth <= 0 || inputHeight <= 0) { + throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); + } + + if(properties.parser.anchorsV2.size() != layerNames.size()) { + logger->error("Number of anchor sets does not match number of output layers. Anchor sets size: {}, output layers size: {}", + properties.parser.anchorsV2.size(), + layerNames.size()); + return; + } + + std::vector detectionCandidates; + detectionCandidates.reserve(250); + + for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { + std::string layerName = layerNames[strideIdx]; + int stride = strides[strideIdx]; + auto tensorInfo = nnData->getTensorInfo(layerName); + if(!tensorInfo) { + std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); + throw std::runtime_error(errorMsg); + } + + if(!isTensorOrderValid(*tensorInfo, properties, logger)) { + logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName); + continue; + } + + int layerHeight = tensorInfo->getHeight(); + int layerWidth = tensorInfo->getWidth(); + int layerChannels = tensorInfo->getChannels(); + + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger); + if(!outputData.build()) { + std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); + throw std::runtime_error(errorMsg); + } + std::vector>& anchors = properties.parser.anchorsV2[strideIdx]; + int numAnchors = anchors.size(); + int block = 5 + numClasses; + int expectedC = numAnchors * block; + + if(layerChannels != expectedC) { + logger->error("Layer {} channels mismatch. Expected {}, got {}", layerName, expectedC, layerChannels); + return; + } + + for(int row = 0; row < layerHeight; ++row) { + for(int col = 0; col < layerWidth; ++col) { + for(int a = 0; a < numAnchors; ++a) { + const int ch0 = a * block; + + const float tx = outputData.get(ch0 + 0, row, col); + const float ty = outputData.get(ch0 + 1, row, col); + const float tw = outputData.get(ch0 + 2, row, col); + const float th = outputData.get(ch0 + 3, row, col); + const float obj = outputData.get(ch0 + 4, row, col); + if(obj < confidenceThr) continue; + + int bestC = 0; + float bestConf = 0.0f; + for(int c = 0; c < numClasses; ++c) { + const float prob = outputData.get(ch0 + 5 + c, row, col); + if(prob > bestConf) { + bestConf = prob; + bestC = c; + } + } + const float conf = obj * bestConf; + if(conf < confidenceThr) continue; + + // YOLOv5 decode + const float cx = ((tx * 2.0f - 0.5f) + static_cast(col)) * static_cast(stride); + const float cy = ((ty * 2.0f - 0.5f) + static_cast(row)) * static_cast(stride); + + const float w = tw * tw * 4.0f * anchors[a][0]; + const float h = th * th * 4.0f * anchors[a][1]; + + float xmin = cx - 0.5f * w; + float ymin = cy - 0.5f * h; + float xmax = cx + 0.5f * w; + float ymax = cy + 0.5f * h; + + xmin = std::max(0.0f, std::min(xmin, float(inputWidth))); + ymin = std::max(0.0f, std::min(ymin, float(inputHeight))); + xmax = std::max(0.0f, std::min(xmax, float(inputWidth))); + ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); + + if(xmax <= xmin || ymax <= ymin) continue; + DetectionCandidate candidate = DetectionCandidate{ + xmin, + ymin, + xmax, + ymax, + conf, + bestC, + strideIdx, + row, + col, + std::nullopt, + }; + + if(!properties.parser.classNames->empty()) { + candidate.labelName = (*properties.parser.classNames)[bestC]; + } + detectionCandidates.emplace_back(std::move(candidate)); + } + } + } + } + + std::vector keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr); + if(keepCandidates.size() == 0) { + logger->trace("No detections after NMS, skipping overlay."); + return; + } + + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); + + if(properties.parser.decodeSegmentation) { + logger->trace("Segmentation decoding."); + segmentationDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + if(properties.parser.decodeKeypoints) { + logger->trace("Keypoints decoding."); + keypointDecode(nnData, keepCandidates, outDetections, properties, logger); + } +} + +/* +Decode TLBR (top left bottom right) style networks, e.g., yolo v6r2, v8, v10, v11 +*/ +void decodeTLBR(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger) { + auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + + const std::vector strides = properties.parser.strides; + if(strides.size() != layerNames.size()) { + std::string errorMsg = fmt::format( + "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size()); + throw std::runtime_error(errorMsg); + } + const float confidenceThr = properties.parser.confidenceThreshold; + const float iouThr = properties.parser.iouThreshold; + const int numClasses = properties.parser.classes; + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + + if(inputWidth <= 0 || inputHeight <= 0) { + throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); + } + + std::vector detectionCandidates; + detectionCandidates.reserve(250); + + for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { + std::string layerName = layerNames[strideIdx]; + int stride = strides[strideIdx]; + auto tensorInfo = nnData->getTensorInfo(layerName); + if(!tensorInfo) { + std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); + throw std::runtime_error(errorMsg); + } + + if(!isTensorOrderValid(*tensorInfo, properties, logger)) { + logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName); + continue; + } + + int layerHeight = tensorInfo->getHeight(); + int layerWidth = tensorInfo->getWidth(); + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger); + if(!outputData.build()) { + std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); + throw std::runtime_error(errorMsg); + } + + for(int row = 0; row < layerHeight; ++row) { + for(int col = 0; col < layerWidth; ++col) { + const float score = outputData.get(4, row, col); + if(score < confidenceThr) { + continue; + } + + int bestC = 0; + float bestConf = 0.0f; + for(int c = 0; c < numClasses; ++c) { + float candidateProb = outputData.get(c + 5, row, col); + if(candidateProb > bestConf) { + bestConf = candidateProb; + bestC = c; + } + } + float xmin = (col - outputData.get(0, row, col) + 0.5f) * stride; + float ymin = (row - outputData.get(1, row, col) + 0.5f) * stride; + float xmax = (col + outputData.get(2, row, col) + 0.5f) * stride; + float ymax = (row + outputData.get(3, row, col) + 0.5f) * stride; + + if(bestConf < confidenceThr) { + continue; + } + + xmin = std::max(0.0f, std::min(xmin, float(inputWidth))); + ymin = std::max(0.0f, std::min(ymin, float(inputHeight))); + xmax = std::max(0.0f, std::min(xmax, float(inputWidth))); + ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); + + if(xmax <= xmin || ymax <= ymin) { + logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping"); + continue; + } + + DetectionCandidate candidate = DetectionCandidate{ + xmin, + ymin, + xmax, + ymax, + bestConf, + bestC, + strideIdx, + row, + col, + std::nullopt, + + }; + + if(!properties.parser.classNames->empty()) { + candidate.labelName = (*properties.parser.classNames)[bestC]; + } + detectionCandidates.emplace_back(std::move(candidate)); + } + } + } + + std::vector keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr); + if(keepCandidates.size() == 0) { + logger->trace("No detections after NMS, skipping overlay."); + return; + } + + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); + + if(properties.parser.decodeSegmentation) { + logger->trace("Segmentation decoding."); + segmentationDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + if(properties.parser.decodeKeypoints) { + logger->trace("Keypoints decoding."); + keypointDecode(nnData, keepCandidates, outDetections, properties, logger); + } +} + +bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr logger) { + // Fix the channel order for Yolo - this is hacky and would be best to be fixed in the actual models and make it consistent + + int anchorMultiplier = properties.parser.anchorsV2.empty() ? 1 : static_cast(properties.parser.anchorsV2.size()); + int channelSize = anchorMultiplier * (properties.parser.classes + properties.parser.coordinates + 1); + + auto checkAndFixOrder = [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool { + // Check that the dims size is big enough + if(static_cast(tensorInfo.dims.size()) <= channelDimIndex || static_cast(tensorInfo.dims.size()) <= alternativeDimIndex) { + logger->error("Invalid tensor dims size. Skipping."); + return false; + } + + if(tensorInfo.dims[channelDimIndex] != uint32_t(channelSize)) { + // Check if the channel size would match the alternative storage order + if(tensorInfo.dims[alternativeDimIndex] == uint32_t(channelSize)) { + logger->trace("Invalid channel size for the tensor. Expected {}, got {}, switching", channelSize, tensorInfo.dims[channelDimIndex]); + tensorInfo.order = alternativeOrder; + } else { + logger->error("Invalid channel size for the tensor. Expected {}, got {}. Skipping.", channelSize, tensorInfo.dims[channelDimIndex]); + return false; + } + } + return true; + }; + + switch(tensorInfo.order) { + case dai::TensorInfo::StorageOrder::CHW: + if(!checkAndFixOrder(0, 2, dai::TensorInfo::StorageOrder::HWC)) return false; + break; + case dai::TensorInfo::StorageOrder::HWC: + if(!checkAndFixOrder(2, 0, dai::TensorInfo::StorageOrder::CHW)) return false; + break; + case dai::TensorInfo::StorageOrder::NCHW: + if(!checkAndFixOrder(1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false; + break; + case dai::TensorInfo::StorageOrder::NHWC: + if(!checkAndFixOrder(3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false; + break; + case dai::TensorInfo::StorageOrder::NHCW: + case dai::TensorInfo::StorageOrder::WHC: + case dai::TensorInfo::StorageOrder::WCH: + case dai::TensorInfo::StorageOrder::HCW: + case dai::TensorInfo::StorageOrder::CWH: + case dai::TensorInfo::StorageOrder::NC: + case dai::TensorInfo::StorageOrder::CN: + case dai::TensorInfo::StorageOrder::C: + case dai::TensorInfo::StorageOrder::H: + case dai::TensorInfo::StorageOrder::W: + default: + logger->error("Invalid storage order for the tensor. Skipping."); + return false; + } + + return true; +} + +std::vector getSortedDetectionLayerNames(std::shared_ptr nnData, std::string searchTerm, std::vector outputNames) { + if(outputNames.empty()) { + outputNames = nnData->getAllLayerNames(); + } + + std::vector layerNames; + for(const auto& name : outputNames) { + // if yolo in the name, push it to layerNames + if(name.find(searchTerm) != std::string::npos) { + layerNames.push_back(name); + } + } + + std::sort(layerNames.begin(), layerNames.end()); + return layerNames; +} + +float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2) { + float width_of_overlap_area = fmin(box1.xmax, box2.xmax) - fmax(box1.xmin, box2.xmin); + float height_of_overlap_area = fmin(box1.ymax, box2.ymax) - fmax(box1.ymin, box2.ymin); + float area_of_overlap; + if(width_of_overlap_area < 0 || height_of_overlap_area < 0) + area_of_overlap = 0; + else + area_of_overlap = width_of_overlap_area * height_of_overlap_area; + float box_1_area = (box1.ymax - box1.ymin) * (box1.xmax - box1.xmin); + float box_2_area = (box2.ymax - box2.ymin) * (box2.xmax - box2.xmin); + float area_of_union = box_1_area + box_2_area - area_of_overlap; + return area_of_overlap / area_of_union; +} + +std::vector nonMaximumSuppression(std::vector& detectionCandidates, float iouThr) { + std::sort( + detectionCandidates.begin(), detectionCandidates.end(), [](const DetectionCandidate& a, const DetectionCandidate& b) { return a.score > b.score; }); + + std::vector keep(detectionCandidates.size(), 1); + std::vector keepIndices; + keepIndices.reserve(detectionCandidates.size()); + + for(size_t i = 0; i < detectionCandidates.size(); ++i) { + if(!keep[i]) continue; + keepIndices.push_back(i); + + for(size_t j = i + 1; j < detectionCandidates.size(); ++j) { + if(!keep[j]) continue; + if(YoloIntersectionOverUnion(detectionCandidates[i], detectionCandidates[j]) >= iouThr) { + keep[j] = 0; + } + } + } + + std::vector keepCandidates; + keepCandidates.reserve(keepIndices.size()); + for(size_t idx : keepIndices) keepCandidates.push_back(detectionCandidates[idx]); + + return keepCandidates; +} + +void createImgDetections(const std::vector& detectionCandidates, + std::shared_ptr outDetections, + unsigned int width, + unsigned int height) { + for(const auto& det : detectionCandidates) { + dai::ImgDetection detection; + dai::RotatedRect rotatedRect(dai::Rect(dai::Point2f(det.xmin, det.ymin), dai::Point2f(det.xmax, det.ymax)), 0.0f); + detection.setBoundingBox(rotatedRect.normalize(width, height)); + detection.confidence = det.score; + detection.label = det.label; + if(det.labelName) { + detection.labelName = *det.labelName; + } + outDetections->detections.push_back(std::move(detection)); + } +} + +void segmentationDecode(std::shared_ptr nnData, + std::vector& detectionCandidates, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger) { + auto maskFromCoeffs = [](NNDataViewer& protos, const float* coeffs, int width, int height) -> cv::Mat { + cv::Mat maskLow(height, width, CV_32F); + for(int y = 0; y < maskLow.rows; ++y) { + float* row = maskLow.ptr(y); + for(int x = 0; x < maskLow.cols; ++x) { + float sum = 0.f; + for(int c = 0; c < 32; ++c) sum += protos.get(c, y, x) * coeffs[c]; + row[x] = 1.f / (1.f + std::exp(-sum)); // sigmoid + } + } + return maskLow; + }; + + std::pair inputSize = nnData->transformation->getSize(); + int inputWidth = inputSize.first; + int inputHeight = inputSize.second; + + cv::Mat indexMask(inputHeight, inputWidth, CV_8U, cv::Scalar(255)); + + cv::Mat maskLow, maskUp; + + auto maskLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "masks", std::vector{}); + if(properties.parser.strides.size() != maskLayerNames.size()) { + logger->error( + "Number of strides does not match number of mask output layers. Strides size: {}, mask output layers size: {}. Skipping segmentation decoding.", + properties.parser.strides.size(), + maskLayerNames.size()); + return; + } + auto protoLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "proto", std::vector{}); + if(protoLayerNames.size() == 0) { + logger->error("Expecting proto output layer, found no layer with proto label. Skipping segmentation decoding."); + return; + } + + NNDataViewer protoValues = NNDataViewer(*nnData->getTensorInfo(protoLayerNames[0]), nnData->data, logger); + if(!protoValues.build()) { + logger->error("Failed to build NNDataViewer for proto layer {}. Skipping segmentation decoding.", protoLayerNames[0]); + return; + } + + std::map maskValues; + for(int strideIdx = 0; strideIdx < static_cast(maskLayerNames.size()); ++strideIdx) { + maskValues.try_emplace(strideIdx, *nnData->getTensorInfo(maskLayerNames[strideIdx]), nnData->data, logger); + if(!maskValues.at(strideIdx).build()) { + logger->error("Failed to build NNDataViewer for mask layer {}. Skipping segmentation decoding.", maskLayerNames[strideIdx]); + return; + } + } + + for(size_t i = 0; i < detectionCandidates.size(); ++i) { // loop over all detections + const auto& c = detectionCandidates[i]; + const int detIdx = static_cast(i); // index in outDetections list + + NNDataViewer mask = maskValues.at(c.headIndex); + std::array coeff; + for(int i = 0; i < 32; ++i) { + coeff[i] = mask.get(i, c.rowIndex, c.columnIndex); + } + + TensorInfo protoInfo = *nnData->getTensorInfo(protoLayerNames[0]); + int protoWidth = protoInfo.getWidth(); + int protoHeight = protoInfo.getHeight(); + maskLow = maskFromCoeffs(protoValues, coeff.data(), protoWidth, protoHeight); + + cv::resize(maskLow, maskUp, cv::Size(inputWidth, inputHeight), 0, 0, cv::INTER_LINEAR); + // ROI clamp + int x0 = std::clamp(static_cast(std::floor(c.xmin)), 0, inputWidth - 1); + int y0 = std::clamp(static_cast(std::floor(c.ymin)), 0, inputHeight - 1); + int x1 = std::clamp(static_cast(std::ceil(c.xmax)), 0, inputWidth); + int y1 = std::clamp(static_cast(std::ceil(c.ymax)), 0, inputHeight); + + if(x1 <= x0 || y1 <= y0) continue; + const cv::Rect roi(x0, y0, x1 - x0, y1 - y0); + + // Threshold & paint only unassigned pixels + cv::Mat roiProb = maskUp(roi); + cv::Mat roiBin; + cv::compare(roiProb, static_cast(0.5f), roiBin, cv::CMP_GT); + cv::Mat roiOut = indexMask(roi); + cv::Mat unassigned; + cv::compare(roiOut, 255, unassigned, cv::CMP_EQ); + cv::Mat paintMask; + cv::bitwise_and(roiBin, unassigned, paintMask); + + const uint8_t value = static_cast(std::min(detIdx, 254)); + roiOut.setTo(value, paintMask); + } + + outDetections->setSegmentationMask(indexMask); +} + +void keypointDecode(std::shared_ptr nnData, + std::vector& detectionCandidates, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger) { + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + + auto yoloLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + std::vector featureMapWidths; + for(int i = 0; i < static_cast(yoloLayerNames.size()); ++i) { + auto tensorInfo = nnData->getTensorInfo(yoloLayerNames[i]); + if(!tensorInfo) { + logger->error("Tensor info for layer {} is null. Skipping keypoints decoding.", yoloLayerNames[i]); + return; + } + featureMapWidths.push_back(tensorInfo->getWidth()); + } + + auto kptsLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "kpt_output", std::vector{}); + if(properties.parser.strides.size() != kptsLayerNames.size()) { + logger->error( + "Number of strides does not match number of keypoints output layers. Strides size: {}, keypoints output layers size: {}. Skipping keypoints " + "decoding.", + properties.parser.strides.size(), + kptsLayerNames.size()); + return; + } + + // TODO (aljaz) move to a function + std::map keypointValues; + for(int strideIdx = 0; strideIdx < static_cast(kptsLayerNames.size()); ++strideIdx) { + keypointValues.try_emplace(strideIdx, *nnData->getTensorInfo(kptsLayerNames[strideIdx]), nnData->data, logger); + if(!keypointValues.at(strideIdx).build()) { + logger->error("Failed to build NNDataViewer for keypoints layer {}. Skipping keypoints decoding.", kptsLayerNames[strideIdx]); + return; + } + } + + if(outDetections->detections.size() != detectionCandidates.size()) { + logger->error( + "Number of detections in ImgDetections does not match number of detection candidates. ImgDetections size: {}, detection candidates size: {}. " + "Skipping keypoints decoding.", + outDetections->detections.size(), + detectionCandidates.size()); + return; + } + + for(size_t i = 0; i < detectionCandidates.size(); ++i) { // loop over all detections + const auto& c = detectionCandidates[i]; + int flattenedIndex = c.rowIndex * featureMapWidths[c.headIndex] + c.columnIndex; + + std::vector keypoints; + keypoints.reserve(*properties.parser.nKeypoints); + NNDataViewer keypointMask = keypointValues.at(c.headIndex); + + for(int k = 0; k < properties.parser.nKeypoints; ++k) { + int base = 3 * k; + + // keypointValues tensor storage order HWC + // H == 0 + // W == 51 == 17 * 3 (x, y, conf for each keypoint) + // C == flattened spatial dimensions of row x col of the feature map + float x = std::clamp(keypointMask.get(flattenedIndex, 0, base + 0) / inputWidth, 0.0f, 1.0f); + float y = std::clamp(keypointMask.get(flattenedIndex, 0, base + 1) / inputHeight, 0.0f, 1.0f); + float conf = 1.f / (1.f + std::exp(-(keypointMask.get(flattenedIndex, 0, base + 2)))); + + keypoints.push_back(dai::Keypoint{dai::Point2f(x, y), conf}); + } + + outDetections->detections[i].keypoints = KeypointsList(keypoints); + } +} + +} // namespace DetectionParserUtils +} // namespace utilities +} // namespace dai \ No newline at end of file diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp new file mode 100644 index 000000000..85b5a234f --- /dev/null +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp @@ -0,0 +1,85 @@ +#pragma once + +#include + +#include + +#include "depthai/pipeline/datatype/ImgDetections.hpp" +#include "depthai/pipeline/datatype/NNData.hpp" +#include "depthai/properties/DetectionParserProperties.hpp" + +namespace dai { +namespace utilities { +namespace DetectionParserUtils { + +struct DetectionCandidate { + float xmin, ymin, xmax, ymax, score; + int label, headIndex, rowIndex, columnIndex; + std::optional labelName; +}; + +/* +Decode anchor free yolo v6r1 with sigmoid assisted center detection +*/ +void decodeR1AF(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger); + +/* +Decode anchor based yolo v3 and v3-Tiny +*/ +void decodeV3AB(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger); + +/* +Decode anchor based networks, e.g., yolo v5, v7, P +*/ +void decodeV5AB(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger); + +/* +Decode anchor free top-left-bottom-right (TLBR) style networks, e.g., yolo v6r2, v8, v10, v11 +*/ +void decodeTLBR(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger); + +std::vector getSortedDetectionLayerNames(std::shared_ptr nnData, std::string searchTerm, std::vector outputNames); + +float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2); + +bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr logger); + +void createImgDetections(std::vector& detectionCandidates, + std::vector keepIndices, + std::shared_ptr outDetections, + std::shared_ptr logger); + +std::vector nonMaximumSuppression(std::vector& detectionCandidates, float iouThr); + +void createImgDetections(const std::vector& detectionCandidates, + std::shared_ptr outDetections, + unsigned int width, + unsigned int height); + +void segmentationDecode(std::shared_ptr nnData, + std::vector& detectionCandidates, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger); + +void keypointDecode(std::shared_ptr nnData, + std::vector& detectionCandidates, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger); + +} // namespace DetectionParserUtils +} // namespace utilities +} // namespace dai \ No newline at end of file diff --git a/src/pipeline/utilities/NNDataViewer.hpp b/src/pipeline/utilities/NNDataViewer.hpp new file mode 100644 index 000000000..94ab12cda --- /dev/null +++ b/src/pipeline/utilities/NNDataViewer.hpp @@ -0,0 +1,163 @@ +#pragma once +#include + +#include "depthai/common/TensorInfo.hpp" +#include "depthai/pipeline/datatype/NNData.hpp" +#include "fp16/fp16.h" +namespace dai { +class NNDataViewer { + public: + std::shared_ptr data; + dai::TensorInfo tensor; + std::shared_ptr logger; + + // Factors to multiply with before the vectors + struct FactorsBefore { + int32_t h; + int32_t w; + int32_t c; + }; + + FactorsBefore factorsBefore; + + NNDataViewer(dai::TensorInfo tensor, std::shared_ptr data, std::shared_ptr logger) + : data{data}, tensor{tensor}, logger{logger} {}; + bool build() { + if(tensor.strides.size() < 2) { + logger->error("Tensor doesn't have enough strides. Number of strides: {}, expected: {}", tensor.strides.size(), 2); + return false; + } + if(tensor.strides[0] == 0 || tensor.strides[1] == 0) { + logger->error("Tensor strides should not be set to zero. Strides are {} {}", tensor.strides[0], tensor.strides[1]); + return false; + } + switch(tensor.order) { + case TensorInfo::StorageOrder::NCHW: + if(tensor.dims[0] != 1) { + logger->error("NCHW is only supported in Detection Parser if N is 1. It is {}", tensor.dims[0]); + return false; + } + if(tensor.strides.size() != 4) { + logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4); + } + factorsBefore.c = tensor.strides[1]; + factorsBefore.h = tensor.strides[2]; + factorsBefore.w = tensor.getDataTypeSize(); + break; + case TensorInfo::StorageOrder::NHWC: + if(tensor.dims[0] != 1) { + logger->error("NHWC is only supported in Detection Parser if N is 1. It is {}", tensor.dims[0]); + return false; + } + if(tensor.strides.size() != 4) { + logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4); + } + factorsBefore.h = tensor.strides[1]; + factorsBefore.w = tensor.strides[2]; + factorsBefore.c = tensor.getDataTypeSize(); + break; + case TensorInfo::StorageOrder::HCW: + factorsBefore.h = tensor.strides[0]; + factorsBefore.c = tensor.strides[1]; + factorsBefore.w = tensor.getDataTypeSize(); + break; + + case TensorInfo::StorageOrder::HWC: + factorsBefore.h = tensor.strides[0]; + factorsBefore.w = tensor.strides[1]; + factorsBefore.c = tensor.getDataTypeSize(); + break; + case TensorInfo::StorageOrder::CHW: + factorsBefore.c = tensor.strides[0]; + factorsBefore.h = tensor.strides[1]; + factorsBefore.w = tensor.getDataTypeSize(); + break; + + case TensorInfo::StorageOrder::CWH: + factorsBefore.c = tensor.strides[0]; + factorsBefore.w = tensor.strides[1]; + factorsBefore.h = tensor.getDataTypeSize(); + break; + + case TensorInfo::StorageOrder::WCH: + factorsBefore.w = tensor.strides[0]; + factorsBefore.c = tensor.strides[1]; + factorsBefore.h = tensor.getDataTypeSize(); + break; + + case TensorInfo::StorageOrder::WHC: + factorsBefore.w = tensor.strides[0]; + factorsBefore.h = tensor.strides[1]; + factorsBefore.c = tensor.getDataTypeSize(); + break; + case TensorInfo::StorageOrder::NHCW: + case TensorInfo::StorageOrder::NC: + case TensorInfo::StorageOrder::CN: + case TensorInfo::StorageOrder::H: + case TensorInfo::StorageOrder::W: + case TensorInfo::StorageOrder::C: + default: + logger->error("Storage order not supported in NNDataViewer"); + return false; + } + return sanity_check(); + } + + bool sanity_check() { + if(data->getSize() < (tensor.offset + (tensor.dims[0] * tensor.strides[0]))) { + logger->error( + "Underlying data does not hold enough data for the tensor to be contained.\ + Tensor size: {}, Tensor offset: {}, Data type size: {}, Data size: {} ", + tensor.dims[0] * tensor.strides[0], + tensor.offset, + tensor.getDataTypeSize(), + data->getSize()); + return false; + } + if(tensor.dims.size() < 2) { + logger->error("Number of dimensions for the input tensor is expected to be at least 2. It is {}", tensor.dims.size()); + return false; + } + return true; + }; + + inline float get(int c, int h, int w) { + // If this turns out to be slow, use a function pointer instead and point to the right getter at build time + int32_t index = tensor.offset + factorsBefore.h * h + factorsBefore.w * w + factorsBefore.c * c; +#ifdef DEPTHAI_SAFE_NN_DATA_ACCESS + logger->trace("Offset {}, fbH {}, fbW {}, fbC {}, h {}, w {}, c{}", tensor.offset, factorsBefore.h, factorsBefore.w, factorsBefore.c, h, w, c); + if(index > data->getSize()) { + logger->error("Out of bound access. Size is {}, index is {}", data->getSize(), index); + return 0.0; + } +#endif + + switch(tensor.dataType) { + case TensorInfo::DataType::U8F: { + uint8_t dataOut = data->getData()[index]; + return (static_cast(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::I8: { + int8_t dataOut = static_cast(data->getData()[index]); + return (static_cast(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::INT: { + int32_t dataOut = reinterpret_cast(data->getData().data())[index / sizeof(int32_t)]; + return (static_cast(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::FP16: { + int16_t dataOut = reinterpret_cast(data->getData().data())[index / sizeof(int16_t)]; + return (fp16_ieee_to_fp32_value(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::FP32: { + float dataOut = reinterpret_cast(data->getData().data())[index / sizeof(float)]; + return (static_cast(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::FP64: + default: { + return 0.0f; + } + } + } +}; +} // namespace dai From 96a92f58cac13c433300945107e82fb1dcf03ab0 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Wed, 5 Nov 2025 10:47:23 +0100 Subject: [PATCH 03/24] Add host side implementation --- .../pipeline/node/DetectionParserBindings.cpp | 2 ++ examples/cpp/DetectionNetwork/CMakeLists.txt | 2 +- .../detection_and_keypoints.cpp | 1 - .../{RVC4 => }/detection_and_segmentation.cpp | 35 ++++++++++++------- .../{RVC4 => }/detection_and_segmentation.py | 14 ++++++-- tests/CMakeLists.txt | 2 +- 6 files changed, 38 insertions(+), 18 deletions(-) rename examples/cpp/DetectionNetwork/{RVC4 => }/detection_and_segmentation.cpp (84%) rename examples/python/DetectionNetwork/{RVC4 => }/detection_and_segmentation.py (92%) diff --git a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp index 7e5a50c4f..eab544ed4 100644 --- a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp +++ b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp @@ -65,11 +65,13 @@ void bind_detectionparser(pybind11::module& m, void* pCallstack) { DOC(dai, node, DetectionParser, setAnchors, 2)) .def("setAnchorMasks", &DetectionParser::setAnchorMasks, py::arg("anchorMasks"), DOC(dai, node, DetectionParser, setAnchorMasks)) .def("setIouThreshold", &DetectionParser::setIouThreshold, py::arg("thresh"), DOC(dai, node, DetectionParser, setIouThreshold)) + .def("setRunOnHost", &DetectionParser::setRunOnHost, py::arg("runOnHost"), DOC(dai, node, DetectionParser, setRunOnHost)) .def("getNumClasses", &DetectionParser::getNumClasses, DOC(dai, node, DetectionParser, getNumClasses)) .def("getCoordinateSize", &DetectionParser::getCoordinateSize, DOC(dai, node, DetectionParser, getCoordinateSize)) .def("getAnchors", &DetectionParser::getAnchors, DOC(dai, node, DetectionParser, getAnchors)) .def("getAnchorMasks", &DetectionParser::getAnchorMasks, DOC(dai, node, DetectionParser, getAnchorMasks)) .def("getIouThreshold", &DetectionParser::getIouThreshold, DOC(dai, node, DetectionParser, getIouThreshold)) + .def("runOnHost", &DetectionParser::runOnHost, DOC(dai, node, DetectionParser, runOnHost)) .def("build", &DetectionParser::build, DOC(dai, node, DetectionParser, build)); daiNodeModule.attr("DetectionParser").attr("Properties") = detectionParserProperties; } diff --git a/examples/cpp/DetectionNetwork/CMakeLists.txt b/examples/cpp/DetectionNetwork/CMakeLists.txt index 8193faeb9..8c3ba6ecf 100644 --- a/examples/cpp/DetectionNetwork/CMakeLists.txt +++ b/examples/cpp/DetectionNetwork/CMakeLists.txt @@ -23,7 +23,7 @@ dai_set_example_test_labels(detection_network ondevice rvc2_all rvc4 rvc4rgb ci) dai_add_example(detection_network_remap detection_network_remap.cpp ON OFF) dai_set_example_test_labels(detection_network_remap ondevice rvc2_all rvc4 ci) -dai_add_example(detection_and_segmentation RVC4/detection_and_segmentation.cpp ON OFF) +dai_add_example(detection_and_segmentation detection_and_segmentation.cpp ON OFF) dai_set_example_test_labels(detection_and_segmentation rvc4) dai_add_example(detection_and_keypoints detection_and_keypoints.cpp ON OFF) diff --git a/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp index 667151bb0..f374bdca1 100644 --- a/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp +++ b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp @@ -86,7 +86,6 @@ int main() { auto currentTime = std::chrono::steady_clock::now(); float fps = counter / std::chrono::duration(currentTime - startTime).count(); - std::cout << "FPS: " << fps << std::endl; } if(cv::waitKey(1) == 'q') { diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp similarity index 84% rename from examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp rename to examples/cpp/DetectionNetwork/detection_and_segmentation.cpp index 4912d04c6..e3e81dcbf 100644 --- a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp +++ b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -16,8 +17,16 @@ cv::Rect frameNorm(const cv::Mat& frame, const dai::Point2f& topLeft, const dai: } int main() { + std::string modelName = "luxonis/yolov8-instance-segmentation-large:coco-640x352"; + bool setRunOnHost = false; + auto device = std::make_shared(); + + if(device->getPlatformAsString() == "RVC2") { + modelName = "luxonis/yolov8-instance-segmentation-nano:coco-512x288"; + setRunOnHost = true; + } // Create pipeline - dai::Pipeline pipeline; + dai::Pipeline pipeline{device}; // Create and configure camera node auto cameraNode = pipeline.create(); @@ -27,8 +36,10 @@ int main() { auto detectionNetwork = pipeline.create(); dai::NNModelDescription modelDescription; - modelDescription.model = "luxonis/yolov8-instance-segmentation-large:coco-640x480"; + + modelDescription.model = modelName; detectionNetwork->build(cameraNode, modelDescription); + detectionNetwork->detectionParser->setRunOnHost(setRunOnHost); auto labelMap = detectionNetwork->getClasses(); // Create output queues @@ -121,16 +132,18 @@ int main() { detections.begin(), detections.end(), [filteredLabel](const dai::ImgDetection& det) { return det.label != filteredLabel; }), detections.end()); } + if(!segmentationMask.empty()) { + cv::Mat lut(1, 256, CV_8U); + for(int i = 0; i < 256; ++i) lut.at(i) = (i >= 255) ? 255 : cv::saturate_cast(i * 25); - cv::Mat lut(1, 256, CV_8U); - for(int i = 0; i < 256; ++i) lut.at(i) = (i == 255) ? 255 : cv::saturate_cast(i * 25); - cv::Mat scaledMask; - cv::LUT(segmentationMask, lut, scaledMask); + cv::Mat scaledMask; + cv::LUT(segmentationMask, lut, scaledMask); - cv::Mat coloredMask; - cv::applyColorMap(scaledMask, coloredMask, cv::COLORMAP_JET); - frame.copyTo(coloredMask, (scaledMask == 255)); - cv::addWeighted(frame, 0.7, coloredMask, 0.3, 0, frame); + cv::Mat coloredMask; + cv::applyColorMap(scaledMask, coloredMask, cv::COLORMAP_JET); + frame.copyTo(coloredMask, (scaledMask == 255)); + cv::addWeighted(frame, 0.7, coloredMask, 0.3, 0, frame); + } // Display detections for(const auto& detection : detections) { @@ -157,8 +170,6 @@ int main() { cv::imshow("rgb", frame); auto currentTime = std::chrono::steady_clock::now(); - float fps = counter / std::chrono::duration(currentTime - startTime).count(); - std::cout << "FPS: " << fps << std::endl; } } } diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py b/examples/python/DetectionNetwork/detection_and_segmentation.py similarity index 92% rename from examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py rename to examples/python/DetectionNetwork/detection_and_segmentation.py index 650f90f2f..a8ecc74a6 100644 --- a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py +++ b/examples/python/DetectionNetwork/detection_and_segmentation.py @@ -6,11 +6,19 @@ import numpy as np import time +model_name = "luxonis/yolov8-instance-segmentation-large:coco-640x480" +setRunOnHost = False +device = dai.Device() +if device.getPlatformAsString() == "RVC2": + model_name = "luxonis/yolov8-instance-segmentation-nano:coco-512x288" + setRunOnHost = True + # Create pipeline -with dai.Pipeline() as pipeline: +with dai.Pipeline(device) as pipeline: cameraNode = pipeline.create(dai.node.Camera).build() - detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-nano:coco-512x288")) - # detectionNetwork.detectionParser.runOnHost(True) + + detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription(model_name)) + detectionNetwork.detectionParser.setRunOnHost(setRunOnHost) labelMap = detectionNetwork.getClasses() qRgb = detectionNetwork.passthrough.createOutputQueue() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d3823f1a8..f4b0776b2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -524,7 +524,7 @@ FIRE_VIDEO="${fire_video}" KITCHEN_IMAGE_PATH="${kitchen_image}" YOLO_V8_INSTANCE_SEGMENTATION_LARGE_COCO_640x352_KITCHEN_SEGMENTATION_GROUND_TRUTH="${yolo_v8_instance_segmentation_large_coco_640x352_kitchen_segmentation_gt}" ) -dai_set_test_labels(detection_parser_test ondevice rvc4 ci) +dai_set_test_labels(detection_parser_test ondevice rvc4 ci onhost) # Spatial detection network test dai_add_test(spatial_detection_network_test src/ondevice_tests/pipeline/node/spatial_detection_network_test.cpp) From a0dd29a87dcf460a720332847689570829a99707 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Wed, 5 Nov 2025 10:52:36 +0100 Subject: [PATCH 04/24] bump device --- cmake/Depthai/DepthaiDeviceSideConfig.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake index b0a270a1d..7c6bb3df4 100644 --- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake +++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake @@ -2,7 +2,7 @@ set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot") # "full commit hash of device side binary" -set(DEPTHAI_DEVICE_SIDE_COMMIT "e658b28655820c649b3bbed9f44865d00139094d") +set(DEPTHAI_DEVICE_SIDE_COMMIT "8741ce89206d2a5299acc3382c7496e1ee205fcb") # "version if applicable" set(DEPTHAI_DEVICE_SIDE_VERSION "") From 33752f1f44726f7e977dac9e1f6bd1ff79228da1 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Wed, 5 Nov 2025 12:54:19 +0100 Subject: [PATCH 05/24] bump rvc4 --- cmake/Depthai/DepthaiDeviceRVC4Config.cmake | 2 +- .../pipeline/node/spatial_location_calculator_test.cpp | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake index 43d640f5e..f6ae0d22b 100644 --- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake +++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake @@ -3,4 +3,4 @@ set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot") # "version if applicable" -set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+53bd364bc4c519e9aa6230b3de4d78a78d073373") +set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+62ce59c3c4a4a53a9b0773fe83dabbecdc4553e9") diff --git a/tests/src/ondevice_tests/pipeline/node/spatial_location_calculator_test.cpp b/tests/src/ondevice_tests/pipeline/node/spatial_location_calculator_test.cpp index 1316e8566..0a8ca09b7 100644 --- a/tests/src/ondevice_tests/pipeline/node/spatial_location_calculator_test.cpp +++ b/tests/src/ondevice_tests/pipeline/node/spatial_location_calculator_test.cpp @@ -1,6 +1,5 @@ -#include - #include +#include #include #include #include @@ -106,7 +105,6 @@ TEST_CASE("SpatialLocationCalculator synthetic depth data test") { auto outputQueue = spatial->out.createOutputQueue(); auto passthroughQueue = spatial->passthroughDepth.createOutputQueue(); - std::vector depthPixels(width * height, 1000); auto setRegionDepth = [&](const RoiSpec& spec) { const int x0 = static_cast(spec.roi.x); @@ -120,7 +118,7 @@ TEST_CASE("SpatialLocationCalculator synthetic depth data test") { for(const auto& spec : roiSpecs) { setRegionDepth(spec); } - + // Prepare synthetic depth frame auto depthFrame = std::make_shared(); depthFrame->setType(dai::ImgFrame::Type::RAW16); From 255a8824da8078e10cf8b8d19623a8cdc649daa7 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Tue, 11 Nov 2025 16:59:50 +0100 Subject: [PATCH 06/24] update parser --- .../utilities/DetectionParser/DetectionParserUtils.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp index c1809e847..a9455e551 100644 --- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp @@ -10,7 +10,7 @@ #include #include -#include "depthai/common/KeypointsList.hpp" +#include "depthai/common/KeypointsListT.hpp" #include "depthai/common/RotatedRect.hpp" #include "depthai/common/TensorInfo.hpp" #include "depthai/pipeline/datatype/ImgDetections.hpp" @@ -888,7 +888,7 @@ void keypointDecode(std::shared_ptr nnData, keypoints.push_back(dai::Keypoint{dai::Point2f(x, y), conf}); } - outDetections->detections[i].keypoints = KeypointsList(keypoints); + outDetections->detections[i].keypoints = KeypointsList(keypoints, properties.parser.keypointEdges); } } From 831232c1ac7faf4cb2e181bc74d5067f9aba220f Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Tue, 11 Nov 2025 17:57:55 +0100 Subject: [PATCH 07/24] update example --- examples/python/DetectionNetwork/detection_and_keypoints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/python/DetectionNetwork/detection_and_keypoints.py b/examples/python/DetectionNetwork/detection_and_keypoints.py index b61c41fc9..4459be138 100644 --- a/examples/python/DetectionNetwork/detection_and_keypoints.py +++ b/examples/python/DetectionNetwork/detection_and_keypoints.py @@ -7,7 +7,7 @@ # Create pipeline with dai.Pipeline() as pipeline: - cameraNode = pipeline.create(dai.node.Camera).build() + cameraNode = pipeline.create(dai.node.Camera).build(sensorFps=12) detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-nano-pose-estimation:coco-512x288")) labelMap = detectionNetwork.getClasses() From 1203e29ff34f107e491841cc50f656506eca9e2c Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Mon, 17 Nov 2025 08:28:27 +0100 Subject: [PATCH 08/24] Move parsing logic to host --- .../DetectionParser/DetectionParserUtils.cpp | 295 +++++++++++------- .../DetectionParser/DetectionParserUtils.hpp | 41 ++- 2 files changed, 197 insertions(+), 139 deletions(-) diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp index a9455e551..33d38cea9 100644 --- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp @@ -2,14 +2,22 @@ #include +#include #include #include #include #include #include +#include +#include +#include +#include #include #include +#include +#include "DetectionParserUtils.hpp" +#include "depthai/common/Keypoint.hpp" #include "depthai/common/KeypointsListT.hpp" #include "depthai/common/RotatedRect.hpp" #include "depthai/common/TensorInfo.hpp" @@ -23,11 +31,11 @@ namespace utilities { namespace DetectionParserUtils { // yolo v6 r1 - anchor free -void decodeR1AF(std::shared_ptr nnData, - std::shared_ptr outDetections, - DetectionParserProperties properties, +void decodeR1AF(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, std::shared_ptr logger) { - auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + auto layerNames = utilities::DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); const std::vector strides = properties.parser.strides; if(strides.size() != layerNames.size()) { @@ -40,17 +48,18 @@ void decodeR1AF(std::shared_ptr nnData, const int numClasses = properties.parser.classes; int inputWidth; int inputHeight; - std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + std::tie(inputWidth, inputHeight) = nnData.transformation->getSize(); if(inputWidth <= 0 || inputHeight <= 0) { throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); } std::vector detectionCandidates; - detectionCandidates.reserve(250); + detectionCandidates.reserve(defaultMaxDetectionsPerFrame); for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { std::string layerName = layerNames[strideIdx]; - auto tensorInfo = nnData->getTensorInfo(layerName); + int stride = strides[strideIdx]; + auto tensorInfo = nnData.getTensorInfo(layerName); if(!tensorInfo) { std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); throw std::runtime_error(errorMsg); @@ -63,7 +72,7 @@ void decodeR1AF(std::shared_ptr nnData, int layerHeight = tensorInfo->getHeight(); int layerWidth = tensorInfo->getWidth(); - NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger); + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger); if(!outputData.build()) { std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); throw std::runtime_error(errorMsg); @@ -71,8 +80,8 @@ void decodeR1AF(std::shared_ptr nnData, for(int row = 0; row < layerHeight; ++row) { for(int col = 0; col < layerWidth; ++col) { - const float score = outputData.get(4, row, col); - if(score < confidenceThr) { + const float objectnessScore = outputData.get(4, row, col); + if(objectnessScore < confidenceThr) { continue; } @@ -85,7 +94,7 @@ void decodeR1AF(std::shared_ptr nnData, bestC = c; } } - if(bestConf * score < confidenceThr) { + if(bestConf * objectnessScore < confidenceThr) { continue; } @@ -105,7 +114,20 @@ void decodeR1AF(std::shared_ptr nnData, ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); if(xmax <= xmin || ymax <= ymin) { - logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping"); + logger->info("Invalid bbox parameters. Either xmax <= xmin or ymax <= ymin. Skipping detection."); + logger->debug( + "Skipping invalid bbox: layer='{}', " + "raw(cx,cy,w,h)=({:.2f},{:.2f},{:.2f},{:.2f}) " + "clamped(xmin,ymin,xmax,ymax)=({:.2f},{:.2f},{:.2f},{:.2f}).", + layerName, + cx, + cy, + w, + h, + xmin, + ymin, + xmax, + ymax); continue; } DetectionCandidate candidate = DetectionCandidate{ @@ -113,17 +135,13 @@ void decodeR1AF(std::shared_ptr nnData, ymin, xmax, ymax, - bestConf * score, + bestConf * objectnessScore, bestC, strideIdx, row, col, - std::nullopt, }; - if(!properties.parser.classNames->empty()) { - candidate.labelName = (*properties.parser.classNames)[bestC]; - } detectionCandidates.emplace_back(std::move(candidate)); } } @@ -134,6 +152,11 @@ void decodeR1AF(std::shared_ptr nnData, logger->trace("No detections after NMS, skipping overlay."); return; } + if(!properties.parser.classNames->empty()) { + for(auto& candidate : keepCandidates) { + candidate.labelName = (*properties.parser.classNames)[candidate.label]; + } + } createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); @@ -151,11 +174,12 @@ void decodeR1AF(std::shared_ptr nnData, /* Decode anchor based yolo v3 and v3-Tiny */ -void decodeV3AB(std::shared_ptr nnData, - std::shared_ptr outDetections, - DetectionParserProperties properties, +void decodeV3AB(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, std::shared_ptr logger) { - auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + auto layerNames = getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); + auto sigmoid = [](float x) -> float { return 1.f / (1.f + std::exp(-x)); }; const std::vector strides = properties.parser.strides; if(strides.size() != layerNames.size()) { @@ -169,7 +193,7 @@ void decodeV3AB(std::shared_ptr nnData, const int numClasses = properties.parser.classes; int inputWidth; int inputHeight; - std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + std::tie(inputWidth, inputHeight) = nnData.transformation->getSize(); if(inputWidth <= 0 || inputHeight <= 0) { throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); } @@ -182,12 +206,12 @@ void decodeV3AB(std::shared_ptr nnData, } std::vector detectionCandidates; - detectionCandidates.reserve(250); + detectionCandidates.reserve(defaultMaxDetectionsPerFrame); for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { std::string layerName = layerNames[strideIdx]; int stride = strides[strideIdx]; - auto tensorInfo = nnData->getTensorInfo(layerName); + auto tensorInfo = nnData.getTensorInfo(layerName); if(!tensorInfo) { std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); throw std::runtime_error(errorMsg); @@ -202,7 +226,7 @@ void decodeV3AB(std::shared_ptr nnData, int layerWidth = tensorInfo->getWidth(); int layerChannels = tensorInfo->getChannels(); - NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger); + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger); if(!outputData.build()) { std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); throw std::runtime_error(errorMsg); @@ -217,8 +241,6 @@ void decodeV3AB(std::shared_ptr nnData, throw std::runtime_error(errorMsg); } - auto sigmoid = [](float x) -> float { return 1.f / (1.f + std::exp(-x)); }; - for(int row = 0; row < layerHeight; ++row) { for(int col = 0; col < layerWidth; ++col) { for(int a = 0; a < numAnchors; ++a) { @@ -231,15 +253,15 @@ void decodeV3AB(std::shared_ptr nnData, if(obj < confidenceThr) continue; int bestC = 0; - float clsProb = 0.0f; + float clsLogit = 0.0f; for(int c = 0; c < numClasses; ++c) { - const float prob = outputData.get(ch0 + 5 + c, row, col); - if(prob > clsProb) { - clsProb = prob; + const float candidateLogit = outputData.get(ch0 + 5 + c, row, col); + if(candidateLogit > clsLogit) { + clsLogit = candidateLogit; bestC = c; } } - const float conf = obj * 1.f / (1.f + std::exp(-clsProb)); + const float conf = obj * sigmoid(clsLogit); if(conf < confidenceThr) continue; // YOLOv3 decode @@ -275,12 +297,8 @@ void decodeV3AB(std::shared_ptr nnData, strideIdx, row, col, - std::nullopt, }; - if(!properties.parser.classNames->empty()) { - candidate.labelName = (*properties.parser.classNames)[bestC]; - } detectionCandidates.emplace_back(std::move(candidate)); } } @@ -293,6 +311,12 @@ void decodeV3AB(std::shared_ptr nnData, return; } + if(!properties.parser.classNames->empty()) { + for(auto& candidate : keepCandidates) { + candidate.labelName = (*properties.parser.classNames)[candidate.label]; + } + } + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); if(properties.parser.decodeSegmentation) { @@ -311,11 +335,11 @@ void decodeV3AB(std::shared_ptr nnData, /* Decode anchor based networks, e.g., yolo v5, v7, P */ -void decodeV5AB(std::shared_ptr nnData, - std::shared_ptr outDetections, - DetectionParserProperties properties, +void decodeV5AB(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, std::shared_ptr logger) { - auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + auto layerNames = getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); const std::vector strides = properties.parser.strides; if(strides.size() != layerNames.size()) { @@ -329,7 +353,7 @@ void decodeV5AB(std::shared_ptr nnData, const int numClasses = properties.parser.classes; int inputWidth; int inputHeight; - std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + std::tie(inputWidth, inputHeight) = nnData.transformation->getSize(); if(inputWidth <= 0 || inputHeight <= 0) { throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); @@ -343,12 +367,12 @@ void decodeV5AB(std::shared_ptr nnData, } std::vector detectionCandidates; - detectionCandidates.reserve(250); + detectionCandidates.reserve(defaultMaxDetectionsPerFrame); for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { std::string layerName = layerNames[strideIdx]; int stride = strides[strideIdx]; - auto tensorInfo = nnData->getTensorInfo(layerName); + auto tensorInfo = nnData.getTensorInfo(layerName); if(!tensorInfo) { std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); throw std::runtime_error(errorMsg); @@ -363,7 +387,7 @@ void decodeV5AB(std::shared_ptr nnData, int layerWidth = tensorInfo->getWidth(); int layerChannels = tensorInfo->getChannels(); - NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger); + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger); if(!outputData.build()) { std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); throw std::runtime_error(errorMsg); @@ -393,9 +417,9 @@ void decodeV5AB(std::shared_ptr nnData, int bestC = 0; float bestConf = 0.0f; for(int c = 0; c < numClasses; ++c) { - const float prob = outputData.get(ch0 + 5 + c, row, col); - if(prob > bestConf) { - bestConf = prob; + const float candidateProb = outputData.get(ch0 + 5 + c, row, col); + if(candidateProb > bestConf) { + bestConf = candidateProb; bestC = c; } } @@ -430,12 +454,8 @@ void decodeV5AB(std::shared_ptr nnData, strideIdx, row, col, - std::nullopt, }; - if(!properties.parser.classNames->empty()) { - candidate.labelName = (*properties.parser.classNames)[bestC]; - } detectionCandidates.emplace_back(std::move(candidate)); } } @@ -448,6 +468,12 @@ void decodeV5AB(std::shared_ptr nnData, return; } + if(!properties.parser.classNames->empty()) { + for(auto& candidate : keepCandidates) { + candidate.labelName = (*properties.parser.classNames)[candidate.label]; + } + } + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); if(properties.parser.decodeSegmentation) { @@ -464,11 +490,11 @@ void decodeV5AB(std::shared_ptr nnData, /* Decode TLBR (top left bottom right) style networks, e.g., yolo v6r2, v8, v10, v11 */ -void decodeTLBR(std::shared_ptr nnData, - std::shared_ptr outDetections, - DetectionParserProperties properties, +void decodeTLBR(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, std::shared_ptr logger) { - auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); const std::vector strides = properties.parser.strides; if(strides.size() != layerNames.size()) { @@ -481,19 +507,19 @@ void decodeTLBR(std::shared_ptr nnData, const int numClasses = properties.parser.classes; int inputWidth; int inputHeight; - std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + std::tie(inputWidth, inputHeight) = nnData.transformation->getSize(); if(inputWidth <= 0 || inputHeight <= 0) { throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); } std::vector detectionCandidates; - detectionCandidates.reserve(250); + detectionCandidates.reserve(defaultMaxDetectionsPerFrame); for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { std::string layerName = layerNames[strideIdx]; int stride = strides[strideIdx]; - auto tensorInfo = nnData->getTensorInfo(layerName); + auto tensorInfo = nnData.getTensorInfo(layerName); if(!tensorInfo) { std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); throw std::runtime_error(errorMsg); @@ -506,7 +532,7 @@ void decodeTLBR(std::shared_ptr nnData, int layerHeight = tensorInfo->getHeight(); int layerWidth = tensorInfo->getWidth(); - NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger); + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger); if(!outputData.build()) { std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); throw std::runtime_error(errorMsg); @@ -557,13 +583,8 @@ void decodeTLBR(std::shared_ptr nnData, strideIdx, row, col, - std::nullopt, - }; - if(!properties.parser.classNames->empty()) { - candidate.labelName = (*properties.parser.classNames)[bestC]; - } detectionCandidates.emplace_back(std::move(candidate)); } } @@ -575,6 +596,12 @@ void decodeTLBR(std::shared_ptr nnData, return; } + if(!properties.parser.classNames->empty()) { + for(auto& candidate : keepCandidates) { + candidate.labelName = (*properties.parser.classNames)[candidate.label]; + } + } + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); if(properties.parser.decodeSegmentation) { @@ -590,11 +617,18 @@ void decodeTLBR(std::shared_ptr nnData, bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr logger) { // Fix the channel order for Yolo - this is hacky and would be best to be fixed in the actual models and make it consistent + auto getYoloChannelSize = [&](int classes, int coordinates, int anchors) -> int { + if(anchors == 0) { + anchors = 1; + } + return anchors * (classes + coordinates + 1); + }; int anchorMultiplier = properties.parser.anchorsV2.empty() ? 1 : static_cast(properties.parser.anchorsV2.size()); int channelSize = anchorMultiplier * (properties.parser.classes + properties.parser.coordinates + 1); - auto checkAndFixOrder = [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool { + auto checkAndFixOrder = + [&](dai::TensorInfo::StorageOrder currentOrder, int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool { // Check that the dims size is big enough if(static_cast(tensorInfo.dims.size()) <= channelDimIndex || static_cast(tensorInfo.dims.size()) <= alternativeDimIndex) { logger->error("Invalid tensor dims size. Skipping."); @@ -616,16 +650,16 @@ bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties p switch(tensorInfo.order) { case dai::TensorInfo::StorageOrder::CHW: - if(!checkAndFixOrder(0, 2, dai::TensorInfo::StorageOrder::HWC)) return false; + if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::CHW, 0, 2, dai::TensorInfo::StorageOrder::HWC)) return false; break; case dai::TensorInfo::StorageOrder::HWC: - if(!checkAndFixOrder(2, 0, dai::TensorInfo::StorageOrder::CHW)) return false; + if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::HWC, 2, 0, dai::TensorInfo::StorageOrder::CHW)) return false; break; case dai::TensorInfo::StorageOrder::NCHW: - if(!checkAndFixOrder(1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false; + if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::NCHW, 1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false; break; case dai::TensorInfo::StorageOrder::NHWC: - if(!checkAndFixOrder(3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false; + if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::NHWC, 3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false; break; case dai::TensorInfo::StorageOrder::NHCW: case dai::TensorInfo::StorageOrder::WHC: @@ -645,9 +679,9 @@ bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties p return true; } -std::vector getSortedDetectionLayerNames(std::shared_ptr nnData, std::string searchTerm, std::vector outputNames) { +std::vector getSortedDetectionLayerNames(const dai::NNData& nnData, std::string searchTerm, std::vector outputNames) { if(outputNames.empty()) { - outputNames = nnData->getAllLayerNames(); + outputNames = nnData.getAllLayerNames(); } std::vector layerNames; @@ -704,7 +738,7 @@ std::vector nonMaximumSuppression(std::vector& detectionCandidates, - std::shared_ptr outDetections, + dai::ImgDetections& outDetections, unsigned int width, unsigned int height) { for(const auto& det : detectionCandidates) { @@ -716,36 +750,21 @@ void createImgDetections(const std::vector& detectionCandida if(det.labelName) { detection.labelName = *det.labelName; } - outDetections->detections.push_back(std::move(detection)); + outDetections.detections.push_back(std::move(detection)); } } -void segmentationDecode(std::shared_ptr nnData, +void segmentationDecode(const dai::NNData& nnData, std::vector& detectionCandidates, - std::shared_ptr outDetections, + dai::ImgDetections& outDetections, DetectionParserProperties properties, std::shared_ptr logger) { - auto maskFromCoeffs = [](NNDataViewer& protos, const float* coeffs, int width, int height) -> cv::Mat { - cv::Mat maskLow(height, width, CV_32F); - for(int y = 0; y < maskLow.rows; ++y) { - float* row = maskLow.ptr(y); - for(int x = 0; x < maskLow.cols; ++x) { - float sum = 0.f; - for(int c = 0; c < 32; ++c) sum += protos.get(c, y, x) * coeffs[c]; - row[x] = 1.f / (1.f + std::exp(-sum)); // sigmoid - } - } - return maskLow; - }; - - std::pair inputSize = nnData->transformation->getSize(); + std::pair inputSize = nnData.transformation->getSize(); int inputWidth = inputSize.first; int inputHeight = inputSize.second; cv::Mat indexMask(inputHeight, inputWidth, CV_8U, cv::Scalar(255)); - cv::Mat maskLow, maskUp; - auto maskLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "masks", std::vector{}); if(properties.parser.strides.size() != maskLayerNames.size()) { logger->error( @@ -760,15 +779,47 @@ void segmentationDecode(std::shared_ptr nnData, return; } - NNDataViewer protoValues = NNDataViewer(*nnData->getTensorInfo(protoLayerNames[0]), nnData->data, logger); + NNDataViewer protoValues = NNDataViewer(*nnData.getTensorInfo(protoLayerNames[0]), nnData.data, logger); if(!protoValues.build()) { logger->error("Failed to build NNDataViewer for proto layer {}. Skipping segmentation decoding.", protoLayerNames[0]); return; } + TensorInfo protoInfo = *nnData.getTensorInfo(protoLayerNames[0]); + int protoWidth = protoInfo.getWidth(); + int protoHeight = protoInfo.getHeight(); + int protoChannels = protoInfo.getChannels(); + if(protoWidth <= 0 || protoHeight <= 0 || protoChannels <= 0) { + logger->error("Invalid proto tensor dimensions: channels {}, height {}, width {}.", protoChannels, protoHeight, protoWidth); + return; + } + int protoWidthScaleFactor = inputWidth / protoWidth; + int protoHeightScaleFactor = inputHeight / protoHeight; + + cv::Mat maskUp; + cv::Mat maskLow(protoHeight, protoWidth, CV_32F); + + dai::NNData& nnDataNonConst = const_cast(nnData); + xt::xarray protoData = nnDataNonConst.getTensor(protoLayerNames[0], true); + Eigen::MatrixXf protoMatrix = Eigen::Map(protoData.data(), protoChannels, protoHeight * protoWidth); + + Eigen::RowVectorXf coeffs(protoChannels); + + auto maskFromCoeffs = [logger, protoHeight, protoWidth, &maskLow](const Eigen::MatrixXf& protos2d, const Eigen::RowVectorXf& coeffs) -> void { + if(protos2d.rows() != coeffs.size()) { + throw std::runtime_error("Mask coefficients size does not match proto channels."); + } + + Eigen::Map logits(maskLow.ptr(), protoHeight * protoWidth); + logits.noalias() = coeffs * protos2d; + + // no need to do sigmoid + // logits = (1.0f / (1.0f + (-logits.array()).exp())).matrix(); + }; + std::map maskValues; for(int strideIdx = 0; strideIdx < static_cast(maskLayerNames.size()); ++strideIdx) { - maskValues.try_emplace(strideIdx, *nnData->getTensorInfo(maskLayerNames[strideIdx]), nnData->data, logger); + maskValues.try_emplace(strideIdx, *nnData.getTensorInfo(maskLayerNames[strideIdx]), nnData.data, logger); if(!maskValues.at(strideIdx).build()) { logger->error("Failed to build NNDataViewer for mask layer {}. Skipping segmentation decoding.", maskLayerNames[strideIdx]); return; @@ -779,19 +830,15 @@ void segmentationDecode(std::shared_ptr nnData, const auto& c = detectionCandidates[i]; const int detIdx = static_cast(i); // index in outDetections list - NNDataViewer mask = maskValues.at(c.headIndex); - std::array coeff; - for(int i = 0; i < 32; ++i) { - coeff[i] = mask.get(i, c.rowIndex, c.columnIndex); + NNDataViewer& mask = maskValues.at(c.headIndex); + for(int ch = 0; ch < protoChannels; ++ch) { + coeffs(ch) = mask.get(ch, c.rowIndex, c.columnIndex); } + // TODO (aljaz) perform operations on ROI only instead of the full resolution + // Eigen::MatrixXf roiMatrix = protoMatrix.block(0, y0 * protoWidth + x0, protoChannels, (y1 - y0) * (x1 - x0)); - TensorInfo protoInfo = *nnData->getTensorInfo(protoLayerNames[0]); - int protoWidth = protoInfo.getWidth(); - int protoHeight = protoInfo.getHeight(); - maskLow = maskFromCoeffs(protoValues, coeff.data(), protoWidth, protoHeight); + maskFromCoeffs(protoMatrix, coeffs); - cv::resize(maskLow, maskUp, cv::Size(inputWidth, inputHeight), 0, 0, cv::INTER_LINEAR); - // ROI clamp int x0 = std::clamp(static_cast(std::floor(c.xmin)), 0, inputWidth - 1); int y0 = std::clamp(static_cast(std::floor(c.ymin)), 0, inputHeight - 1); int x1 = std::clamp(static_cast(std::ceil(c.xmax)), 0, inputWidth); @@ -800,10 +847,18 @@ void segmentationDecode(std::shared_ptr nnData, if(x1 <= x0 || y1 <= y0) continue; const cv::Rect roi(x0, y0, x1 - x0, y1 - y0); + int protoX0 = x0 / protoWidthScaleFactor; + int protoY0 = y0 / protoHeightScaleFactor; + int protoX1 = x1 / protoWidthScaleFactor; + int protoY1 = y1 / protoHeightScaleFactor; + const cv::Rect protoROI(protoX0, protoY0, protoX1 - protoX0, protoY1 - protoY0); + + cv::Mat roiProb; + cv::resize(maskLow(protoROI), roiProb, roi.size(), 0, 0, cv::INTER_LINEAR); + // Threshold & paint only unassigned pixels - cv::Mat roiProb = maskUp(roi); cv::Mat roiBin; - cv::compare(roiProb, static_cast(0.5f), roiBin, cv::CMP_GT); + cv::compare(roiProb, 0.0f, roiBin, cv::CMP_GT); cv::Mat roiOut = indexMask(roi); cv::Mat unassigned; cv::compare(roiOut, 255, unassigned, cv::CMP_EQ); @@ -814,22 +869,27 @@ void segmentationDecode(std::shared_ptr nnData, roiOut.setTo(value, paintMask); } - outDetections->setSegmentationMask(indexMask); + outDetections.setCvSegmentationMask(indexMask); } -void keypointDecode(std::shared_ptr nnData, +void keypointDecode(const dai::NNData& nnData, std::vector& detectionCandidates, - std::shared_ptr outDetections, + dai::ImgDetections& outDetections, DetectionParserProperties properties, std::shared_ptr logger) { + if(!properties.parser.nKeypoints) { + logger->warn("Number of keypoints not set in properties.parser.nKeypoints. Skipping keypoints decoding."); + return; + } + int inputWidth; int inputHeight; - std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + std::tie(inputWidth, inputHeight) = nnData.transformation->getSize(); - auto yoloLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + auto yoloLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); std::vector featureMapWidths; - for(int i = 0; i < static_cast(yoloLayerNames.size()); ++i) { - auto tensorInfo = nnData->getTensorInfo(yoloLayerNames[i]); + for(int i = 0; i < yoloLayerNames.size(); ++i) { + auto tensorInfo = nnData.getTensorInfo(yoloLayerNames[i]); if(!tensorInfo) { logger->error("Tensor info for layer {} is null. Skipping keypoints decoding.", yoloLayerNames[i]); return; @@ -850,18 +910,18 @@ void keypointDecode(std::shared_ptr nnData, // TODO (aljaz) move to a function std::map keypointValues; for(int strideIdx = 0; strideIdx < static_cast(kptsLayerNames.size()); ++strideIdx) { - keypointValues.try_emplace(strideIdx, *nnData->getTensorInfo(kptsLayerNames[strideIdx]), nnData->data, logger); + keypointValues.try_emplace(strideIdx, *nnData.getTensorInfo(kptsLayerNames[strideIdx]), nnData.data, logger); if(!keypointValues.at(strideIdx).build()) { logger->error("Failed to build NNDataViewer for keypoints layer {}. Skipping keypoints decoding.", kptsLayerNames[strideIdx]); return; } } - if(outDetections->detections.size() != detectionCandidates.size()) { + if(outDetections.detections.size() != detectionCandidates.size()) { logger->error( "Number of detections in ImgDetections does not match number of detection candidates. ImgDetections size: {}, detection candidates size: {}. " "Skipping keypoints decoding.", - outDetections->detections.size(), + outDetections.detections.size(), detectionCandidates.size()); return; } @@ -887,11 +947,10 @@ void keypointDecode(std::shared_ptr nnData, keypoints.push_back(dai::Keypoint{dai::Point2f(x, y), conf}); } - - outDetections->detections[i].keypoints = KeypointsList(keypoints, properties.parser.keypointEdges); + outDetections.detections[i].keypoints = KeypointsList(keypoints, properties.parser.keypointEdges); } } } // namespace DetectionParserUtils } // namespace utilities -} // namespace dai \ No newline at end of file +} // namespace dai diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp index 85b5a234f..bb61eaa57 100644 --- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp @@ -1,5 +1,4 @@ #pragma once - #include #include @@ -12,45 +11,45 @@ namespace dai { namespace utilities { namespace DetectionParserUtils { +constexpr std::size_t defaultMaxDetectionsPerFrame = 250; struct DetectionCandidate { float xmin, ymin, xmax, ymax, score; int label, headIndex, rowIndex, columnIndex; std::optional labelName; }; - /* Decode anchor free yolo v6r1 with sigmoid assisted center detection */ -void decodeR1AF(std::shared_ptr nnData, - std::shared_ptr outDetections, - DetectionParserProperties properties, +void decodeR1AF(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, std::shared_ptr logger); /* Decode anchor based yolo v3 and v3-Tiny */ -void decodeV3AB(std::shared_ptr nnData, - std::shared_ptr outDetections, - DetectionParserProperties properties, +void decodeV3AB(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, std::shared_ptr logger); /* Decode anchor based networks, e.g., yolo v5, v7, P */ -void decodeV5AB(std::shared_ptr nnData, - std::shared_ptr outDetections, - DetectionParserProperties properties, +void decodeV5AB(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, std::shared_ptr logger); /* Decode anchor free top-left-bottom-right (TLBR) style networks, e.g., yolo v6r2, v8, v10, v11 */ -void decodeTLBR(std::shared_ptr nnData, - std::shared_ptr outDetections, - DetectionParserProperties properties, +void decodeTLBR(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, std::shared_ptr logger); -std::vector getSortedDetectionLayerNames(std::shared_ptr nnData, std::string searchTerm, std::vector outputNames); +std::vector getSortedDetectionLayerNames(const dai::NNData& nnData, std::string searchTerm, std::vector outputNames); float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2); @@ -58,25 +57,25 @@ bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties p void createImgDetections(std::vector& detectionCandidates, std::vector keepIndices, - std::shared_ptr outDetections, + dai::ImgDetections& outDetections, std::shared_ptr logger); std::vector nonMaximumSuppression(std::vector& detectionCandidates, float iouThr); void createImgDetections(const std::vector& detectionCandidates, - std::shared_ptr outDetections, + dai::ImgDetections& outDetections, unsigned int width, unsigned int height); -void segmentationDecode(std::shared_ptr nnData, +void segmentationDecode(const dai::NNData& nnData, std::vector& detectionCandidates, - std::shared_ptr outDetections, + dai::ImgDetections& outDetections, DetectionParserProperties properties, std::shared_ptr logger); -void keypointDecode(std::shared_ptr nnData, +void keypointDecode(const dai::NNData& nnData, std::vector& detectionCandidates, - std::shared_ptr outDetections, + dai::ImgDetections& outDetections, DetectionParserProperties properties, std::shared_ptr logger); From 36770bc777d759d38dd5d5312090e4bdc2fd31b6 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Mon, 17 Nov 2025 09:18:07 +0100 Subject: [PATCH 09/24] update utils and node params --- .../depthai/pipeline/node/DetectionParser.hpp | 4 +- src/pipeline/node/DetectionParser.cpp | 64 +++++++++++-------- .../DetectionParser/DetectionParserUtils.cpp | 51 ++------------- 3 files changed, 46 insertions(+), 73 deletions(-) diff --git a/include/depthai/pipeline/node/DetectionParser.hpp b/include/depthai/pipeline/node/DetectionParser.hpp index 7a547a936..c04c6206a 100644 --- a/include/depthai/pipeline/node/DetectionParser.hpp +++ b/include/depthai/pipeline/node/DetectionParser.hpp @@ -283,7 +283,7 @@ class DetectionParser : public DeviceNodeCRTP decodeMobilenet(std::shared_ptr nnData, float confidenceThr); + void decodeMobilenet(dai::NNData& nnData, dai::ImgDetections& outDetections, float confidenceThr); private: bool runOnHostVar = false; @@ -296,7 +296,7 @@ class DetectionParser : public DeviceNodeCRTP nnData, std::shared_ptr outDetections); + void decodeYolo(dai::NNData& nnData, dai::ImgDetections& outDetections); std::vector inTensorInfo; uint32_t imgWidth; uint32_t imgHeight; diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp index 2040bf8b6..1ffbbf18d 100644 --- a/src/pipeline/node/DetectionParser.cpp +++ b/src/pipeline/node/DetectionParser.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -392,36 +393,36 @@ void DetectionParser::run() { using namespace std::chrono; while(isRunning()) { auto tAbsoluteBeginning = steady_clock::now(); - std::shared_ptr inputData; - inputData = input.get(); - if(!inputData) { - logger->error("Error while receiving NN frame."); + std::shared_ptr sharedInputData = input.get(); + auto outDetections = std::make_shared(); + + if(!sharedInputData) { + logger->error("NN Data is empty. Skipping processing."); continue; } auto tAfterMessageBeginning = steady_clock::now(); + dai::NNData& inputData = *sharedInputData; if(!imgSizesSet) { - const bool containsTransformation = inputData->transformation.has_value(); + const bool containsTransformation = inputData.transformation.has_value(); if(containsTransformation) { - std::tie(imgWidth, imgHeight) = inputData->transformation->getSize(); + std::tie(imgWidth, imgHeight) = inputData.transformation->getSize(); } else { logger->warn("No image size provided for detection parser. Skipping processing and sending empty detections."); continue; } - + // We have determined the image size, no need to try again in the future imgSizesSet = true; } - auto outDetections = std::make_shared(); - + // Parse detections switch(properties.parser.nnFamily) { case DetectionNetworkType::YOLO: { - decodeYolo(inputData, outDetections); + decodeYolo(inputData, *outDetections); break; } case DetectionNetworkType::MOBILENET: { - auto dets = decodeMobilenet(inputData, properties.parser.confidenceThreshold); // TODO (aljaz) update to shared pointer - outDetections->detections = dets; + decodeMobilenet(inputData, *outDetections, properties.parser.confidenceThreshold); break; } default: { @@ -433,10 +434,11 @@ void DetectionParser::run() { auto tBeforeSend = steady_clock::now(); // Copy over seq and ts - outDetections->setSequenceNum(inputData->getSequenceNum()); - outDetections->setTimestamp(inputData->getTimestamp()); - outDetections->setTimestampDevice(inputData->getTimestampDevice()); - outDetections->transformation = inputData->transformation; + outDetections->setSequenceNum(inputData.getSequenceNum()); + outDetections->setTimestamp(inputData.getTimestamp()); + outDetections->setTimestampDevice(inputData.getTimestampDevice()); + outDetections->transformation = inputData.transformation; + // Send detections out.send(outDetections); @@ -476,26 +478,34 @@ void DetectionParser::buildStage1() { } } -std::vector DetectionParser::decodeMobilenet(std::shared_ptr nnData, float confidenceThr) { +void DetectionParser::decodeMobilenet(dai::NNData& nnData, dai::ImgDetections& outDetections, float confidenceThr) { auto& logger = pimpl->logger; - if(!nnData) { - return {}; - } int maxDetections = 100; std::vector detections; std::string tensorName; - for(const auto& tensor : nnData->getAllLayers()) { + for(const auto& tensor : nnData.getAllLayers()) { if(tensor.offset == 0) { + // // The tensor we want to checkout + // if(tensor.numDimensions != 4) { + // std::cout << "ERROR while decoding Mobilenet. Output tensor has incorrect dimensions. Number of dimensions: " << tensor.numDimensions + // << std::endl; + // } + // // Get tensor output size in Bytes + // // Expected dimensions are [1, 1, N, 7] where N is number of detections + // if(tensor.dims[3] != 7) { + // std::cout << "ERROR while decoding Mobilenet. Expecting 7 fields for every detection but: " << tensor.dims[3] << " found.\n"; + // } + // maxDetections = tensor.dims[tensor.numDimensions - 2]; tensorName = tensor.name; } } - auto tensorData = nnData->getTensor(tensorName); + auto tensorData = nnData.getTensor(tensorName); maxDetections = tensorData.size() / 7; if(static_cast(tensorData.size()) < maxDetections * 7) { logger->error("Error while parsing Mobilenet. Vector not long enough, expected size: {}, real size {}", maxDetections * 7, tensorData.size()); - return {}; + return; } struct raw_Detection { // need to update it to include more @@ -529,13 +539,12 @@ std::vector DetectionParser::decodeMobilenet(std::shared_ptr< d.xmax = temp.xmax; d.ymax = temp.ymax; - detections.push_back(d); + outDetections.detections.push_back(d); } } - return detections; } -void DetectionParser::decodeYolo(std::shared_ptr nnData, std::shared_ptr outDetections) { +void DetectionParser::decodeYolo(dai::NNData& nnData, dai::ImgDetections& outDetections) { auto& logger = pimpl->logger; switch(properties.parser.decodingFamily) { case YoloDecodingFamily::R1AF: // anchor free: yolo v6r1 @@ -550,6 +559,9 @@ void DetectionParser::decodeYolo(std::shared_ptr nnData, std::share case YoloDecodingFamily::TLBR: // top left bottom right anchor free: yolo v6r2, v8 v10 v11 utilities::DetectionParserUtils::decodeTLBR(nnData, outDetections, properties, logger); break; + default: + logger->error("Unknown Yolo decoding family. 'R1AF', 'v3AB', 'v5AB' and 'TLBR' are supported."); + throw std::runtime_error("Unknown Yolo decoding family"); } } diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp index 33d38cea9..1534ac36b 100644 --- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -130,17 +131,7 @@ void decodeR1AF(const dai::NNData& nnData, ymax); continue; } - DetectionCandidate candidate = DetectionCandidate{ - xmin, - ymin, - xmax, - ymax, - bestConf * objectnessScore, - bestC, - strideIdx, - row, - col, - }; + DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, bestConf * objectnessScore, bestC, strideIdx, row, col, std::nullopt}; detectionCandidates.emplace_back(std::move(candidate)); } @@ -287,17 +278,7 @@ void decodeV3AB(const dai::NNData& nnData, continue; } - DetectionCandidate candidate = DetectionCandidate{ - xmin, - ymin, - xmax, - ymax, - conf, - bestC, - strideIdx, - row, - col, - }; + DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, conf, bestC, strideIdx, row, col, std::nullopt}; detectionCandidates.emplace_back(std::move(candidate)); } @@ -444,17 +425,7 @@ void decodeV5AB(const dai::NNData& nnData, ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); if(xmax <= xmin || ymax <= ymin) continue; - DetectionCandidate candidate = DetectionCandidate{ - xmin, - ymin, - xmax, - ymax, - conf, - bestC, - strideIdx, - row, - col, - }; + DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, conf, bestC, strideIdx, row, col, std::nullopt}; detectionCandidates.emplace_back(std::move(candidate)); } @@ -573,17 +544,7 @@ void decodeTLBR(const dai::NNData& nnData, continue; } - DetectionCandidate candidate = DetectionCandidate{ - xmin, - ymin, - xmax, - ymax, - bestConf, - bestC, - strideIdx, - row, - col, - }; + DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, bestConf, bestC, strideIdx, row, col, std::nullopt}; detectionCandidates.emplace_back(std::move(candidate)); } @@ -888,7 +849,7 @@ void keypointDecode(const dai::NNData& nnData, auto yoloLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); std::vector featureMapWidths; - for(int i = 0; i < yoloLayerNames.size(); ++i) { + for(int i = 0; i < static_cast(yoloLayerNames.size()); ++i) { auto tensorInfo = nnData.getTensorInfo(yoloLayerNames[i]); if(!tensorInfo) { logger->error("Tensor info for layer {} is null. Skipping keypoints decoding.", yoloLayerNames[i]); From 9dbda0ba11e119e28bf956fc56893ff55b75f523 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Mon, 17 Nov 2025 13:45:15 +0100 Subject: [PATCH 10/24] add logger --- .../pipeline/datatype/ImgDetectionsT.hpp | 1 + src/pipeline/datatype/ImgDetectionsT.cpp | 5 +++++ src/pipeline/node/DetectionParser.cpp | 8 ++++---- .../DetectionParser/DetectionParserUtils.cpp | 18 +++++++++++------- .../DetectionParser/DetectionParserUtils.hpp | 16 ++++++++-------- 5 files changed, 29 insertions(+), 19 deletions(-) diff --git a/include/depthai/pipeline/datatype/ImgDetectionsT.hpp b/include/depthai/pipeline/datatype/ImgDetectionsT.hpp index 61b4d4bf0..99eb12cf7 100644 --- a/include/depthai/pipeline/datatype/ImgDetectionsT.hpp +++ b/include/depthai/pipeline/datatype/ImgDetectionsT.hpp @@ -76,6 +76,7 @@ class ImgDetectionsT : public Buffer { * Copies cv::Mat data to Segmentation Mask buffer * * @param frame Input cv::Mat frame from which to copy the data + * @note Throws if mask is not a single channel INT8 type. */ void setCvSegmentationMask(cv::Mat mask); diff --git a/src/pipeline/datatype/ImgDetectionsT.cpp b/src/pipeline/datatype/ImgDetectionsT.cpp index 875628fb8..bbf6545e2 100644 --- a/src/pipeline/datatype/ImgDetectionsT.cpp +++ b/src/pipeline/datatype/ImgDetectionsT.cpp @@ -1,5 +1,7 @@ #include "depthai/pipeline/datatype/ImgDetectionsT.hpp" +#include + #include #include #include @@ -75,6 +77,9 @@ std::optional ImgDetectionsT::getSegmentationMask() c template void ImgDetectionsT::setCvSegmentationMask(cv::Mat mask) { + if(mask.type() != CV_8U) { + throw("SetCvSegmentationMask: Mask must be of INT8 type."); + } std::vector dataVec; if(!mask.isContinuous()) { for(int i = 0; i < mask.rows; i++) { diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp index 1ffbbf18d..e1aff71a6 100644 --- a/src/pipeline/node/DetectionParser.cpp +++ b/src/pipeline/node/DetectionParser.cpp @@ -387,7 +387,7 @@ bool DetectionParser::runOnHost() const { } void DetectionParser::run() { - auto& logger = pimpl->logger; + auto& logger = ThreadedNode::pimpl->logger; logger->info("Detection parser running on host."); using namespace std::chrono; @@ -452,7 +452,7 @@ void DetectionParser::run() { } void DetectionParser::buildStage1() { - auto& logger = pimpl->logger; + auto& logger = ThreadedNode::pimpl->logger; // Grab dimensions from input tensor info if(properties.networkInputs.size() > 0) { @@ -479,7 +479,7 @@ void DetectionParser::buildStage1() { } void DetectionParser::decodeMobilenet(dai::NNData& nnData, dai::ImgDetections& outDetections, float confidenceThr) { - auto& logger = pimpl->logger; + auto& logger = ThreadedNode::pimpl->logger; int maxDetections = 100; std::vector detections; @@ -545,7 +545,7 @@ void DetectionParser::decodeMobilenet(dai::NNData& nnData, dai::ImgDetections& o } void DetectionParser::decodeYolo(dai::NNData& nnData, dai::ImgDetections& outDetections) { - auto& logger = pimpl->logger; + std::shared_ptr& logger = ThreadedNode::pimpl->logger; switch(properties.parser.decodingFamily) { case YoloDecodingFamily::R1AF: // anchor free: yolo v6r1 utilities::DetectionParserUtils::decodeR1AF(nnData, outDetections, properties, logger); diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp index 1534ac36b..7269175a8 100644 --- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -35,7 +36,7 @@ namespace DetectionParserUtils { void decodeR1AF(const dai::NNData& nnData, dai::ImgDetections& outDetections, DetectionParserProperties& properties, - std::shared_ptr logger) { + std::shared_ptr& logger) { auto layerNames = utilities::DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); const std::vector strides = properties.parser.strides; @@ -168,7 +169,7 @@ Decode anchor based yolo v3 and v3-Tiny void decodeV3AB(const dai::NNData& nnData, dai::ImgDetections& outDetections, DetectionParserProperties& properties, - std::shared_ptr logger) { + std::shared_ptr& logger) { auto layerNames = getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); auto sigmoid = [](float x) -> float { return 1.f / (1.f + std::exp(-x)); }; @@ -319,7 +320,7 @@ Decode anchor based networks, e.g., yolo v5, v7, P void decodeV5AB(const dai::NNData& nnData, dai::ImgDetections& outDetections, DetectionParserProperties& properties, - std::shared_ptr logger) { + std::shared_ptr& logger) { auto layerNames = getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); const std::vector strides = properties.parser.strides; @@ -464,7 +465,7 @@ Decode TLBR (top left bottom right) style networks, e.g., yolo v6r2, v8, v10, v1 void decodeTLBR(const dai::NNData& nnData, dai::ImgDetections& outDetections, DetectionParserProperties& properties, - std::shared_ptr logger) { + std::shared_ptr& logger) { auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); const std::vector strides = properties.parser.strides; @@ -576,7 +577,7 @@ void decodeTLBR(const dai::NNData& nnData, } } -bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr logger) { +bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr& logger) { // Fix the channel order for Yolo - this is hacky and would be best to be fixed in the actual models and make it consistent auto getYoloChannelSize = [&](int classes, int coordinates, int anchors) -> int { if(anchors == 0) { @@ -719,10 +720,11 @@ void segmentationDecode(const dai::NNData& nnData, std::vector& detectionCandidates, dai::ImgDetections& outDetections, DetectionParserProperties properties, - std::shared_ptr logger) { + std::shared_ptr& logger) { std::pair inputSize = nnData.transformation->getSize(); int inputWidth = inputSize.first; int inputHeight = inputSize.second; + auto tStart = std::chrono::steady_clock::now(); cv::Mat indexMask(inputHeight, inputWidth, CV_8U, cv::Scalar(255)); @@ -829,6 +831,8 @@ void segmentationDecode(const dai::NNData& nnData, const uint8_t value = static_cast(std::min(detIdx, 254)); roiOut.setTo(value, paintMask); } + auto tEnd = std::chrono::steady_clock::now(); + logger->warn("Time to transform: {} ns", std::chrono::duration_cast(tEnd - tStart).count()); outDetections.setCvSegmentationMask(indexMask); } @@ -837,7 +841,7 @@ void keypointDecode(const dai::NNData& nnData, std::vector& detectionCandidates, dai::ImgDetections& outDetections, DetectionParserProperties properties, - std::shared_ptr logger) { + std::shared_ptr& logger) { if(!properties.parser.nKeypoints) { logger->warn("Number of keypoints not set in properties.parser.nKeypoints. Skipping keypoints decoding."); return; diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp index bb61eaa57..593007c14 100644 --- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp @@ -23,7 +23,7 @@ Decode anchor free yolo v6r1 with sigmoid assisted center detection void decodeR1AF(const dai::NNData& nnData, dai::ImgDetections& outDetections, DetectionParserProperties& properties, - std::shared_ptr logger); + std::shared_ptr& logger); /* Decode anchor based yolo v3 and v3-Tiny @@ -31,7 +31,7 @@ Decode anchor based yolo v3 and v3-Tiny void decodeV3AB(const dai::NNData& nnData, dai::ImgDetections& outDetections, DetectionParserProperties& properties, - std::shared_ptr logger); + std::shared_ptr& logger); /* Decode anchor based networks, e.g., yolo v5, v7, P @@ -39,7 +39,7 @@ Decode anchor based networks, e.g., yolo v5, v7, P void decodeV5AB(const dai::NNData& nnData, dai::ImgDetections& outDetections, DetectionParserProperties& properties, - std::shared_ptr logger); + std::shared_ptr& logger); /* Decode anchor free top-left-bottom-right (TLBR) style networks, e.g., yolo v6r2, v8, v10, v11 @@ -47,18 +47,18 @@ Decode anchor free top-left-bottom-right (TLBR) style networks, e.g., yolo v6r2, void decodeTLBR(const dai::NNData& nnData, dai::ImgDetections& outDetections, DetectionParserProperties& properties, - std::shared_ptr logger); + std::shared_ptr& logger); std::vector getSortedDetectionLayerNames(const dai::NNData& nnData, std::string searchTerm, std::vector outputNames); float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2); -bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr logger); +bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr& logger); void createImgDetections(std::vector& detectionCandidates, std::vector keepIndices, dai::ImgDetections& outDetections, - std::shared_ptr logger); + std::shared_ptr& logger); std::vector nonMaximumSuppression(std::vector& detectionCandidates, float iouThr); @@ -71,13 +71,13 @@ void segmentationDecode(const dai::NNData& nnData, std::vector& detectionCandidates, dai::ImgDetections& outDetections, DetectionParserProperties properties, - std::shared_ptr logger); + std::shared_ptr& logger); void keypointDecode(const dai::NNData& nnData, std::vector& detectionCandidates, dai::ImgDetections& outDetections, DetectionParserProperties properties, - std::shared_ptr logger); + std::shared_ptr& logger); } // namespace DetectionParserUtils } // namespace utilities From 8f30141b734373936b32612b03434a313d582f37 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Mon, 17 Nov 2025 14:01:24 +0100 Subject: [PATCH 11/24] Remove unused functions --- .../DetectionParser/DetectionParserUtils.cpp | 21 +++++-------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp index 7269175a8..a0391ccb4 100644 --- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp @@ -578,19 +578,11 @@ void decodeTLBR(const dai::NNData& nnData, } bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr& logger) { - // Fix the channel order for Yolo - this is hacky and would be best to be fixed in the actual models and make it consistent - auto getYoloChannelSize = [&](int classes, int coordinates, int anchors) -> int { - if(anchors == 0) { - anchors = 1; - } - return anchors * (classes + coordinates + 1); - }; - int anchorMultiplier = properties.parser.anchorsV2.empty() ? 1 : static_cast(properties.parser.anchorsV2.size()); int channelSize = anchorMultiplier * (properties.parser.classes + properties.parser.coordinates + 1); auto checkAndFixOrder = - [&](dai::TensorInfo::StorageOrder currentOrder, int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool { + [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool { // Check that the dims size is big enough if(static_cast(tensorInfo.dims.size()) <= channelDimIndex || static_cast(tensorInfo.dims.size()) <= alternativeDimIndex) { logger->error("Invalid tensor dims size. Skipping."); @@ -612,16 +604,16 @@ bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties p switch(tensorInfo.order) { case dai::TensorInfo::StorageOrder::CHW: - if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::CHW, 0, 2, dai::TensorInfo::StorageOrder::HWC)) return false; + if(!checkAndFixOrder(0, 2, dai::TensorInfo::StorageOrder::HWC)) return false; break; case dai::TensorInfo::StorageOrder::HWC: - if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::HWC, 2, 0, dai::TensorInfo::StorageOrder::CHW)) return false; + if(!checkAndFixOrder( 2, 0, dai::TensorInfo::StorageOrder::CHW)) return false; break; case dai::TensorInfo::StorageOrder::NCHW: - if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::NCHW, 1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false; + if(!checkAndFixOrder( 1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false; break; case dai::TensorInfo::StorageOrder::NHWC: - if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::NHWC, 3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false; + if(!checkAndFixOrder( 3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false; break; case dai::TensorInfo::StorageOrder::NHCW: case dai::TensorInfo::StorageOrder::WHC: @@ -724,7 +716,6 @@ void segmentationDecode(const dai::NNData& nnData, std::pair inputSize = nnData.transformation->getSize(); int inputWidth = inputSize.first; int inputHeight = inputSize.second; - auto tStart = std::chrono::steady_clock::now(); cv::Mat indexMask(inputHeight, inputWidth, CV_8U, cv::Scalar(255)); @@ -831,8 +822,6 @@ void segmentationDecode(const dai::NNData& nnData, const uint8_t value = static_cast(std::min(detIdx, 254)); roiOut.setTo(value, paintMask); } - auto tEnd = std::chrono::steady_clock::now(); - logger->warn("Time to transform: {} ns", std::chrono::duration_cast(tEnd - tStart).count()); outDetections.setCvSegmentationMask(indexMask); } From 6509bfced8630fdc145949965e5ad7e5f862fcf2 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Mon, 17 Nov 2025 14:04:20 +0100 Subject: [PATCH 12/24] remove unwanted import --- src/pipeline/datatype/ImgDetectionsT.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/pipeline/datatype/ImgDetectionsT.cpp b/src/pipeline/datatype/ImgDetectionsT.cpp index bbf6545e2..299d96c27 100644 --- a/src/pipeline/datatype/ImgDetectionsT.cpp +++ b/src/pipeline/datatype/ImgDetectionsT.cpp @@ -1,7 +1,5 @@ #include "depthai/pipeline/datatype/ImgDetectionsT.hpp" -#include - #include #include #include From 85fdbd1943b89f24bcbb7aa714847342707235bc Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Mon, 17 Nov 2025 15:05:04 +0100 Subject: [PATCH 13/24] Add storage order checker --- .../DetectionParser/DetectionParserUtils.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp index a0391ccb4..6f187a8b0 100644 --- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp @@ -581,8 +581,7 @@ bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties p int anchorMultiplier = properties.parser.anchorsV2.empty() ? 1 : static_cast(properties.parser.anchorsV2.size()); int channelSize = anchorMultiplier * (properties.parser.classes + properties.parser.coordinates + 1); - auto checkAndFixOrder = - [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool { + auto checkAndFixOrder = [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool { // Check that the dims size is big enough if(static_cast(tensorInfo.dims.size()) <= channelDimIndex || static_cast(tensorInfo.dims.size()) <= alternativeDimIndex) { logger->error("Invalid tensor dims size. Skipping."); @@ -607,13 +606,13 @@ bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties p if(!checkAndFixOrder(0, 2, dai::TensorInfo::StorageOrder::HWC)) return false; break; case dai::TensorInfo::StorageOrder::HWC: - if(!checkAndFixOrder( 2, 0, dai::TensorInfo::StorageOrder::CHW)) return false; + if(!checkAndFixOrder(2, 0, dai::TensorInfo::StorageOrder::CHW)) return false; break; case dai::TensorInfo::StorageOrder::NCHW: - if(!checkAndFixOrder( 1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false; + if(!checkAndFixOrder(1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false; break; case dai::TensorInfo::StorageOrder::NHWC: - if(!checkAndFixOrder( 3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false; + if(!checkAndFixOrder(3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false; break; case dai::TensorInfo::StorageOrder::NHCW: case dai::TensorInfo::StorageOrder::WHC: @@ -755,6 +754,10 @@ void segmentationDecode(const dai::NNData& nnData, dai::NNData& nnDataNonConst = const_cast(nnData); xt::xarray protoData = nnDataNonConst.getTensor(protoLayerNames[0], true); + if(protoInfo.order != dai::TensorInfo::StorageOrder::NHWC) { + logger->trace("Proto storage is not NHWC, changing order."); + nnDataNonConst.changeStorageOrder(protoData, protoInfo.order, dai::TensorInfo::StorageOrder::NHWC); + } Eigen::MatrixXf protoMatrix = Eigen::Map(protoData.data(), protoChannels, protoHeight * protoWidth); Eigen::RowVectorXf coeffs(protoChannels); From 2e51cca60218b1735491f1c3779480bd038dd022 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Mon, 17 Nov 2025 15:35:59 +0100 Subject: [PATCH 14/24] Fix formating --- examples/cpp/DetectionNetwork/detection_and_keypoints.cpp | 1 + examples/cpp/DetectionNetwork/detection_and_segmentation.cpp | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp index 3d94f5764..f4c80837d 100644 --- a/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp +++ b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp @@ -93,6 +93,7 @@ int main() { auto currentTime = std::chrono::steady_clock::now(); float fps = counter / std::chrono::duration(currentTime - startTime).count(); + std::cout << "FPS: " << fps << std::endl; } if(cv::waitKey(1) == 'q') { diff --git a/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp index f3ad63f2c..d9551117a 100644 --- a/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp +++ b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp @@ -4,8 +4,8 @@ #include #include #include -#include #include +#include #include #include @@ -41,7 +41,6 @@ int main() { modelDescription.model = modelName; detectionNetwork->build(cameraNode, modelDescription); detectionNetwork->detectionParser->setRunOnHost(setRunOnHost); - auto labelMap = detectionNetwork->getClasses(); // Create output queues auto qRgb = detectionNetwork->passthrough.createOutputQueue(); @@ -137,10 +136,10 @@ int main() { detections.begin(), detections.end(), [filteredLabel](const dai::ImgDetection& det) { return det.label != filteredLabel; }), detections.end()); } + if(segmentationMask) { cv::Mat lut(1, 256, CV_8U); for(int i = 0; i < 256; ++i) lut.at(i) = (i >= 255) ? 255 : cv::saturate_cast(i * 25); - cv::Mat scaledMask; cv::LUT(*segmentationMask, lut, scaledMask); From 6c2214ae3e1fea51e75b5fe0fd9fd73ab38f71b0 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Mon, 17 Nov 2025 16:29:18 +0100 Subject: [PATCH 15/24] bump fw --- cmake/Depthai/DepthaiDeviceRVC4Config.cmake | 2 +- cmake/Depthai/DepthaiDeviceSideConfig.cmake | 2 +- .../python/DetectionNetwork/detection_and_segmentation.py | 2 +- src/pipeline/datatype/ImgDetectionsT.cpp | 5 +++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake index 9f108699e..be5d08dc7 100644 --- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake +++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake @@ -3,4 +3,4 @@ set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot") # "version if applicable" -set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+1e78c47a2c81d8de6f10d888de2a14de5557c6c3") +set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+29d1575da0655630004fc1733d1acefa3b66499c") diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake index 351618039..98b588226 100644 --- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake +++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake @@ -2,7 +2,7 @@ set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot") # "full commit hash of device side binary" -set(DEPTHAI_DEVICE_SIDE_COMMIT "913e44e627a6e24f794bce4c4eed2a94691072a4") +set(DEPTHAI_DEVICE_SIDE_COMMIT "6d07abc50b03c9ea164f2e5664c3f155741998b5") # "version if applicable" set(DEPTHAI_DEVICE_SIDE_VERSION "") diff --git a/examples/python/DetectionNetwork/detection_and_segmentation.py b/examples/python/DetectionNetwork/detection_and_segmentation.py index 4f74ce29e..4445d6f04 100644 --- a/examples/python/DetectionNetwork/detection_and_segmentation.py +++ b/examples/python/DetectionNetwork/detection_and_segmentation.py @@ -15,7 +15,7 @@ # Create pipeline with dai.Pipeline(device) as pipeline: cameraNode = pipeline.create(dai.node.Camera).build() - + detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription(model_name)) detectionNetwork.detectionParser.setRunOnHost(setRunOnHost) labelMap = detectionNetwork.getClasses() diff --git a/src/pipeline/datatype/ImgDetectionsT.cpp b/src/pipeline/datatype/ImgDetectionsT.cpp index 299d96c27..ce689fb03 100644 --- a/src/pipeline/datatype/ImgDetectionsT.cpp +++ b/src/pipeline/datatype/ImgDetectionsT.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -75,8 +76,8 @@ std::optional ImgDetectionsT::getSegmentationMask() c template void ImgDetectionsT::setCvSegmentationMask(cv::Mat mask) { - if(mask.type() != CV_8U) { - throw("SetCvSegmentationMask: Mask must be of INT8 type."); + if(mask.type() != CV_8UC1) { + throw std::runtime_error("SetCvSegmentationMask: Mask must be of INT8 type, got opencv type " + cv::typeToString(mask.type()) + "."); } std::vector dataVec; if(!mask.isContinuous()) { From 109020c5a387f49a79d2b65852dbaa5af98c81d1 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Tue, 18 Nov 2025 08:52:15 +0100 Subject: [PATCH 16/24] remove rvc2 test label --- tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 70ec76f07..69e42ba58 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -546,7 +546,7 @@ FIRE_VIDEO="${fire_video}" KITCHEN_IMAGE_PATH="${kitchen_image}" YOLO_V8_INSTANCE_SEGMENTATION_LARGE_COCO_640x352_KITCHEN_SEGMENTATION_GROUND_TRUTH="${yolo_v8_instance_segmentation_large_coco_640x352_kitchen_segmentation_gt_v2}" ) -dai_set_test_labels(detection_parser_test ondevice rvc4 ci onhost) +dai_set_test_labels(detection_parser_test ondevice rvc4 ci) # Spatial detection network test dai_add_test(spatial_detection_network_test src/ondevice_tests/pipeline/node/spatial_detection_network_test.cpp) From ab5f693d47327bc517385c5a8dc23ec19ec7e711 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Wed, 19 Nov 2025 08:20:26 +0100 Subject: [PATCH 17/24] Implement suggestions --- examples/cpp/DetectionNetwork/CMakeLists.txt | 4 ++-- examples/cpp/DetectionNetwork/detection_and_segmentation.cpp | 4 ++-- .../python/DetectionNetwork/detection_and_segmentation.py | 2 +- src/pipeline/utilities/NNDataViewer.hpp | 2 ++ tests/CMakeLists.txt | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/cpp/DetectionNetwork/CMakeLists.txt b/examples/cpp/DetectionNetwork/CMakeLists.txt index 2af09182f..fef48e7bb 100644 --- a/examples/cpp/DetectionNetwork/CMakeLists.txt +++ b/examples/cpp/DetectionNetwork/CMakeLists.txt @@ -24,7 +24,7 @@ dai_add_example(detection_network_remap detection_network_remap.cpp ON OFF) dai_set_example_test_labels(detection_network_remap ondevice rvc2_all rvc4 ci) dai_add_example(detection_and_segmentation detection_and_segmentation.cpp ON OFF) -dai_set_example_test_labels(detection_and_segmentation rvc4) +dai_set_example_test_labels(detection_and_segmentation rvc2_all rvc4 ci) dai_add_example(detection_and_keypoints detection_and_keypoints.cpp ON OFF) -dai_set_example_test_labels(detection_and_keypoints rvc4) +dai_set_example_test_labels(detection_and_keypoints rvc2_all rvc4 ci) diff --git a/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp index d9551117a..fa312d382 100644 --- a/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp +++ b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp @@ -22,7 +22,7 @@ int main() { bool setRunOnHost = false; auto device = std::make_shared(); - if(device->getPlatformAsString() == "RVC2") { + if(device->getPlatform() == dai::Platform::RVC2) { modelName = "luxonis/yolov8-instance-segmentation-nano:coco-512x288"; setRunOnHost = true; } @@ -139,7 +139,7 @@ int main() { if(segmentationMask) { cv::Mat lut(1, 256, CV_8U); - for(int i = 0; i < 256; ++i) lut.at(i) = (i >= 255) ? 255 : cv::saturate_cast(i * 25); + for(int i = 0; i < 256; ++i) lut.at(i) = (i == 255) ? 255 : cv::saturate_cast(i * 25); cv::Mat scaledMask; cv::LUT(*segmentationMask, lut, scaledMask); diff --git a/examples/python/DetectionNetwork/detection_and_segmentation.py b/examples/python/DetectionNetwork/detection_and_segmentation.py index 4445d6f04..81d703106 100644 --- a/examples/python/DetectionNetwork/detection_and_segmentation.py +++ b/examples/python/DetectionNetwork/detection_and_segmentation.py @@ -8,7 +8,7 @@ model_name = "luxonis/yolov8-instance-segmentation-large:coco-640x480" setRunOnHost = False device = dai.Device() -if device.getPlatformAsString() == "RVC2": +if device.getPlatform() == dai.Platform.RVC2: model_name = "luxonis/yolov8-instance-segmentation-nano:coco-512x288" setRunOnHost = True diff --git a/src/pipeline/utilities/NNDataViewer.hpp b/src/pipeline/utilities/NNDataViewer.hpp index 94ab12cda..f00d23a6a 100644 --- a/src/pipeline/utilities/NNDataViewer.hpp +++ b/src/pipeline/utilities/NNDataViewer.hpp @@ -39,6 +39,7 @@ class NNDataViewer { } if(tensor.strides.size() != 4) { logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4); + return false; } factorsBefore.c = tensor.strides[1]; factorsBefore.h = tensor.strides[2]; @@ -51,6 +52,7 @@ class NNDataViewer { } if(tensor.strides.size() != 4) { logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4); + return false; } factorsBefore.h = tensor.strides[1]; factorsBefore.w = tensor.strides[2]; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 75d546f48..80705e181 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -407,7 +407,7 @@ dai_set_test_labels(nndata_test onhost ci) #ImgDetections tests dai_add_test(imgdetections_test src/onhost_tests/pipeline/datatype/imgdetections_test.cpp) -dai_set_test_labels(imgdetections_test ondevice rvc2 rvc4 onhost ci) +dai_set_test_labels(imgdetections_test onhost ci) # Model description tests dai_add_test(model_slug_test src/onhost_tests/model_slug_test.cpp) From eeecbf928089b2bfbb7e4e9f214f2f6386c9bbea Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Wed, 19 Nov 2025 08:23:06 +0100 Subject: [PATCH 18/24] merge develop --- 3rdparty/foxglove/ws-protocol | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/foxglove/ws-protocol b/3rdparty/foxglove/ws-protocol index 234fa7936..45d3e08ff 160000 --- a/3rdparty/foxglove/ws-protocol +++ b/3rdparty/foxglove/ws-protocol @@ -1 +1 @@ -Subproject commit 234fa7936bfedc2824068aecd04b5ee6390e98c9 +Subproject commit 45d3e08ff168611ab8347ba194fd54b9425c99f8 From 3943a06884d3a29901d4f8914544b8030066fa54 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Wed, 19 Nov 2025 08:51:48 +0100 Subject: [PATCH 19/24] fix rvc2 build failure --- src/pipeline/datatype/ImgDetectionsT.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pipeline/datatype/ImgDetectionsT.cpp b/src/pipeline/datatype/ImgDetectionsT.cpp index ce689fb03..c9db32022 100644 --- a/src/pipeline/datatype/ImgDetectionsT.cpp +++ b/src/pipeline/datatype/ImgDetectionsT.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include From 07ae4c64aa84ac5008a5e4ace5bdc70f22631756 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Wed, 19 Nov 2025 08:56:28 +0100 Subject: [PATCH 20/24] bump fw --- cmake/Depthai/DepthaiDeviceRVC4Config.cmake | 2 +- cmake/Depthai/DepthaiDeviceSideConfig.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake index 4fe07a881..587547e25 100644 --- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake +++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake @@ -3,4 +3,4 @@ set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot") # "version if applicable" -set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+3f910a125fde9174915207ba3a01372ad562e0f2") +set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+2f8298ea040cf4eb4524c9ca84776b9f60fd106d") diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake index 98b588226..e84ae7861 100644 --- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake +++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake @@ -2,7 +2,7 @@ set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot") # "full commit hash of device side binary" -set(DEPTHAI_DEVICE_SIDE_COMMIT "6d07abc50b03c9ea164f2e5664c3f155741998b5") +set(DEPTHAI_DEVICE_SIDE_COMMIT "966c3f9094a4fb73c663fe11a57c3cec7c0deeee") # "version if applicable" set(DEPTHAI_DEVICE_SIDE_VERSION "") From 93a3f27a7d66f281b5751899727d75190baf4c64 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Fri, 21 Nov 2025 11:02:10 +0100 Subject: [PATCH 21/24] Throw if anchor dimension is != 2 --- src/pipeline/node/DetectionParser.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp index e1aff71a6..34299c740 100644 --- a/src/pipeline/node/DetectionParser.cpp +++ b/src/pipeline/node/DetectionParser.cpp @@ -159,6 +159,9 @@ void DetectionParser::setConfig(const dai::NNArchiveVersionedConfig& config) { std::vector> layerOut(anchorsIn[layer].size()); for(size_t anchor = 0; anchor < layerOut.size(); ++anchor) { std::vector anchorOut(anchorsIn[layer][anchor].size()); + if (anchorOut.size() != 2) { + throw std::runtime_error("Each anchor should have exactly 2 dimensions (width and height)."); + } for(size_t dim = 0; dim < anchorOut.size(); ++dim) { anchorOut[dim] = static_cast(anchorsIn[layer][anchor][dim]); } From c7dda6f30a3e6a45bd0512e848f67e2090e5db2e Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Fri, 21 Nov 2025 12:28:35 +0100 Subject: [PATCH 22/24] Bump fw --- cmake/Depthai/DepthaiDeviceRVC4Config.cmake | 2 +- cmake/Depthai/DepthaiDeviceSideConfig.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake index 6114eff63..e698e7d26 100644 --- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake +++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake @@ -3,4 +3,4 @@ set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot") # "version if applicable" -set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+ccb59dd506392110b0c85abee0d82e28c7d91f9e") +set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+40a7690ba1a1b35753040def6389bd773c61c67a") diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake index 8fd10aaca..ed9354ab8 100644 --- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake +++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake @@ -2,7 +2,7 @@ set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot") # "full commit hash of device side binary" -set(DEPTHAI_DEVICE_SIDE_COMMIT "0f9a7793654b8f2fbed64759c162425eab2c8541") +set(DEPTHAI_DEVICE_SIDE_COMMIT "1a92fd182936f0ec83eb7986c14d02625f1cffdb") # "version if applicable" set(DEPTHAI_DEVICE_SIDE_VERSION "") From cf0b4c802341cf3ca089c3f12112c41a0cdd955d Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Fri, 21 Nov 2025 12:49:57 +0100 Subject: [PATCH 23/24] bump fw --- cmake/Depthai/DepthaiDeviceRVC4Config.cmake | 2 +- cmake/Depthai/DepthaiDeviceSideConfig.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake index e698e7d26..78b734802 100644 --- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake +++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake @@ -3,4 +3,4 @@ set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot") # "version if applicable" -set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+40a7690ba1a1b35753040def6389bd773c61c67a") +set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+098f664d24dc72bce0589a9f81f18ceb285b0f8f") diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake index ed9354ab8..1b16f0d4a 100644 --- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake +++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake @@ -2,7 +2,7 @@ set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot") # "full commit hash of device side binary" -set(DEPTHAI_DEVICE_SIDE_COMMIT "1a92fd182936f0ec83eb7986c14d02625f1cffdb") +set(DEPTHAI_DEVICE_SIDE_COMMIT "621e48a2a0375f4594f7f8875661c50d3d5950c9") # "version if applicable" set(DEPTHAI_DEVICE_SIDE_VERSION "") From f5a149f5886d3b5ca248529c7067de98178cbde0 Mon Sep 17 00:00:00 2001 From: aljazkonec1 Date: Fri, 21 Nov 2025 13:20:35 +0100 Subject: [PATCH 24/24] fix formatting --- src/pipeline/node/DetectionParser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp index 34299c740..f06447353 100644 --- a/src/pipeline/node/DetectionParser.cpp +++ b/src/pipeline/node/DetectionParser.cpp @@ -159,7 +159,7 @@ void DetectionParser::setConfig(const dai::NNArchiveVersionedConfig& config) { std::vector> layerOut(anchorsIn[layer].size()); for(size_t anchor = 0; anchor < layerOut.size(); ++anchor) { std::vector anchorOut(anchorsIn[layer][anchor].size()); - if (anchorOut.size() != 2) { + if(anchorOut.size() != 2) { throw std::runtime_error("Each anchor should have exactly 2 dimensions (width and height)."); } for(size_t dim = 0; dim < anchorOut.size(); ++dim) {