diff --git a/3rdparty/foxglove/ws-protocol b/3rdparty/foxglove/ws-protocol index 234fa7936..45d3e08ff 160000 --- a/3rdparty/foxglove/ws-protocol +++ b/3rdparty/foxglove/ws-protocol @@ -1 +1 @@ -Subproject commit 234fa7936bfedc2824068aecd04b5ee6390e98c9 +Subproject commit 45d3e08ff168611ab8347ba194fd54b9425c99f8 diff --git a/CMakeLists.txt b/CMakeLists.txt index bb49d6f72..cbe164a09 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -311,6 +311,7 @@ set(TARGET_CORE_SOURCES src/pipeline/node/ImageAlign.cpp src/pipeline/node/ToF.cpp src/pipeline/node/DetectionParser.cpp + src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp src/pipeline/node/test/MyProducer.cpp src/pipeline/node/test/MyConsumer.cpp src/pipeline/node/UVC.cpp diff --git a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp index ff8ec4670..05fb9d2f6 100644 --- a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp +++ b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp @@ -65,6 +65,7 @@ void bind_detectionparser(pybind11::module& m, void* pCallstack) { DOC(dai, node, DetectionParser, setAnchors, 2)) .def("setAnchorMasks", &DetectionParser::setAnchorMasks, py::arg("anchorMasks"), DOC(dai, node, DetectionParser, setAnchorMasks)) .def("setIouThreshold", &DetectionParser::setIouThreshold, py::arg("thresh"), DOC(dai, node, DetectionParser, setIouThreshold)) + .def("setRunOnHost", &DetectionParser::setRunOnHost, py::arg("runOnHost"), DOC(dai, node, DetectionParser, setRunOnHost)) .def("setSubtype", &DetectionParser::setSubtype, py::arg("subtype"), DOC(dai, node, DetectionParser, setSubtype)) .def("setDecodeKeypoints", &DetectionParser::setDecodeKeypoints, py::arg("decode"), DOC(dai, node, DetectionParser, setDecodeKeypoints)) .def("setDecodeSegmentation", &DetectionParser::setDecodeSegmentation, py::arg("decode"), DOC(dai, node, DetectionParser, setDecodeSegmentation)) @@ -78,6 +79,7 @@ void bind_detectionparser(pybind11::module& m, void* pCallstack) { .def("getAnchors", &DetectionParser::getAnchors, DOC(dai, node, DetectionParser, getAnchors)) .def("getAnchorMasks", &DetectionParser::getAnchorMasks, DOC(dai, node, DetectionParser, getAnchorMasks)) .def("getIouThreshold", &DetectionParser::getIouThreshold, DOC(dai, node, DetectionParser, getIouThreshold)) + .def("runOnHost", &DetectionParser::runOnHost, DOC(dai, node, DetectionParser, runOnHost)) .def("getSubtype", &DetectionParser::getSubtype, DOC(dai, node, DetectionParser, getSubtype)) .def("getNkeypoints", &DetectionParser::getNKeypoints, DOC(dai, node, DetectionParser, getNKeypoints)) .def("getDecodeKeypoints", &DetectionParser::getDecodeKeypoints, DOC(dai, node, DetectionParser, getDecodeKeypoints)) diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake index 6114eff63..78b734802 100644 --- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake +++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake @@ -3,4 +3,4 @@ set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot") # "version if applicable" -set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+ccb59dd506392110b0c85abee0d82e28c7d91f9e") +set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+098f664d24dc72bce0589a9f81f18ceb285b0f8f") diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake index 8fd10aaca..1b16f0d4a 100644 --- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake +++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake @@ -2,7 +2,7 @@ set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot") # "full commit hash of device side binary" -set(DEPTHAI_DEVICE_SIDE_COMMIT "0f9a7793654b8f2fbed64759c162425eab2c8541") +set(DEPTHAI_DEVICE_SIDE_COMMIT "621e48a2a0375f4594f7f8875661c50d3d5950c9") # "version if applicable" set(DEPTHAI_DEVICE_SIDE_VERSION "") diff --git a/examples/cpp/DetectionNetwork/CMakeLists.txt b/examples/cpp/DetectionNetwork/CMakeLists.txt index b1fffaca5..fef48e7bb 100644 --- a/examples/cpp/DetectionNetwork/CMakeLists.txt +++ b/examples/cpp/DetectionNetwork/CMakeLists.txt @@ -23,8 +23,8 @@ dai_set_example_test_labels(detection_network ondevice rvc2_all rvc4 rvc4rgb ci) dai_add_example(detection_network_remap detection_network_remap.cpp ON OFF) dai_set_example_test_labels(detection_network_remap ondevice rvc2_all rvc4 ci) -dai_add_example(detection_and_segmentation RVC4/detection_and_segmentation.cpp ON OFF) -dai_set_example_test_labels(detection_and_segmentation rvc4) +dai_add_example(detection_and_segmentation detection_and_segmentation.cpp ON OFF) +dai_set_example_test_labels(detection_and_segmentation rvc2_all rvc4 ci) -dai_add_example(detection_and_keypoints RVC4/detection_and_keypoints.cpp ON OFF) -dai_set_example_test_labels(detection_and_keypoints rvc4) +dai_add_example(detection_and_keypoints detection_and_keypoints.cpp ON OFF) +dai_set_example_test_labels(detection_and_keypoints rvc2_all rvc4 ci) diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp similarity index 97% rename from examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp rename to examples/cpp/DetectionNetwork/detection_and_keypoints.cpp index e653f5592..f4c80837d 100644 --- a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp +++ b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp @@ -22,7 +22,7 @@ int main() { auto detectionNetwork = pipeline.create(); dai::NNModelDescription modelDescription; - modelDescription.model = "luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39"; + modelDescription.model = "luxonis/yolov8-nano-pose-estimation:coco-512x288"; detectionNetwork->build(cameraNode, modelDescription); // Create output queues diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp similarity index 93% rename from examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp rename to examples/cpp/DetectionNetwork/detection_and_segmentation.cpp index 960f2a21d..fa312d382 100644 --- a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp +++ b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -17,8 +18,16 @@ cv::Rect frameNorm(const cv::Mat& frame, const dai::Point2f& topLeft, const dai: } int main() { + std::string modelName = "luxonis/yolov8-instance-segmentation-large:coco-640x352"; + bool setRunOnHost = false; + auto device = std::make_shared(); + + if(device->getPlatform() == dai::Platform::RVC2) { + modelName = "luxonis/yolov8-instance-segmentation-nano:coco-512x288"; + setRunOnHost = true; + } // Create pipeline - dai::Pipeline pipeline; + dai::Pipeline pipeline{device}; // Create and configure camera node auto cameraNode = pipeline.create(); @@ -28,8 +37,10 @@ int main() { auto detectionNetwork = pipeline.create(); dai::NNModelDescription modelDescription; - modelDescription.model = "luxonis/yolov8-instance-segmentation-large:coco-640x480"; + + modelDescription.model = modelName; detectionNetwork->build(cameraNode, modelDescription); + detectionNetwork->detectionParser->setRunOnHost(setRunOnHost); // Create output queues auto qRgb = detectionNetwork->passthrough.createOutputQueue(); diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py b/examples/python/DetectionNetwork/detection_and_keypoints.py similarity index 95% rename from examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py rename to examples/python/DetectionNetwork/detection_and_keypoints.py index 561f8f2ea..24eca43bc 100644 --- a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py +++ b/examples/python/DetectionNetwork/detection_and_keypoints.py @@ -7,8 +7,8 @@ # Create pipeline with dai.Pipeline() as pipeline: - cameraNode = pipeline.create(dai.node.Camera).build() - detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39")) + cameraNode = pipeline.create(dai.node.Camera).build(sensorFps=12) + detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-nano-pose-estimation:coco-512x288")) labelMap = detectionNetwork.getClasses() qRgb = detectionNetwork.passthrough.createOutputQueue() diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py b/examples/python/DetectionNetwork/detection_and_segmentation.py similarity index 92% rename from examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py rename to examples/python/DetectionNetwork/detection_and_segmentation.py index 5ab883db3..81d703106 100644 --- a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py +++ b/examples/python/DetectionNetwork/detection_and_segmentation.py @@ -5,10 +5,19 @@ import numpy as np import time +model_name = "luxonis/yolov8-instance-segmentation-large:coco-640x480" +setRunOnHost = False +device = dai.Device() +if device.getPlatform() == dai.Platform.RVC2: + model_name = "luxonis/yolov8-instance-segmentation-nano:coco-512x288" + setRunOnHost = True + # Create pipeline -with dai.Pipeline() as pipeline: +with dai.Pipeline(device) as pipeline: cameraNode = pipeline.create(dai.node.Camera).build() - detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-large:coco-640x480")) + + detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription(model_name)) + detectionNetwork.detectionParser.setRunOnHost(setRunOnHost) labelMap = detectionNetwork.getClasses() assert labelMap is not None qRgb = detectionNetwork.passthrough.createOutputQueue() diff --git a/include/depthai/pipeline/datatype/ImgDetectionsT.hpp b/include/depthai/pipeline/datatype/ImgDetectionsT.hpp index 61b4d4bf0..99eb12cf7 100644 --- a/include/depthai/pipeline/datatype/ImgDetectionsT.hpp +++ b/include/depthai/pipeline/datatype/ImgDetectionsT.hpp @@ -76,6 +76,7 @@ class ImgDetectionsT : public Buffer { * Copies cv::Mat data to Segmentation Mask buffer * * @param frame Input cv::Mat frame from which to copy the data + * @note Throws if mask is not a single channel INT8 type. */ void setCvSegmentationMask(cv::Mat mask); diff --git a/include/depthai/pipeline/node/DetectionParser.hpp b/include/depthai/pipeline/node/DetectionParser.hpp index 4479df803..c04c6206a 100644 --- a/include/depthai/pipeline/node/DetectionParser.hpp +++ b/include/depthai/pipeline/node/DetectionParser.hpp @@ -12,6 +12,8 @@ #include #include "depthai/common/YoloDecodingFamily.hpp" +#include "depthai/pipeline/datatype/ImgDetections.hpp" +#include "depthai/pipeline/datatype/NNData.hpp" namespace dai { namespace node { @@ -20,7 +22,7 @@ namespace node { * @brief DetectionParser node. Parses detection results from different neural networks and is being used internally by MobileNetDetectionNetwork and * YoloDetectionNetwork. */ -class DetectionParser : public DeviceNodeCRTP { +class DetectionParser : public DeviceNodeCRTP, public HostRunnable { public: constexpr static const char* NAME = "DetectionParser"; using DeviceNodeCRTP::DeviceNodeCRTP; @@ -268,7 +270,23 @@ class DetectionParser : public DeviceNodeCRTP& outputs); + // host runnable requirements + void buildStage1() override; + void decodeYolo(dai::NNData& nnData, dai::ImgDetections& outDetections); + std::vector inTensorInfo; + uint32_t imgWidth; + uint32_t imgHeight; + uint32_t imgSizesSet = false; + // + std::optional mArchive; std::optional archiveConfig; diff --git a/src/pipeline/datatype/ImgDetectionsT.cpp b/src/pipeline/datatype/ImgDetectionsT.cpp index 875628fb8..c9db32022 100644 --- a/src/pipeline/datatype/ImgDetectionsT.cpp +++ b/src/pipeline/datatype/ImgDetectionsT.cpp @@ -75,6 +75,9 @@ std::optional ImgDetectionsT::getSegmentationMask() c template void ImgDetectionsT::setCvSegmentationMask(cv::Mat mask) { + if(mask.type() != CV_8UC1) { + throw std::runtime_error("SetCvSegmentationMask: Mask must be of INT8 type, got opencv type " + cv::typeToString(mask.type()) + "."); + } std::vector dataVec; if(!mask.isContinuous()) { for(int i = 0; i < mask.rows; i++) { diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp index 7ab17b2ac..f06447353 100644 --- a/src/pipeline/node/DetectionParser.cpp +++ b/src/pipeline/node/DetectionParser.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -13,6 +14,8 @@ #include "nn_archive/NNArchive.hpp" #include "nn_archive/v1/Head.hpp" #include "pipeline/ThreadedNodeImpl.hpp" +#include "pipeline/datatype/NNData.hpp" +#include "pipeline/utilities/DetectionParser/DetectionParserUtils.hpp" // internal headers #include "utility/ErrorMacros.hpp" @@ -156,6 +159,9 @@ void DetectionParser::setConfig(const dai::NNArchiveVersionedConfig& config) { std::vector> layerOut(anchorsIn[layer].size()); for(size_t anchor = 0; anchor < layerOut.size(); ++anchor) { std::vector anchorOut(anchorsIn[layer][anchor].size()); + if(anchorOut.size() != 2) { + throw std::runtime_error("Each anchor should have exactly 2 dimensions (width and height)."); + } for(size_t dim = 0; dim < anchorOut.size(); ++dim) { anchorOut[dim] = static_cast(anchorsIn[layer][anchor][dim]); } @@ -369,5 +375,198 @@ std::vector DetectionParser::getStrides() const { return properties.parser.strides; } +void DetectionParser::setRunOnHost(bool runOnHost) { + if(runOnHost) { + pimpl->logger->warn("Detection parser set to run on host."); + } + runOnHostVar = runOnHost; +} + +/** + * Check if the node is set to run on host + */ +bool DetectionParser::runOnHost() const { + return runOnHostVar; +} + +void DetectionParser::run() { + auto& logger = ThreadedNode::pimpl->logger; + logger->info("Detection parser running on host."); + + using namespace std::chrono; + while(isRunning()) { + auto tAbsoluteBeginning = steady_clock::now(); + std::shared_ptr sharedInputData = input.get(); + auto outDetections = std::make_shared(); + + if(!sharedInputData) { + logger->error("NN Data is empty. Skipping processing."); + continue; + } + auto tAfterMessageBeginning = steady_clock::now(); + dai::NNData& inputData = *sharedInputData; + + if(!imgSizesSet) { + const bool containsTransformation = inputData.transformation.has_value(); + if(containsTransformation) { + std::tie(imgWidth, imgHeight) = inputData.transformation->getSize(); + } else { + logger->warn("No image size provided for detection parser. Skipping processing and sending empty detections."); + continue; + } + // We have determined the image size, no need to try again in the future + imgSizesSet = true; + } + + // Parse detections + switch(properties.parser.nnFamily) { + case DetectionNetworkType::YOLO: { + decodeYolo(inputData, *outDetections); + break; + } + case DetectionNetworkType::MOBILENET: { + decodeMobilenet(inputData, *outDetections, properties.parser.confidenceThreshold); + break; + } + default: { + logger->error("Unknown NN family. 'YOLO' and 'MOBILENET' are supported."); + break; + } + } + + auto tBeforeSend = steady_clock::now(); + + // Copy over seq and ts + outDetections->setSequenceNum(inputData.getSequenceNum()); + outDetections->setTimestamp(inputData.getTimestamp()); + outDetections->setTimestampDevice(inputData.getTimestampDevice()); + outDetections->transformation = inputData.transformation; + + // Send detections + out.send(outDetections); + + auto tAbsoluteEnd = steady_clock::now(); + logger->debug("Detection parser total took {}ms, processing {}ms, getting_frames {}ms, sending_frames {}ms", + duration_cast(tAbsoluteEnd - tAbsoluteBeginning).count() / 1000, + duration_cast(tBeforeSend - tAfterMessageBeginning).count() / 1000, + duration_cast(tAfterMessageBeginning - tAbsoluteBeginning).count() / 1000, + duration_cast(tAbsoluteEnd - tBeforeSend).count() / 1000); + } +} + +void DetectionParser::buildStage1() { + auto& logger = ThreadedNode::pimpl->logger; + + // Grab dimensions from input tensor info + if(properties.networkInputs.size() > 0) { + if(properties.networkInputs.size() > 1) { + logger->warn("Detection parser supports only single input networks, assuming first input"); + } + for(const auto& kv : properties.networkInputs) { + const dai::TensorInfo& tensorInfo = kv.second; + inTensorInfo.push_back(tensorInfo); + } + } + if(inTensorInfo.size() > 0) { + int numDimensions = inTensorInfo[0].numDimensions; + if(numDimensions < 2) { + logger->error("Number of input dimensions is less than 2"); + } else { + imgSizesSet = true; + imgWidth = inTensorInfo[0].dims[numDimensions - 1]; + imgHeight = inTensorInfo[0].dims[numDimensions - 2]; + } + } else { + logger->info("Unable to read input tensor height and width from static inputs. The node will try to get input sizes at runtime."); + } +} + +void DetectionParser::decodeMobilenet(dai::NNData& nnData, dai::ImgDetections& outDetections, float confidenceThr) { + auto& logger = ThreadedNode::pimpl->logger; + + int maxDetections = 100; + std::vector detections; + std::string tensorName; + for(const auto& tensor : nnData.getAllLayers()) { + if(tensor.offset == 0) { + // // The tensor we want to checkout + // if(tensor.numDimensions != 4) { + // std::cout << "ERROR while decoding Mobilenet. Output tensor has incorrect dimensions. Number of dimensions: " << tensor.numDimensions + // << std::endl; + // } + // // Get tensor output size in Bytes + // // Expected dimensions are [1, 1, N, 7] where N is number of detections + // if(tensor.dims[3] != 7) { + // std::cout << "ERROR while decoding Mobilenet. Expecting 7 fields for every detection but: " << tensor.dims[3] << " found.\n"; + // } + // maxDetections = tensor.dims[tensor.numDimensions - 2]; + tensorName = tensor.name; + } + } + + auto tensorData = nnData.getTensor(tensorName); + maxDetections = tensorData.size() / 7; + if(static_cast(tensorData.size()) < maxDetections * 7) { + logger->error("Error while parsing Mobilenet. Vector not long enough, expected size: {}, real size {}", maxDetections * 7, tensorData.size()); + return; + } + + struct raw_Detection { // need to update it to include more + float header; + float label; + float confidence; + float xmin; + float ymin; + float xmax; + float ymax; + }; + + float* rawPtr = tensorData.data(); + for(int i = 0; i < maxDetections; i++) { + raw_Detection temp; + // TODO This is likely unnecessary optimisation + memcpy(&temp, &rawPtr[i * 7], sizeof(raw_Detection)); + + // if header == -1, stop sooner + if(temp.header == -1.0f) break; + + float currentConfidence = temp.confidence; + if(currentConfidence >= confidenceThr) { + dai::ImgDetection d; + d.label = temp.label; + + d.confidence = currentConfidence; + + d.xmin = temp.xmin; + d.ymin = temp.ymin; + d.xmax = temp.xmax; + d.ymax = temp.ymax; + + outDetections.detections.push_back(d); + } + } +} + +void DetectionParser::decodeYolo(dai::NNData& nnData, dai::ImgDetections& outDetections) { + std::shared_ptr& logger = ThreadedNode::pimpl->logger; + switch(properties.parser.decodingFamily) { + case YoloDecodingFamily::R1AF: // anchor free: yolo v6r1 + utilities::DetectionParserUtils::decodeR1AF(nnData, outDetections, properties, logger); + break; + case YoloDecodingFamily::v3AB: // anchor based yolo v3 v3-Tiny + utilities::DetectionParserUtils::decodeV3AB(nnData, outDetections, properties, logger); + break; + case YoloDecodingFamily::v5AB: // anchor based yolo v5, v7, P + utilities::DetectionParserUtils::decodeV5AB(nnData, outDetections, properties, logger); + break; + case YoloDecodingFamily::TLBR: // top left bottom right anchor free: yolo v6r2, v8 v10 v11 + utilities::DetectionParserUtils::decodeTLBR(nnData, outDetections, properties, logger); + break; + default: + logger->error("Unknown Yolo decoding family. 'R1AF', 'v3AB', 'v5AB' and 'TLBR' are supported."); + throw std::runtime_error("Unknown Yolo decoding family"); + } +} + } // namespace node } // namespace dai diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp new file mode 100644 index 000000000..6f187a8b0 --- /dev/null +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp @@ -0,0 +1,913 @@ +#include "DetectionParserUtils.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DetectionParserUtils.hpp" +#include "depthai/common/Keypoint.hpp" +#include "depthai/common/KeypointsListT.hpp" +#include "depthai/common/RotatedRect.hpp" +#include "depthai/common/TensorInfo.hpp" +#include "depthai/pipeline/datatype/ImgDetections.hpp" +#include "depthai/pipeline/datatype/NNData.hpp" +#include "depthai/properties/DetectionParserProperties.hpp" +#include "pipeline/utilities/NNDataViewer.hpp" + +namespace dai { +namespace utilities { +namespace DetectionParserUtils { + +// yolo v6 r1 - anchor free +void decodeR1AF(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, + std::shared_ptr& logger) { + auto layerNames = utilities::DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); + + const std::vector strides = properties.parser.strides; + if(strides.size() != layerNames.size()) { + std::string errorMsg = fmt::format( + "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size()); + throw std::runtime_error(errorMsg); + } + const float confidenceThr = properties.parser.confidenceThreshold; + const float iouThr = properties.parser.iouThreshold; + const int numClasses = properties.parser.classes; + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData.transformation->getSize(); + + if(inputWidth <= 0 || inputHeight <= 0) { + throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); + } + std::vector detectionCandidates; + detectionCandidates.reserve(defaultMaxDetectionsPerFrame); + + for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { + std::string layerName = layerNames[strideIdx]; + int stride = strides[strideIdx]; + auto tensorInfo = nnData.getTensorInfo(layerName); + if(!tensorInfo) { + std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); + throw std::runtime_error(errorMsg); + } + + if(!isTensorOrderValid(*tensorInfo, properties, logger)) { + logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName); + continue; + } + + int layerHeight = tensorInfo->getHeight(); + int layerWidth = tensorInfo->getWidth(); + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger); + if(!outputData.build()) { + std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); + throw std::runtime_error(errorMsg); + } + + for(int row = 0; row < layerHeight; ++row) { + for(int col = 0; col < layerWidth; ++col) { + const float objectnessScore = outputData.get(4, row, col); + if(objectnessScore < confidenceThr) { + continue; + } + + int bestC = 0; + float bestConf = 0.0f; + for(int c = 0; c < numClasses; ++c) { + float candidateProb = outputData.get(c + 5, row, col); + if(candidateProb > bestConf) { + bestConf = candidateProb; + bestC = c; + } + } + if(bestConf * objectnessScore < confidenceThr) { + continue; + } + + float cx = outputData.get(0, row, col); + float cy = outputData.get(1, row, col); + float w = outputData.get(2, row, col); + float h = outputData.get(3, row, col); + + float xmin = cx - w * 0.5f; + float ymin = cy - h * 0.5f; + float xmax = cx + w * 0.5f; + float ymax = cy + h * 0.5f; + + xmin = std::max(0.0f, std::min(xmin, float(inputWidth))); + ymin = std::max(0.0f, std::min(ymin, float(inputHeight))); + xmax = std::max(0.0f, std::min(xmax, float(inputWidth))); + ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); + + if(xmax <= xmin || ymax <= ymin) { + logger->info("Invalid bbox parameters. Either xmax <= xmin or ymax <= ymin. Skipping detection."); + logger->debug( + "Skipping invalid bbox: layer='{}', " + "raw(cx,cy,w,h)=({:.2f},{:.2f},{:.2f},{:.2f}) " + "clamped(xmin,ymin,xmax,ymax)=({:.2f},{:.2f},{:.2f},{:.2f}).", + layerName, + cx, + cy, + w, + h, + xmin, + ymin, + xmax, + ymax); + continue; + } + DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, bestConf * objectnessScore, bestC, strideIdx, row, col, std::nullopt}; + + detectionCandidates.emplace_back(std::move(candidate)); + } + } + } + + std::vector keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr); + if(keepCandidates.size() == 0) { + logger->trace("No detections after NMS, skipping overlay."); + return; + } + if(!properties.parser.classNames->empty()) { + for(auto& candidate : keepCandidates) { + candidate.labelName = (*properties.parser.classNames)[candidate.label]; + } + } + + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); + + if(properties.parser.decodeSegmentation) { + logger->trace("Segmentation decoding."); + segmentationDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + if(properties.parser.decodeKeypoints) { + logger->trace("Keypoints decoding."); + keypointDecode(nnData, keepCandidates, outDetections, properties, logger); + } +} + +/* +Decode anchor based yolo v3 and v3-Tiny +*/ +void decodeV3AB(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, + std::shared_ptr& logger) { + auto layerNames = getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); + auto sigmoid = [](float x) -> float { return 1.f / (1.f + std::exp(-x)); }; + + const std::vector strides = properties.parser.strides; + if(strides.size() != layerNames.size()) { + std::string errorMsg = fmt::format( + "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size()); + throw std::runtime_error(errorMsg); + } + + const float confidenceThr = properties.parser.confidenceThreshold; + const float iouThr = properties.parser.iouThreshold; + const int numClasses = properties.parser.classes; + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData.transformation->getSize(); + if(inputWidth <= 0 || inputHeight <= 0) { + throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); + } + + if(properties.parser.anchorsV2.size() != layerNames.size()) { + logger->error("Number of anchor sets does not match number of output layers. Anchor sets size: {}, output layers size: {}", + properties.parser.anchorsV2.size(), + layerNames.size()); + return; + } + + std::vector detectionCandidates; + detectionCandidates.reserve(defaultMaxDetectionsPerFrame); + + for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { + std::string layerName = layerNames[strideIdx]; + int stride = strides[strideIdx]; + auto tensorInfo = nnData.getTensorInfo(layerName); + if(!tensorInfo) { + std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); + throw std::runtime_error(errorMsg); + } + + if(!isTensorOrderValid(*tensorInfo, properties, logger)) { + logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName); + continue; + } + + int layerHeight = tensorInfo->getHeight(); + int layerWidth = tensorInfo->getWidth(); + int layerChannels = tensorInfo->getChannels(); + + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger); + if(!outputData.build()) { + std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); + throw std::runtime_error(errorMsg); + } + std::vector>& anchors = properties.parser.anchorsV2[strideIdx]; + int numAnchors = anchors.size(); + int block = 5 + numClasses; + int expectedC = numAnchors * block; + + if(layerChannels != expectedC) { + std::string errorMsg = fmt::format("Layer {} channels mismatch. Expected {}, got {}", layerName, expectedC, layerChannels); + throw std::runtime_error(errorMsg); + } + + for(int row = 0; row < layerHeight; ++row) { + for(int col = 0; col < layerWidth; ++col) { + for(int a = 0; a < numAnchors; ++a) { + const int ch0 = a * block; + const float tx = sigmoid(outputData.get(ch0 + 0, row, col)); + const float ty = sigmoid(outputData.get(ch0 + 1, row, col)); + const float tw = outputData.get(ch0 + 2, row, col); + const float th = outputData.get(ch0 + 3, row, col); + const float obj = sigmoid(outputData.get(ch0 + 4, row, col)); + if(obj < confidenceThr) continue; + + int bestC = 0; + float clsLogit = 0.0f; + for(int c = 0; c < numClasses; ++c) { + const float candidateLogit = outputData.get(ch0 + 5 + c, row, col); + if(candidateLogit > clsLogit) { + clsLogit = candidateLogit; + bestC = c; + } + } + const float conf = obj * sigmoid(clsLogit); + if(conf < confidenceThr) continue; + + // YOLOv3 decode + const float cx = (static_cast(col) + tx) * static_cast(stride); + const float cy = (static_cast(row) + ty) * static_cast(stride); + const float w_exp = std::exp(tw); + const float h_exp = std::exp(th); + const float w = w_exp * anchors[a][0]; + const float h = h_exp * anchors[a][1]; + + float xmin = cx - 0.5f * w; + float ymin = cy - 0.5f * h; + float xmax = cx + 0.5f * w; + float ymax = cy + 0.5f * h; + + xmin = std::max(0.0f, std::min(xmin, float(inputWidth))); + ymin = std::max(0.0f, std::min(ymin, float(inputHeight))); + xmax = std::max(0.0f, std::min(xmax, float(inputWidth))); + ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); + + if(xmax <= xmin || ymax <= ymin) { + logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping"); + continue; + } + + DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, conf, bestC, strideIdx, row, col, std::nullopt}; + + detectionCandidates.emplace_back(std::move(candidate)); + } + } + } + } + + std::vector keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr); + if(keepCandidates.size() == 0) { + logger->trace("No detections after NMS, skipping overlay."); + return; + } + + if(!properties.parser.classNames->empty()) { + for(auto& candidate : keepCandidates) { + candidate.labelName = (*properties.parser.classNames)[candidate.label]; + } + } + + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); + + if(properties.parser.decodeSegmentation) { + logger->trace("Segmentation decoding."); + segmentationDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + if(properties.parser.decodeKeypoints) { + logger->trace("Keypoints decoding."); + keypointDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + // +} + +/* +Decode anchor based networks, e.g., yolo v5, v7, P +*/ +void decodeV5AB(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, + std::shared_ptr& logger) { + auto layerNames = getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); + + const std::vector strides = properties.parser.strides; + if(strides.size() != layerNames.size()) { + std::string errorMsg = fmt::format( + "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size()); + throw std::runtime_error(errorMsg); + } + + const float confidenceThr = properties.parser.confidenceThreshold; + const float iouThr = properties.parser.iouThreshold; + const int numClasses = properties.parser.classes; + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData.transformation->getSize(); + + if(inputWidth <= 0 || inputHeight <= 0) { + throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); + } + + if(properties.parser.anchorsV2.size() != layerNames.size()) { + logger->error("Number of anchor sets does not match number of output layers. Anchor sets size: {}, output layers size: {}", + properties.parser.anchorsV2.size(), + layerNames.size()); + return; + } + + std::vector detectionCandidates; + detectionCandidates.reserve(defaultMaxDetectionsPerFrame); + + for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { + std::string layerName = layerNames[strideIdx]; + int stride = strides[strideIdx]; + auto tensorInfo = nnData.getTensorInfo(layerName); + if(!tensorInfo) { + std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); + throw std::runtime_error(errorMsg); + } + + if(!isTensorOrderValid(*tensorInfo, properties, logger)) { + logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName); + continue; + } + + int layerHeight = tensorInfo->getHeight(); + int layerWidth = tensorInfo->getWidth(); + int layerChannels = tensorInfo->getChannels(); + + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger); + if(!outputData.build()) { + std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); + throw std::runtime_error(errorMsg); + } + std::vector>& anchors = properties.parser.anchorsV2[strideIdx]; + int numAnchors = anchors.size(); + int block = 5 + numClasses; + int expectedC = numAnchors * block; + + if(layerChannels != expectedC) { + logger->error("Layer {} channels mismatch. Expected {}, got {}", layerName, expectedC, layerChannels); + return; + } + + for(int row = 0; row < layerHeight; ++row) { + for(int col = 0; col < layerWidth; ++col) { + for(int a = 0; a < numAnchors; ++a) { + const int ch0 = a * block; + + const float tx = outputData.get(ch0 + 0, row, col); + const float ty = outputData.get(ch0 + 1, row, col); + const float tw = outputData.get(ch0 + 2, row, col); + const float th = outputData.get(ch0 + 3, row, col); + const float obj = outputData.get(ch0 + 4, row, col); + if(obj < confidenceThr) continue; + + int bestC = 0; + float bestConf = 0.0f; + for(int c = 0; c < numClasses; ++c) { + const float candidateProb = outputData.get(ch0 + 5 + c, row, col); + if(candidateProb > bestConf) { + bestConf = candidateProb; + bestC = c; + } + } + const float conf = obj * bestConf; + if(conf < confidenceThr) continue; + + // YOLOv5 decode + const float cx = ((tx * 2.0f - 0.5f) + static_cast(col)) * static_cast(stride); + const float cy = ((ty * 2.0f - 0.5f) + static_cast(row)) * static_cast(stride); + + const float w = tw * tw * 4.0f * anchors[a][0]; + const float h = th * th * 4.0f * anchors[a][1]; + + float xmin = cx - 0.5f * w; + float ymin = cy - 0.5f * h; + float xmax = cx + 0.5f * w; + float ymax = cy + 0.5f * h; + + xmin = std::max(0.0f, std::min(xmin, float(inputWidth))); + ymin = std::max(0.0f, std::min(ymin, float(inputHeight))); + xmax = std::max(0.0f, std::min(xmax, float(inputWidth))); + ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); + + if(xmax <= xmin || ymax <= ymin) continue; + DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, conf, bestC, strideIdx, row, col, std::nullopt}; + + detectionCandidates.emplace_back(std::move(candidate)); + } + } + } + } + + std::vector keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr); + if(keepCandidates.size() == 0) { + logger->trace("No detections after NMS, skipping overlay."); + return; + } + + if(!properties.parser.classNames->empty()) { + for(auto& candidate : keepCandidates) { + candidate.labelName = (*properties.parser.classNames)[candidate.label]; + } + } + + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); + + if(properties.parser.decodeSegmentation) { + logger->trace("Segmentation decoding."); + segmentationDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + if(properties.parser.decodeKeypoints) { + logger->trace("Keypoints decoding."); + keypointDecode(nnData, keepCandidates, outDetections, properties, logger); + } +} + +/* +Decode TLBR (top left bottom right) style networks, e.g., yolo v6r2, v8, v10, v11 +*/ +void decodeTLBR(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, + std::shared_ptr& logger) { + auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); + + const std::vector strides = properties.parser.strides; + if(strides.size() != layerNames.size()) { + std::string errorMsg = fmt::format( + "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size()); + throw std::runtime_error(errorMsg); + } + const float confidenceThr = properties.parser.confidenceThreshold; + const float iouThr = properties.parser.iouThreshold; + const int numClasses = properties.parser.classes; + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData.transformation->getSize(); + + if(inputWidth <= 0 || inputHeight <= 0) { + throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); + } + + std::vector detectionCandidates; + detectionCandidates.reserve(defaultMaxDetectionsPerFrame); + + for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { + std::string layerName = layerNames[strideIdx]; + int stride = strides[strideIdx]; + auto tensorInfo = nnData.getTensorInfo(layerName); + if(!tensorInfo) { + std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); + throw std::runtime_error(errorMsg); + } + + if(!isTensorOrderValid(*tensorInfo, properties, logger)) { + logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName); + continue; + } + + int layerHeight = tensorInfo->getHeight(); + int layerWidth = tensorInfo->getWidth(); + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger); + if(!outputData.build()) { + std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); + throw std::runtime_error(errorMsg); + } + + for(int row = 0; row < layerHeight; ++row) { + for(int col = 0; col < layerWidth; ++col) { + const float score = outputData.get(4, row, col); + if(score < confidenceThr) { + continue; + } + + int bestC = 0; + float bestConf = 0.0f; + for(int c = 0; c < numClasses; ++c) { + float candidateProb = outputData.get(c + 5, row, col); + if(candidateProb > bestConf) { + bestConf = candidateProb; + bestC = c; + } + } + float xmin = (col - outputData.get(0, row, col) + 0.5f) * stride; + float ymin = (row - outputData.get(1, row, col) + 0.5f) * stride; + float xmax = (col + outputData.get(2, row, col) + 0.5f) * stride; + float ymax = (row + outputData.get(3, row, col) + 0.5f) * stride; + + if(bestConf < confidenceThr) { + continue; + } + + xmin = std::max(0.0f, std::min(xmin, float(inputWidth))); + ymin = std::max(0.0f, std::min(ymin, float(inputHeight))); + xmax = std::max(0.0f, std::min(xmax, float(inputWidth))); + ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); + + if(xmax <= xmin || ymax <= ymin) { + logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping"); + continue; + } + + DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, bestConf, bestC, strideIdx, row, col, std::nullopt}; + + detectionCandidates.emplace_back(std::move(candidate)); + } + } + } + + std::vector keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr); + if(keepCandidates.size() == 0) { + logger->trace("No detections after NMS, skipping overlay."); + return; + } + + if(!properties.parser.classNames->empty()) { + for(auto& candidate : keepCandidates) { + candidate.labelName = (*properties.parser.classNames)[candidate.label]; + } + } + + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); + + if(properties.parser.decodeSegmentation) { + logger->trace("Segmentation decoding."); + segmentationDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + if(properties.parser.decodeKeypoints) { + logger->trace("Keypoints decoding."); + keypointDecode(nnData, keepCandidates, outDetections, properties, logger); + } +} + +bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr& logger) { + int anchorMultiplier = properties.parser.anchorsV2.empty() ? 1 : static_cast(properties.parser.anchorsV2.size()); + int channelSize = anchorMultiplier * (properties.parser.classes + properties.parser.coordinates + 1); + + auto checkAndFixOrder = [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool { + // Check that the dims size is big enough + if(static_cast(tensorInfo.dims.size()) <= channelDimIndex || static_cast(tensorInfo.dims.size()) <= alternativeDimIndex) { + logger->error("Invalid tensor dims size. Skipping."); + return false; + } + + if(tensorInfo.dims[channelDimIndex] != uint32_t(channelSize)) { + // Check if the channel size would match the alternative storage order + if(tensorInfo.dims[alternativeDimIndex] == uint32_t(channelSize)) { + logger->trace("Invalid channel size for the tensor. Expected {}, got {}, switching", channelSize, tensorInfo.dims[channelDimIndex]); + tensorInfo.order = alternativeOrder; + } else { + logger->error("Invalid channel size for the tensor. Expected {}, got {}. Skipping.", channelSize, tensorInfo.dims[channelDimIndex]); + return false; + } + } + return true; + }; + + switch(tensorInfo.order) { + case dai::TensorInfo::StorageOrder::CHW: + if(!checkAndFixOrder(0, 2, dai::TensorInfo::StorageOrder::HWC)) return false; + break; + case dai::TensorInfo::StorageOrder::HWC: + if(!checkAndFixOrder(2, 0, dai::TensorInfo::StorageOrder::CHW)) return false; + break; + case dai::TensorInfo::StorageOrder::NCHW: + if(!checkAndFixOrder(1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false; + break; + case dai::TensorInfo::StorageOrder::NHWC: + if(!checkAndFixOrder(3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false; + break; + case dai::TensorInfo::StorageOrder::NHCW: + case dai::TensorInfo::StorageOrder::WHC: + case dai::TensorInfo::StorageOrder::WCH: + case dai::TensorInfo::StorageOrder::HCW: + case dai::TensorInfo::StorageOrder::CWH: + case dai::TensorInfo::StorageOrder::NC: + case dai::TensorInfo::StorageOrder::CN: + case dai::TensorInfo::StorageOrder::C: + case dai::TensorInfo::StorageOrder::H: + case dai::TensorInfo::StorageOrder::W: + default: + logger->error("Invalid storage order for the tensor. Skipping."); + return false; + } + + return true; +} + +std::vector getSortedDetectionLayerNames(const dai::NNData& nnData, std::string searchTerm, std::vector outputNames) { + if(outputNames.empty()) { + outputNames = nnData.getAllLayerNames(); + } + + std::vector layerNames; + for(const auto& name : outputNames) { + // if yolo in the name, push it to layerNames + if(name.find(searchTerm) != std::string::npos) { + layerNames.push_back(name); + } + } + + std::sort(layerNames.begin(), layerNames.end()); + return layerNames; +} + +float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2) { + float width_of_overlap_area = fmin(box1.xmax, box2.xmax) - fmax(box1.xmin, box2.xmin); + float height_of_overlap_area = fmin(box1.ymax, box2.ymax) - fmax(box1.ymin, box2.ymin); + float area_of_overlap; + if(width_of_overlap_area < 0 || height_of_overlap_area < 0) + area_of_overlap = 0; + else + area_of_overlap = width_of_overlap_area * height_of_overlap_area; + float box_1_area = (box1.ymax - box1.ymin) * (box1.xmax - box1.xmin); + float box_2_area = (box2.ymax - box2.ymin) * (box2.xmax - box2.xmin); + float area_of_union = box_1_area + box_2_area - area_of_overlap; + return area_of_overlap / area_of_union; +} + +std::vector nonMaximumSuppression(std::vector& detectionCandidates, float iouThr) { + std::sort( + detectionCandidates.begin(), detectionCandidates.end(), [](const DetectionCandidate& a, const DetectionCandidate& b) { return a.score > b.score; }); + + std::vector keep(detectionCandidates.size(), 1); + std::vector keepIndices; + keepIndices.reserve(detectionCandidates.size()); + + for(size_t i = 0; i < detectionCandidates.size(); ++i) { + if(!keep[i]) continue; + keepIndices.push_back(i); + + for(size_t j = i + 1; j < detectionCandidates.size(); ++j) { + if(!keep[j]) continue; + if(YoloIntersectionOverUnion(detectionCandidates[i], detectionCandidates[j]) >= iouThr) { + keep[j] = 0; + } + } + } + + std::vector keepCandidates; + keepCandidates.reserve(keepIndices.size()); + for(size_t idx : keepIndices) keepCandidates.push_back(detectionCandidates[idx]); + + return keepCandidates; +} + +void createImgDetections(const std::vector& detectionCandidates, + dai::ImgDetections& outDetections, + unsigned int width, + unsigned int height) { + for(const auto& det : detectionCandidates) { + dai::ImgDetection detection; + dai::RotatedRect rotatedRect(dai::Rect(dai::Point2f(det.xmin, det.ymin), dai::Point2f(det.xmax, det.ymax)), 0.0f); + detection.setBoundingBox(rotatedRect.normalize(width, height)); + detection.confidence = det.score; + detection.label = det.label; + if(det.labelName) { + detection.labelName = *det.labelName; + } + outDetections.detections.push_back(std::move(detection)); + } +} + +void segmentationDecode(const dai::NNData& nnData, + std::vector& detectionCandidates, + dai::ImgDetections& outDetections, + DetectionParserProperties properties, + std::shared_ptr& logger) { + std::pair inputSize = nnData.transformation->getSize(); + int inputWidth = inputSize.first; + int inputHeight = inputSize.second; + + cv::Mat indexMask(inputHeight, inputWidth, CV_8U, cv::Scalar(255)); + + auto maskLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "masks", std::vector{}); + if(properties.parser.strides.size() != maskLayerNames.size()) { + logger->error( + "Number of strides does not match number of mask output layers. Strides size: {}, mask output layers size: {}. Skipping segmentation decoding.", + properties.parser.strides.size(), + maskLayerNames.size()); + return; + } + auto protoLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "proto", std::vector{}); + if(protoLayerNames.size() == 0) { + logger->error("Expecting proto output layer, found no layer with proto label. Skipping segmentation decoding."); + return; + } + + NNDataViewer protoValues = NNDataViewer(*nnData.getTensorInfo(protoLayerNames[0]), nnData.data, logger); + if(!protoValues.build()) { + logger->error("Failed to build NNDataViewer for proto layer {}. Skipping segmentation decoding.", protoLayerNames[0]); + return; + } + + TensorInfo protoInfo = *nnData.getTensorInfo(protoLayerNames[0]); + int protoWidth = protoInfo.getWidth(); + int protoHeight = protoInfo.getHeight(); + int protoChannels = protoInfo.getChannels(); + if(protoWidth <= 0 || protoHeight <= 0 || protoChannels <= 0) { + logger->error("Invalid proto tensor dimensions: channels {}, height {}, width {}.", protoChannels, protoHeight, protoWidth); + return; + } + int protoWidthScaleFactor = inputWidth / protoWidth; + int protoHeightScaleFactor = inputHeight / protoHeight; + + cv::Mat maskUp; + cv::Mat maskLow(protoHeight, protoWidth, CV_32F); + + dai::NNData& nnDataNonConst = const_cast(nnData); + xt::xarray protoData = nnDataNonConst.getTensor(protoLayerNames[0], true); + if(protoInfo.order != dai::TensorInfo::StorageOrder::NHWC) { + logger->trace("Proto storage is not NHWC, changing order."); + nnDataNonConst.changeStorageOrder(protoData, protoInfo.order, dai::TensorInfo::StorageOrder::NHWC); + } + Eigen::MatrixXf protoMatrix = Eigen::Map(protoData.data(), protoChannels, protoHeight * protoWidth); + + Eigen::RowVectorXf coeffs(protoChannels); + + auto maskFromCoeffs = [logger, protoHeight, protoWidth, &maskLow](const Eigen::MatrixXf& protos2d, const Eigen::RowVectorXf& coeffs) -> void { + if(protos2d.rows() != coeffs.size()) { + throw std::runtime_error("Mask coefficients size does not match proto channels."); + } + + Eigen::Map logits(maskLow.ptr(), protoHeight * protoWidth); + logits.noalias() = coeffs * protos2d; + + // no need to do sigmoid + // logits = (1.0f / (1.0f + (-logits.array()).exp())).matrix(); + }; + + std::map maskValues; + for(int strideIdx = 0; strideIdx < static_cast(maskLayerNames.size()); ++strideIdx) { + maskValues.try_emplace(strideIdx, *nnData.getTensorInfo(maskLayerNames[strideIdx]), nnData.data, logger); + if(!maskValues.at(strideIdx).build()) { + logger->error("Failed to build NNDataViewer for mask layer {}. Skipping segmentation decoding.", maskLayerNames[strideIdx]); + return; + } + } + + for(size_t i = 0; i < detectionCandidates.size(); ++i) { // loop over all detections + const auto& c = detectionCandidates[i]; + const int detIdx = static_cast(i); // index in outDetections list + + NNDataViewer& mask = maskValues.at(c.headIndex); + for(int ch = 0; ch < protoChannels; ++ch) { + coeffs(ch) = mask.get(ch, c.rowIndex, c.columnIndex); + } + // TODO (aljaz) perform operations on ROI only instead of the full resolution + // Eigen::MatrixXf roiMatrix = protoMatrix.block(0, y0 * protoWidth + x0, protoChannels, (y1 - y0) * (x1 - x0)); + + maskFromCoeffs(protoMatrix, coeffs); + + int x0 = std::clamp(static_cast(std::floor(c.xmin)), 0, inputWidth - 1); + int y0 = std::clamp(static_cast(std::floor(c.ymin)), 0, inputHeight - 1); + int x1 = std::clamp(static_cast(std::ceil(c.xmax)), 0, inputWidth); + int y1 = std::clamp(static_cast(std::ceil(c.ymax)), 0, inputHeight); + + if(x1 <= x0 || y1 <= y0) continue; + const cv::Rect roi(x0, y0, x1 - x0, y1 - y0); + + int protoX0 = x0 / protoWidthScaleFactor; + int protoY0 = y0 / protoHeightScaleFactor; + int protoX1 = x1 / protoWidthScaleFactor; + int protoY1 = y1 / protoHeightScaleFactor; + const cv::Rect protoROI(protoX0, protoY0, protoX1 - protoX0, protoY1 - protoY0); + + cv::Mat roiProb; + cv::resize(maskLow(protoROI), roiProb, roi.size(), 0, 0, cv::INTER_LINEAR); + + // Threshold & paint only unassigned pixels + cv::Mat roiBin; + cv::compare(roiProb, 0.0f, roiBin, cv::CMP_GT); + cv::Mat roiOut = indexMask(roi); + cv::Mat unassigned; + cv::compare(roiOut, 255, unassigned, cv::CMP_EQ); + cv::Mat paintMask; + cv::bitwise_and(roiBin, unassigned, paintMask); + + const uint8_t value = static_cast(std::min(detIdx, 254)); + roiOut.setTo(value, paintMask); + } + + outDetections.setCvSegmentationMask(indexMask); +} + +void keypointDecode(const dai::NNData& nnData, + std::vector& detectionCandidates, + dai::ImgDetections& outDetections, + DetectionParserProperties properties, + std::shared_ptr& logger) { + if(!properties.parser.nKeypoints) { + logger->warn("Number of keypoints not set in properties.parser.nKeypoints. Skipping keypoints decoding."); + return; + } + + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData.transformation->getSize(); + + auto yoloLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse); + std::vector featureMapWidths; + for(int i = 0; i < static_cast(yoloLayerNames.size()); ++i) { + auto tensorInfo = nnData.getTensorInfo(yoloLayerNames[i]); + if(!tensorInfo) { + logger->error("Tensor info for layer {} is null. Skipping keypoints decoding.", yoloLayerNames[i]); + return; + } + featureMapWidths.push_back(tensorInfo->getWidth()); + } + + auto kptsLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "kpt_output", std::vector{}); + if(properties.parser.strides.size() != kptsLayerNames.size()) { + logger->error( + "Number of strides does not match number of keypoints output layers. Strides size: {}, keypoints output layers size: {}. Skipping keypoints " + "decoding.", + properties.parser.strides.size(), + kptsLayerNames.size()); + return; + } + + // TODO (aljaz) move to a function + std::map keypointValues; + for(int strideIdx = 0; strideIdx < static_cast(kptsLayerNames.size()); ++strideIdx) { + keypointValues.try_emplace(strideIdx, *nnData.getTensorInfo(kptsLayerNames[strideIdx]), nnData.data, logger); + if(!keypointValues.at(strideIdx).build()) { + logger->error("Failed to build NNDataViewer for keypoints layer {}. Skipping keypoints decoding.", kptsLayerNames[strideIdx]); + return; + } + } + + if(outDetections.detections.size() != detectionCandidates.size()) { + logger->error( + "Number of detections in ImgDetections does not match number of detection candidates. ImgDetections size: {}, detection candidates size: {}. " + "Skipping keypoints decoding.", + outDetections.detections.size(), + detectionCandidates.size()); + return; + } + + for(size_t i = 0; i < detectionCandidates.size(); ++i) { // loop over all detections + const auto& c = detectionCandidates[i]; + int flattenedIndex = c.rowIndex * featureMapWidths[c.headIndex] + c.columnIndex; + + std::vector keypoints; + keypoints.reserve(*properties.parser.nKeypoints); + NNDataViewer keypointMask = keypointValues.at(c.headIndex); + + for(int k = 0; k < properties.parser.nKeypoints; ++k) { + int base = 3 * k; + + // keypointValues tensor storage order HWC + // H == 0 + // W == 51 == 17 * 3 (x, y, conf for each keypoint) + // C == flattened spatial dimensions of row x col of the feature map + float x = std::clamp(keypointMask.get(flattenedIndex, 0, base + 0) / inputWidth, 0.0f, 1.0f); + float y = std::clamp(keypointMask.get(flattenedIndex, 0, base + 1) / inputHeight, 0.0f, 1.0f); + float conf = 1.f / (1.f + std::exp(-(keypointMask.get(flattenedIndex, 0, base + 2)))); + + keypoints.push_back(dai::Keypoint{dai::Point2f(x, y), conf}); + } + outDetections.detections[i].keypoints = KeypointsList(keypoints, properties.parser.keypointEdges); + } +} + +} // namespace DetectionParserUtils +} // namespace utilities +} // namespace dai diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp new file mode 100644 index 000000000..593007c14 --- /dev/null +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp @@ -0,0 +1,84 @@ +#pragma once +#include + +#include + +#include "depthai/pipeline/datatype/ImgDetections.hpp" +#include "depthai/pipeline/datatype/NNData.hpp" +#include "depthai/properties/DetectionParserProperties.hpp" + +namespace dai { +namespace utilities { +namespace DetectionParserUtils { + +constexpr std::size_t defaultMaxDetectionsPerFrame = 250; +struct DetectionCandidate { + float xmin, ymin, xmax, ymax, score; + int label, headIndex, rowIndex, columnIndex; + std::optional labelName; +}; +/* +Decode anchor free yolo v6r1 with sigmoid assisted center detection +*/ +void decodeR1AF(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, + std::shared_ptr& logger); + +/* +Decode anchor based yolo v3 and v3-Tiny +*/ +void decodeV3AB(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, + std::shared_ptr& logger); + +/* +Decode anchor based networks, e.g., yolo v5, v7, P +*/ +void decodeV5AB(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, + std::shared_ptr& logger); + +/* +Decode anchor free top-left-bottom-right (TLBR) style networks, e.g., yolo v6r2, v8, v10, v11 +*/ +void decodeTLBR(const dai::NNData& nnData, + dai::ImgDetections& outDetections, + DetectionParserProperties& properties, + std::shared_ptr& logger); + +std::vector getSortedDetectionLayerNames(const dai::NNData& nnData, std::string searchTerm, std::vector outputNames); + +float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2); + +bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr& logger); + +void createImgDetections(std::vector& detectionCandidates, + std::vector keepIndices, + dai::ImgDetections& outDetections, + std::shared_ptr& logger); + +std::vector nonMaximumSuppression(std::vector& detectionCandidates, float iouThr); + +void createImgDetections(const std::vector& detectionCandidates, + dai::ImgDetections& outDetections, + unsigned int width, + unsigned int height); + +void segmentationDecode(const dai::NNData& nnData, + std::vector& detectionCandidates, + dai::ImgDetections& outDetections, + DetectionParserProperties properties, + std::shared_ptr& logger); + +void keypointDecode(const dai::NNData& nnData, + std::vector& detectionCandidates, + dai::ImgDetections& outDetections, + DetectionParserProperties properties, + std::shared_ptr& logger); + +} // namespace DetectionParserUtils +} // namespace utilities +} // namespace dai \ No newline at end of file diff --git a/src/pipeline/utilities/NNDataViewer.hpp b/src/pipeline/utilities/NNDataViewer.hpp new file mode 100644 index 000000000..f00d23a6a --- /dev/null +++ b/src/pipeline/utilities/NNDataViewer.hpp @@ -0,0 +1,165 @@ +#pragma once +#include + +#include "depthai/common/TensorInfo.hpp" +#include "depthai/pipeline/datatype/NNData.hpp" +#include "fp16/fp16.h" +namespace dai { +class NNDataViewer { + public: + std::shared_ptr data; + dai::TensorInfo tensor; + std::shared_ptr logger; + + // Factors to multiply with before the vectors + struct FactorsBefore { + int32_t h; + int32_t w; + int32_t c; + }; + + FactorsBefore factorsBefore; + + NNDataViewer(dai::TensorInfo tensor, std::shared_ptr data, std::shared_ptr logger) + : data{data}, tensor{tensor}, logger{logger} {}; + bool build() { + if(tensor.strides.size() < 2) { + logger->error("Tensor doesn't have enough strides. Number of strides: {}, expected: {}", tensor.strides.size(), 2); + return false; + } + if(tensor.strides[0] == 0 || tensor.strides[1] == 0) { + logger->error("Tensor strides should not be set to zero. Strides are {} {}", tensor.strides[0], tensor.strides[1]); + return false; + } + switch(tensor.order) { + case TensorInfo::StorageOrder::NCHW: + if(tensor.dims[0] != 1) { + logger->error("NCHW is only supported in Detection Parser if N is 1. It is {}", tensor.dims[0]); + return false; + } + if(tensor.strides.size() != 4) { + logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4); + return false; + } + factorsBefore.c = tensor.strides[1]; + factorsBefore.h = tensor.strides[2]; + factorsBefore.w = tensor.getDataTypeSize(); + break; + case TensorInfo::StorageOrder::NHWC: + if(tensor.dims[0] != 1) { + logger->error("NHWC is only supported in Detection Parser if N is 1. It is {}", tensor.dims[0]); + return false; + } + if(tensor.strides.size() != 4) { + logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4); + return false; + } + factorsBefore.h = tensor.strides[1]; + factorsBefore.w = tensor.strides[2]; + factorsBefore.c = tensor.getDataTypeSize(); + break; + case TensorInfo::StorageOrder::HCW: + factorsBefore.h = tensor.strides[0]; + factorsBefore.c = tensor.strides[1]; + factorsBefore.w = tensor.getDataTypeSize(); + break; + + case TensorInfo::StorageOrder::HWC: + factorsBefore.h = tensor.strides[0]; + factorsBefore.w = tensor.strides[1]; + factorsBefore.c = tensor.getDataTypeSize(); + break; + case TensorInfo::StorageOrder::CHW: + factorsBefore.c = tensor.strides[0]; + factorsBefore.h = tensor.strides[1]; + factorsBefore.w = tensor.getDataTypeSize(); + break; + + case TensorInfo::StorageOrder::CWH: + factorsBefore.c = tensor.strides[0]; + factorsBefore.w = tensor.strides[1]; + factorsBefore.h = tensor.getDataTypeSize(); + break; + + case TensorInfo::StorageOrder::WCH: + factorsBefore.w = tensor.strides[0]; + factorsBefore.c = tensor.strides[1]; + factorsBefore.h = tensor.getDataTypeSize(); + break; + + case TensorInfo::StorageOrder::WHC: + factorsBefore.w = tensor.strides[0]; + factorsBefore.h = tensor.strides[1]; + factorsBefore.c = tensor.getDataTypeSize(); + break; + case TensorInfo::StorageOrder::NHCW: + case TensorInfo::StorageOrder::NC: + case TensorInfo::StorageOrder::CN: + case TensorInfo::StorageOrder::H: + case TensorInfo::StorageOrder::W: + case TensorInfo::StorageOrder::C: + default: + logger->error("Storage order not supported in NNDataViewer"); + return false; + } + return sanity_check(); + } + + bool sanity_check() { + if(data->getSize() < (tensor.offset + (tensor.dims[0] * tensor.strides[0]))) { + logger->error( + "Underlying data does not hold enough data for the tensor to be contained.\ + Tensor size: {}, Tensor offset: {}, Data type size: {}, Data size: {} ", + tensor.dims[0] * tensor.strides[0], + tensor.offset, + tensor.getDataTypeSize(), + data->getSize()); + return false; + } + if(tensor.dims.size() < 2) { + logger->error("Number of dimensions for the input tensor is expected to be at least 2. It is {}", tensor.dims.size()); + return false; + } + return true; + }; + + inline float get(int c, int h, int w) { + // If this turns out to be slow, use a function pointer instead and point to the right getter at build time + int32_t index = tensor.offset + factorsBefore.h * h + factorsBefore.w * w + factorsBefore.c * c; +#ifdef DEPTHAI_SAFE_NN_DATA_ACCESS + logger->trace("Offset {}, fbH {}, fbW {}, fbC {}, h {}, w {}, c{}", tensor.offset, factorsBefore.h, factorsBefore.w, factorsBefore.c, h, w, c); + if(index > data->getSize()) { + logger->error("Out of bound access. Size is {}, index is {}", data->getSize(), index); + return 0.0; + } +#endif + + switch(tensor.dataType) { + case TensorInfo::DataType::U8F: { + uint8_t dataOut = data->getData()[index]; + return (static_cast(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::I8: { + int8_t dataOut = static_cast(data->getData()[index]); + return (static_cast(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::INT: { + int32_t dataOut = reinterpret_cast(data->getData().data())[index / sizeof(int32_t)]; + return (static_cast(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::FP16: { + int16_t dataOut = reinterpret_cast(data->getData().data())[index / sizeof(int16_t)]; + return (fp16_ieee_to_fp32_value(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::FP32: { + float dataOut = reinterpret_cast(data->getData().data())[index / sizeof(float)]; + return (static_cast(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::FP64: + default: { + return 0.0f; + } + } + } +}; +} // namespace dai