From 115c3d16c9932b2a94231f695e8260c05419c614 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Tue, 4 Nov 2025 14:24:22 +0100
Subject: [PATCH 01/24] Move example

---
 examples/cpp/DetectionNetwork/CMakeLists.txt                    | 2 +-
 .../cpp/DetectionNetwork/{RVC4 => }/detection_and_keypoints.cpp | 2 +-
 .../DetectionNetwork/{RVC4 => }/detection_and_keypoints.py      | 2 +-
 tests/CMakeLists.txt                                            | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)
 rename examples/cpp/DetectionNetwork/{RVC4 => }/detection_and_keypoints.cpp (97%)
 rename examples/python/DetectionNetwork/{RVC4 => }/detection_and_keypoints.py (96%)

diff --git a/examples/cpp/DetectionNetwork/CMakeLists.txt b/examples/cpp/DetectionNetwork/CMakeLists.txt
index 9a3adc4e6..8193faeb9 100644
--- a/examples/cpp/DetectionNetwork/CMakeLists.txt
+++ b/examples/cpp/DetectionNetwork/CMakeLists.txt
@@ -26,5 +26,5 @@ dai_set_example_test_labels(detection_network_remap ondevice rvc2_all rvc4 ci)
 dai_add_example(detection_and_segmentation RVC4/detection_and_segmentation.cpp ON OFF)
 dai_set_example_test_labels(detection_and_segmentation rvc4)
 
-dai_add_example(detection_and_keypoints RVC4/detection_and_keypoints.cpp ON OFF)
+dai_add_example(detection_and_keypoints detection_and_keypoints.cpp ON OFF)
 dai_set_example_test_labels(detection_and_keypoints rvc4)
\ No newline at end of file
diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
similarity index 97%
rename from examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp
rename to examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
index bc8dca07c..667151bb0 100644
--- a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp
+++ b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
@@ -22,7 +22,7 @@ int main() {
     auto detectionNetwork = pipeline.create<dai::node::DetectionNetwork>();
 
     dai::NNModelDescription modelDescription;
-    modelDescription.model = "luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39";
+    modelDescription.model = "luxonis/yolov8-nano-pose-estimation:coco-512x288";
     detectionNetwork->build(cameraNode, modelDescription);
     auto labelMap = detectionNetwork->getClasses();
 
diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py b/examples/python/DetectionNetwork/detection_and_keypoints.py
similarity index 96%
rename from examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py
rename to examples/python/DetectionNetwork/detection_and_keypoints.py
index c62987701..431679544 100644
--- a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py
+++ b/examples/python/DetectionNetwork/detection_and_keypoints.py
@@ -9,7 +9,7 @@
 # Create pipeline
 with dai.Pipeline() as pipeline:
     cameraNode = pipeline.create(dai.node.Camera).build()
-    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39"))
+    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-nano-pose-estimation:coco-512x288"))
     labelMap = detectionNetwork.getClasses()
 
     qRgb = detectionNetwork.passthrough.createOutputQueue()
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e88884733..6ab38e604 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -389,7 +389,7 @@ dai_set_test_labels(nndata_test onhost ci)
 
 #ImgDetections tests
 dai_add_test(imgdetections_test src/onhost_tests/pipeline/datatype/imgdetections_test.cpp)
-dai_set_test_labels(imgdetections_test onhost ci)
+dai_set_test_labels(imgdetections_test ondevice rvc2 rvc4 onhost ci)
 
 # Model description tests
 dai_add_test(model_slug_test src/onhost_tests/model_slug_test.cpp)

From bb3204ebd1abeaa5a6748e77a1d4cf9ed075c04a Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Tue, 4 Nov 2025 16:59:00 +0100
Subject: [PATCH 02/24] Add host parsing option

---
 CMakeLists.txt                                |   1 +
 .../RVC4/detection_and_segmentation.py        |   3 +-
 .../depthai/pipeline/node/DetectionParser.hpp |  29 +-
 src/pipeline/node/DetectionParser.cpp         | 184 ++++
 .../DetectionParser/DetectionParserUtils.cpp  | 897 ++++++++++++++++++
 .../DetectionParser/DetectionParserUtils.hpp  |  85 ++
 src/pipeline/utilities/NNDataViewer.hpp       | 163 ++++
 7 files changed, 1360 insertions(+), 2 deletions(-)
 create mode 100644 src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
 create mode 100644 src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
 create mode 100644 src/pipeline/utilities/NNDataViewer.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae50f4e25..54150150f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -311,6 +311,7 @@ set(TARGET_CORE_SOURCES
     src/pipeline/node/ImageAlign.cpp
     src/pipeline/node/ToF.cpp
     src/pipeline/node/DetectionParser.cpp
+    src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
     src/pipeline/node/test/MyProducer.cpp
     src/pipeline/node/test/MyConsumer.cpp
     src/pipeline/node/UVC.cpp
diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py b/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
index fcbbbfd2f..650f90f2f 100644
--- a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
+++ b/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
@@ -9,7 +9,8 @@
 # Create pipeline
 with dai.Pipeline() as pipeline:
     cameraNode = pipeline.create(dai.node.Camera).build()
-    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-large:coco-640x480"))
+    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-nano:coco-512x288"))
+    # detectionNetwork.detectionParser.runOnHost(True)
     labelMap = detectionNetwork.getClasses()
 
     qRgb = detectionNetwork.passthrough.createOutputQueue()
diff --git a/include/depthai/pipeline/node/DetectionParser.hpp b/include/depthai/pipeline/node/DetectionParser.hpp
index 78bb8ce8e..4b50a75b3 100644
--- a/include/depthai/pipeline/node/DetectionParser.hpp
+++ b/include/depthai/pipeline/node/DetectionParser.hpp
@@ -15,6 +15,8 @@
 #include <string>
 
 #include "depthai/common/YoloDecodingFamily.hpp"
+#include "depthai/pipeline/datatype/ImgDetections.hpp"
+#include "depthai/pipeline/datatype/NNData.hpp"
 
 namespace dai {
 namespace node {
@@ -23,7 +25,7 @@ namespace node {
  * @brief DetectionParser node. Parses detection results from different neural networks and is being used internally by MobileNetDetectionNetwork and
  * YoloDetectionNetwork.
  */
-class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, DetectionParserProperties> {
+class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, DetectionParserProperties>, public HostRunnable {
    public:
     constexpr static const char* NAME = "DetectionParser";
     using DeviceNodeCRTP::DeviceNodeCRTP;
@@ -177,7 +179,23 @@ class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, Detec
 
     const NNArchiveVersionedConfig& getNNArchiveVersionedConfig() const;
 
+    /**
+     * Specify whether to run on host or device
+     * By default, the node will run on device.
+     */
+    void setRunOnHost(bool runOnHost);
+
+    /**
+     * Check if the node is set to run on host
+     */
+    bool runOnHost() const override;
+
+    void run() override;
+
+    std::vector<dai::ImgDetection> decodeMobilenet(std::shared_ptr<dai::NNData> nnData, float confidenceThr);
+
    private:
+    bool runOnHostVar = false;
     void setNNArchiveBlob(const NNArchive& nnArchive);
     void setNNArchiveSuperblob(const NNArchive& nnArchive, int numShaves);
     void setNNArchiveOther(const NNArchive& nnArchive);
@@ -185,6 +203,15 @@ class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, Detec
     YoloDecodingFamily yoloDecodingFamilyResolver(const std::string& subtype);
     bool decodeSegmentationResolver(const std::vector<std::string>& outputs);
 
+    // host runnable requirements
+    void buildStage1() override;
+    void decodeYolo(std::shared_ptr<dai::NNData> nnData, std::shared_ptr<dai::ImgDetections> outDetections);
+    std::vector<dai::TensorInfo> inTensorInfo;
+    uint32_t imgWidth;
+    uint32_t imgHeight;
+    uint32_t imgSizesSet = false;
+    //
+
     std::optional<NNArchive> mArchive;
 
     std::optional<NNArchiveVersionedConfig> archiveConfig;
diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp
index a03b64633..2c0e07b9a 100644
--- a/src/pipeline/node/DetectionParser.cpp
+++ b/src/pipeline/node/DetectionParser.cpp
@@ -13,6 +13,8 @@
 #include "nn_archive/NNArchive.hpp"
 #include "nn_archive/v1/Head.hpp"
 #include "pipeline/ThreadedNodeImpl.hpp"
+#include "pipeline/datatype/NNData.hpp"
+#include "pipeline/utilities/DetectionParser/DetectionParserUtils.hpp"
 #include "spdlog/fmt/fmt.h"
 
 // internal headers
@@ -349,5 +351,187 @@ std::vector<int> DetectionParser::getStrides() const {
     return properties.parser.strides;
 }
 
+void DetectionParser::setRunOnHost(bool runOnHost) {
+    if(runOnHost) {
+        pimpl->logger->warn("Detection parser set to run on host.");
+    }
+    runOnHostVar = runOnHost;
+}
+
+/**
+ * Check if the node is set to run on host
+ */
+bool DetectionParser::runOnHost() const {
+    return runOnHostVar;
+}
+
+void DetectionParser::run() {
+    auto& logger = pimpl->logger;
+    logger->info("Detection parser running on host.");
+
+    using namespace std::chrono;
+    while(isRunning()) {
+        auto tAbsoluteBeginning = steady_clock::now();
+        std::shared_ptr<dai::NNData> inputData;
+        inputData = input.get<dai::NNData>();
+        if(!inputData) {
+            logger->error("Error while receiving NN frame.");
+            continue;
+        }
+        auto tAfterMessageBeginning = steady_clock::now();
+
+        if(!imgSizesSet) {
+            const bool containsTransformation = inputData->transformation.has_value();
+            if(containsTransformation) {
+                std::tie(imgWidth, imgHeight) = inputData->transformation->getSize();
+            } else {
+                logger->warn("No image size provided for detection parser. Skipping processing and sending empty detections.");
+                continue;
+            }
+
+            imgSizesSet = true;
+        }
+
+        auto outDetections = std::make_shared<dai::ImgDetections>();
+
+        switch(properties.parser.nnFamily) {
+            case DetectionNetworkType::YOLO: {
+                decodeYolo(inputData, outDetections);
+                break;
+            }
+            case DetectionNetworkType::MOBILENET: {
+                auto dets = decodeMobilenet(inputData, properties.parser.confidenceThreshold);  // TODO (aljaz) update to shared pointer
+                outDetections->detections = dets;
+                break;
+            }
+            default: {
+                logger->error("Unknown NN family. 'YOLO' and 'MOBILENET' are supported.");
+                break;
+            }
+        }
+
+        auto tBeforeSend = steady_clock::now();
+
+        // Copy over seq and ts
+        outDetections->setSequenceNum(inputData->getSequenceNum());
+        outDetections->setTimestamp(inputData->getTimestamp());
+        outDetections->setTimestampDevice(inputData->getTimestampDevice());
+        outDetections->transformation = inputData->transformation;
+        // Send detections
+        out.send(outDetections);
+
+        auto tAbsoluteEnd = steady_clock::now();
+        logger->debug("Detection parser total took {}ms, processing {}ms, getting_frames {}ms, sending_frames {}ms",
+                      duration_cast<microseconds>(tAbsoluteEnd - tAbsoluteBeginning).count() / 1000,
+                      duration_cast<microseconds>(tBeforeSend - tAfterMessageBeginning).count() / 1000,
+                      duration_cast<microseconds>(tAfterMessageBeginning - tAbsoluteBeginning).count() / 1000,
+                      duration_cast<microseconds>(tAbsoluteEnd - tBeforeSend).count() / 1000);
+    }
+}
+
+void DetectionParser::buildStage1() {
+    auto& logger = pimpl->logger;
+
+    // Grab dimensions from input tensor info
+    if(properties.networkInputs.size() > 0) {
+        if(properties.networkInputs.size() > 1) {
+            logger->warn("Detection parser supports only single input networks, assuming first input");
+        }
+        for(const auto& kv : properties.networkInputs) {
+            const dai::TensorInfo& tensorInfo = kv.second;
+            inTensorInfo.push_back(tensorInfo);
+        }
+    }
+    if(inTensorInfo.size() > 0) {
+        int numDimensions = inTensorInfo[0].numDimensions;
+        if(numDimensions < 2) {
+            logger->error("Number of input dimensions is less than 2");
+        } else {
+            imgSizesSet = true;
+            imgWidth = inTensorInfo[0].dims[numDimensions - 1];
+            imgHeight = inTensorInfo[0].dims[numDimensions - 2];
+        }
+    } else {
+        logger->info("Unable to read input tensor height and width from static inputs. The node will try to get input sizes at runtime.");
+    }
+}
+
+std::vector<dai::ImgDetection> DetectionParser::decodeMobilenet(std::shared_ptr<dai::NNData> nnData, float confidenceThr) {
+    auto& logger = pimpl->logger;
+
+    if(!nnData) {
+        return {};
+    }
+    int maxDetections = 100;
+    std::vector<dai::ImgDetection> detections;
+    std::string tensorName;
+    for(const auto& tensor : nnData->getAllLayers()) {
+        if(tensor.offset == 0) {
+            tensorName = tensor.name;
+        }
+    }
+
+    auto tensorData = nnData->getTensor<float>(tensorName);
+    maxDetections = tensorData.size() / 7;
+    if(static_cast<int>(tensorData.size()) < maxDetections * 7) {
+        logger->error("Error while parsing Mobilenet. Vector not long enough, expected size: {}, real size {}", maxDetections * 7, tensorData.size());
+        return {};
+    }
+
+    struct raw_Detection {  // need to update it to include more
+        float header;
+        float label;
+        float confidence;
+        float xmin;
+        float ymin;
+        float xmax;
+        float ymax;
+    };
+
+    float* rawPtr = tensorData.data();
+    for(int i = 0; i < maxDetections; i++) {
+        raw_Detection temp;
+        // TODO This is likely unnecessary optimisation
+        memcpy(&temp, &rawPtr[i * 7], sizeof(raw_Detection));
+
+        // if header == -1, stop sooner
+        if(temp.header == -1.0f) break;
+
+        float currentConfidence = temp.confidence;
+        if(currentConfidence >= confidenceThr) {
+            dai::ImgDetection d;
+            d.label = temp.label;
+
+            d.confidence = currentConfidence;
+
+            d.xmin = temp.xmin;
+            d.ymin = temp.ymin;
+            d.xmax = temp.xmax;
+            d.ymax = temp.ymax;
+
+            detections.push_back(d);
+        }
+    }
+    return detections;
+}
+
+void DetectionParser::decodeYolo(std::shared_ptr<dai::NNData> nnData, std::shared_ptr<dai::ImgDetections> outDetections) {
+    auto& logger = pimpl->logger;
+    switch(properties.parser.decodingFamily) {
+        case YoloDecodingFamily::R1AF:  // anchor free: yolo v6r1
+            utilities::DetectionParserUtils::decodeR1AF(nnData, outDetections, properties, logger);
+            break;
+        case YoloDecodingFamily::v3AB:  // anchor based yolo v3 v3-Tiny
+            utilities::DetectionParserUtils::decodeV3AB(nnData, outDetections, properties, logger);
+            break;
+        case YoloDecodingFamily::v5AB:  // anchor based yolo v5, v7, P
+            utilities::DetectionParserUtils::decodeV5AB(nnData, outDetections, properties, logger);
+            break;
+        case YoloDecodingFamily::TLBR:  // top left bottom right anchor free: yolo v6r2, v8 v10 v11
+            utilities::DetectionParserUtils::decodeTLBR(nnData, outDetections, properties, logger);
+            break;
+    }
+}
+
 }  // namespace node
 }  // namespace dai
diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
new file mode 100644
index 000000000..c1809e847
--- /dev/null
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
@@ -0,0 +1,897 @@
+#include "DetectionParserUtils.hpp"
+
+#include <spdlog/async_logger.h>
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "depthai/common/KeypointsList.hpp"
+#include "depthai/common/RotatedRect.hpp"
+#include "depthai/common/TensorInfo.hpp"
+#include "depthai/pipeline/datatype/ImgDetections.hpp"
+#include "depthai/pipeline/datatype/NNData.hpp"
+#include "depthai/properties/DetectionParserProperties.hpp"
+#include "pipeline/utilities/NNDataViewer.hpp"
+
+namespace dai {
+namespace utilities {
+namespace DetectionParserUtils {
+
+// yolo v6 r1 - anchor free
+void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger) {
+    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+
+    const std::vector<int> strides = properties.parser.strides;
+    if(strides.size() != layerNames.size()) {
+        std::string errorMsg = fmt::format(
+            "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size());
+        throw std::runtime_error(errorMsg);
+    }
+    const float confidenceThr = properties.parser.confidenceThreshold;
+    const float iouThr = properties.parser.iouThreshold;
+    const int numClasses = properties.parser.classes;
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+
+    if(inputWidth <= 0 || inputHeight <= 0) {
+        throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
+    }
+    std::vector<DetectionCandidate> detectionCandidates;
+    detectionCandidates.reserve(250);
+
+    for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
+        std::string layerName = layerNames[strideIdx];
+        auto tensorInfo = nnData->getTensorInfo(layerName);
+        if(!tensorInfo) {
+            std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        if(!isTensorOrderValid(*tensorInfo, properties, logger)) {
+            logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName);
+            continue;
+        }
+
+        int layerHeight = tensorInfo->getHeight();
+        int layerWidth = tensorInfo->getWidth();
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger);
+        if(!outputData.build()) {
+            std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        for(int row = 0; row < layerHeight; ++row) {
+            for(int col = 0; col < layerWidth; ++col) {
+                const float score = outputData.get(4, row, col);
+                if(score < confidenceThr) {
+                    continue;
+                }
+
+                int bestC = 0;
+                float bestConf = 0.0f;
+                for(int c = 0; c < numClasses; ++c) {
+                    float candidateProb = outputData.get(c + 5, row, col);
+                    if(candidateProb > bestConf) {
+                        bestConf = candidateProb;
+                        bestC = c;
+                    }
+                }
+                if(bestConf * score < confidenceThr) {
+                    continue;
+                }
+
+                float cx = outputData.get(0, row, col);
+                float cy = outputData.get(1, row, col);
+                float w = outputData.get(2, row, col);
+                float h = outputData.get(3, row, col);
+
+                float xmin = cx - w * 0.5f;
+                float ymin = cy - h * 0.5f;
+                float xmax = cx + w * 0.5f;
+                float ymax = cy + h * 0.5f;
+
+                xmin = std::max(0.0f, std::min(xmin, float(inputWidth)));
+                ymin = std::max(0.0f, std::min(ymin, float(inputHeight)));
+                xmax = std::max(0.0f, std::min(xmax, float(inputWidth)));
+                ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
+
+                if(xmax <= xmin || ymax <= ymin) {
+                    logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping");
+                    continue;
+                }
+                DetectionCandidate candidate = DetectionCandidate{
+                    xmin,
+                    ymin,
+                    xmax,
+                    ymax,
+                    bestConf * score,
+                    bestC,
+                    strideIdx,
+                    row,
+                    col,
+                    std::nullopt,
+                };
+
+                if(!properties.parser.classNames->empty()) {
+                    candidate.labelName = (*properties.parser.classNames)[bestC];
+                }
+                detectionCandidates.emplace_back(std::move(candidate));
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr);
+    if(keepCandidates.size() == 0) {
+        logger->trace("No detections after NMS, skipping overlay.");
+        return;
+    }
+
+    createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
+
+    if(properties.parser.decodeSegmentation) {
+        logger->trace("Segmentation decoding.");
+        segmentationDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    if(properties.parser.decodeKeypoints) {
+        logger->trace("Keypoints decoding.");
+        keypointDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+}
+
+/*
+Decode anchor based yolo v3 and v3-Tiny
+*/
+void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger) {
+    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+
+    const std::vector<int> strides = properties.parser.strides;
+    if(strides.size() != layerNames.size()) {
+        std::string errorMsg = fmt::format(
+            "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size());
+        throw std::runtime_error(errorMsg);
+    }
+
+    const float confidenceThr = properties.parser.confidenceThreshold;
+    const float iouThr = properties.parser.iouThreshold;
+    const int numClasses = properties.parser.classes;
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+    if(inputWidth <= 0 || inputHeight <= 0) {
+        throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
+    }
+
+    if(properties.parser.anchorsV2.size() != layerNames.size()) {
+        logger->error("Number of anchor sets does not match number of output layers. Anchor sets size: {}, output layers size: {}",
+                      properties.parser.anchorsV2.size(),
+                      layerNames.size());
+        return;
+    }
+
+    std::vector<DetectionCandidate> detectionCandidates;
+    detectionCandidates.reserve(250);
+
+    for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
+        std::string layerName = layerNames[strideIdx];
+        int stride = strides[strideIdx];
+        auto tensorInfo = nnData->getTensorInfo(layerName);
+        if(!tensorInfo) {
+            std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        if(!isTensorOrderValid(*tensorInfo, properties, logger)) {
+            logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName);
+            continue;
+        }
+
+        int layerHeight = tensorInfo->getHeight();
+        int layerWidth = tensorInfo->getWidth();
+        int layerChannels = tensorInfo->getChannels();
+
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger);
+        if(!outputData.build()) {
+            std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+        std::vector<std::vector<float>>& anchors = properties.parser.anchorsV2[strideIdx];
+        int numAnchors = anchors.size();
+        int block = 5 + numClasses;
+        int expectedC = numAnchors * block;
+
+        if(layerChannels != expectedC) {
+            std::string errorMsg = fmt::format("Layer {} channels mismatch. Expected {}, got {}", layerName, expectedC, layerChannels);
+            throw std::runtime_error(errorMsg);
+        }
+
+        auto sigmoid = [](float x) -> float { return 1.f / (1.f + std::exp(-x)); };
+
+        for(int row = 0; row < layerHeight; ++row) {
+            for(int col = 0; col < layerWidth; ++col) {
+                for(int a = 0; a < numAnchors; ++a) {
+                    const int ch0 = a * block;
+                    const float tx = sigmoid(outputData.get(ch0 + 0, row, col));
+                    const float ty = sigmoid(outputData.get(ch0 + 1, row, col));
+                    const float tw = outputData.get(ch0 + 2, row, col);
+                    const float th = outputData.get(ch0 + 3, row, col);
+                    const float obj = sigmoid(outputData.get(ch0 + 4, row, col));
+                    if(obj < confidenceThr) continue;
+
+                    int bestC = 0;
+                    float clsProb = 0.0f;
+                    for(int c = 0; c < numClasses; ++c) {
+                        const float prob = outputData.get(ch0 + 5 + c, row, col);
+                        if(prob > clsProb) {
+                            clsProb = prob;
+                            bestC = c;
+                        }
+                    }
+                    const float conf = obj * 1.f / (1.f + std::exp(-clsProb));
+                    if(conf < confidenceThr) continue;
+
+                    // YOLOv3 decode
+                    const float cx = (static_cast<float>(col) + tx) * static_cast<float>(stride);
+                    const float cy = (static_cast<float>(row) + ty) * static_cast<float>(stride);
+                    const float w_exp = std::exp(tw);
+                    const float h_exp = std::exp(th);
+                    const float w = w_exp * anchors[a][0];
+                    const float h = h_exp * anchors[a][1];
+
+                    float xmin = cx - 0.5f * w;
+                    float ymin = cy - 0.5f * h;
+                    float xmax = cx + 0.5f * w;
+                    float ymax = cy + 0.5f * h;
+
+                    xmin = std::max(0.0f, std::min(xmin, float(inputWidth)));
+                    ymin = std::max(0.0f, std::min(ymin, float(inputHeight)));
+                    xmax = std::max(0.0f, std::min(xmax, float(inputWidth)));
+                    ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
+
+                    if(xmax <= xmin || ymax <= ymin) {
+                        logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping");
+                        continue;
+                    }
+
+                    DetectionCandidate candidate = DetectionCandidate{
+                        xmin,
+                        ymin,
+                        xmax,
+                        ymax,
+                        conf,
+                        bestC,
+                        strideIdx,
+                        row,
+                        col,
+                        std::nullopt,
+                    };
+
+                    if(!properties.parser.classNames->empty()) {
+                        candidate.labelName = (*properties.parser.classNames)[bestC];
+                    }
+                    detectionCandidates.emplace_back(std::move(candidate));
+                }
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr);
+    if(keepCandidates.size() == 0) {
+        logger->trace("No detections after NMS, skipping overlay.");
+        return;
+    }
+
+    createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
+
+    if(properties.parser.decodeSegmentation) {
+        logger->trace("Segmentation decoding.");
+        segmentationDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    if(properties.parser.decodeKeypoints) {
+        logger->trace("Keypoints decoding.");
+        keypointDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    //
+}
+
+/*
+Decode anchor based networks, e.g., yolo v5, v7, P
+*/
+void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger) {
+    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+
+    const std::vector<int> strides = properties.parser.strides;
+    if(strides.size() != layerNames.size()) {
+        std::string errorMsg = fmt::format(
+            "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size());
+        throw std::runtime_error(errorMsg);
+    }
+
+    const float confidenceThr = properties.parser.confidenceThreshold;
+    const float iouThr = properties.parser.iouThreshold;
+    const int numClasses = properties.parser.classes;
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+
+    if(inputWidth <= 0 || inputHeight <= 0) {
+        throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
+    }
+
+    if(properties.parser.anchorsV2.size() != layerNames.size()) {
+        logger->error("Number of anchor sets does not match number of output layers. Anchor sets size: {}, output layers size: {}",
+                      properties.parser.anchorsV2.size(),
+                      layerNames.size());
+        return;
+    }
+
+    std::vector<DetectionCandidate> detectionCandidates;
+    detectionCandidates.reserve(250);
+
+    for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
+        std::string layerName = layerNames[strideIdx];
+        int stride = strides[strideIdx];
+        auto tensorInfo = nnData->getTensorInfo(layerName);
+        if(!tensorInfo) {
+            std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        if(!isTensorOrderValid(*tensorInfo, properties, logger)) {
+            logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName);
+            continue;
+        }
+
+        int layerHeight = tensorInfo->getHeight();
+        int layerWidth = tensorInfo->getWidth();
+        int layerChannels = tensorInfo->getChannels();
+
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger);
+        if(!outputData.build()) {
+            std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+        std::vector<std::vector<float>>& anchors = properties.parser.anchorsV2[strideIdx];
+        int numAnchors = anchors.size();
+        int block = 5 + numClasses;
+        int expectedC = numAnchors * block;
+
+        if(layerChannels != expectedC) {
+            logger->error("Layer {} channels mismatch. Expected {}, got {}", layerName, expectedC, layerChannels);
+            return;
+        }
+
+        for(int row = 0; row < layerHeight; ++row) {
+            for(int col = 0; col < layerWidth; ++col) {
+                for(int a = 0; a < numAnchors; ++a) {
+                    const int ch0 = a * block;
+
+                    const float tx = outputData.get(ch0 + 0, row, col);
+                    const float ty = outputData.get(ch0 + 1, row, col);
+                    const float tw = outputData.get(ch0 + 2, row, col);
+                    const float th = outputData.get(ch0 + 3, row, col);
+                    const float obj = outputData.get(ch0 + 4, row, col);
+                    if(obj < confidenceThr) continue;
+
+                    int bestC = 0;
+                    float bestConf = 0.0f;
+                    for(int c = 0; c < numClasses; ++c) {
+                        const float prob = outputData.get(ch0 + 5 + c, row, col);
+                        if(prob > bestConf) {
+                            bestConf = prob;
+                            bestC = c;
+                        }
+                    }
+                    const float conf = obj * bestConf;
+                    if(conf < confidenceThr) continue;
+
+                    // YOLOv5 decode
+                    const float cx = ((tx * 2.0f - 0.5f) + static_cast<float>(col)) * static_cast<float>(stride);
+                    const float cy = ((ty * 2.0f - 0.5f) + static_cast<float>(row)) * static_cast<float>(stride);
+
+                    const float w = tw * tw * 4.0f * anchors[a][0];
+                    const float h = th * th * 4.0f * anchors[a][1];
+
+                    float xmin = cx - 0.5f * w;
+                    float ymin = cy - 0.5f * h;
+                    float xmax = cx + 0.5f * w;
+                    float ymax = cy + 0.5f * h;
+
+                    xmin = std::max(0.0f, std::min(xmin, float(inputWidth)));
+                    ymin = std::max(0.0f, std::min(ymin, float(inputHeight)));
+                    xmax = std::max(0.0f, std::min(xmax, float(inputWidth)));
+                    ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
+
+                    if(xmax <= xmin || ymax <= ymin) continue;
+                    DetectionCandidate candidate = DetectionCandidate{
+                        xmin,
+                        ymin,
+                        xmax,
+                        ymax,
+                        conf,
+                        bestC,
+                        strideIdx,
+                        row,
+                        col,
+                        std::nullopt,
+                    };
+
+                    if(!properties.parser.classNames->empty()) {
+                        candidate.labelName = (*properties.parser.classNames)[bestC];
+                    }
+                    detectionCandidates.emplace_back(std::move(candidate));
+                }
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr);
+    if(keepCandidates.size() == 0) {
+        logger->trace("No detections after NMS, skipping overlay.");
+        return;
+    }
+
+    createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
+
+    if(properties.parser.decodeSegmentation) {
+        logger->trace("Segmentation decoding.");
+        segmentationDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    if(properties.parser.decodeKeypoints) {
+        logger->trace("Keypoints decoding.");
+        keypointDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+}
+
+/*
+Decode TLBR (top left bottom right) style networks, e.g., yolo v6r2, v8, v10, v11
+*/
+void decodeTLBR(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger) {
+    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+
+    const std::vector<int> strides = properties.parser.strides;
+    if(strides.size() != layerNames.size()) {
+        std::string errorMsg = fmt::format(
+            "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size());
+        throw std::runtime_error(errorMsg);
+    }
+    const float confidenceThr = properties.parser.confidenceThreshold;
+    const float iouThr = properties.parser.iouThreshold;
+    const int numClasses = properties.parser.classes;
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+
+    if(inputWidth <= 0 || inputHeight <= 0) {
+        throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
+    }
+
+    std::vector<DetectionCandidate> detectionCandidates;
+    detectionCandidates.reserve(250);
+
+    for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
+        std::string layerName = layerNames[strideIdx];
+        int stride = strides[strideIdx];
+        auto tensorInfo = nnData->getTensorInfo(layerName);
+        if(!tensorInfo) {
+            std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        if(!isTensorOrderValid(*tensorInfo, properties, logger)) {
+            logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName);
+            continue;
+        }
+
+        int layerHeight = tensorInfo->getHeight();
+        int layerWidth = tensorInfo->getWidth();
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger);
+        if(!outputData.build()) {
+            std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        for(int row = 0; row < layerHeight; ++row) {
+            for(int col = 0; col < layerWidth; ++col) {
+                const float score = outputData.get(4, row, col);
+                if(score < confidenceThr) {
+                    continue;
+                }
+
+                int bestC = 0;
+                float bestConf = 0.0f;
+                for(int c = 0; c < numClasses; ++c) {
+                    float candidateProb = outputData.get(c + 5, row, col);
+                    if(candidateProb > bestConf) {
+                        bestConf = candidateProb;
+                        bestC = c;
+                    }
+                }
+                float xmin = (col - outputData.get(0, row, col) + 0.5f) * stride;
+                float ymin = (row - outputData.get(1, row, col) + 0.5f) * stride;
+                float xmax = (col + outputData.get(2, row, col) + 0.5f) * stride;
+                float ymax = (row + outputData.get(3, row, col) + 0.5f) * stride;
+
+                if(bestConf < confidenceThr) {
+                    continue;
+                }
+
+                xmin = std::max(0.0f, std::min(xmin, float(inputWidth)));
+                ymin = std::max(0.0f, std::min(ymin, float(inputHeight)));
+                xmax = std::max(0.0f, std::min(xmax, float(inputWidth)));
+                ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
+
+                if(xmax <= xmin || ymax <= ymin) {
+                    logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping");
+                    continue;
+                }
+
+                DetectionCandidate candidate = DetectionCandidate{
+                    xmin,
+                    ymin,
+                    xmax,
+                    ymax,
+                    bestConf,
+                    bestC,
+                    strideIdx,
+                    row,
+                    col,
+                    std::nullopt,
+
+                };
+
+                if(!properties.parser.classNames->empty()) {
+                    candidate.labelName = (*properties.parser.classNames)[bestC];
+                }
+                detectionCandidates.emplace_back(std::move(candidate));
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr);
+    if(keepCandidates.size() == 0) {
+        logger->trace("No detections after NMS, skipping overlay.");
+        return;
+    }
+
+    createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
+
+    if(properties.parser.decodeSegmentation) {
+        logger->trace("Segmentation decoding.");
+        segmentationDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    if(properties.parser.decodeKeypoints) {
+        logger->trace("Keypoints decoding.");
+        keypointDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+}
+
+bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr<spdlog::async_logger> logger) {
+    // Fix the channel order for Yolo - this is hacky and would be best to be fixed in the actual models and make it consistent
+
+    int anchorMultiplier = properties.parser.anchorsV2.empty() ? 1 : static_cast<int>(properties.parser.anchorsV2.size());
+    int channelSize = anchorMultiplier * (properties.parser.classes + properties.parser.coordinates + 1);
+
+    auto checkAndFixOrder = [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool {
+        // Check that the dims size is big enough
+        if(static_cast<int>(tensorInfo.dims.size()) <= channelDimIndex || static_cast<int>(tensorInfo.dims.size()) <= alternativeDimIndex) {
+            logger->error("Invalid tensor dims size. Skipping.");
+            return false;
+        }
+
+        if(tensorInfo.dims[channelDimIndex] != uint32_t(channelSize)) {
+            // Check if the channel size would match the alternative storage order
+            if(tensorInfo.dims[alternativeDimIndex] == uint32_t(channelSize)) {
+                logger->trace("Invalid channel size for the tensor. Expected {}, got {}, switching", channelSize, tensorInfo.dims[channelDimIndex]);
+                tensorInfo.order = alternativeOrder;
+            } else {
+                logger->error("Invalid channel size for the tensor. Expected {}, got {}. Skipping.", channelSize, tensorInfo.dims[channelDimIndex]);
+                return false;
+            }
+        }
+        return true;
+    };
+
+    switch(tensorInfo.order) {
+        case dai::TensorInfo::StorageOrder::CHW:
+            if(!checkAndFixOrder(0, 2, dai::TensorInfo::StorageOrder::HWC)) return false;
+            break;
+        case dai::TensorInfo::StorageOrder::HWC:
+            if(!checkAndFixOrder(2, 0, dai::TensorInfo::StorageOrder::CHW)) return false;
+            break;
+        case dai::TensorInfo::StorageOrder::NCHW:
+            if(!checkAndFixOrder(1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false;
+            break;
+        case dai::TensorInfo::StorageOrder::NHWC:
+            if(!checkAndFixOrder(3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false;
+            break;
+        case dai::TensorInfo::StorageOrder::NHCW:
+        case dai::TensorInfo::StorageOrder::WHC:
+        case dai::TensorInfo::StorageOrder::WCH:
+        case dai::TensorInfo::StorageOrder::HCW:
+        case dai::TensorInfo::StorageOrder::CWH:
+        case dai::TensorInfo::StorageOrder::NC:
+        case dai::TensorInfo::StorageOrder::CN:
+        case dai::TensorInfo::StorageOrder::C:
+        case dai::TensorInfo::StorageOrder::H:
+        case dai::TensorInfo::StorageOrder::W:
+        default:
+            logger->error("Invalid storage order for the tensor. Skipping.");
+            return false;
+    }
+
+    return true;
+}
+
+std::vector<std::string> getSortedDetectionLayerNames(std::shared_ptr<dai::NNData> nnData, std::string searchTerm, std::vector<std::string> outputNames) {
+    if(outputNames.empty()) {
+        outputNames = nnData->getAllLayerNames();
+    }
+
+    std::vector<std::string> layerNames;
+    for(const auto& name : outputNames) {
+        // if yolo in the name, push it to layerNames
+        if(name.find(searchTerm) != std::string::npos) {
+            layerNames.push_back(name);
+        }
+    }
+
+    std::sort(layerNames.begin(), layerNames.end());
+    return layerNames;
+}
+
+float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2) {
+    float width_of_overlap_area = fmin(box1.xmax, box2.xmax) - fmax(box1.xmin, box2.xmin);
+    float height_of_overlap_area = fmin(box1.ymax, box2.ymax) - fmax(box1.ymin, box2.ymin);
+    float area_of_overlap;
+    if(width_of_overlap_area < 0 || height_of_overlap_area < 0)
+        area_of_overlap = 0;
+    else
+        area_of_overlap = width_of_overlap_area * height_of_overlap_area;
+    float box_1_area = (box1.ymax - box1.ymin) * (box1.xmax - box1.xmin);
+    float box_2_area = (box2.ymax - box2.ymin) * (box2.xmax - box2.xmin);
+    float area_of_union = box_1_area + box_2_area - area_of_overlap;
+    return area_of_overlap / area_of_union;
+}
+
+std::vector<DetectionCandidate> nonMaximumSuppression(std::vector<DetectionCandidate>& detectionCandidates, float iouThr) {
+    std::sort(
+        detectionCandidates.begin(), detectionCandidates.end(), [](const DetectionCandidate& a, const DetectionCandidate& b) { return a.score > b.score; });
+
+    std::vector<uint8_t> keep(detectionCandidates.size(), 1);
+    std::vector<size_t> keepIndices;
+    keepIndices.reserve(detectionCandidates.size());
+
+    for(size_t i = 0; i < detectionCandidates.size(); ++i) {
+        if(!keep[i]) continue;
+        keepIndices.push_back(i);
+
+        for(size_t j = i + 1; j < detectionCandidates.size(); ++j) {
+            if(!keep[j]) continue;
+            if(YoloIntersectionOverUnion(detectionCandidates[i], detectionCandidates[j]) >= iouThr) {
+                keep[j] = 0;
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates;
+    keepCandidates.reserve(keepIndices.size());
+    for(size_t idx : keepIndices) keepCandidates.push_back(detectionCandidates[idx]);
+
+    return keepCandidates;
+}
+
+void createImgDetections(const std::vector<DetectionCandidate>& detectionCandidates,
+                         std::shared_ptr<dai::ImgDetections> outDetections,
+                         unsigned int width,
+                         unsigned int height) {
+    for(const auto& det : detectionCandidates) {
+        dai::ImgDetection detection;
+        dai::RotatedRect rotatedRect(dai::Rect(dai::Point2f(det.xmin, det.ymin), dai::Point2f(det.xmax, det.ymax)), 0.0f);
+        detection.setBoundingBox(rotatedRect.normalize(width, height));
+        detection.confidence = det.score;
+        detection.label = det.label;
+        if(det.labelName) {
+            detection.labelName = *det.labelName;
+        }
+        outDetections->detections.push_back(std::move(detection));
+    }
+}
+
+void segmentationDecode(std::shared_ptr<dai::NNData> nnData,
+                        std::vector<DetectionCandidate>& detectionCandidates,
+                        std::shared_ptr<dai::ImgDetections> outDetections,
+                        DetectionParserProperties properties,
+                        std::shared_ptr<spdlog::async_logger> logger) {
+    auto maskFromCoeffs = [](NNDataViewer& protos, const float* coeffs, int width, int height) -> cv::Mat {
+        cv::Mat maskLow(height, width, CV_32F);
+        for(int y = 0; y < maskLow.rows; ++y) {
+            float* row = maskLow.ptr<float>(y);
+            for(int x = 0; x < maskLow.cols; ++x) {
+                float sum = 0.f;
+                for(int c = 0; c < 32; ++c) sum += protos.get(c, y, x) * coeffs[c];
+                row[x] = 1.f / (1.f + std::exp(-sum));  // sigmoid
+            }
+        }
+        return maskLow;
+    };
+
+    std::pair<int, int> inputSize = nnData->transformation->getSize();
+    int inputWidth = inputSize.first;
+    int inputHeight = inputSize.second;
+
+    cv::Mat indexMask(inputHeight, inputWidth, CV_8U, cv::Scalar(255));
+
+    cv::Mat maskLow, maskUp;
+
+    auto maskLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "masks", std::vector<std::string>{});
+    if(properties.parser.strides.size() != maskLayerNames.size()) {
+        logger->error(
+            "Number of strides does not match number of mask output layers. Strides size: {}, mask output layers size: {}. Skipping segmentation decoding.",
+            properties.parser.strides.size(),
+            maskLayerNames.size());
+        return;
+    }
+    auto protoLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "proto", std::vector<std::string>{});
+    if(protoLayerNames.size() == 0) {
+        logger->error("Expecting proto output layer, found no layer with proto label. Skipping segmentation decoding.");
+        return;
+    }
+
+    NNDataViewer protoValues = NNDataViewer(*nnData->getTensorInfo(protoLayerNames[0]), nnData->data, logger);
+    if(!protoValues.build()) {
+        logger->error("Failed to build NNDataViewer for proto layer {}. Skipping segmentation decoding.", protoLayerNames[0]);
+        return;
+    }
+
+    std::map<int, NNDataViewer> maskValues;
+    for(int strideIdx = 0; strideIdx < static_cast<int>(maskLayerNames.size()); ++strideIdx) {
+        maskValues.try_emplace(strideIdx, *nnData->getTensorInfo(maskLayerNames[strideIdx]), nnData->data, logger);
+        if(!maskValues.at(strideIdx).build()) {
+            logger->error("Failed to build NNDataViewer for mask layer {}. Skipping segmentation decoding.", maskLayerNames[strideIdx]);
+            return;
+        }
+    }
+
+    for(size_t i = 0; i < detectionCandidates.size(); ++i) {  // loop over all detections
+        const auto& c = detectionCandidates[i];
+        const int detIdx = static_cast<int>(i);  // index in outDetections list
+
+        NNDataViewer mask = maskValues.at(c.headIndex);
+        std::array<float, 32> coeff;
+        for(int i = 0; i < 32; ++i) {
+            coeff[i] = mask.get(i, c.rowIndex, c.columnIndex);
+        }
+
+        TensorInfo protoInfo = *nnData->getTensorInfo(protoLayerNames[0]);
+        int protoWidth = protoInfo.getWidth();
+        int protoHeight = protoInfo.getHeight();
+        maskLow = maskFromCoeffs(protoValues, coeff.data(), protoWidth, protoHeight);
+
+        cv::resize(maskLow, maskUp, cv::Size(inputWidth, inputHeight), 0, 0, cv::INTER_LINEAR);
+        // ROI clamp
+        int x0 = std::clamp(static_cast<int>(std::floor(c.xmin)), 0, inputWidth - 1);
+        int y0 = std::clamp(static_cast<int>(std::floor(c.ymin)), 0, inputHeight - 1);
+        int x1 = std::clamp(static_cast<int>(std::ceil(c.xmax)), 0, inputWidth);
+        int y1 = std::clamp(static_cast<int>(std::ceil(c.ymax)), 0, inputHeight);
+
+        if(x1 <= x0 || y1 <= y0) continue;
+        const cv::Rect roi(x0, y0, x1 - x0, y1 - y0);
+
+        // Threshold & paint only unassigned pixels
+        cv::Mat roiProb = maskUp(roi);
+        cv::Mat roiBin;
+        cv::compare(roiProb, static_cast<double>(0.5f), roiBin, cv::CMP_GT);
+        cv::Mat roiOut = indexMask(roi);
+        cv::Mat unassigned;
+        cv::compare(roiOut, 255, unassigned, cv::CMP_EQ);
+        cv::Mat paintMask;
+        cv::bitwise_and(roiBin, unassigned, paintMask);
+
+        const uint8_t value = static_cast<uint8_t>(std::min(detIdx, 254));
+        roiOut.setTo(value, paintMask);
+    }
+
+    outDetections->setSegmentationMask(indexMask);
+}
+
+void keypointDecode(std::shared_ptr<dai::NNData> nnData,
+                    std::vector<DetectionCandidate>& detectionCandidates,
+                    std::shared_ptr<dai::ImgDetections> outDetections,
+                    DetectionParserProperties properties,
+                    std::shared_ptr<spdlog::async_logger> logger) {
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+
+    auto yoloLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+    std::vector<int> featureMapWidths;
+    for(int i = 0; i < static_cast<int>(yoloLayerNames.size()); ++i) {
+        auto tensorInfo = nnData->getTensorInfo(yoloLayerNames[i]);
+        if(!tensorInfo) {
+            logger->error("Tensor info for layer {} is null. Skipping keypoints decoding.", yoloLayerNames[i]);
+            return;
+        }
+        featureMapWidths.push_back(tensorInfo->getWidth());
+    }
+
+    auto kptsLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "kpt_output", std::vector<std::string>{});
+    if(properties.parser.strides.size() != kptsLayerNames.size()) {
+        logger->error(
+            "Number of strides does not match number of keypoints output layers. Strides size: {}, keypoints output layers size: {}. Skipping keypoints "
+            "decoding.",
+            properties.parser.strides.size(),
+            kptsLayerNames.size());
+        return;
+    }
+
+    // TODO (aljaz) move to a function
+    std::map<int, NNDataViewer> keypointValues;
+    for(int strideIdx = 0; strideIdx < static_cast<int>(kptsLayerNames.size()); ++strideIdx) {
+        keypointValues.try_emplace(strideIdx, *nnData->getTensorInfo(kptsLayerNames[strideIdx]), nnData->data, logger);
+        if(!keypointValues.at(strideIdx).build()) {
+            logger->error("Failed to build NNDataViewer for keypoints layer {}. Skipping keypoints decoding.", kptsLayerNames[strideIdx]);
+            return;
+        }
+    }
+
+    if(outDetections->detections.size() != detectionCandidates.size()) {
+        logger->error(
+            "Number of detections in ImgDetections does not match number of detection candidates. ImgDetections size: {}, detection candidates size: {}. "
+            "Skipping keypoints decoding.",
+            outDetections->detections.size(),
+            detectionCandidates.size());
+        return;
+    }
+
+    for(size_t i = 0; i < detectionCandidates.size(); ++i) {  // loop over all detections
+        const auto& c = detectionCandidates[i];
+        int flattenedIndex = c.rowIndex * featureMapWidths[c.headIndex] + c.columnIndex;
+
+        std::vector<dai::Keypoint> keypoints;
+        keypoints.reserve(*properties.parser.nKeypoints);
+        NNDataViewer keypointMask = keypointValues.at(c.headIndex);
+
+        for(int k = 0; k < properties.parser.nKeypoints; ++k) {
+            int base = 3 * k;
+
+            // keypointValues tensor storage order HWC
+            //  H == 0
+            //  W == 51 == 17 * 3 (x, y, conf for each keypoint)
+            //  C == flattened spatial dimensions of row x col of the feature map
+            float x = std::clamp(keypointMask.get(flattenedIndex, 0, base + 0) / inputWidth, 0.0f, 1.0f);
+            float y = std::clamp(keypointMask.get(flattenedIndex, 0, base + 1) / inputHeight, 0.0f, 1.0f);
+            float conf = 1.f / (1.f + std::exp(-(keypointMask.get(flattenedIndex, 0, base + 2))));
+
+            keypoints.push_back(dai::Keypoint{dai::Point2f(x, y), conf});
+        }
+
+        outDetections->detections[i].keypoints = KeypointsList(keypoints);
+    }
+}
+
+}  // namespace DetectionParserUtils
+}  // namespace utilities
+}  // namespace dai
\ No newline at end of file
diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
new file mode 100644
index 000000000..85b5a234f
--- /dev/null
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <spdlog/async_logger.h>
+
+#include <optional>
+
+#include "depthai/pipeline/datatype/ImgDetections.hpp"
+#include "depthai/pipeline/datatype/NNData.hpp"
+#include "depthai/properties/DetectionParserProperties.hpp"
+
+namespace dai {
+namespace utilities {
+namespace DetectionParserUtils {
+
+struct DetectionCandidate {
+    float xmin, ymin, xmax, ymax, score;
+    int label, headIndex, rowIndex, columnIndex;
+    std::optional<std::string> labelName;
+};
+
+/*
+Decode anchor free yolo v6r1 with sigmoid assisted center detection
+*/
+void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger);
+
+/*
+Decode anchor based yolo v3 and v3-Tiny
+*/
+void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger);
+
+/*
+Decode anchor based networks, e.g., yolo v5, v7, P
+*/
+void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger);
+
+/*
+Decode anchor free top-left-bottom-right (TLBR) style networks, e.g., yolo v6r2, v8, v10, v11
+*/
+void decodeTLBR(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger);
+
+std::vector<std::string> getSortedDetectionLayerNames(std::shared_ptr<dai::NNData> nnData, std::string searchTerm, std::vector<std::string> outputNames);
+
+float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2);
+
+bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr<spdlog::async_logger> logger);
+
+void createImgDetections(std::vector<DetectionCandidate>& detectionCandidates,
+                         std::vector<int> keepIndices,
+                         std::shared_ptr<dai::ImgDetections> outDetections,
+                         std::shared_ptr<spdlog::async_logger> logger);
+
+std::vector<DetectionCandidate> nonMaximumSuppression(std::vector<DetectionCandidate>& detectionCandidates, float iouThr);
+
+void createImgDetections(const std::vector<DetectionCandidate>& detectionCandidates,
+                         std::shared_ptr<dai::ImgDetections> outDetections,
+                         unsigned int width,
+                         unsigned int height);
+
+void segmentationDecode(std::shared_ptr<dai::NNData> nnData,
+                        std::vector<DetectionCandidate>& detectionCandidates,
+                        std::shared_ptr<dai::ImgDetections> outDetections,
+                        DetectionParserProperties properties,
+                        std::shared_ptr<spdlog::async_logger> logger);
+
+void keypointDecode(std::shared_ptr<dai::NNData> nnData,
+                    std::vector<DetectionCandidate>& detectionCandidates,
+                    std::shared_ptr<dai::ImgDetections> outDetections,
+                    DetectionParserProperties properties,
+                    std::shared_ptr<spdlog::async_logger> logger);
+
+}  // namespace DetectionParserUtils
+}  // namespace utilities
+}  // namespace dai
\ No newline at end of file
diff --git a/src/pipeline/utilities/NNDataViewer.hpp b/src/pipeline/utilities/NNDataViewer.hpp
new file mode 100644
index 000000000..94ab12cda
--- /dev/null
+++ b/src/pipeline/utilities/NNDataViewer.hpp
@@ -0,0 +1,163 @@
+#pragma once
+#include <spdlog/async_logger.h>
+
+#include "depthai/common/TensorInfo.hpp"
+#include "depthai/pipeline/datatype/NNData.hpp"
+#include "fp16/fp16.h"
+namespace dai {
+class NNDataViewer {
+   public:
+    std::shared_ptr<dai::Memory> data;
+    dai::TensorInfo tensor;
+    std::shared_ptr<spdlog::async_logger> logger;
+
+    // Factors to multiply with before the vectors
+    struct FactorsBefore {
+        int32_t h;
+        int32_t w;
+        int32_t c;
+    };
+
+    FactorsBefore factorsBefore;
+
+    NNDataViewer(dai::TensorInfo tensor, std::shared_ptr<dai::Memory> data, std::shared_ptr<spdlog::async_logger> logger)
+        : data{data}, tensor{tensor}, logger{logger} {};
+    bool build() {
+        if(tensor.strides.size() < 2) {
+            logger->error("Tensor doesn't have enough strides. Number of strides: {}, expected: {}", tensor.strides.size(), 2);
+            return false;
+        }
+        if(tensor.strides[0] == 0 || tensor.strides[1] == 0) {
+            logger->error("Tensor strides should not be set to zero. Strides are {} {}", tensor.strides[0], tensor.strides[1]);
+            return false;
+        }
+        switch(tensor.order) {
+            case TensorInfo::StorageOrder::NCHW:
+                if(tensor.dims[0] != 1) {
+                    logger->error("NCHW is only supported in Detection Parser if N is 1. It is {}", tensor.dims[0]);
+                    return false;
+                }
+                if(tensor.strides.size() != 4) {
+                    logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4);
+                }
+                factorsBefore.c = tensor.strides[1];
+                factorsBefore.h = tensor.strides[2];
+                factorsBefore.w = tensor.getDataTypeSize();
+                break;
+            case TensorInfo::StorageOrder::NHWC:
+                if(tensor.dims[0] != 1) {
+                    logger->error("NHWC is only supported in Detection Parser if N is 1. It is {}", tensor.dims[0]);
+                    return false;
+                }
+                if(tensor.strides.size() != 4) {
+                    logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4);
+                }
+                factorsBefore.h = tensor.strides[1];
+                factorsBefore.w = tensor.strides[2];
+                factorsBefore.c = tensor.getDataTypeSize();
+                break;
+            case TensorInfo::StorageOrder::HCW:
+                factorsBefore.h = tensor.strides[0];
+                factorsBefore.c = tensor.strides[1];
+                factorsBefore.w = tensor.getDataTypeSize();
+                break;
+
+            case TensorInfo::StorageOrder::HWC:
+                factorsBefore.h = tensor.strides[0];
+                factorsBefore.w = tensor.strides[1];
+                factorsBefore.c = tensor.getDataTypeSize();
+                break;
+            case TensorInfo::StorageOrder::CHW:
+                factorsBefore.c = tensor.strides[0];
+                factorsBefore.h = tensor.strides[1];
+                factorsBefore.w = tensor.getDataTypeSize();
+                break;
+
+            case TensorInfo::StorageOrder::CWH:
+                factorsBefore.c = tensor.strides[0];
+                factorsBefore.w = tensor.strides[1];
+                factorsBefore.h = tensor.getDataTypeSize();
+                break;
+
+            case TensorInfo::StorageOrder::WCH:
+                factorsBefore.w = tensor.strides[0];
+                factorsBefore.c = tensor.strides[1];
+                factorsBefore.h = tensor.getDataTypeSize();
+                break;
+
+            case TensorInfo::StorageOrder::WHC:
+                factorsBefore.w = tensor.strides[0];
+                factorsBefore.h = tensor.strides[1];
+                factorsBefore.c = tensor.getDataTypeSize();
+                break;
+            case TensorInfo::StorageOrder::NHCW:
+            case TensorInfo::StorageOrder::NC:
+            case TensorInfo::StorageOrder::CN:
+            case TensorInfo::StorageOrder::H:
+            case TensorInfo::StorageOrder::W:
+            case TensorInfo::StorageOrder::C:
+            default:
+                logger->error("Storage order not supported in NNDataViewer");
+                return false;
+        }
+        return sanity_check();
+    }
+
+    bool sanity_check() {
+        if(data->getSize() < (tensor.offset + (tensor.dims[0] * tensor.strides[0]))) {
+            logger->error(
+                "Underlying data does not hold enough data for the tensor to be contained.\
+                Tensor size: {}, Tensor offset: {}, Data type size: {}, Data size: {} ",
+                tensor.dims[0] * tensor.strides[0],
+                tensor.offset,
+                tensor.getDataTypeSize(),
+                data->getSize());
+            return false;
+        }
+        if(tensor.dims.size() < 2) {
+            logger->error("Number of dimensions for the input tensor is expected to be at least 2. It is {}", tensor.dims.size());
+            return false;
+        }
+        return true;
+    };
+
+    inline float get(int c, int h, int w) {
+        // If this turns out to be slow, use a function pointer instead and point to the right getter at build time
+        int32_t index = tensor.offset + factorsBefore.h * h + factorsBefore.w * w + factorsBefore.c * c;
+#ifdef DEPTHAI_SAFE_NN_DATA_ACCESS
+        logger->trace("Offset {}, fbH {}, fbW {}, fbC {}, h {}, w {}, c{}", tensor.offset, factorsBefore.h, factorsBefore.w, factorsBefore.c, h, w, c);
+        if(index > data->getSize()) {
+            logger->error("Out of bound access. Size is {}, index is {}", data->getSize(), index);
+            return 0.0;
+        }
+#endif
+
+        switch(tensor.dataType) {
+            case TensorInfo::DataType::U8F: {
+                uint8_t dataOut = data->getData()[index];
+                return (static_cast<float>(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::I8: {
+                int8_t dataOut = static_cast<int8_t>(data->getData()[index]);
+                return (static_cast<float>(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::INT: {
+                int32_t dataOut = reinterpret_cast<int32_t*>(data->getData().data())[index / sizeof(int32_t)];
+                return (static_cast<float>(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::FP16: {
+                int16_t dataOut = reinterpret_cast<int16_t*>(data->getData().data())[index / sizeof(int16_t)];
+                return (fp16_ieee_to_fp32_value(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::FP32: {
+                float dataOut = reinterpret_cast<float*>(data->getData().data())[index / sizeof(float)];
+                return (static_cast<float>(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::FP64:
+            default: {
+                return 0.0f;
+            }
+        }
+    }
+};
+}  // namespace dai

From 96a92f58cac13c433300945107e82fb1dcf03ab0 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Wed, 5 Nov 2025 10:47:23 +0100
Subject: [PATCH 03/24] Add host side implementation

---
 .../pipeline/node/DetectionParserBindings.cpp |  2 ++
 examples/cpp/DetectionNetwork/CMakeLists.txt  |  2 +-
 .../detection_and_keypoints.cpp               |  1 -
 .../{RVC4 => }/detection_and_segmentation.cpp | 35 ++++++++++++-------
 .../{RVC4 => }/detection_and_segmentation.py  | 14 ++++++--
 tests/CMakeLists.txt                          |  2 +-
 6 files changed, 38 insertions(+), 18 deletions(-)
 rename examples/cpp/DetectionNetwork/{RVC4 => }/detection_and_segmentation.cpp (84%)
 rename examples/python/DetectionNetwork/{RVC4 => }/detection_and_segmentation.py (92%)

diff --git a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp
index 7e5a50c4f..eab544ed4 100644
--- a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp
+++ b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp
@@ -65,11 +65,13 @@ void bind_detectionparser(pybind11::module& m, void* pCallstack) {
              DOC(dai, node, DetectionParser, setAnchors, 2))
         .def("setAnchorMasks", &DetectionParser::setAnchorMasks, py::arg("anchorMasks"), DOC(dai, node, DetectionParser, setAnchorMasks))
         .def("setIouThreshold", &DetectionParser::setIouThreshold, py::arg("thresh"), DOC(dai, node, DetectionParser, setIouThreshold))
+        .def("setRunOnHost", &DetectionParser::setRunOnHost, py::arg("runOnHost"), DOC(dai, node, DetectionParser, setRunOnHost))
         .def("getNumClasses", &DetectionParser::getNumClasses, DOC(dai, node, DetectionParser, getNumClasses))
         .def("getCoordinateSize", &DetectionParser::getCoordinateSize, DOC(dai, node, DetectionParser, getCoordinateSize))
         .def("getAnchors", &DetectionParser::getAnchors, DOC(dai, node, DetectionParser, getAnchors))
         .def("getAnchorMasks", &DetectionParser::getAnchorMasks, DOC(dai, node, DetectionParser, getAnchorMasks))
         .def("getIouThreshold", &DetectionParser::getIouThreshold, DOC(dai, node, DetectionParser, getIouThreshold))
+        .def("runOnHost", &DetectionParser::runOnHost, DOC(dai, node, DetectionParser, runOnHost))
         .def("build", &DetectionParser::build, DOC(dai, node, DetectionParser, build));
     daiNodeModule.attr("DetectionParser").attr("Properties") = detectionParserProperties;
 }
diff --git a/examples/cpp/DetectionNetwork/CMakeLists.txt b/examples/cpp/DetectionNetwork/CMakeLists.txt
index 8193faeb9..8c3ba6ecf 100644
--- a/examples/cpp/DetectionNetwork/CMakeLists.txt
+++ b/examples/cpp/DetectionNetwork/CMakeLists.txt
@@ -23,7 +23,7 @@ dai_set_example_test_labels(detection_network ondevice rvc2_all rvc4 rvc4rgb ci)
 dai_add_example(detection_network_remap detection_network_remap.cpp ON OFF)
 dai_set_example_test_labels(detection_network_remap ondevice rvc2_all rvc4 ci)
 
-dai_add_example(detection_and_segmentation RVC4/detection_and_segmentation.cpp ON OFF)
+dai_add_example(detection_and_segmentation detection_and_segmentation.cpp ON OFF)
 dai_set_example_test_labels(detection_and_segmentation rvc4)
 
 dai_add_example(detection_and_keypoints detection_and_keypoints.cpp ON OFF)
diff --git a/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
index 667151bb0..f374bdca1 100644
--- a/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
+++ b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
@@ -86,7 +86,6 @@ int main() {
 
             auto currentTime = std::chrono::steady_clock::now();
             float fps = counter / std::chrono::duration<float>(currentTime - startTime).count();
-            std::cout << "FPS: " << fps << std::endl;
         }
 
         if(cv::waitKey(1) == 'q') {
diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
similarity index 84%
rename from examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp
rename to examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
index 4912d04c6..e3e81dcbf 100644
--- a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp
+++ b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
@@ -4,6 +4,7 @@
 #include <cstddef>
 #include <cstdio>
 #include <iostream>
+#include <memory>
 #include <opencv2/core.hpp>
 #include <opencv2/opencv.hpp>
 
@@ -16,8 +17,16 @@ cv::Rect frameNorm(const cv::Mat& frame, const dai::Point2f& topLeft, const dai:
 }
 
 int main() {
+    std::string modelName = "luxonis/yolov8-instance-segmentation-large:coco-640x352";
+    bool setRunOnHost = false;
+    auto device = std::make_shared<dai::Device>();
+
+    if(device->getPlatformAsString() == "RVC2") {
+        modelName = "luxonis/yolov8-instance-segmentation-nano:coco-512x288";
+        setRunOnHost = true;
+    }
     // Create pipeline
-    dai::Pipeline pipeline;
+    dai::Pipeline pipeline{device};
 
     // Create and configure camera node
     auto cameraNode = pipeline.create<dai::node::Camera>();
@@ -27,8 +36,10 @@ int main() {
     auto detectionNetwork = pipeline.create<dai::node::DetectionNetwork>();
 
     dai::NNModelDescription modelDescription;
-    modelDescription.model = "luxonis/yolov8-instance-segmentation-large:coco-640x480";
+
+    modelDescription.model = modelName;
     detectionNetwork->build(cameraNode, modelDescription);
+    detectionNetwork->detectionParser->setRunOnHost(setRunOnHost);
     auto labelMap = detectionNetwork->getClasses();
 
     // Create output queues
@@ -121,16 +132,18 @@ int main() {
                             detections.begin(), detections.end(), [filteredLabel](const dai::ImgDetection& det) { return det.label != filteredLabel; }),
                         detections.end());
                 }
+                if(!segmentationMask.empty()) {
+                    cv::Mat lut(1, 256, CV_8U);
+                    for(int i = 0; i < 256; ++i) lut.at<uchar>(i) = (i >= 255) ? 255 : cv::saturate_cast<uchar>(i * 25);
 
-                cv::Mat lut(1, 256, CV_8U);
-                for(int i = 0; i < 256; ++i) lut.at<uchar>(i) = (i == 255) ? 255 : cv::saturate_cast<uchar>(i * 25);
-                cv::Mat scaledMask;
-                cv::LUT(segmentationMask, lut, scaledMask);
+                    cv::Mat scaledMask;
+                    cv::LUT(segmentationMask, lut, scaledMask);
 
-                cv::Mat coloredMask;
-                cv::applyColorMap(scaledMask, coloredMask, cv::COLORMAP_JET);
-                frame.copyTo(coloredMask, (scaledMask == 255));
-                cv::addWeighted(frame, 0.7, coloredMask, 0.3, 0, frame);
+                    cv::Mat coloredMask;
+                    cv::applyColorMap(scaledMask, coloredMask, cv::COLORMAP_JET);
+                    frame.copyTo(coloredMask, (scaledMask == 255));
+                    cv::addWeighted(frame, 0.7, coloredMask, 0.3, 0, frame);
+                }
 
                 // Display detections
                 for(const auto& detection : detections) {
@@ -157,8 +170,6 @@ int main() {
                 cv::imshow("rgb", frame);
 
                 auto currentTime = std::chrono::steady_clock::now();
-                float fps = counter / std::chrono::duration<float>(currentTime - startTime).count();
-                std::cout << "FPS: " << fps << std::endl;
             }
         }
     }
diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py b/examples/python/DetectionNetwork/detection_and_segmentation.py
similarity index 92%
rename from examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
rename to examples/python/DetectionNetwork/detection_and_segmentation.py
index 650f90f2f..a8ecc74a6 100644
--- a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
+++ b/examples/python/DetectionNetwork/detection_and_segmentation.py
@@ -6,11 +6,19 @@
 import numpy as np
 import time
 
+model_name = "luxonis/yolov8-instance-segmentation-large:coco-640x480"
+setRunOnHost = False
+device = dai.Device()
+if device.getPlatformAsString() == "RVC2":
+    model_name = "luxonis/yolov8-instance-segmentation-nano:coco-512x288"
+    setRunOnHost = True
+
 # Create pipeline
-with dai.Pipeline() as pipeline:
+with dai.Pipeline(device) as pipeline:
     cameraNode = pipeline.create(dai.node.Camera).build()
-    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-nano:coco-512x288"))
-    # detectionNetwork.detectionParser.runOnHost(True)
+    
+    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription(model_name))
+    detectionNetwork.detectionParser.setRunOnHost(setRunOnHost)
     labelMap = detectionNetwork.getClasses()
 
     qRgb = detectionNetwork.passthrough.createOutputQueue()
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d3823f1a8..f4b0776b2 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -524,7 +524,7 @@ FIRE_VIDEO="${fire_video}"
 KITCHEN_IMAGE_PATH="${kitchen_image}"
 YOLO_V8_INSTANCE_SEGMENTATION_LARGE_COCO_640x352_KITCHEN_SEGMENTATION_GROUND_TRUTH="${yolo_v8_instance_segmentation_large_coco_640x352_kitchen_segmentation_gt}"
 )
-dai_set_test_labels(detection_parser_test ondevice rvc4 ci)
+dai_set_test_labels(detection_parser_test ondevice rvc4 ci onhost)
 
 # Spatial detection network test
 dai_add_test(spatial_detection_network_test src/ondevice_tests/pipeline/node/spatial_detection_network_test.cpp)

From a0dd29a87dcf460a720332847689570829a99707 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Wed, 5 Nov 2025 10:52:36 +0100
Subject: [PATCH 04/24] bump device

---
 cmake/Depthai/DepthaiDeviceSideConfig.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
index b0a270a1d..7c6bb3df4 100644
--- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake
+++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
@@ -2,7 +2,7 @@
 set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot")
 
 # "full commit hash of device side binary"
-set(DEPTHAI_DEVICE_SIDE_COMMIT "e658b28655820c649b3bbed9f44865d00139094d")
+set(DEPTHAI_DEVICE_SIDE_COMMIT "8741ce89206d2a5299acc3382c7496e1ee205fcb")
 
 # "version if applicable"
 set(DEPTHAI_DEVICE_SIDE_VERSION "")

From 33752f1f44726f7e977dac9e1f6bd1ff79228da1 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Wed, 5 Nov 2025 12:54:19 +0100
Subject: [PATCH 05/24] bump rvc4

---
 cmake/Depthai/DepthaiDeviceRVC4Config.cmake                 | 2 +-
 .../pipeline/node/spatial_location_calculator_test.cpp      | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
index 43d640f5e..f6ae0d22b 100644
--- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
+++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
@@ -3,4 +3,4 @@
 set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot")
 
 # "version if applicable"
-set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+53bd364bc4c519e9aa6230b3de4d78a78d073373")
+set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+62ce59c3c4a4a53a9b0773fe83dabbecdc4553e9")
diff --git a/tests/src/ondevice_tests/pipeline/node/spatial_location_calculator_test.cpp b/tests/src/ondevice_tests/pipeline/node/spatial_location_calculator_test.cpp
index 1316e8566..0a8ca09b7 100644
--- a/tests/src/ondevice_tests/pipeline/node/spatial_location_calculator_test.cpp
+++ b/tests/src/ondevice_tests/pipeline/node/spatial_location_calculator_test.cpp
@@ -1,6 +1,5 @@
-#include <catch2/catch_all.hpp>
-
 #include <array>
+#include <catch2/catch_all.hpp>
 #include <cstdint>
 #include <cstring>
 #include <vector>
@@ -106,7 +105,6 @@ TEST_CASE("SpatialLocationCalculator synthetic depth data test") {
     auto outputQueue = spatial->out.createOutputQueue();
     auto passthroughQueue = spatial->passthroughDepth.createOutputQueue();
 
-    
     std::vector<std::uint16_t> depthPixels(width * height, 1000);
     auto setRegionDepth = [&](const RoiSpec& spec) {
         const int x0 = static_cast<int>(spec.roi.x);
@@ -120,7 +118,7 @@ TEST_CASE("SpatialLocationCalculator synthetic depth data test") {
     for(const auto& spec : roiSpecs) {
         setRegionDepth(spec);
     }
-    
+
     // Prepare synthetic depth frame
     auto depthFrame = std::make_shared<dai::ImgFrame>();
     depthFrame->setType(dai::ImgFrame::Type::RAW16);

From 255a8824da8078e10cf8b8d19623a8cdc649daa7 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Tue, 11 Nov 2025 16:59:50 +0100
Subject: [PATCH 06/24] update parser

---
 .../utilities/DetectionParser/DetectionParserUtils.cpp        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
index c1809e847..a9455e551 100644
--- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
@@ -10,7 +10,7 @@
 #include <string>
 #include <vector>
 
-#include "depthai/common/KeypointsList.hpp"
+#include "depthai/common/KeypointsListT.hpp"
 #include "depthai/common/RotatedRect.hpp"
 #include "depthai/common/TensorInfo.hpp"
 #include "depthai/pipeline/datatype/ImgDetections.hpp"
@@ -888,7 +888,7 @@ void keypointDecode(std::shared_ptr<dai::NNData> nnData,
             keypoints.push_back(dai::Keypoint{dai::Point2f(x, y), conf});
         }
 
-        outDetections->detections[i].keypoints = KeypointsList(keypoints);
+        outDetections->detections[i].keypoints = KeypointsList(keypoints, properties.parser.keypointEdges);
     }
 }
 

From 831232c1ac7faf4cb2e181bc74d5067f9aba220f Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Tue, 11 Nov 2025 17:57:55 +0100
Subject: [PATCH 07/24] update example

---
 examples/python/DetectionNetwork/detection_and_keypoints.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/python/DetectionNetwork/detection_and_keypoints.py b/examples/python/DetectionNetwork/detection_and_keypoints.py
index b61c41fc9..4459be138 100644
--- a/examples/python/DetectionNetwork/detection_and_keypoints.py
+++ b/examples/python/DetectionNetwork/detection_and_keypoints.py
@@ -7,7 +7,7 @@
 
 # Create pipeline
 with dai.Pipeline() as pipeline:
-    cameraNode = pipeline.create(dai.node.Camera).build()
+    cameraNode = pipeline.create(dai.node.Camera).build(sensorFps=12)
     detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-nano-pose-estimation:coco-512x288"))
     labelMap = detectionNetwork.getClasses()
 

From 1203e29ff34f107e491841cc50f656506eca9e2c Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Mon, 17 Nov 2025 08:28:27 +0100
Subject: [PATCH 08/24] Move parsing logic to host

---
 .../DetectionParser/DetectionParserUtils.cpp  | 295 +++++++++++-------
 .../DetectionParser/DetectionParserUtils.hpp  |  41 ++-
 2 files changed, 197 insertions(+), 139 deletions(-)

diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
index a9455e551..33d38cea9 100644
--- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
@@ -2,14 +2,22 @@
 
 #include <spdlog/async_logger.h>
 
+#include <Eigen/Dense>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <map>
 #include <memory>
+#include <opencv2/core.hpp>
+#include <opencv2/core/base.hpp>
+#include <opencv2/core/mat.hpp>
+#include <opencv2/opencv.hpp>
 #include <string>
 #include <vector>
+#include <xtensor/core/xtensor_forward.hpp>
 
+#include "DetectionParserUtils.hpp"
+#include "depthai/common/Keypoint.hpp"
 #include "depthai/common/KeypointsListT.hpp"
 #include "depthai/common/RotatedRect.hpp"
 #include "depthai/common/TensorInfo.hpp"
@@ -23,11 +31,11 @@ namespace utilities {
 namespace DetectionParserUtils {
 
 // yolo v6 r1 - anchor free
-void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
-                std::shared_ptr<dai::ImgDetections> outDetections,
-                DetectionParserProperties properties,
+void decodeR1AF(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
                 std::shared_ptr<spdlog::async_logger> logger) {
-    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+    auto layerNames = utilities::DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
 
     const std::vector<int> strides = properties.parser.strides;
     if(strides.size() != layerNames.size()) {
@@ -40,17 +48,18 @@ void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
     const int numClasses = properties.parser.classes;
     int inputWidth;
     int inputHeight;
-    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+    std::tie(inputWidth, inputHeight) = nnData.transformation->getSize();
 
     if(inputWidth <= 0 || inputHeight <= 0) {
         throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
     }
     std::vector<DetectionCandidate> detectionCandidates;
-    detectionCandidates.reserve(250);
+    detectionCandidates.reserve(defaultMaxDetectionsPerFrame);
 
     for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
         std::string layerName = layerNames[strideIdx];
-        auto tensorInfo = nnData->getTensorInfo(layerName);
+        int stride = strides[strideIdx];
+        auto tensorInfo = nnData.getTensorInfo(layerName);
         if(!tensorInfo) {
             std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
             throw std::runtime_error(errorMsg);
@@ -63,7 +72,7 @@ void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
 
         int layerHeight = tensorInfo->getHeight();
         int layerWidth = tensorInfo->getWidth();
-        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger);
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger);
         if(!outputData.build()) {
             std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
             throw std::runtime_error(errorMsg);
@@ -71,8 +80,8 @@ void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
 
         for(int row = 0; row < layerHeight; ++row) {
             for(int col = 0; col < layerWidth; ++col) {
-                const float score = outputData.get(4, row, col);
-                if(score < confidenceThr) {
+                const float objectnessScore = outputData.get(4, row, col);
+                if(objectnessScore < confidenceThr) {
                     continue;
                 }
 
@@ -85,7 +94,7 @@ void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
                         bestC = c;
                     }
                 }
-                if(bestConf * score < confidenceThr) {
+                if(bestConf * objectnessScore < confidenceThr) {
                     continue;
                 }
 
@@ -105,7 +114,20 @@ void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
                 ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
 
                 if(xmax <= xmin || ymax <= ymin) {
-                    logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping");
+                    logger->info("Invalid bbox parameters. Either xmax <= xmin or ymax <= ymin. Skipping detection.");
+                    logger->debug(
+                        "Skipping invalid bbox: layer='{}', "
+                        "raw(cx,cy,w,h)=({:.2f},{:.2f},{:.2f},{:.2f}) "
+                        "clamped(xmin,ymin,xmax,ymax)=({:.2f},{:.2f},{:.2f},{:.2f}).",
+                        layerName,
+                        cx,
+                        cy,
+                        w,
+                        h,
+                        xmin,
+                        ymin,
+                        xmax,
+                        ymax);
                     continue;
                 }
                 DetectionCandidate candidate = DetectionCandidate{
@@ -113,17 +135,13 @@ void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
                     ymin,
                     xmax,
                     ymax,
-                    bestConf * score,
+                    bestConf * objectnessScore,
                     bestC,
                     strideIdx,
                     row,
                     col,
-                    std::nullopt,
                 };
 
-                if(!properties.parser.classNames->empty()) {
-                    candidate.labelName = (*properties.parser.classNames)[bestC];
-                }
                 detectionCandidates.emplace_back(std::move(candidate));
             }
         }
@@ -134,6 +152,11 @@ void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
         logger->trace("No detections after NMS, skipping overlay.");
         return;
     }
+    if(!properties.parser.classNames->empty()) {
+        for(auto& candidate : keepCandidates) {
+            candidate.labelName = (*properties.parser.classNames)[candidate.label];
+        }
+    }
 
     createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
 
@@ -151,11 +174,12 @@ void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
 /*
 Decode anchor based yolo v3 and v3-Tiny
 */
-void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
-                std::shared_ptr<dai::ImgDetections> outDetections,
-                DetectionParserProperties properties,
+void decodeV3AB(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
                 std::shared_ptr<spdlog::async_logger> logger) {
-    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+    auto layerNames = getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
+    auto sigmoid = [](float x) -> float { return 1.f / (1.f + std::exp(-x)); };
 
     const std::vector<int> strides = properties.parser.strides;
     if(strides.size() != layerNames.size()) {
@@ -169,7 +193,7 @@ void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
     const int numClasses = properties.parser.classes;
     int inputWidth;
     int inputHeight;
-    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+    std::tie(inputWidth, inputHeight) = nnData.transformation->getSize();
     if(inputWidth <= 0 || inputHeight <= 0) {
         throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
     }
@@ -182,12 +206,12 @@ void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
     }
 
     std::vector<DetectionCandidate> detectionCandidates;
-    detectionCandidates.reserve(250);
+    detectionCandidates.reserve(defaultMaxDetectionsPerFrame);
 
     for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
         std::string layerName = layerNames[strideIdx];
         int stride = strides[strideIdx];
-        auto tensorInfo = nnData->getTensorInfo(layerName);
+        auto tensorInfo = nnData.getTensorInfo(layerName);
         if(!tensorInfo) {
             std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
             throw std::runtime_error(errorMsg);
@@ -202,7 +226,7 @@ void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
         int layerWidth = tensorInfo->getWidth();
         int layerChannels = tensorInfo->getChannels();
 
-        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger);
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger);
         if(!outputData.build()) {
             std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
             throw std::runtime_error(errorMsg);
@@ -217,8 +241,6 @@ void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
             throw std::runtime_error(errorMsg);
         }
 
-        auto sigmoid = [](float x) -> float { return 1.f / (1.f + std::exp(-x)); };
-
         for(int row = 0; row < layerHeight; ++row) {
             for(int col = 0; col < layerWidth; ++col) {
                 for(int a = 0; a < numAnchors; ++a) {
@@ -231,15 +253,15 @@ void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
                     if(obj < confidenceThr) continue;
 
                     int bestC = 0;
-                    float clsProb = 0.0f;
+                    float clsLogit = 0.0f;
                     for(int c = 0; c < numClasses; ++c) {
-                        const float prob = outputData.get(ch0 + 5 + c, row, col);
-                        if(prob > clsProb) {
-                            clsProb = prob;
+                        const float candidateLogit = outputData.get(ch0 + 5 + c, row, col);
+                        if(candidateLogit > clsLogit) {
+                            clsLogit = candidateLogit;
                             bestC = c;
                         }
                     }
-                    const float conf = obj * 1.f / (1.f + std::exp(-clsProb));
+                    const float conf = obj * sigmoid(clsLogit);
                     if(conf < confidenceThr) continue;
 
                     // YOLOv3 decode
@@ -275,12 +297,8 @@ void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
                         strideIdx,
                         row,
                         col,
-                        std::nullopt,
                     };
 
-                    if(!properties.parser.classNames->empty()) {
-                        candidate.labelName = (*properties.parser.classNames)[bestC];
-                    }
                     detectionCandidates.emplace_back(std::move(candidate));
                 }
             }
@@ -293,6 +311,12 @@ void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
         return;
     }
 
+    if(!properties.parser.classNames->empty()) {
+        for(auto& candidate : keepCandidates) {
+            candidate.labelName = (*properties.parser.classNames)[candidate.label];
+        }
+    }
+
     createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
 
     if(properties.parser.decodeSegmentation) {
@@ -311,11 +335,11 @@ void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
 /*
 Decode anchor based networks, e.g., yolo v5, v7, P
 */
-void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
-                std::shared_ptr<dai::ImgDetections> outDetections,
-                DetectionParserProperties properties,
+void decodeV5AB(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
                 std::shared_ptr<spdlog::async_logger> logger) {
-    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+    auto layerNames = getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
 
     const std::vector<int> strides = properties.parser.strides;
     if(strides.size() != layerNames.size()) {
@@ -329,7 +353,7 @@ void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
     const int numClasses = properties.parser.classes;
     int inputWidth;
     int inputHeight;
-    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+    std::tie(inputWidth, inputHeight) = nnData.transformation->getSize();
 
     if(inputWidth <= 0 || inputHeight <= 0) {
         throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
@@ -343,12 +367,12 @@ void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
     }
 
     std::vector<DetectionCandidate> detectionCandidates;
-    detectionCandidates.reserve(250);
+    detectionCandidates.reserve(defaultMaxDetectionsPerFrame);
 
     for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
         std::string layerName = layerNames[strideIdx];
         int stride = strides[strideIdx];
-        auto tensorInfo = nnData->getTensorInfo(layerName);
+        auto tensorInfo = nnData.getTensorInfo(layerName);
         if(!tensorInfo) {
             std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
             throw std::runtime_error(errorMsg);
@@ -363,7 +387,7 @@ void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
         int layerWidth = tensorInfo->getWidth();
         int layerChannels = tensorInfo->getChannels();
 
-        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger);
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger);
         if(!outputData.build()) {
             std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
             throw std::runtime_error(errorMsg);
@@ -393,9 +417,9 @@ void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
                     int bestC = 0;
                     float bestConf = 0.0f;
                     for(int c = 0; c < numClasses; ++c) {
-                        const float prob = outputData.get(ch0 + 5 + c, row, col);
-                        if(prob > bestConf) {
-                            bestConf = prob;
+                        const float candidateProb = outputData.get(ch0 + 5 + c, row, col);
+                        if(candidateProb > bestConf) {
+                            bestConf = candidateProb;
                             bestC = c;
                         }
                     }
@@ -430,12 +454,8 @@ void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
                         strideIdx,
                         row,
                         col,
-                        std::nullopt,
                     };
 
-                    if(!properties.parser.classNames->empty()) {
-                        candidate.labelName = (*properties.parser.classNames)[bestC];
-                    }
                     detectionCandidates.emplace_back(std::move(candidate));
                 }
             }
@@ -448,6 +468,12 @@ void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
         return;
     }
 
+    if(!properties.parser.classNames->empty()) {
+        for(auto& candidate : keepCandidates) {
+            candidate.labelName = (*properties.parser.classNames)[candidate.label];
+        }
+    }
+
     createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
 
     if(properties.parser.decodeSegmentation) {
@@ -464,11 +490,11 @@ void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
 /*
 Decode TLBR (top left bottom right) style networks, e.g., yolo v6r2, v8, v10, v11
 */
-void decodeTLBR(std::shared_ptr<dai::NNData> nnData,
-                std::shared_ptr<dai::ImgDetections> outDetections,
-                DetectionParserProperties properties,
+void decodeTLBR(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
                 std::shared_ptr<spdlog::async_logger> logger) {
-    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
 
     const std::vector<int> strides = properties.parser.strides;
     if(strides.size() != layerNames.size()) {
@@ -481,19 +507,19 @@ void decodeTLBR(std::shared_ptr<dai::NNData> nnData,
     const int numClasses = properties.parser.classes;
     int inputWidth;
     int inputHeight;
-    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+    std::tie(inputWidth, inputHeight) = nnData.transformation->getSize();
 
     if(inputWidth <= 0 || inputHeight <= 0) {
         throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
     }
 
     std::vector<DetectionCandidate> detectionCandidates;
-    detectionCandidates.reserve(250);
+    detectionCandidates.reserve(defaultMaxDetectionsPerFrame);
 
     for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
         std::string layerName = layerNames[strideIdx];
         int stride = strides[strideIdx];
-        auto tensorInfo = nnData->getTensorInfo(layerName);
+        auto tensorInfo = nnData.getTensorInfo(layerName);
         if(!tensorInfo) {
             std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
             throw std::runtime_error(errorMsg);
@@ -506,7 +532,7 @@ void decodeTLBR(std::shared_ptr<dai::NNData> nnData,
 
         int layerHeight = tensorInfo->getHeight();
         int layerWidth = tensorInfo->getWidth();
-        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger);
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger);
         if(!outputData.build()) {
             std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
             throw std::runtime_error(errorMsg);
@@ -557,13 +583,8 @@ void decodeTLBR(std::shared_ptr<dai::NNData> nnData,
                     strideIdx,
                     row,
                     col,
-                    std::nullopt,
-
                 };
 
-                if(!properties.parser.classNames->empty()) {
-                    candidate.labelName = (*properties.parser.classNames)[bestC];
-                }
                 detectionCandidates.emplace_back(std::move(candidate));
             }
         }
@@ -575,6 +596,12 @@ void decodeTLBR(std::shared_ptr<dai::NNData> nnData,
         return;
     }
 
+    if(!properties.parser.classNames->empty()) {
+        for(auto& candidate : keepCandidates) {
+            candidate.labelName = (*properties.parser.classNames)[candidate.label];
+        }
+    }
+
     createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
 
     if(properties.parser.decodeSegmentation) {
@@ -590,11 +617,18 @@ void decodeTLBR(std::shared_ptr<dai::NNData> nnData,
 
 bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr<spdlog::async_logger> logger) {
     // Fix the channel order for Yolo - this is hacky and would be best to be fixed in the actual models and make it consistent
+    auto getYoloChannelSize = [&](int classes, int coordinates, int anchors) -> int {
+        if(anchors == 0) {
+            anchors = 1;
+        }
+        return anchors * (classes + coordinates + 1);
+    };
 
     int anchorMultiplier = properties.parser.anchorsV2.empty() ? 1 : static_cast<int>(properties.parser.anchorsV2.size());
     int channelSize = anchorMultiplier * (properties.parser.classes + properties.parser.coordinates + 1);
 
-    auto checkAndFixOrder = [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool {
+    auto checkAndFixOrder =
+        [&](dai::TensorInfo::StorageOrder currentOrder, int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool {
         // Check that the dims size is big enough
         if(static_cast<int>(tensorInfo.dims.size()) <= channelDimIndex || static_cast<int>(tensorInfo.dims.size()) <= alternativeDimIndex) {
             logger->error("Invalid tensor dims size. Skipping.");
@@ -616,16 +650,16 @@ bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties p
 
     switch(tensorInfo.order) {
         case dai::TensorInfo::StorageOrder::CHW:
-            if(!checkAndFixOrder(0, 2, dai::TensorInfo::StorageOrder::HWC)) return false;
+            if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::CHW, 0, 2, dai::TensorInfo::StorageOrder::HWC)) return false;
             break;
         case dai::TensorInfo::StorageOrder::HWC:
-            if(!checkAndFixOrder(2, 0, dai::TensorInfo::StorageOrder::CHW)) return false;
+            if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::HWC, 2, 0, dai::TensorInfo::StorageOrder::CHW)) return false;
             break;
         case dai::TensorInfo::StorageOrder::NCHW:
-            if(!checkAndFixOrder(1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false;
+            if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::NCHW, 1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false;
             break;
         case dai::TensorInfo::StorageOrder::NHWC:
-            if(!checkAndFixOrder(3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false;
+            if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::NHWC, 3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false;
             break;
         case dai::TensorInfo::StorageOrder::NHCW:
         case dai::TensorInfo::StorageOrder::WHC:
@@ -645,9 +679,9 @@ bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties p
     return true;
 }
 
-std::vector<std::string> getSortedDetectionLayerNames(std::shared_ptr<dai::NNData> nnData, std::string searchTerm, std::vector<std::string> outputNames) {
+std::vector<std::string> getSortedDetectionLayerNames(const dai::NNData& nnData, std::string searchTerm, std::vector<std::string> outputNames) {
     if(outputNames.empty()) {
-        outputNames = nnData->getAllLayerNames();
+        outputNames = nnData.getAllLayerNames();
     }
 
     std::vector<std::string> layerNames;
@@ -704,7 +738,7 @@ std::vector<DetectionCandidate> nonMaximumSuppression(std::vector<DetectionCandi
 }
 
 void createImgDetections(const std::vector<DetectionCandidate>& detectionCandidates,
-                         std::shared_ptr<dai::ImgDetections> outDetections,
+                         dai::ImgDetections& outDetections,
                          unsigned int width,
                          unsigned int height) {
     for(const auto& det : detectionCandidates) {
@@ -716,36 +750,21 @@ void createImgDetections(const std::vector<DetectionCandidate>& detectionCandida
         if(det.labelName) {
             detection.labelName = *det.labelName;
         }
-        outDetections->detections.push_back(std::move(detection));
+        outDetections.detections.push_back(std::move(detection));
     }
 }
 
-void segmentationDecode(std::shared_ptr<dai::NNData> nnData,
+void segmentationDecode(const dai::NNData& nnData,
                         std::vector<DetectionCandidate>& detectionCandidates,
-                        std::shared_ptr<dai::ImgDetections> outDetections,
+                        dai::ImgDetections& outDetections,
                         DetectionParserProperties properties,
                         std::shared_ptr<spdlog::async_logger> logger) {
-    auto maskFromCoeffs = [](NNDataViewer& protos, const float* coeffs, int width, int height) -> cv::Mat {
-        cv::Mat maskLow(height, width, CV_32F);
-        for(int y = 0; y < maskLow.rows; ++y) {
-            float* row = maskLow.ptr<float>(y);
-            for(int x = 0; x < maskLow.cols; ++x) {
-                float sum = 0.f;
-                for(int c = 0; c < 32; ++c) sum += protos.get(c, y, x) * coeffs[c];
-                row[x] = 1.f / (1.f + std::exp(-sum));  // sigmoid
-            }
-        }
-        return maskLow;
-    };
-
-    std::pair<int, int> inputSize = nnData->transformation->getSize();
+    std::pair<int, int> inputSize = nnData.transformation->getSize();
     int inputWidth = inputSize.first;
     int inputHeight = inputSize.second;
 
     cv::Mat indexMask(inputHeight, inputWidth, CV_8U, cv::Scalar(255));
 
-    cv::Mat maskLow, maskUp;
-
     auto maskLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "masks", std::vector<std::string>{});
     if(properties.parser.strides.size() != maskLayerNames.size()) {
         logger->error(
@@ -760,15 +779,47 @@ void segmentationDecode(std::shared_ptr<dai::NNData> nnData,
         return;
     }
 
-    NNDataViewer protoValues = NNDataViewer(*nnData->getTensorInfo(protoLayerNames[0]), nnData->data, logger);
+    NNDataViewer protoValues = NNDataViewer(*nnData.getTensorInfo(protoLayerNames[0]), nnData.data, logger);
     if(!protoValues.build()) {
         logger->error("Failed to build NNDataViewer for proto layer {}. Skipping segmentation decoding.", protoLayerNames[0]);
         return;
     }
 
+    TensorInfo protoInfo = *nnData.getTensorInfo(protoLayerNames[0]);
+    int protoWidth = protoInfo.getWidth();
+    int protoHeight = protoInfo.getHeight();
+    int protoChannels = protoInfo.getChannels();
+    if(protoWidth <= 0 || protoHeight <= 0 || protoChannels <= 0) {
+        logger->error("Invalid proto tensor dimensions: channels {}, height {}, width {}.", protoChannels, protoHeight, protoWidth);
+        return;
+    }
+    int protoWidthScaleFactor = inputWidth / protoWidth;
+    int protoHeightScaleFactor = inputHeight / protoHeight;
+
+    cv::Mat maskUp;
+    cv::Mat maskLow(protoHeight, protoWidth, CV_32F);
+
+    dai::NNData& nnDataNonConst = const_cast<dai::NNData&>(nnData);
+    xt::xarray<float> protoData = nnDataNonConst.getTensor<float>(protoLayerNames[0], true);
+    Eigen::MatrixXf protoMatrix = Eigen::Map<Eigen::MatrixXf>(protoData.data(), protoChannels, protoHeight * protoWidth);
+
+    Eigen::RowVectorXf coeffs(protoChannels);
+
+    auto maskFromCoeffs = [logger, protoHeight, protoWidth, &maskLow](const Eigen::MatrixXf& protos2d, const Eigen::RowVectorXf& coeffs) -> void {
+        if(protos2d.rows() != coeffs.size()) {
+            throw std::runtime_error("Mask coefficients size does not match proto channels.");
+        }
+
+        Eigen::Map<Eigen::RowVectorXf> logits(maskLow.ptr<float>(), protoHeight * protoWidth);
+        logits.noalias() = coeffs * protos2d;
+
+        // no need to do sigmoid
+        // logits = (1.0f / (1.0f + (-logits.array()).exp())).matrix();
+    };
+
     std::map<int, NNDataViewer> maskValues;
     for(int strideIdx = 0; strideIdx < static_cast<int>(maskLayerNames.size()); ++strideIdx) {
-        maskValues.try_emplace(strideIdx, *nnData->getTensorInfo(maskLayerNames[strideIdx]), nnData->data, logger);
+        maskValues.try_emplace(strideIdx, *nnData.getTensorInfo(maskLayerNames[strideIdx]), nnData.data, logger);
         if(!maskValues.at(strideIdx).build()) {
             logger->error("Failed to build NNDataViewer for mask layer {}. Skipping segmentation decoding.", maskLayerNames[strideIdx]);
             return;
@@ -779,19 +830,15 @@ void segmentationDecode(std::shared_ptr<dai::NNData> nnData,
         const auto& c = detectionCandidates[i];
         const int detIdx = static_cast<int>(i);  // index in outDetections list
 
-        NNDataViewer mask = maskValues.at(c.headIndex);
-        std::array<float, 32> coeff;
-        for(int i = 0; i < 32; ++i) {
-            coeff[i] = mask.get(i, c.rowIndex, c.columnIndex);
+        NNDataViewer& mask = maskValues.at(c.headIndex);
+        for(int ch = 0; ch < protoChannels; ++ch) {
+            coeffs(ch) = mask.get(ch, c.rowIndex, c.columnIndex);
         }
+        // TODO (aljaz) perform operations on ROI only instead of the full resolution
+        // Eigen::MatrixXf roiMatrix = protoMatrix.block(0, y0 * protoWidth + x0, protoChannels, (y1 - y0) * (x1 - x0));
 
-        TensorInfo protoInfo = *nnData->getTensorInfo(protoLayerNames[0]);
-        int protoWidth = protoInfo.getWidth();
-        int protoHeight = protoInfo.getHeight();
-        maskLow = maskFromCoeffs(protoValues, coeff.data(), protoWidth, protoHeight);
+        maskFromCoeffs(protoMatrix, coeffs);
 
-        cv::resize(maskLow, maskUp, cv::Size(inputWidth, inputHeight), 0, 0, cv::INTER_LINEAR);
-        // ROI clamp
         int x0 = std::clamp(static_cast<int>(std::floor(c.xmin)), 0, inputWidth - 1);
         int y0 = std::clamp(static_cast<int>(std::floor(c.ymin)), 0, inputHeight - 1);
         int x1 = std::clamp(static_cast<int>(std::ceil(c.xmax)), 0, inputWidth);
@@ -800,10 +847,18 @@ void segmentationDecode(std::shared_ptr<dai::NNData> nnData,
         if(x1 <= x0 || y1 <= y0) continue;
         const cv::Rect roi(x0, y0, x1 - x0, y1 - y0);
 
+        int protoX0 = x0 / protoWidthScaleFactor;
+        int protoY0 = y0 / protoHeightScaleFactor;
+        int protoX1 = x1 / protoWidthScaleFactor;
+        int protoY1 = y1 / protoHeightScaleFactor;
+        const cv::Rect protoROI(protoX0, protoY0, protoX1 - protoX0, protoY1 - protoY0);
+
+        cv::Mat roiProb;
+        cv::resize(maskLow(protoROI), roiProb, roi.size(), 0, 0, cv::INTER_LINEAR);
+
         // Threshold & paint only unassigned pixels
-        cv::Mat roiProb = maskUp(roi);
         cv::Mat roiBin;
-        cv::compare(roiProb, static_cast<double>(0.5f), roiBin, cv::CMP_GT);
+        cv::compare(roiProb, 0.0f, roiBin, cv::CMP_GT);
         cv::Mat roiOut = indexMask(roi);
         cv::Mat unassigned;
         cv::compare(roiOut, 255, unassigned, cv::CMP_EQ);
@@ -814,22 +869,27 @@ void segmentationDecode(std::shared_ptr<dai::NNData> nnData,
         roiOut.setTo(value, paintMask);
     }
 
-    outDetections->setSegmentationMask(indexMask);
+    outDetections.setCvSegmentationMask(indexMask);
 }
 
-void keypointDecode(std::shared_ptr<dai::NNData> nnData,
+void keypointDecode(const dai::NNData& nnData,
                     std::vector<DetectionCandidate>& detectionCandidates,
-                    std::shared_ptr<dai::ImgDetections> outDetections,
+                    dai::ImgDetections& outDetections,
                     DetectionParserProperties properties,
                     std::shared_ptr<spdlog::async_logger> logger) {
+    if(!properties.parser.nKeypoints) {
+        logger->warn("Number of keypoints not set in properties.parser.nKeypoints. Skipping keypoints decoding.");
+        return;
+    }
+
     int inputWidth;
     int inputHeight;
-    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+    std::tie(inputWidth, inputHeight) = nnData.transformation->getSize();
 
-    auto yoloLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+    auto yoloLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
     std::vector<int> featureMapWidths;
-    for(int i = 0; i < static_cast<int>(yoloLayerNames.size()); ++i) {
-        auto tensorInfo = nnData->getTensorInfo(yoloLayerNames[i]);
+    for(int i = 0; i < yoloLayerNames.size(); ++i) {
+        auto tensorInfo = nnData.getTensorInfo(yoloLayerNames[i]);
         if(!tensorInfo) {
             logger->error("Tensor info for layer {} is null. Skipping keypoints decoding.", yoloLayerNames[i]);
             return;
@@ -850,18 +910,18 @@ void keypointDecode(std::shared_ptr<dai::NNData> nnData,
     // TODO (aljaz) move to a function
     std::map<int, NNDataViewer> keypointValues;
     for(int strideIdx = 0; strideIdx < static_cast<int>(kptsLayerNames.size()); ++strideIdx) {
-        keypointValues.try_emplace(strideIdx, *nnData->getTensorInfo(kptsLayerNames[strideIdx]), nnData->data, logger);
+        keypointValues.try_emplace(strideIdx, *nnData.getTensorInfo(kptsLayerNames[strideIdx]), nnData.data, logger);
         if(!keypointValues.at(strideIdx).build()) {
             logger->error("Failed to build NNDataViewer for keypoints layer {}. Skipping keypoints decoding.", kptsLayerNames[strideIdx]);
             return;
         }
     }
 
-    if(outDetections->detections.size() != detectionCandidates.size()) {
+    if(outDetections.detections.size() != detectionCandidates.size()) {
         logger->error(
             "Number of detections in ImgDetections does not match number of detection candidates. ImgDetections size: {}, detection candidates size: {}. "
             "Skipping keypoints decoding.",
-            outDetections->detections.size(),
+            outDetections.detections.size(),
             detectionCandidates.size());
         return;
     }
@@ -887,11 +947,10 @@ void keypointDecode(std::shared_ptr<dai::NNData> nnData,
 
             keypoints.push_back(dai::Keypoint{dai::Point2f(x, y), conf});
         }
-
-        outDetections->detections[i].keypoints = KeypointsList(keypoints, properties.parser.keypointEdges);
+        outDetections.detections[i].keypoints = KeypointsList(keypoints, properties.parser.keypointEdges);
     }
 }
 
 }  // namespace DetectionParserUtils
 }  // namespace utilities
-}  // namespace dai
\ No newline at end of file
+}  // namespace dai
diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
index 85b5a234f..bb61eaa57 100644
--- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
@@ -1,5 +1,4 @@
 #pragma once
-
 #include <spdlog/async_logger.h>
 
 #include <optional>
@@ -12,45 +11,45 @@ namespace dai {
 namespace utilities {
 namespace DetectionParserUtils {
 
+constexpr std::size_t defaultMaxDetectionsPerFrame = 250;
 struct DetectionCandidate {
     float xmin, ymin, xmax, ymax, score;
     int label, headIndex, rowIndex, columnIndex;
     std::optional<std::string> labelName;
 };
-
 /*
 Decode anchor free yolo v6r1 with sigmoid assisted center detection
 */
-void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
-                std::shared_ptr<dai::ImgDetections> outDetections,
-                DetectionParserProperties properties,
+void decodeR1AF(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
                 std::shared_ptr<spdlog::async_logger> logger);
 
 /*
 Decode anchor based yolo v3 and v3-Tiny
 */
-void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
-                std::shared_ptr<dai::ImgDetections> outDetections,
-                DetectionParserProperties properties,
+void decodeV3AB(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
                 std::shared_ptr<spdlog::async_logger> logger);
 
 /*
 Decode anchor based networks, e.g., yolo v5, v7, P
 */
-void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
-                std::shared_ptr<dai::ImgDetections> outDetections,
-                DetectionParserProperties properties,
+void decodeV5AB(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
                 std::shared_ptr<spdlog::async_logger> logger);
 
 /*
 Decode anchor free top-left-bottom-right (TLBR) style networks, e.g., yolo v6r2, v8, v10, v11
 */
-void decodeTLBR(std::shared_ptr<dai::NNData> nnData,
-                std::shared_ptr<dai::ImgDetections> outDetections,
-                DetectionParserProperties properties,
+void decodeTLBR(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
                 std::shared_ptr<spdlog::async_logger> logger);
 
-std::vector<std::string> getSortedDetectionLayerNames(std::shared_ptr<dai::NNData> nnData, std::string searchTerm, std::vector<std::string> outputNames);
+std::vector<std::string> getSortedDetectionLayerNames(const dai::NNData& nnData, std::string searchTerm, std::vector<std::string> outputNames);
 
 float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2);
 
@@ -58,25 +57,25 @@ bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties p
 
 void createImgDetections(std::vector<DetectionCandidate>& detectionCandidates,
                          std::vector<int> keepIndices,
-                         std::shared_ptr<dai::ImgDetections> outDetections,
+                         dai::ImgDetections& outDetections,
                          std::shared_ptr<spdlog::async_logger> logger);
 
 std::vector<DetectionCandidate> nonMaximumSuppression(std::vector<DetectionCandidate>& detectionCandidates, float iouThr);
 
 void createImgDetections(const std::vector<DetectionCandidate>& detectionCandidates,
-                         std::shared_ptr<dai::ImgDetections> outDetections,
+                         dai::ImgDetections& outDetections,
                          unsigned int width,
                          unsigned int height);
 
-void segmentationDecode(std::shared_ptr<dai::NNData> nnData,
+void segmentationDecode(const dai::NNData& nnData,
                         std::vector<DetectionCandidate>& detectionCandidates,
-                        std::shared_ptr<dai::ImgDetections> outDetections,
+                        dai::ImgDetections& outDetections,
                         DetectionParserProperties properties,
                         std::shared_ptr<spdlog::async_logger> logger);
 
-void keypointDecode(std::shared_ptr<dai::NNData> nnData,
+void keypointDecode(const dai::NNData& nnData,
                     std::vector<DetectionCandidate>& detectionCandidates,
-                    std::shared_ptr<dai::ImgDetections> outDetections,
+                    dai::ImgDetections& outDetections,
                     DetectionParserProperties properties,
                     std::shared_ptr<spdlog::async_logger> logger);
 

From 36770bc777d759d38dd5d5312090e4bdc2fd31b6 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Mon, 17 Nov 2025 09:18:07 +0100
Subject: [PATCH 09/24] update utils and node params

---
 .../depthai/pipeline/node/DetectionParser.hpp |  4 +-
 src/pipeline/node/DetectionParser.cpp         | 64 +++++++++++--------
 .../DetectionParser/DetectionParserUtils.cpp  | 51 ++-------------
 3 files changed, 46 insertions(+), 73 deletions(-)

diff --git a/include/depthai/pipeline/node/DetectionParser.hpp b/include/depthai/pipeline/node/DetectionParser.hpp
index 7a547a936..c04c6206a 100644
--- a/include/depthai/pipeline/node/DetectionParser.hpp
+++ b/include/depthai/pipeline/node/DetectionParser.hpp
@@ -283,7 +283,7 @@ class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, Detec
 
     void run() override;
 
-    std::vector<dai::ImgDetection> decodeMobilenet(std::shared_ptr<dai::NNData> nnData, float confidenceThr);
+    void decodeMobilenet(dai::NNData& nnData, dai::ImgDetections& outDetections, float confidenceThr);
 
    private:
     bool runOnHostVar = false;
@@ -296,7 +296,7 @@ class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, Detec
 
     // host runnable requirements
     void buildStage1() override;
-    void decodeYolo(std::shared_ptr<dai::NNData> nnData, std::shared_ptr<dai::ImgDetections> outDetections);
+    void decodeYolo(dai::NNData& nnData, dai::ImgDetections& outDetections);
     std::vector<dai::TensorInfo> inTensorInfo;
     uint32_t imgWidth;
     uint32_t imgHeight;
diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp
index 2040bf8b6..1ffbbf18d 100644
--- a/src/pipeline/node/DetectionParser.cpp
+++ b/src/pipeline/node/DetectionParser.cpp
@@ -3,6 +3,7 @@
 #include <algorithm>
 #include <cctype>
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <vector>
 
@@ -392,36 +393,36 @@ void DetectionParser::run() {
     using namespace std::chrono;
     while(isRunning()) {
         auto tAbsoluteBeginning = steady_clock::now();
-        std::shared_ptr<dai::NNData> inputData;
-        inputData = input.get<dai::NNData>();
-        if(!inputData) {
-            logger->error("Error while receiving NN frame.");
+        std::shared_ptr<dai::NNData> sharedInputData = input.get<dai::NNData>();
+        auto outDetections = std::make_shared<dai::ImgDetections>();
+
+        if(!sharedInputData) {
+            logger->error("NN Data is empty. Skipping processing.");
             continue;
         }
         auto tAfterMessageBeginning = steady_clock::now();
+        dai::NNData& inputData = *sharedInputData;
 
         if(!imgSizesSet) {
-            const bool containsTransformation = inputData->transformation.has_value();
+            const bool containsTransformation = inputData.transformation.has_value();
             if(containsTransformation) {
-                std::tie(imgWidth, imgHeight) = inputData->transformation->getSize();
+                std::tie(imgWidth, imgHeight) = inputData.transformation->getSize();
             } else {
                 logger->warn("No image size provided for detection parser. Skipping processing and sending empty detections.");
                 continue;
             }
-
+            // We have determined the image size, no need to try again in the future
             imgSizesSet = true;
         }
 
-        auto outDetections = std::make_shared<dai::ImgDetections>();
-
+        // Parse detections
         switch(properties.parser.nnFamily) {
             case DetectionNetworkType::YOLO: {
-                decodeYolo(inputData, outDetections);
+                decodeYolo(inputData, *outDetections);
                 break;
             }
             case DetectionNetworkType::MOBILENET: {
-                auto dets = decodeMobilenet(inputData, properties.parser.confidenceThreshold);  // TODO (aljaz) update to shared pointer
-                outDetections->detections = dets;
+                decodeMobilenet(inputData, *outDetections, properties.parser.confidenceThreshold);
                 break;
             }
             default: {
@@ -433,10 +434,11 @@ void DetectionParser::run() {
         auto tBeforeSend = steady_clock::now();
 
         // Copy over seq and ts
-        outDetections->setSequenceNum(inputData->getSequenceNum());
-        outDetections->setTimestamp(inputData->getTimestamp());
-        outDetections->setTimestampDevice(inputData->getTimestampDevice());
-        outDetections->transformation = inputData->transformation;
+        outDetections->setSequenceNum(inputData.getSequenceNum());
+        outDetections->setTimestamp(inputData.getTimestamp());
+        outDetections->setTimestampDevice(inputData.getTimestampDevice());
+        outDetections->transformation = inputData.transformation;
+
         // Send detections
         out.send(outDetections);
 
@@ -476,26 +478,34 @@ void DetectionParser::buildStage1() {
     }
 }
 
-std::vector<dai::ImgDetection> DetectionParser::decodeMobilenet(std::shared_ptr<dai::NNData> nnData, float confidenceThr) {
+void DetectionParser::decodeMobilenet(dai::NNData& nnData, dai::ImgDetections& outDetections, float confidenceThr) {
     auto& logger = pimpl->logger;
 
-    if(!nnData) {
-        return {};
-    }
     int maxDetections = 100;
     std::vector<dai::ImgDetection> detections;
     std::string tensorName;
-    for(const auto& tensor : nnData->getAllLayers()) {
+    for(const auto& tensor : nnData.getAllLayers()) {
         if(tensor.offset == 0) {
+            // // The tensor we want to checkout
+            // if(tensor.numDimensions != 4) {
+            //     std::cout << "ERROR while decoding Mobilenet. Output tensor has incorrect dimensions. Number of dimensions: " << tensor.numDimensions
+            //               << std::endl;
+            // }
+            // // Get tensor output size in Bytes
+            // // Expected dimensions are [1, 1, N, 7] where N is number of detections
+            // if(tensor.dims[3] != 7) {
+            //     std::cout << "ERROR while decoding Mobilenet. Expecting 7 fields for every detection but: " << tensor.dims[3] << " found.\n";
+            // }
+            // maxDetections = tensor.dims[tensor.numDimensions - 2];
             tensorName = tensor.name;
         }
     }
 
-    auto tensorData = nnData->getTensor<float>(tensorName);
+    auto tensorData = nnData.getTensor<float>(tensorName);
     maxDetections = tensorData.size() / 7;
     if(static_cast<int>(tensorData.size()) < maxDetections * 7) {
         logger->error("Error while parsing Mobilenet. Vector not long enough, expected size: {}, real size {}", maxDetections * 7, tensorData.size());
-        return {};
+        return;
     }
 
     struct raw_Detection {  // need to update it to include more
@@ -529,13 +539,12 @@ std::vector<dai::ImgDetection> DetectionParser::decodeMobilenet(std::shared_ptr<
             d.xmax = temp.xmax;
             d.ymax = temp.ymax;
 
-            detections.push_back(d);
+            outDetections.detections.push_back(d);
         }
     }
-    return detections;
 }
 
-void DetectionParser::decodeYolo(std::shared_ptr<dai::NNData> nnData, std::shared_ptr<dai::ImgDetections> outDetections) {
+void DetectionParser::decodeYolo(dai::NNData& nnData, dai::ImgDetections& outDetections) {
     auto& logger = pimpl->logger;
     switch(properties.parser.decodingFamily) {
         case YoloDecodingFamily::R1AF:  // anchor free: yolo v6r1
@@ -550,6 +559,9 @@ void DetectionParser::decodeYolo(std::shared_ptr<dai::NNData> nnData, std::share
         case YoloDecodingFamily::TLBR:  // top left bottom right anchor free: yolo v6r2, v8 v10 v11
             utilities::DetectionParserUtils::decodeTLBR(nnData, outDetections, properties, logger);
             break;
+        default:
+            logger->error("Unknown Yolo decoding family. 'R1AF', 'v3AB', 'v5AB' and 'TLBR' are supported.");
+            throw std::runtime_error("Unknown Yolo decoding family");
     }
 }
 
diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
index 33d38cea9..1534ac36b 100644
--- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
@@ -12,6 +12,7 @@
 #include <opencv2/core/base.hpp>
 #include <opencv2/core/mat.hpp>
 #include <opencv2/opencv.hpp>
+#include <optional>
 #include <string>
 #include <vector>
 #include <xtensor/core/xtensor_forward.hpp>
@@ -130,17 +131,7 @@ void decodeR1AF(const dai::NNData& nnData,
                         ymax);
                     continue;
                 }
-                DetectionCandidate candidate = DetectionCandidate{
-                    xmin,
-                    ymin,
-                    xmax,
-                    ymax,
-                    bestConf * objectnessScore,
-                    bestC,
-                    strideIdx,
-                    row,
-                    col,
-                };
+                DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, bestConf * objectnessScore, bestC, strideIdx, row, col, std::nullopt};
 
                 detectionCandidates.emplace_back(std::move(candidate));
             }
@@ -287,17 +278,7 @@ void decodeV3AB(const dai::NNData& nnData,
                         continue;
                     }
 
-                    DetectionCandidate candidate = DetectionCandidate{
-                        xmin,
-                        ymin,
-                        xmax,
-                        ymax,
-                        conf,
-                        bestC,
-                        strideIdx,
-                        row,
-                        col,
-                    };
+                    DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, conf, bestC, strideIdx, row, col, std::nullopt};
 
                     detectionCandidates.emplace_back(std::move(candidate));
                 }
@@ -444,17 +425,7 @@ void decodeV5AB(const dai::NNData& nnData,
                     ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
 
                     if(xmax <= xmin || ymax <= ymin) continue;
-                    DetectionCandidate candidate = DetectionCandidate{
-                        xmin,
-                        ymin,
-                        xmax,
-                        ymax,
-                        conf,
-                        bestC,
-                        strideIdx,
-                        row,
-                        col,
-                    };
+                    DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, conf, bestC, strideIdx, row, col, std::nullopt};
 
                     detectionCandidates.emplace_back(std::move(candidate));
                 }
@@ -573,17 +544,7 @@ void decodeTLBR(const dai::NNData& nnData,
                     continue;
                 }
 
-                DetectionCandidate candidate = DetectionCandidate{
-                    xmin,
-                    ymin,
-                    xmax,
-                    ymax,
-                    bestConf,
-                    bestC,
-                    strideIdx,
-                    row,
-                    col,
-                };
+                DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, bestConf, bestC, strideIdx, row, col, std::nullopt};
 
                 detectionCandidates.emplace_back(std::move(candidate));
             }
@@ -888,7 +849,7 @@ void keypointDecode(const dai::NNData& nnData,
 
     auto yoloLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
     std::vector<int> featureMapWidths;
-    for(int i = 0; i < yoloLayerNames.size(); ++i) {
+    for(int i = 0; i < static_cast<int>(yoloLayerNames.size()); ++i) {
         auto tensorInfo = nnData.getTensorInfo(yoloLayerNames[i]);
         if(!tensorInfo) {
             logger->error("Tensor info for layer {} is null. Skipping keypoints decoding.", yoloLayerNames[i]);

From 9dbda0ba11e119e28bf956fc56893ff55b75f523 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Mon, 17 Nov 2025 13:45:15 +0100
Subject: [PATCH 10/24] add logger

---
 .../pipeline/datatype/ImgDetectionsT.hpp       |  1 +
 src/pipeline/datatype/ImgDetectionsT.cpp       |  5 +++++
 src/pipeline/node/DetectionParser.cpp          |  8 ++++----
 .../DetectionParser/DetectionParserUtils.cpp   | 18 +++++++++++-------
 .../DetectionParser/DetectionParserUtils.hpp   | 16 ++++++++--------
 5 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/include/depthai/pipeline/datatype/ImgDetectionsT.hpp b/include/depthai/pipeline/datatype/ImgDetectionsT.hpp
index 61b4d4bf0..99eb12cf7 100644
--- a/include/depthai/pipeline/datatype/ImgDetectionsT.hpp
+++ b/include/depthai/pipeline/datatype/ImgDetectionsT.hpp
@@ -76,6 +76,7 @@ class ImgDetectionsT : public Buffer {
      * Copies cv::Mat data to Segmentation Mask buffer
      *
      * @param frame Input cv::Mat frame from which to copy the data
+     * @note Throws if mask is not a single channel INT8 type.
      */
     void setCvSegmentationMask(cv::Mat mask);
 
diff --git a/src/pipeline/datatype/ImgDetectionsT.cpp b/src/pipeline/datatype/ImgDetectionsT.cpp
index 875628fb8..bbf6545e2 100644
--- a/src/pipeline/datatype/ImgDetectionsT.cpp
+++ b/src/pipeline/datatype/ImgDetectionsT.cpp
@@ -1,5 +1,7 @@
 #include "depthai/pipeline/datatype/ImgDetectionsT.hpp"
 
+#include <opencv2/core/hal/interface.h>
+
 #include <algorithm>
 #include <array>
 #include <cstring>
@@ -75,6 +77,9 @@ std::optional<dai::ImgFrame> ImgDetectionsT<DetectionT>::getSegmentationMask() c
 
 template <class DetectionT>
 void ImgDetectionsT<DetectionT>::setCvSegmentationMask(cv::Mat mask) {
+    if(mask.type() != CV_8U) {
+        throw("SetCvSegmentationMask: Mask must be of INT8 type.");
+    }
     std::vector<std::uint8_t> dataVec;
     if(!mask.isContinuous()) {
         for(int i = 0; i < mask.rows; i++) {
diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp
index 1ffbbf18d..e1aff71a6 100644
--- a/src/pipeline/node/DetectionParser.cpp
+++ b/src/pipeline/node/DetectionParser.cpp
@@ -387,7 +387,7 @@ bool DetectionParser::runOnHost() const {
 }
 
 void DetectionParser::run() {
-    auto& logger = pimpl->logger;
+    auto& logger = ThreadedNode::pimpl->logger;
     logger->info("Detection parser running on host.");
 
     using namespace std::chrono;
@@ -452,7 +452,7 @@ void DetectionParser::run() {
 }
 
 void DetectionParser::buildStage1() {
-    auto& logger = pimpl->logger;
+    auto& logger = ThreadedNode::pimpl->logger;
 
     // Grab dimensions from input tensor info
     if(properties.networkInputs.size() > 0) {
@@ -479,7 +479,7 @@ void DetectionParser::buildStage1() {
 }
 
 void DetectionParser::decodeMobilenet(dai::NNData& nnData, dai::ImgDetections& outDetections, float confidenceThr) {
-    auto& logger = pimpl->logger;
+    auto& logger = ThreadedNode::pimpl->logger;
 
     int maxDetections = 100;
     std::vector<dai::ImgDetection> detections;
@@ -545,7 +545,7 @@ void DetectionParser::decodeMobilenet(dai::NNData& nnData, dai::ImgDetections& o
 }
 
 void DetectionParser::decodeYolo(dai::NNData& nnData, dai::ImgDetections& outDetections) {
-    auto& logger = pimpl->logger;
+    std::shared_ptr<spdlog::async_logger>& logger = ThreadedNode::pimpl->logger;
     switch(properties.parser.decodingFamily) {
         case YoloDecodingFamily::R1AF:  // anchor free: yolo v6r1
             utilities::DetectionParserUtils::decodeR1AF(nnData, outDetections, properties, logger);
diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
index 1534ac36b..7269175a8 100644
--- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
@@ -3,6 +3,7 @@
 #include <spdlog/async_logger.h>
 
 #include <Eigen/Dense>
+#include <chrono>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
@@ -35,7 +36,7 @@ namespace DetectionParserUtils {
 void decodeR1AF(const dai::NNData& nnData,
                 dai::ImgDetections& outDetections,
                 DetectionParserProperties& properties,
-                std::shared_ptr<spdlog::async_logger> logger) {
+                std::shared_ptr<spdlog::async_logger>& logger) {
     auto layerNames = utilities::DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
 
     const std::vector<int> strides = properties.parser.strides;
@@ -168,7 +169,7 @@ Decode anchor based yolo v3 and v3-Tiny
 void decodeV3AB(const dai::NNData& nnData,
                 dai::ImgDetections& outDetections,
                 DetectionParserProperties& properties,
-                std::shared_ptr<spdlog::async_logger> logger) {
+                std::shared_ptr<spdlog::async_logger>& logger) {
     auto layerNames = getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
     auto sigmoid = [](float x) -> float { return 1.f / (1.f + std::exp(-x)); };
 
@@ -319,7 +320,7 @@ Decode anchor based networks, e.g., yolo v5, v7, P
 void decodeV5AB(const dai::NNData& nnData,
                 dai::ImgDetections& outDetections,
                 DetectionParserProperties& properties,
-                std::shared_ptr<spdlog::async_logger> logger) {
+                std::shared_ptr<spdlog::async_logger>& logger) {
     auto layerNames = getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
 
     const std::vector<int> strides = properties.parser.strides;
@@ -464,7 +465,7 @@ Decode TLBR (top left bottom right) style networks, e.g., yolo v6r2, v8, v10, v1
 void decodeTLBR(const dai::NNData& nnData,
                 dai::ImgDetections& outDetections,
                 DetectionParserProperties& properties,
-                std::shared_ptr<spdlog::async_logger> logger) {
+                std::shared_ptr<spdlog::async_logger>& logger) {
     auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
 
     const std::vector<int> strides = properties.parser.strides;
@@ -576,7 +577,7 @@ void decodeTLBR(const dai::NNData& nnData,
     }
 }
 
-bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr<spdlog::async_logger> logger) {
+bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr<spdlog::async_logger>& logger) {
     // Fix the channel order for Yolo - this is hacky and would be best to be fixed in the actual models and make it consistent
     auto getYoloChannelSize = [&](int classes, int coordinates, int anchors) -> int {
         if(anchors == 0) {
@@ -719,10 +720,11 @@ void segmentationDecode(const dai::NNData& nnData,
                         std::vector<DetectionCandidate>& detectionCandidates,
                         dai::ImgDetections& outDetections,
                         DetectionParserProperties properties,
-                        std::shared_ptr<spdlog::async_logger> logger) {
+                        std::shared_ptr<spdlog::async_logger>& logger) {
     std::pair<int, int> inputSize = nnData.transformation->getSize();
     int inputWidth = inputSize.first;
     int inputHeight = inputSize.second;
+    auto tStart = std::chrono::steady_clock::now();
 
     cv::Mat indexMask(inputHeight, inputWidth, CV_8U, cv::Scalar(255));
 
@@ -829,6 +831,8 @@ void segmentationDecode(const dai::NNData& nnData,
         const uint8_t value = static_cast<uint8_t>(std::min(detIdx, 254));
         roiOut.setTo(value, paintMask);
     }
+    auto tEnd = std::chrono::steady_clock::now();
+    logger->warn("Time to transform: {} ns", std::chrono::duration_cast<std::chrono::microseconds>(tEnd - tStart).count());
 
     outDetections.setCvSegmentationMask(indexMask);
 }
@@ -837,7 +841,7 @@ void keypointDecode(const dai::NNData& nnData,
                     std::vector<DetectionCandidate>& detectionCandidates,
                     dai::ImgDetections& outDetections,
                     DetectionParserProperties properties,
-                    std::shared_ptr<spdlog::async_logger> logger) {
+                    std::shared_ptr<spdlog::async_logger>& logger) {
     if(!properties.parser.nKeypoints) {
         logger->warn("Number of keypoints not set in properties.parser.nKeypoints. Skipping keypoints decoding.");
         return;
diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
index bb61eaa57..593007c14 100644
--- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
@@ -23,7 +23,7 @@ Decode anchor free yolo v6r1 with sigmoid assisted center detection
 void decodeR1AF(const dai::NNData& nnData,
                 dai::ImgDetections& outDetections,
                 DetectionParserProperties& properties,
-                std::shared_ptr<spdlog::async_logger> logger);
+                std::shared_ptr<spdlog::async_logger>& logger);
 
 /*
 Decode anchor based yolo v3 and v3-Tiny
@@ -31,7 +31,7 @@ Decode anchor based yolo v3 and v3-Tiny
 void decodeV3AB(const dai::NNData& nnData,
                 dai::ImgDetections& outDetections,
                 DetectionParserProperties& properties,
-                std::shared_ptr<spdlog::async_logger> logger);
+                std::shared_ptr<spdlog::async_logger>& logger);
 
 /*
 Decode anchor based networks, e.g., yolo v5, v7, P
@@ -39,7 +39,7 @@ Decode anchor based networks, e.g., yolo v5, v7, P
 void decodeV5AB(const dai::NNData& nnData,
                 dai::ImgDetections& outDetections,
                 DetectionParserProperties& properties,
-                std::shared_ptr<spdlog::async_logger> logger);
+                std::shared_ptr<spdlog::async_logger>& logger);
 
 /*
 Decode anchor free top-left-bottom-right (TLBR) style networks, e.g., yolo v6r2, v8, v10, v11
@@ -47,18 +47,18 @@ Decode anchor free top-left-bottom-right (TLBR) style networks, e.g., yolo v6r2,
 void decodeTLBR(const dai::NNData& nnData,
                 dai::ImgDetections& outDetections,
                 DetectionParserProperties& properties,
-                std::shared_ptr<spdlog::async_logger> logger);
+                std::shared_ptr<spdlog::async_logger>& logger);
 
 std::vector<std::string> getSortedDetectionLayerNames(const dai::NNData& nnData, std::string searchTerm, std::vector<std::string> outputNames);
 
 float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2);
 
-bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr<spdlog::async_logger> logger);
+bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr<spdlog::async_logger>& logger);
 
 void createImgDetections(std::vector<DetectionCandidate>& detectionCandidates,
                          std::vector<int> keepIndices,
                          dai::ImgDetections& outDetections,
-                         std::shared_ptr<spdlog::async_logger> logger);
+                         std::shared_ptr<spdlog::async_logger>& logger);
 
 std::vector<DetectionCandidate> nonMaximumSuppression(std::vector<DetectionCandidate>& detectionCandidates, float iouThr);
 
@@ -71,13 +71,13 @@ void segmentationDecode(const dai::NNData& nnData,
                         std::vector<DetectionCandidate>& detectionCandidates,
                         dai::ImgDetections& outDetections,
                         DetectionParserProperties properties,
-                        std::shared_ptr<spdlog::async_logger> logger);
+                        std::shared_ptr<spdlog::async_logger>& logger);
 
 void keypointDecode(const dai::NNData& nnData,
                     std::vector<DetectionCandidate>& detectionCandidates,
                     dai::ImgDetections& outDetections,
                     DetectionParserProperties properties,
-                    std::shared_ptr<spdlog::async_logger> logger);
+                    std::shared_ptr<spdlog::async_logger>& logger);
 
 }  // namespace DetectionParserUtils
 }  // namespace utilities

From 8f30141b734373936b32612b03434a313d582f37 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Mon, 17 Nov 2025 14:01:24 +0100
Subject: [PATCH 11/24] Remove unused functions

---
 .../DetectionParser/DetectionParserUtils.cpp  | 21 +++++--------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
index 7269175a8..a0391ccb4 100644
--- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
@@ -578,19 +578,11 @@ void decodeTLBR(const dai::NNData& nnData,
 }
 
 bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr<spdlog::async_logger>& logger) {
-    // Fix the channel order for Yolo - this is hacky and would be best to be fixed in the actual models and make it consistent
-    auto getYoloChannelSize = [&](int classes, int coordinates, int anchors) -> int {
-        if(anchors == 0) {
-            anchors = 1;
-        }
-        return anchors * (classes + coordinates + 1);
-    };
-
     int anchorMultiplier = properties.parser.anchorsV2.empty() ? 1 : static_cast<int>(properties.parser.anchorsV2.size());
     int channelSize = anchorMultiplier * (properties.parser.classes + properties.parser.coordinates + 1);
 
     auto checkAndFixOrder =
-        [&](dai::TensorInfo::StorageOrder currentOrder, int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool {
+        [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool {
         // Check that the dims size is big enough
         if(static_cast<int>(tensorInfo.dims.size()) <= channelDimIndex || static_cast<int>(tensorInfo.dims.size()) <= alternativeDimIndex) {
             logger->error("Invalid tensor dims size. Skipping.");
@@ -612,16 +604,16 @@ bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties p
 
     switch(tensorInfo.order) {
         case dai::TensorInfo::StorageOrder::CHW:
-            if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::CHW, 0, 2, dai::TensorInfo::StorageOrder::HWC)) return false;
+            if(!checkAndFixOrder(0, 2, dai::TensorInfo::StorageOrder::HWC)) return false;
             break;
         case dai::TensorInfo::StorageOrder::HWC:
-            if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::HWC, 2, 0, dai::TensorInfo::StorageOrder::CHW)) return false;
+            if(!checkAndFixOrder( 2, 0, dai::TensorInfo::StorageOrder::CHW)) return false;
             break;
         case dai::TensorInfo::StorageOrder::NCHW:
-            if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::NCHW, 1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false;
+            if(!checkAndFixOrder( 1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false;
             break;
         case dai::TensorInfo::StorageOrder::NHWC:
-            if(!checkAndFixOrder(dai::TensorInfo::StorageOrder::NHWC, 3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false;
+            if(!checkAndFixOrder( 3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false;
             break;
         case dai::TensorInfo::StorageOrder::NHCW:
         case dai::TensorInfo::StorageOrder::WHC:
@@ -724,7 +716,6 @@ void segmentationDecode(const dai::NNData& nnData,
     std::pair<int, int> inputSize = nnData.transformation->getSize();
     int inputWidth = inputSize.first;
     int inputHeight = inputSize.second;
-    auto tStart = std::chrono::steady_clock::now();
 
     cv::Mat indexMask(inputHeight, inputWidth, CV_8U, cv::Scalar(255));
 
@@ -831,8 +822,6 @@ void segmentationDecode(const dai::NNData& nnData,
         const uint8_t value = static_cast<uint8_t>(std::min(detIdx, 254));
         roiOut.setTo(value, paintMask);
     }
-    auto tEnd = std::chrono::steady_clock::now();
-    logger->warn("Time to transform: {} ns", std::chrono::duration_cast<std::chrono::microseconds>(tEnd - tStart).count());
 
     outDetections.setCvSegmentationMask(indexMask);
 }

From 6509bfced8630fdc145949965e5ad7e5f862fcf2 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Mon, 17 Nov 2025 14:04:20 +0100
Subject: [PATCH 12/24] remove unwanted import

---
 src/pipeline/datatype/ImgDetectionsT.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/pipeline/datatype/ImgDetectionsT.cpp b/src/pipeline/datatype/ImgDetectionsT.cpp
index bbf6545e2..299d96c27 100644
--- a/src/pipeline/datatype/ImgDetectionsT.cpp
+++ b/src/pipeline/datatype/ImgDetectionsT.cpp
@@ -1,7 +1,5 @@
 #include "depthai/pipeline/datatype/ImgDetectionsT.hpp"
 
-#include <opencv2/core/hal/interface.h>
-
 #include <algorithm>
 #include <array>
 #include <cstring>

From 85fdbd1943b89f24bcbb7aa714847342707235bc Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Mon, 17 Nov 2025 15:05:04 +0100
Subject: [PATCH 13/24] Add storage order checker

---
 .../DetectionParser/DetectionParserUtils.cpp        | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
index a0391ccb4..6f187a8b0 100644
--- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
@@ -581,8 +581,7 @@ bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties p
     int anchorMultiplier = properties.parser.anchorsV2.empty() ? 1 : static_cast<int>(properties.parser.anchorsV2.size());
     int channelSize = anchorMultiplier * (properties.parser.classes + properties.parser.coordinates + 1);
 
-    auto checkAndFixOrder =
-        [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool {
+    auto checkAndFixOrder = [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool {
         // Check that the dims size is big enough
         if(static_cast<int>(tensorInfo.dims.size()) <= channelDimIndex || static_cast<int>(tensorInfo.dims.size()) <= alternativeDimIndex) {
             logger->error("Invalid tensor dims size. Skipping.");
@@ -607,13 +606,13 @@ bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties p
             if(!checkAndFixOrder(0, 2, dai::TensorInfo::StorageOrder::HWC)) return false;
             break;
         case dai::TensorInfo::StorageOrder::HWC:
-            if(!checkAndFixOrder( 2, 0, dai::TensorInfo::StorageOrder::CHW)) return false;
+            if(!checkAndFixOrder(2, 0, dai::TensorInfo::StorageOrder::CHW)) return false;
             break;
         case dai::TensorInfo::StorageOrder::NCHW:
-            if(!checkAndFixOrder( 1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false;
+            if(!checkAndFixOrder(1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false;
             break;
         case dai::TensorInfo::StorageOrder::NHWC:
-            if(!checkAndFixOrder( 3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false;
+            if(!checkAndFixOrder(3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false;
             break;
         case dai::TensorInfo::StorageOrder::NHCW:
         case dai::TensorInfo::StorageOrder::WHC:
@@ -755,6 +754,10 @@ void segmentationDecode(const dai::NNData& nnData,
 
     dai::NNData& nnDataNonConst = const_cast<dai::NNData&>(nnData);
     xt::xarray<float> protoData = nnDataNonConst.getTensor<float>(protoLayerNames[0], true);
+    if(protoInfo.order != dai::TensorInfo::StorageOrder::NHWC) {
+        logger->trace("Proto storage is not NHWC, changing order.");
+        nnDataNonConst.changeStorageOrder(protoData, protoInfo.order, dai::TensorInfo::StorageOrder::NHWC);
+    }
     Eigen::MatrixXf protoMatrix = Eigen::Map<Eigen::MatrixXf>(protoData.data(), protoChannels, protoHeight * protoWidth);
 
     Eigen::RowVectorXf coeffs(protoChannels);

From 2e51cca60218b1735491f1c3779480bd038dd022 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Mon, 17 Nov 2025 15:35:59 +0100
Subject: [PATCH 14/24] Fix formating

---
 examples/cpp/DetectionNetwork/detection_and_keypoints.cpp    | 1 +
 examples/cpp/DetectionNetwork/detection_and_segmentation.cpp | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
index 3d94f5764..f4c80837d 100644
--- a/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
+++ b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
@@ -93,6 +93,7 @@ int main() {
 
             auto currentTime = std::chrono::steady_clock::now();
             float fps = counter / std::chrono::duration<float>(currentTime - startTime).count();
+            std::cout << "FPS: " << fps << std::endl;
         }
 
         if(cv::waitKey(1) == 'q') {
diff --git a/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
index f3ad63f2c..d9551117a 100644
--- a/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
+++ b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
@@ -4,8 +4,8 @@
 #include <cstddef>
 #include <cstdio>
 #include <iostream>
-#include <memory>
 #include <map>
+#include <memory>
 #include <opencv2/core.hpp>
 #include <opencv2/opencv.hpp>
 
@@ -41,7 +41,6 @@ int main() {
     modelDescription.model = modelName;
     detectionNetwork->build(cameraNode, modelDescription);
     detectionNetwork->detectionParser->setRunOnHost(setRunOnHost);
-    auto labelMap = detectionNetwork->getClasses();
 
     // Create output queues
     auto qRgb = detectionNetwork->passthrough.createOutputQueue();
@@ -137,10 +136,10 @@ int main() {
                             detections.begin(), detections.end(), [filteredLabel](const dai::ImgDetection& det) { return det.label != filteredLabel; }),
                         detections.end());
                 }
+
                 if(segmentationMask) {
                     cv::Mat lut(1, 256, CV_8U);
                     for(int i = 0; i < 256; ++i) lut.at<uchar>(i) = (i >= 255) ? 255 : cv::saturate_cast<uchar>(i * 25);
-
                     cv::Mat scaledMask;
                     cv::LUT(*segmentationMask, lut, scaledMask);
 

From 6c2214ae3e1fea51e75b5fe0fd9fd73ab38f71b0 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Mon, 17 Nov 2025 16:29:18 +0100
Subject: [PATCH 15/24] bump fw

---
 cmake/Depthai/DepthaiDeviceRVC4Config.cmake                  | 2 +-
 cmake/Depthai/DepthaiDeviceSideConfig.cmake                  | 2 +-
 .../python/DetectionNetwork/detection_and_segmentation.py    | 2 +-
 src/pipeline/datatype/ImgDetectionsT.cpp                     | 5 +++--
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
index 9f108699e..be5d08dc7 100644
--- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
+++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
@@ -3,4 +3,4 @@
 set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot")
 
 # "version if applicable"
-set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+1e78c47a2c81d8de6f10d888de2a14de5557c6c3")
+set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+29d1575da0655630004fc1733d1acefa3b66499c")
diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
index 351618039..98b588226 100644
--- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake
+++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
@@ -2,7 +2,7 @@
 set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot")
 
 # "full commit hash of device side binary"
-set(DEPTHAI_DEVICE_SIDE_COMMIT "913e44e627a6e24f794bce4c4eed2a94691072a4")
+set(DEPTHAI_DEVICE_SIDE_COMMIT "6d07abc50b03c9ea164f2e5664c3f155741998b5")
 
 # "version if applicable"
 set(DEPTHAI_DEVICE_SIDE_VERSION "")
diff --git a/examples/python/DetectionNetwork/detection_and_segmentation.py b/examples/python/DetectionNetwork/detection_and_segmentation.py
index 4f74ce29e..4445d6f04 100644
--- a/examples/python/DetectionNetwork/detection_and_segmentation.py
+++ b/examples/python/DetectionNetwork/detection_and_segmentation.py
@@ -15,7 +15,7 @@
 # Create pipeline
 with dai.Pipeline(device) as pipeline:
     cameraNode = pipeline.create(dai.node.Camera).build()
-    
+
     detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription(model_name))
     detectionNetwork.detectionParser.setRunOnHost(setRunOnHost)
     labelMap = detectionNetwork.getClasses()
diff --git a/src/pipeline/datatype/ImgDetectionsT.cpp b/src/pipeline/datatype/ImgDetectionsT.cpp
index 299d96c27..ce689fb03 100644
--- a/src/pipeline/datatype/ImgDetectionsT.cpp
+++ b/src/pipeline/datatype/ImgDetectionsT.cpp
@@ -3,6 +3,7 @@
 #include <algorithm>
 #include <array>
 #include <cstring>
+#include <opencv2/core/check.hpp>
 #include <stdexcept>
 #include <vector>
 
@@ -75,8 +76,8 @@ std::optional<dai::ImgFrame> ImgDetectionsT<DetectionT>::getSegmentationMask() c
 
 template <class DetectionT>
 void ImgDetectionsT<DetectionT>::setCvSegmentationMask(cv::Mat mask) {
-    if(mask.type() != CV_8U) {
-        throw("SetCvSegmentationMask: Mask must be of INT8 type.");
+    if(mask.type() != CV_8UC1) {
+        throw std::runtime_error("SetCvSegmentationMask: Mask must be of INT8 type, got opencv type " + cv::typeToString(mask.type()) + ".");
     }
     std::vector<std::uint8_t> dataVec;
     if(!mask.isContinuous()) {

From 109020c5a387f49a79d2b65852dbaa5af98c81d1 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Tue, 18 Nov 2025 08:52:15 +0100
Subject: [PATCH 16/24] remove rvc2 test label

---
 tests/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 70ec76f07..69e42ba58 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -546,7 +546,7 @@ FIRE_VIDEO="${fire_video}"
 KITCHEN_IMAGE_PATH="${kitchen_image}"
 YOLO_V8_INSTANCE_SEGMENTATION_LARGE_COCO_640x352_KITCHEN_SEGMENTATION_GROUND_TRUTH="${yolo_v8_instance_segmentation_large_coco_640x352_kitchen_segmentation_gt_v2}"
 )
-dai_set_test_labels(detection_parser_test ondevice rvc4 ci onhost)
+dai_set_test_labels(detection_parser_test ondevice rvc4 ci)
 
 # Spatial detection network test
 dai_add_test(spatial_detection_network_test src/ondevice_tests/pipeline/node/spatial_detection_network_test.cpp)

From ab5f693d47327bc517385c5a8dc23ec19ec7e711 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Wed, 19 Nov 2025 08:20:26 +0100
Subject: [PATCH 17/24] Implement suggestions

---
 examples/cpp/DetectionNetwork/CMakeLists.txt                  | 4 ++--
 examples/cpp/DetectionNetwork/detection_and_segmentation.cpp  | 4 ++--
 .../python/DetectionNetwork/detection_and_segmentation.py     | 2 +-
 src/pipeline/utilities/NNDataViewer.hpp                       | 2 ++
 tests/CMakeLists.txt                                          | 2 +-
 5 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/cpp/DetectionNetwork/CMakeLists.txt b/examples/cpp/DetectionNetwork/CMakeLists.txt
index 2af09182f..fef48e7bb 100644
--- a/examples/cpp/DetectionNetwork/CMakeLists.txt
+++ b/examples/cpp/DetectionNetwork/CMakeLists.txt
@@ -24,7 +24,7 @@ dai_add_example(detection_network_remap detection_network_remap.cpp ON OFF)
 dai_set_example_test_labels(detection_network_remap ondevice rvc2_all rvc4 ci)
 
 dai_add_example(detection_and_segmentation detection_and_segmentation.cpp ON OFF)
-dai_set_example_test_labels(detection_and_segmentation rvc4)
+dai_set_example_test_labels(detection_and_segmentation rvc2_all rvc4 ci)
 
 dai_add_example(detection_and_keypoints detection_and_keypoints.cpp ON OFF)
-dai_set_example_test_labels(detection_and_keypoints rvc4)
+dai_set_example_test_labels(detection_and_keypoints rvc2_all rvc4 ci)
diff --git a/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
index d9551117a..fa312d382 100644
--- a/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
+++ b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
@@ -22,7 +22,7 @@ int main() {
     bool setRunOnHost = false;
     auto device = std::make_shared<dai::Device>();
 
-    if(device->getPlatformAsString() == "RVC2") {
+    if(device->getPlatform() == dai::Platform::RVC2) {
         modelName = "luxonis/yolov8-instance-segmentation-nano:coco-512x288";
         setRunOnHost = true;
     }
@@ -139,7 +139,7 @@ int main() {
 
                 if(segmentationMask) {
                     cv::Mat lut(1, 256, CV_8U);
-                    for(int i = 0; i < 256; ++i) lut.at<uchar>(i) = (i >= 255) ? 255 : cv::saturate_cast<uchar>(i * 25);
+                    for(int i = 0; i < 256; ++i) lut.at<uchar>(i) = (i == 255) ? 255 : cv::saturate_cast<uchar>(i * 25);
                     cv::Mat scaledMask;
                     cv::LUT(*segmentationMask, lut, scaledMask);
 
diff --git a/examples/python/DetectionNetwork/detection_and_segmentation.py b/examples/python/DetectionNetwork/detection_and_segmentation.py
index 4445d6f04..81d703106 100644
--- a/examples/python/DetectionNetwork/detection_and_segmentation.py
+++ b/examples/python/DetectionNetwork/detection_and_segmentation.py
@@ -8,7 +8,7 @@
 model_name = "luxonis/yolov8-instance-segmentation-large:coco-640x480"
 setRunOnHost = False
 device = dai.Device()
-if device.getPlatformAsString() == "RVC2":
+if device.getPlatform() == dai.Platform.RVC2:
     model_name = "luxonis/yolov8-instance-segmentation-nano:coco-512x288"
     setRunOnHost = True
 
diff --git a/src/pipeline/utilities/NNDataViewer.hpp b/src/pipeline/utilities/NNDataViewer.hpp
index 94ab12cda..f00d23a6a 100644
--- a/src/pipeline/utilities/NNDataViewer.hpp
+++ b/src/pipeline/utilities/NNDataViewer.hpp
@@ -39,6 +39,7 @@ class NNDataViewer {
                 }
                 if(tensor.strides.size() != 4) {
                     logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4);
+                    return false;
                 }
                 factorsBefore.c = tensor.strides[1];
                 factorsBefore.h = tensor.strides[2];
@@ -51,6 +52,7 @@ class NNDataViewer {
                 }
                 if(tensor.strides.size() != 4) {
                     logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4);
+                    return false;
                 }
                 factorsBefore.h = tensor.strides[1];
                 factorsBefore.w = tensor.strides[2];
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 75d546f48..80705e181 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -407,7 +407,7 @@ dai_set_test_labels(nndata_test onhost ci)
 
 #ImgDetections tests
 dai_add_test(imgdetections_test src/onhost_tests/pipeline/datatype/imgdetections_test.cpp)
-dai_set_test_labels(imgdetections_test ondevice rvc2 rvc4 onhost ci)
+dai_set_test_labels(imgdetections_test onhost ci)
 
 # Model description tests
 dai_add_test(model_slug_test src/onhost_tests/model_slug_test.cpp)

From eeecbf928089b2bfbb7e4e9f214f2f6386c9bbea Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Wed, 19 Nov 2025 08:23:06 +0100
Subject: [PATCH 18/24] merge develop

---
 3rdparty/foxglove/ws-protocol | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/foxglove/ws-protocol b/3rdparty/foxglove/ws-protocol
index 234fa7936..45d3e08ff 160000
--- a/3rdparty/foxglove/ws-protocol
+++ b/3rdparty/foxglove/ws-protocol
@@ -1 +1 @@
-Subproject commit 234fa7936bfedc2824068aecd04b5ee6390e98c9
+Subproject commit 45d3e08ff168611ab8347ba194fd54b9425c99f8

From 3943a06884d3a29901d4f8914544b8030066fa54 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Wed, 19 Nov 2025 08:51:48 +0100
Subject: [PATCH 19/24] fix rvc2 build failure

---
 src/pipeline/datatype/ImgDetectionsT.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pipeline/datatype/ImgDetectionsT.cpp b/src/pipeline/datatype/ImgDetectionsT.cpp
index ce689fb03..c9db32022 100644
--- a/src/pipeline/datatype/ImgDetectionsT.cpp
+++ b/src/pipeline/datatype/ImgDetectionsT.cpp
@@ -3,7 +3,6 @@
 #include <algorithm>
 #include <array>
 #include <cstring>
-#include <opencv2/core/check.hpp>
 #include <stdexcept>
 #include <vector>
 

From 07ae4c64aa84ac5008a5e4ace5bdc70f22631756 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Wed, 19 Nov 2025 08:56:28 +0100
Subject: [PATCH 20/24] bump fw

---
 cmake/Depthai/DepthaiDeviceRVC4Config.cmake | 2 +-
 cmake/Depthai/DepthaiDeviceSideConfig.cmake | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
index 4fe07a881..587547e25 100644
--- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
+++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
@@ -3,4 +3,4 @@
 set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot")
 
 # "version if applicable"
-set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+3f910a125fde9174915207ba3a01372ad562e0f2")
+set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+2f8298ea040cf4eb4524c9ca84776b9f60fd106d")
diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
index 98b588226..e84ae7861 100644
--- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake
+++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
@@ -2,7 +2,7 @@
 set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot")
 
 # "full commit hash of device side binary"
-set(DEPTHAI_DEVICE_SIDE_COMMIT "6d07abc50b03c9ea164f2e5664c3f155741998b5")
+set(DEPTHAI_DEVICE_SIDE_COMMIT "966c3f9094a4fb73c663fe11a57c3cec7c0deeee")
 
 # "version if applicable"
 set(DEPTHAI_DEVICE_SIDE_VERSION "")

From 93a3f27a7d66f281b5751899727d75190baf4c64 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Fri, 21 Nov 2025 11:02:10 +0100
Subject: [PATCH 21/24] Throw if anchor dimension is != 2

---
 src/pipeline/node/DetectionParser.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp
index e1aff71a6..34299c740 100644
--- a/src/pipeline/node/DetectionParser.cpp
+++ b/src/pipeline/node/DetectionParser.cpp
@@ -159,6 +159,9 @@ void DetectionParser::setConfig(const dai::NNArchiveVersionedConfig& config) {
             std::vector<std::vector<float>> layerOut(anchorsIn[layer].size());
             for(size_t anchor = 0; anchor < layerOut.size(); ++anchor) {
                 std::vector<float> anchorOut(anchorsIn[layer][anchor].size());
+                if (anchorOut.size() != 2) {
+                    throw std::runtime_error("Each anchor should have exactly 2 dimensions (width and height).");
+                }
                 for(size_t dim = 0; dim < anchorOut.size(); ++dim) {
                     anchorOut[dim] = static_cast<float>(anchorsIn[layer][anchor][dim]);
                 }

From c7dda6f30a3e6a45bd0512e848f67e2090e5db2e Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Fri, 21 Nov 2025 12:28:35 +0100
Subject: [PATCH 22/24] Bump fw

---
 cmake/Depthai/DepthaiDeviceRVC4Config.cmake | 2 +-
 cmake/Depthai/DepthaiDeviceSideConfig.cmake | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
index 6114eff63..e698e7d26 100644
--- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
+++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
@@ -3,4 +3,4 @@
 set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot")
 
 # "version if applicable"
-set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+ccb59dd506392110b0c85abee0d82e28c7d91f9e")
+set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+40a7690ba1a1b35753040def6389bd773c61c67a")
diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
index 8fd10aaca..ed9354ab8 100644
--- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake
+++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
@@ -2,7 +2,7 @@
 set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot")
 
 # "full commit hash of device side binary"
-set(DEPTHAI_DEVICE_SIDE_COMMIT "0f9a7793654b8f2fbed64759c162425eab2c8541")
+set(DEPTHAI_DEVICE_SIDE_COMMIT "1a92fd182936f0ec83eb7986c14d02625f1cffdb")
 
 # "version if applicable"
 set(DEPTHAI_DEVICE_SIDE_VERSION "")

From cf0b4c802341cf3ca089c3f12112c41a0cdd955d Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Fri, 21 Nov 2025 12:49:57 +0100
Subject: [PATCH 23/24] bump fw

---
 cmake/Depthai/DepthaiDeviceRVC4Config.cmake | 2 +-
 cmake/Depthai/DepthaiDeviceSideConfig.cmake | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
index e698e7d26..78b734802 100644
--- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
+++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
@@ -3,4 +3,4 @@
 set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot")
 
 # "version if applicable"
-set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+40a7690ba1a1b35753040def6389bd773c61c67a")
+set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+098f664d24dc72bce0589a9f81f18ceb285b0f8f")
diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
index ed9354ab8..1b16f0d4a 100644
--- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake
+++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
@@ -2,7 +2,7 @@
 set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot")
 
 # "full commit hash of device side binary"
-set(DEPTHAI_DEVICE_SIDE_COMMIT "1a92fd182936f0ec83eb7986c14d02625f1cffdb")
+set(DEPTHAI_DEVICE_SIDE_COMMIT "621e48a2a0375f4594f7f8875661c50d3d5950c9")
 
 # "version if applicable"
 set(DEPTHAI_DEVICE_SIDE_VERSION "")

From f5a149f5886d3b5ca248529c7067de98178cbde0 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Fri, 21 Nov 2025 13:20:35 +0100
Subject: [PATCH 24/24] fix formatting

---
 src/pipeline/node/DetectionParser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp
index 34299c740..f06447353 100644
--- a/src/pipeline/node/DetectionParser.cpp
+++ b/src/pipeline/node/DetectionParser.cpp
@@ -159,7 +159,7 @@ void DetectionParser::setConfig(const dai::NNArchiveVersionedConfig& config) {
             std::vector<std::vector<float>> layerOut(anchorsIn[layer].size());
             for(size_t anchor = 0; anchor < layerOut.size(); ++anchor) {
                 std::vector<float> anchorOut(anchorsIn[layer][anchor].size());
-                if (anchorOut.size() != 2) {
+                if(anchorOut.size() != 2) {
                     throw std::runtime_error("Each anchor should have exactly 2 dimensions (width and height).");
                 }
                 for(size_t dim = 0; dim < anchorOut.size(); ++dim) {