diff --git a/3rdparty/foxglove/ws-protocol b/3rdparty/foxglove/ws-protocol
index 234fa7936..45d3e08ff 160000
--- a/3rdparty/foxglove/ws-protocol
+++ b/3rdparty/foxglove/ws-protocol
@@ -1 +1 @@
-Subproject commit 234fa7936bfedc2824068aecd04b5ee6390e98c9
+Subproject commit 45d3e08ff168611ab8347ba194fd54b9425c99f8
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb49d6f72..cbe164a09 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -311,6 +311,7 @@ set(TARGET_CORE_SOURCES
     src/pipeline/node/ImageAlign.cpp
     src/pipeline/node/ToF.cpp
     src/pipeline/node/DetectionParser.cpp
+    src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
     src/pipeline/node/test/MyProducer.cpp
     src/pipeline/node/test/MyConsumer.cpp
     src/pipeline/node/UVC.cpp
diff --git a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp
index ff8ec4670..05fb9d2f6 100644
--- a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp
+++ b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp
@@ -65,6 +65,7 @@ void bind_detectionparser(pybind11::module& m, void* pCallstack) {
              DOC(dai, node, DetectionParser, setAnchors, 2))
         .def("setAnchorMasks", &DetectionParser::setAnchorMasks, py::arg("anchorMasks"), DOC(dai, node, DetectionParser, setAnchorMasks))
         .def("setIouThreshold", &DetectionParser::setIouThreshold, py::arg("thresh"), DOC(dai, node, DetectionParser, setIouThreshold))
+        .def("setRunOnHost", &DetectionParser::setRunOnHost, py::arg("runOnHost"), DOC(dai, node, DetectionParser, setRunOnHost))
         .def("setSubtype", &DetectionParser::setSubtype, py::arg("subtype"), DOC(dai, node, DetectionParser, setSubtype))
         .def("setDecodeKeypoints", &DetectionParser::setDecodeKeypoints, py::arg("decode"), DOC(dai, node, DetectionParser, setDecodeKeypoints))
         .def("setDecodeSegmentation", &DetectionParser::setDecodeSegmentation, py::arg("decode"), DOC(dai, node, DetectionParser, setDecodeSegmentation))
@@ -78,6 +79,7 @@ void bind_detectionparser(pybind11::module& m, void* pCallstack) {
         .def("getAnchors", &DetectionParser::getAnchors, DOC(dai, node, DetectionParser, getAnchors))
         .def("getAnchorMasks", &DetectionParser::getAnchorMasks, DOC(dai, node, DetectionParser, getAnchorMasks))
         .def("getIouThreshold", &DetectionParser::getIouThreshold, DOC(dai, node, DetectionParser, getIouThreshold))
+        .def("runOnHost", &DetectionParser::runOnHost, DOC(dai, node, DetectionParser, runOnHost))
         .def("getSubtype", &DetectionParser::getSubtype, DOC(dai, node, DetectionParser, getSubtype))
         .def("getNkeypoints", &DetectionParser::getNKeypoints, DOC(dai, node, DetectionParser, getNKeypoints))
         .def("getDecodeKeypoints", &DetectionParser::getDecodeKeypoints, DOC(dai, node, DetectionParser, getDecodeKeypoints))
diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
index 6114eff63..78b734802 100644
--- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
+++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
@@ -3,4 +3,4 @@
 set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot")
 
 # "version if applicable"
-set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+ccb59dd506392110b0c85abee0d82e28c7d91f9e")
+set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+098f664d24dc72bce0589a9f81f18ceb285b0f8f")
diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
index 8fd10aaca..1b16f0d4a 100644
--- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake
+++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
@@ -2,7 +2,7 @@
 set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot")
 
 # "full commit hash of device side binary"
-set(DEPTHAI_DEVICE_SIDE_COMMIT "0f9a7793654b8f2fbed64759c162425eab2c8541")
+set(DEPTHAI_DEVICE_SIDE_COMMIT "621e48a2a0375f4594f7f8875661c50d3d5950c9")
 
 # "version if applicable"
 set(DEPTHAI_DEVICE_SIDE_VERSION "")
diff --git a/examples/cpp/DetectionNetwork/CMakeLists.txt b/examples/cpp/DetectionNetwork/CMakeLists.txt
index b1fffaca5..fef48e7bb 100644
--- a/examples/cpp/DetectionNetwork/CMakeLists.txt
+++ b/examples/cpp/DetectionNetwork/CMakeLists.txt
@@ -23,8 +23,8 @@ dai_set_example_test_labels(detection_network ondevice rvc2_all rvc4 rvc4rgb ci)
 dai_add_example(detection_network_remap detection_network_remap.cpp ON OFF)
 dai_set_example_test_labels(detection_network_remap ondevice rvc2_all rvc4 ci)
 
-dai_add_example(detection_and_segmentation RVC4/detection_and_segmentation.cpp ON OFF)
-dai_set_example_test_labels(detection_and_segmentation rvc4)
+dai_add_example(detection_and_segmentation detection_and_segmentation.cpp ON OFF)
+dai_set_example_test_labels(detection_and_segmentation rvc2_all rvc4 ci)
 
-dai_add_example(detection_and_keypoints RVC4/detection_and_keypoints.cpp ON OFF)
-dai_set_example_test_labels(detection_and_keypoints rvc4)
+dai_add_example(detection_and_keypoints detection_and_keypoints.cpp ON OFF)
+dai_set_example_test_labels(detection_and_keypoints rvc2_all rvc4 ci)
diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
similarity index 97%
rename from examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp
rename to examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
index e653f5592..f4c80837d 100644
--- a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp
+++ b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
@@ -22,7 +22,7 @@ int main() {
     auto detectionNetwork = pipeline.create<dai::node::DetectionNetwork>();
 
     dai::NNModelDescription modelDescription;
-    modelDescription.model = "luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39";
+    modelDescription.model = "luxonis/yolov8-nano-pose-estimation:coco-512x288";
     detectionNetwork->build(cameraNode, modelDescription);
 
     // Create output queues
diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
similarity index 93%
rename from examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp
rename to examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
index 960f2a21d..fa312d382 100644
--- a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp
+++ b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
@@ -5,6 +5,7 @@
 #include <cstdio>
 #include <iostream>
 #include <map>
+#include <memory>
 #include <opencv2/core.hpp>
 #include <opencv2/opencv.hpp>
 
@@ -17,8 +18,16 @@ cv::Rect frameNorm(const cv::Mat& frame, const dai::Point2f& topLeft, const dai:
 }
 
 int main() {
+    std::string modelName = "luxonis/yolov8-instance-segmentation-large:coco-640x352";
+    bool setRunOnHost = false;
+    auto device = std::make_shared<dai::Device>();
+
+    if(device->getPlatform() == dai::Platform::RVC2) {
+        modelName = "luxonis/yolov8-instance-segmentation-nano:coco-512x288";
+        setRunOnHost = true;
+    }
     // Create pipeline
-    dai::Pipeline pipeline;
+    dai::Pipeline pipeline{device};
 
     // Create and configure camera node
     auto cameraNode = pipeline.create<dai::node::Camera>();
@@ -28,8 +37,10 @@ int main() {
     auto detectionNetwork = pipeline.create<dai::node::DetectionNetwork>();
 
     dai::NNModelDescription modelDescription;
-    modelDescription.model = "luxonis/yolov8-instance-segmentation-large:coco-640x480";
+
+    modelDescription.model = modelName;
     detectionNetwork->build(cameraNode, modelDescription);
+    detectionNetwork->detectionParser->setRunOnHost(setRunOnHost);
 
     // Create output queues
     auto qRgb = detectionNetwork->passthrough.createOutputQueue();
diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py b/examples/python/DetectionNetwork/detection_and_keypoints.py
similarity index 95%
rename from examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py
rename to examples/python/DetectionNetwork/detection_and_keypoints.py
index 561f8f2ea..24eca43bc 100644
--- a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py
+++ b/examples/python/DetectionNetwork/detection_and_keypoints.py
@@ -7,8 +7,8 @@
 
 # Create pipeline
 with dai.Pipeline() as pipeline:
-    cameraNode = pipeline.create(dai.node.Camera).build()
-    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39"))
+    cameraNode = pipeline.create(dai.node.Camera).build(sensorFps=12)
+    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-nano-pose-estimation:coco-512x288"))
     labelMap = detectionNetwork.getClasses()
 
     qRgb = detectionNetwork.passthrough.createOutputQueue()
diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py b/examples/python/DetectionNetwork/detection_and_segmentation.py
similarity index 92%
rename from examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
rename to examples/python/DetectionNetwork/detection_and_segmentation.py
index 5ab883db3..81d703106 100644
--- a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
+++ b/examples/python/DetectionNetwork/detection_and_segmentation.py
@@ -5,10 +5,19 @@
 import numpy as np
 import time
 
+model_name = "luxonis/yolov8-instance-segmentation-large:coco-640x480"
+setRunOnHost = False
+device = dai.Device()
+if device.getPlatform() == dai.Platform.RVC2:
+    model_name = "luxonis/yolov8-instance-segmentation-nano:coco-512x288"
+    setRunOnHost = True
+
 # Create pipeline
-with dai.Pipeline() as pipeline:
+with dai.Pipeline(device) as pipeline:
     cameraNode = pipeline.create(dai.node.Camera).build()
-    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-large:coco-640x480"))
+
+    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription(model_name))
+    detectionNetwork.detectionParser.setRunOnHost(setRunOnHost)
     labelMap = detectionNetwork.getClasses()
     assert labelMap is not None
     qRgb = detectionNetwork.passthrough.createOutputQueue()
diff --git a/include/depthai/pipeline/datatype/ImgDetectionsT.hpp b/include/depthai/pipeline/datatype/ImgDetectionsT.hpp
index 61b4d4bf0..99eb12cf7 100644
--- a/include/depthai/pipeline/datatype/ImgDetectionsT.hpp
+++ b/include/depthai/pipeline/datatype/ImgDetectionsT.hpp
@@ -76,6 +76,7 @@ class ImgDetectionsT : public Buffer {
      * Copies cv::Mat data to Segmentation Mask buffer
      *
      * @param frame Input cv::Mat frame from which to copy the data
+     * @note Throws if mask is not a single channel INT8 type.
      */
     void setCvSegmentationMask(cv::Mat mask);
 
diff --git a/include/depthai/pipeline/node/DetectionParser.hpp b/include/depthai/pipeline/node/DetectionParser.hpp
index 4479df803..c04c6206a 100644
--- a/include/depthai/pipeline/node/DetectionParser.hpp
+++ b/include/depthai/pipeline/node/DetectionParser.hpp
@@ -12,6 +12,8 @@
 #include <string>
 
 #include "depthai/common/YoloDecodingFamily.hpp"
+#include "depthai/pipeline/datatype/ImgDetections.hpp"
+#include "depthai/pipeline/datatype/NNData.hpp"
 
 namespace dai {
 namespace node {
@@ -20,7 +22,7 @@ namespace node {
  * @brief DetectionParser node. Parses detection results from different neural networks and is being used internally by MobileNetDetectionNetwork and
  * YoloDetectionNetwork.
  */
-class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, DetectionParserProperties> {
+class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, DetectionParserProperties>, public HostRunnable {
    public:
     constexpr static const char* NAME = "DetectionParser";
     using DeviceNodeCRTP::DeviceNodeCRTP;
@@ -268,7 +270,23 @@ class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, Detec
      */
     const NNArchiveVersionedConfig& getNNArchiveVersionedConfig() const;
 
+    /**
+     * Specify whether to run on host or device
+     * By default, the node will run on device.
+     */
+    void setRunOnHost(bool runOnHost);
+
+    /**
+     * Check if the node is set to run on host
+     */
+    bool runOnHost() const override;
+
+    void run() override;
+
+    void decodeMobilenet(dai::NNData& nnData, dai::ImgDetections& outDetections, float confidenceThr);
+
    private:
+    bool runOnHostVar = false;
     void setNNArchiveBlob(const NNArchive& nnArchive);
     void setNNArchiveSuperblob(const NNArchive& nnArchive, int numShaves);
     void setNNArchiveOther(const NNArchive& nnArchive);
@@ -276,6 +294,15 @@ class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, Detec
     YoloDecodingFamily yoloDecodingFamilyResolver(const std::string& subtype);
     bool decodeSegmentationResolver(const std::vector<std::string>& outputs);
 
+    // host runnable requirements
+    void buildStage1() override;
+    void decodeYolo(dai::NNData& nnData, dai::ImgDetections& outDetections);
+    std::vector<dai::TensorInfo> inTensorInfo;
+    uint32_t imgWidth;
+    uint32_t imgHeight;
+    uint32_t imgSizesSet = false;
+    //
+
     std::optional<NNArchive> mArchive;
 
     std::optional<NNArchiveVersionedConfig> archiveConfig;
diff --git a/src/pipeline/datatype/ImgDetectionsT.cpp b/src/pipeline/datatype/ImgDetectionsT.cpp
index 875628fb8..c9db32022 100644
--- a/src/pipeline/datatype/ImgDetectionsT.cpp
+++ b/src/pipeline/datatype/ImgDetectionsT.cpp
@@ -75,6 +75,9 @@ std::optional<dai::ImgFrame> ImgDetectionsT<DetectionT>::getSegmentationMask() c
 
 template <class DetectionT>
 void ImgDetectionsT<DetectionT>::setCvSegmentationMask(cv::Mat mask) {
+    if(mask.type() != CV_8UC1) {
+        throw std::runtime_error("SetCvSegmentationMask: Mask must be of INT8 type, got opencv type " + cv::typeToString(mask.type()) + ".");
+    }
     std::vector<std::uint8_t> dataVec;
     if(!mask.isContinuous()) {
         for(int i = 0; i < mask.rows; i++) {
diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp
index 7ab17b2ac..f06447353 100644
--- a/src/pipeline/node/DetectionParser.cpp
+++ b/src/pipeline/node/DetectionParser.cpp
@@ -3,6 +3,7 @@
 #include <algorithm>
 #include <cctype>
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <vector>
 
@@ -13,6 +14,8 @@
 #include "nn_archive/NNArchive.hpp"
 #include "nn_archive/v1/Head.hpp"
 #include "pipeline/ThreadedNodeImpl.hpp"
+#include "pipeline/datatype/NNData.hpp"
+#include "pipeline/utilities/DetectionParser/DetectionParserUtils.hpp"
 
 // internal headers
 #include "utility/ErrorMacros.hpp"
@@ -156,6 +159,9 @@ void DetectionParser::setConfig(const dai::NNArchiveVersionedConfig& config) {
             std::vector<std::vector<float>> layerOut(anchorsIn[layer].size());
             for(size_t anchor = 0; anchor < layerOut.size(); ++anchor) {
                 std::vector<float> anchorOut(anchorsIn[layer][anchor].size());
+                if(anchorOut.size() != 2) {
+                    throw std::runtime_error("Each anchor should have exactly 2 dimensions (width and height).");
+                }
                 for(size_t dim = 0; dim < anchorOut.size(); ++dim) {
                     anchorOut[dim] = static_cast<float>(anchorsIn[layer][anchor][dim]);
                 }
@@ -369,5 +375,198 @@ std::vector<int> DetectionParser::getStrides() const {
     return properties.parser.strides;
 }
 
+void DetectionParser::setRunOnHost(bool runOnHost) {
+    if(runOnHost) {
+        pimpl->logger->warn("Detection parser set to run on host.");
+    }
+    runOnHostVar = runOnHost;
+}
+
+/**
+ * Check if the node is set to run on host
+ */
+bool DetectionParser::runOnHost() const {
+    return runOnHostVar;
+}
+
+void DetectionParser::run() {
+    auto& logger = ThreadedNode::pimpl->logger;
+    logger->info("Detection parser running on host.");
+
+    using namespace std::chrono;
+    while(isRunning()) {
+        auto tAbsoluteBeginning = steady_clock::now();
+        std::shared_ptr<dai::NNData> sharedInputData = input.get<dai::NNData>();
+        auto outDetections = std::make_shared<dai::ImgDetections>();
+
+        if(!sharedInputData) {
+            logger->error("NN Data is empty. Skipping processing.");
+            continue;
+        }
+        auto tAfterMessageBeginning = steady_clock::now();
+        dai::NNData& inputData = *sharedInputData;
+
+        if(!imgSizesSet) {
+            const bool containsTransformation = inputData.transformation.has_value();
+            if(containsTransformation) {
+                std::tie(imgWidth, imgHeight) = inputData.transformation->getSize();
+            } else {
+                logger->warn("No image size provided for detection parser. Skipping processing and sending empty detections.");
+                continue;
+            }
+            // We have determined the image size, no need to try again in the future
+            imgSizesSet = true;
+        }
+
+        // Parse detections
+        switch(properties.parser.nnFamily) {
+            case DetectionNetworkType::YOLO: {
+                decodeYolo(inputData, *outDetections);
+                break;
+            }
+            case DetectionNetworkType::MOBILENET: {
+                decodeMobilenet(inputData, *outDetections, properties.parser.confidenceThreshold);
+                break;
+            }
+            default: {
+                logger->error("Unknown NN family. 'YOLO' and 'MOBILENET' are supported.");
+                break;
+            }
+        }
+
+        auto tBeforeSend = steady_clock::now();
+
+        // Copy over seq and ts
+        outDetections->setSequenceNum(inputData.getSequenceNum());
+        outDetections->setTimestamp(inputData.getTimestamp());
+        outDetections->setTimestampDevice(inputData.getTimestampDevice());
+        outDetections->transformation = inputData.transformation;
+
+        // Send detections
+        out.send(outDetections);
+
+        auto tAbsoluteEnd = steady_clock::now();
+        logger->debug("Detection parser total took {}ms, processing {}ms, getting_frames {}ms, sending_frames {}ms",
+                      duration_cast<microseconds>(tAbsoluteEnd - tAbsoluteBeginning).count() / 1000,
+                      duration_cast<microseconds>(tBeforeSend - tAfterMessageBeginning).count() / 1000,
+                      duration_cast<microseconds>(tAfterMessageBeginning - tAbsoluteBeginning).count() / 1000,
+                      duration_cast<microseconds>(tAbsoluteEnd - tBeforeSend).count() / 1000);
+    }
+}
+
+void DetectionParser::buildStage1() {
+    auto& logger = ThreadedNode::pimpl->logger;
+
+    // Grab dimensions from input tensor info
+    if(properties.networkInputs.size() > 0) {
+        if(properties.networkInputs.size() > 1) {
+            logger->warn("Detection parser supports only single input networks, assuming first input");
+        }
+        for(const auto& kv : properties.networkInputs) {
+            const dai::TensorInfo& tensorInfo = kv.second;
+            inTensorInfo.push_back(tensorInfo);
+        }
+    }
+    if(inTensorInfo.size() > 0) {
+        int numDimensions = inTensorInfo[0].numDimensions;
+        if(numDimensions < 2) {
+            logger->error("Number of input dimensions is less than 2");
+        } else {
+            imgSizesSet = true;
+            imgWidth = inTensorInfo[0].dims[numDimensions - 1];
+            imgHeight = inTensorInfo[0].dims[numDimensions - 2];
+        }
+    } else {
+        logger->info("Unable to read input tensor height and width from static inputs. The node will try to get input sizes at runtime.");
+    }
+}
+
+void DetectionParser::decodeMobilenet(dai::NNData& nnData, dai::ImgDetections& outDetections, float confidenceThr) {
+    auto& logger = ThreadedNode::pimpl->logger;
+
+    int maxDetections = 100;
+    std::vector<dai::ImgDetection> detections;
+    std::string tensorName;
+    for(const auto& tensor : nnData.getAllLayers()) {
+        if(tensor.offset == 0) {
+            // // The tensor we want to checkout
+            // if(tensor.numDimensions != 4) {
+            //     std::cout << "ERROR while decoding Mobilenet. Output tensor has incorrect dimensions. Number of dimensions: " << tensor.numDimensions
+            //               << std::endl;
+            // }
+            // // Get tensor output size in Bytes
+            // // Expected dimensions are [1, 1, N, 7] where N is number of detections
+            // if(tensor.dims[3] != 7) {
+            //     std::cout << "ERROR while decoding Mobilenet. Expecting 7 fields for every detection but: " << tensor.dims[3] << " found.\n";
+            // }
+            // maxDetections = tensor.dims[tensor.numDimensions - 2];
+            tensorName = tensor.name;
+        }
+    }
+
+    auto tensorData = nnData.getTensor<float>(tensorName);
+    maxDetections = tensorData.size() / 7;
+    if(static_cast<int>(tensorData.size()) < maxDetections * 7) {
+        logger->error("Error while parsing Mobilenet. Vector not long enough, expected size: {}, real size {}", maxDetections * 7, tensorData.size());
+        return;
+    }
+
+    struct raw_Detection {  // need to update it to include more
+        float header;
+        float label;
+        float confidence;
+        float xmin;
+        float ymin;
+        float xmax;
+        float ymax;
+    };
+
+    float* rawPtr = tensorData.data();
+    for(int i = 0; i < maxDetections; i++) {
+        raw_Detection temp;
+        // TODO This is likely unnecessary optimisation
+        memcpy(&temp, &rawPtr[i * 7], sizeof(raw_Detection));
+
+        // if header == -1, stop sooner
+        if(temp.header == -1.0f) break;
+
+        float currentConfidence = temp.confidence;
+        if(currentConfidence >= confidenceThr) {
+            dai::ImgDetection d;
+            d.label = temp.label;
+
+            d.confidence = currentConfidence;
+
+            d.xmin = temp.xmin;
+            d.ymin = temp.ymin;
+            d.xmax = temp.xmax;
+            d.ymax = temp.ymax;
+
+            outDetections.detections.push_back(d);
+        }
+    }
+}
+
+void DetectionParser::decodeYolo(dai::NNData& nnData, dai::ImgDetections& outDetections) {
+    std::shared_ptr<spdlog::async_logger>& logger = ThreadedNode::pimpl->logger;
+    switch(properties.parser.decodingFamily) {
+        case YoloDecodingFamily::R1AF:  // anchor free: yolo v6r1
+            utilities::DetectionParserUtils::decodeR1AF(nnData, outDetections, properties, logger);
+            break;
+        case YoloDecodingFamily::v3AB:  // anchor based yolo v3 v3-Tiny
+            utilities::DetectionParserUtils::decodeV3AB(nnData, outDetections, properties, logger);
+            break;
+        case YoloDecodingFamily::v5AB:  // anchor based yolo v5, v7, P
+            utilities::DetectionParserUtils::decodeV5AB(nnData, outDetections, properties, logger);
+            break;
+        case YoloDecodingFamily::TLBR:  // top left bottom right anchor free: yolo v6r2, v8 v10 v11
+            utilities::DetectionParserUtils::decodeTLBR(nnData, outDetections, properties, logger);
+            break;
+        default:
+            logger->error("Unknown Yolo decoding family. 'R1AF', 'v3AB', 'v5AB' and 'TLBR' are supported.");
+            throw std::runtime_error("Unknown Yolo decoding family");
+    }
+}
+
 }  // namespace node
 }  // namespace dai
diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
new file mode 100644
index 000000000..6f187a8b0
--- /dev/null
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
@@ -0,0 +1,913 @@
+#include "DetectionParserUtils.hpp"
+
+#include <spdlog/async_logger.h>
+
+#include <Eigen/Dense>
+#include <chrono>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <opencv2/core.hpp>
+#include <opencv2/core/base.hpp>
+#include <opencv2/core/mat.hpp>
+#include <opencv2/opencv.hpp>
+#include <optional>
+#include <string>
+#include <vector>
+#include <xtensor/core/xtensor_forward.hpp>
+
+#include "DetectionParserUtils.hpp"
+#include "depthai/common/Keypoint.hpp"
+#include "depthai/common/KeypointsListT.hpp"
+#include "depthai/common/RotatedRect.hpp"
+#include "depthai/common/TensorInfo.hpp"
+#include "depthai/pipeline/datatype/ImgDetections.hpp"
+#include "depthai/pipeline/datatype/NNData.hpp"
+#include "depthai/properties/DetectionParserProperties.hpp"
+#include "pipeline/utilities/NNDataViewer.hpp"
+
+namespace dai {
+namespace utilities {
+namespace DetectionParserUtils {
+
+// yolo v6 r1 - anchor free
+void decodeR1AF(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
+                std::shared_ptr<spdlog::async_logger>& logger) {
+    auto layerNames = utilities::DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
+
+    const std::vector<int> strides = properties.parser.strides;
+    if(strides.size() != layerNames.size()) {
+        std::string errorMsg = fmt::format(
+            "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size());
+        throw std::runtime_error(errorMsg);
+    }
+    const float confidenceThr = properties.parser.confidenceThreshold;
+    const float iouThr = properties.parser.iouThreshold;
+    const int numClasses = properties.parser.classes;
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData.transformation->getSize();
+
+    if(inputWidth <= 0 || inputHeight <= 0) {
+        throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
+    }
+    std::vector<DetectionCandidate> detectionCandidates;
+    detectionCandidates.reserve(defaultMaxDetectionsPerFrame);
+
+    for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
+        std::string layerName = layerNames[strideIdx];
+        int stride = strides[strideIdx];
+        auto tensorInfo = nnData.getTensorInfo(layerName);
+        if(!tensorInfo) {
+            std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        if(!isTensorOrderValid(*tensorInfo, properties, logger)) {
+            logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName);
+            continue;
+        }
+
+        int layerHeight = tensorInfo->getHeight();
+        int layerWidth = tensorInfo->getWidth();
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger);
+        if(!outputData.build()) {
+            std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        for(int row = 0; row < layerHeight; ++row) {
+            for(int col = 0; col < layerWidth; ++col) {
+                const float objectnessScore = outputData.get(4, row, col);
+                if(objectnessScore < confidenceThr) {
+                    continue;
+                }
+
+                int bestC = 0;
+                float bestConf = 0.0f;
+                for(int c = 0; c < numClasses; ++c) {
+                    float candidateProb = outputData.get(c + 5, row, col);
+                    if(candidateProb > bestConf) {
+                        bestConf = candidateProb;
+                        bestC = c;
+                    }
+                }
+                if(bestConf * objectnessScore < confidenceThr) {
+                    continue;
+                }
+
+                float cx = outputData.get(0, row, col);
+                float cy = outputData.get(1, row, col);
+                float w = outputData.get(2, row, col);
+                float h = outputData.get(3, row, col);
+
+                float xmin = cx - w * 0.5f;
+                float ymin = cy - h * 0.5f;
+                float xmax = cx + w * 0.5f;
+                float ymax = cy + h * 0.5f;
+
+                xmin = std::max(0.0f, std::min(xmin, float(inputWidth)));
+                ymin = std::max(0.0f, std::min(ymin, float(inputHeight)));
+                xmax = std::max(0.0f, std::min(xmax, float(inputWidth)));
+                ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
+
+                if(xmax <= xmin || ymax <= ymin) {
+                    logger->info("Invalid bbox parameters. Either xmax <= xmin or ymax <= ymin. Skipping detection.");
+                    logger->debug(
+                        "Skipping invalid bbox: layer='{}', "
+                        "raw(cx,cy,w,h)=({:.2f},{:.2f},{:.2f},{:.2f}) "
+                        "clamped(xmin,ymin,xmax,ymax)=({:.2f},{:.2f},{:.2f},{:.2f}).",
+                        layerName,
+                        cx,
+                        cy,
+                        w,
+                        h,
+                        xmin,
+                        ymin,
+                        xmax,
+                        ymax);
+                    continue;
+                }
+                DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, bestConf * objectnessScore, bestC, strideIdx, row, col, std::nullopt};
+
+                detectionCandidates.emplace_back(std::move(candidate));
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr);
+    if(keepCandidates.size() == 0) {
+        logger->trace("No detections after NMS, skipping overlay.");
+        return;
+    }
+    if(!properties.parser.classNames->empty()) {
+        for(auto& candidate : keepCandidates) {
+            candidate.labelName = (*properties.parser.classNames)[candidate.label];
+        }
+    }
+
+    createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
+
+    if(properties.parser.decodeSegmentation) {
+        logger->trace("Segmentation decoding.");
+        segmentationDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    if(properties.parser.decodeKeypoints) {
+        logger->trace("Keypoints decoding.");
+        keypointDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+}
+
+/*
+Decode anchor based yolo v3 and v3-Tiny
+*/
+void decodeV3AB(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
+                std::shared_ptr<spdlog::async_logger>& logger) {
+    auto layerNames = getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
+    auto sigmoid = [](float x) -> float { return 1.f / (1.f + std::exp(-x)); };
+
+    const std::vector<int> strides = properties.parser.strides;
+    if(strides.size() != layerNames.size()) {
+        std::string errorMsg = fmt::format(
+            "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size());
+        throw std::runtime_error(errorMsg);
+    }
+
+    const float confidenceThr = properties.parser.confidenceThreshold;
+    const float iouThr = properties.parser.iouThreshold;
+    const int numClasses = properties.parser.classes;
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData.transformation->getSize();
+    if(inputWidth <= 0 || inputHeight <= 0) {
+        throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
+    }
+
+    if(properties.parser.anchorsV2.size() != layerNames.size()) {
+        logger->error("Number of anchor sets does not match number of output layers. Anchor sets size: {}, output layers size: {}",
+                      properties.parser.anchorsV2.size(),
+                      layerNames.size());
+        return;
+    }
+
+    std::vector<DetectionCandidate> detectionCandidates;
+    detectionCandidates.reserve(defaultMaxDetectionsPerFrame);
+
+    for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
+        std::string layerName = layerNames[strideIdx];
+        int stride = strides[strideIdx];
+        auto tensorInfo = nnData.getTensorInfo(layerName);
+        if(!tensorInfo) {
+            std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        if(!isTensorOrderValid(*tensorInfo, properties, logger)) {
+            logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName);
+            continue;
+        }
+
+        int layerHeight = tensorInfo->getHeight();
+        int layerWidth = tensorInfo->getWidth();
+        int layerChannels = tensorInfo->getChannels();
+
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger);
+        if(!outputData.build()) {
+            std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+        std::vector<std::vector<float>>& anchors = properties.parser.anchorsV2[strideIdx];
+        int numAnchors = anchors.size();
+        int block = 5 + numClasses;
+        int expectedC = numAnchors * block;
+
+        if(layerChannels != expectedC) {
+            std::string errorMsg = fmt::format("Layer {} channels mismatch. Expected {}, got {}", layerName, expectedC, layerChannels);
+            throw std::runtime_error(errorMsg);
+        }
+
+        for(int row = 0; row < layerHeight; ++row) {
+            for(int col = 0; col < layerWidth; ++col) {
+                for(int a = 0; a < numAnchors; ++a) {
+                    const int ch0 = a * block;
+                    const float tx = sigmoid(outputData.get(ch0 + 0, row, col));
+                    const float ty = sigmoid(outputData.get(ch0 + 1, row, col));
+                    const float tw = outputData.get(ch0 + 2, row, col);
+                    const float th = outputData.get(ch0 + 3, row, col);
+                    const float obj = sigmoid(outputData.get(ch0 + 4, row, col));
+                    if(obj < confidenceThr) continue;
+
+                    int bestC = 0;
+                    float clsLogit = 0.0f;
+                    for(int c = 0; c < numClasses; ++c) {
+                        const float candidateLogit = outputData.get(ch0 + 5 + c, row, col);
+                        if(candidateLogit > clsLogit) {
+                            clsLogit = candidateLogit;
+                            bestC = c;
+                        }
+                    }
+                    const float conf = obj * sigmoid(clsLogit);
+                    if(conf < confidenceThr) continue;
+
+                    // YOLOv3 decode
+                    const float cx = (static_cast<float>(col) + tx) * static_cast<float>(stride);
+                    const float cy = (static_cast<float>(row) + ty) * static_cast<float>(stride);
+                    const float w_exp = std::exp(tw);
+                    const float h_exp = std::exp(th);
+                    const float w = w_exp * anchors[a][0];
+                    const float h = h_exp * anchors[a][1];
+
+                    float xmin = cx - 0.5f * w;
+                    float ymin = cy - 0.5f * h;
+                    float xmax = cx + 0.5f * w;
+                    float ymax = cy + 0.5f * h;
+
+                    xmin = std::max(0.0f, std::min(xmin, float(inputWidth)));
+                    ymin = std::max(0.0f, std::min(ymin, float(inputHeight)));
+                    xmax = std::max(0.0f, std::min(xmax, float(inputWidth)));
+                    ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
+
+                    if(xmax <= xmin || ymax <= ymin) {
+                        logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping");
+                        continue;
+                    }
+
+                    DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, conf, bestC, strideIdx, row, col, std::nullopt};
+
+                    detectionCandidates.emplace_back(std::move(candidate));
+                }
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr);
+    if(keepCandidates.size() == 0) {
+        logger->trace("No detections after NMS, skipping overlay.");
+        return;
+    }
+
+    if(!properties.parser.classNames->empty()) {
+        for(auto& candidate : keepCandidates) {
+            candidate.labelName = (*properties.parser.classNames)[candidate.label];
+        }
+    }
+
+    createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
+
+    if(properties.parser.decodeSegmentation) {
+        logger->trace("Segmentation decoding.");
+        segmentationDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    if(properties.parser.decodeKeypoints) {
+        logger->trace("Keypoints decoding.");
+        keypointDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    //
+}
+
+/*
+Decode anchor based networks, e.g., yolo v5, v7, P
+*/
+void decodeV5AB(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
+                std::shared_ptr<spdlog::async_logger>& logger) {
+    auto layerNames = getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
+
+    const std::vector<int> strides = properties.parser.strides;
+    if(strides.size() != layerNames.size()) {
+        std::string errorMsg = fmt::format(
+            "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size());
+        throw std::runtime_error(errorMsg);
+    }
+
+    const float confidenceThr = properties.parser.confidenceThreshold;
+    const float iouThr = properties.parser.iouThreshold;
+    const int numClasses = properties.parser.classes;
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData.transformation->getSize();
+
+    if(inputWidth <= 0 || inputHeight <= 0) {
+        throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
+    }
+
+    if(properties.parser.anchorsV2.size() != layerNames.size()) {
+        logger->error("Number of anchor sets does not match number of output layers. Anchor sets size: {}, output layers size: {}",
+                      properties.parser.anchorsV2.size(),
+                      layerNames.size());
+        return;
+    }
+
+    std::vector<DetectionCandidate> detectionCandidates;
+    detectionCandidates.reserve(defaultMaxDetectionsPerFrame);
+
+    for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
+        std::string layerName = layerNames[strideIdx];
+        int stride = strides[strideIdx];
+        auto tensorInfo = nnData.getTensorInfo(layerName);
+        if(!tensorInfo) {
+            std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        if(!isTensorOrderValid(*tensorInfo, properties, logger)) {
+            logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName);
+            continue;
+        }
+
+        int layerHeight = tensorInfo->getHeight();
+        int layerWidth = tensorInfo->getWidth();
+        int layerChannels = tensorInfo->getChannels();
+
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger);
+        if(!outputData.build()) {
+            std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+        std::vector<std::vector<float>>& anchors = properties.parser.anchorsV2[strideIdx];
+        int numAnchors = anchors.size();
+        int block = 5 + numClasses;
+        int expectedC = numAnchors * block;
+
+        if(layerChannels != expectedC) {
+            logger->error("Layer {} channels mismatch. Expected {}, got {}", layerName, expectedC, layerChannels);
+            return;
+        }
+
+        for(int row = 0; row < layerHeight; ++row) {
+            for(int col = 0; col < layerWidth; ++col) {
+                for(int a = 0; a < numAnchors; ++a) {
+                    const int ch0 = a * block;
+
+                    const float tx = outputData.get(ch0 + 0, row, col);
+                    const float ty = outputData.get(ch0 + 1, row, col);
+                    const float tw = outputData.get(ch0 + 2, row, col);
+                    const float th = outputData.get(ch0 + 3, row, col);
+                    const float obj = outputData.get(ch0 + 4, row, col);
+                    if(obj < confidenceThr) continue;
+
+                    int bestC = 0;
+                    float bestConf = 0.0f;
+                    for(int c = 0; c < numClasses; ++c) {
+                        const float candidateProb = outputData.get(ch0 + 5 + c, row, col);
+                        if(candidateProb > bestConf) {
+                            bestConf = candidateProb;
+                            bestC = c;
+                        }
+                    }
+                    const float conf = obj * bestConf;
+                    if(conf < confidenceThr) continue;
+
+                    // YOLOv5 decode
+                    const float cx = ((tx * 2.0f - 0.5f) + static_cast<float>(col)) * static_cast<float>(stride);
+                    const float cy = ((ty * 2.0f - 0.5f) + static_cast<float>(row)) * static_cast<float>(stride);
+
+                    const float w = tw * tw * 4.0f * anchors[a][0];
+                    const float h = th * th * 4.0f * anchors[a][1];
+
+                    float xmin = cx - 0.5f * w;
+                    float ymin = cy - 0.5f * h;
+                    float xmax = cx + 0.5f * w;
+                    float ymax = cy + 0.5f * h;
+
+                    xmin = std::max(0.0f, std::min(xmin, float(inputWidth)));
+                    ymin = std::max(0.0f, std::min(ymin, float(inputHeight)));
+                    xmax = std::max(0.0f, std::min(xmax, float(inputWidth)));
+                    ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
+
+                    if(xmax <= xmin || ymax <= ymin) continue;
+                    DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, conf, bestC, strideIdx, row, col, std::nullopt};
+
+                    detectionCandidates.emplace_back(std::move(candidate));
+                }
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr);
+    if(keepCandidates.size() == 0) {
+        logger->trace("No detections after NMS, skipping overlay.");
+        return;
+    }
+
+    if(!properties.parser.classNames->empty()) {
+        for(auto& candidate : keepCandidates) {
+            candidate.labelName = (*properties.parser.classNames)[candidate.label];
+        }
+    }
+
+    createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
+
+    if(properties.parser.decodeSegmentation) {
+        logger->trace("Segmentation decoding.");
+        segmentationDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    if(properties.parser.decodeKeypoints) {
+        logger->trace("Keypoints decoding.");
+        keypointDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+}
+
+/*
+Decode TLBR (top left bottom right) style networks, e.g., yolo v6r2, v8, v10, v11
+*/
+void decodeTLBR(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
+                std::shared_ptr<spdlog::async_logger>& logger) {
+    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
+
+    const std::vector<int> strides = properties.parser.strides;
+    if(strides.size() != layerNames.size()) {
+        std::string errorMsg = fmt::format(
+            "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size());
+        throw std::runtime_error(errorMsg);
+    }
+    const float confidenceThr = properties.parser.confidenceThreshold;
+    const float iouThr = properties.parser.iouThreshold;
+    const int numClasses = properties.parser.classes;
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData.transformation->getSize();
+
+    if(inputWidth <= 0 || inputHeight <= 0) {
+        throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
+    }
+
+    std::vector<DetectionCandidate> detectionCandidates;
+    detectionCandidates.reserve(defaultMaxDetectionsPerFrame);
+
+    for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
+        std::string layerName = layerNames[strideIdx];
+        int stride = strides[strideIdx];
+        auto tensorInfo = nnData.getTensorInfo(layerName);
+        if(!tensorInfo) {
+            std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        if(!isTensorOrderValid(*tensorInfo, properties, logger)) {
+            logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName);
+            continue;
+        }
+
+        int layerHeight = tensorInfo->getHeight();
+        int layerWidth = tensorInfo->getWidth();
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData.data, logger);
+        if(!outputData.build()) {
+            std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        for(int row = 0; row < layerHeight; ++row) {
+            for(int col = 0; col < layerWidth; ++col) {
+                const float score = outputData.get(4, row, col);
+                if(score < confidenceThr) {
+                    continue;
+                }
+
+                int bestC = 0;
+                float bestConf = 0.0f;
+                for(int c = 0; c < numClasses; ++c) {
+                    float candidateProb = outputData.get(c + 5, row, col);
+                    if(candidateProb > bestConf) {
+                        bestConf = candidateProb;
+                        bestC = c;
+                    }
+                }
+                float xmin = (col - outputData.get(0, row, col) + 0.5f) * stride;
+                float ymin = (row - outputData.get(1, row, col) + 0.5f) * stride;
+                float xmax = (col + outputData.get(2, row, col) + 0.5f) * stride;
+                float ymax = (row + outputData.get(3, row, col) + 0.5f) * stride;
+
+                if(bestConf < confidenceThr) {
+                    continue;
+                }
+
+                xmin = std::max(0.0f, std::min(xmin, float(inputWidth)));
+                ymin = std::max(0.0f, std::min(ymin, float(inputHeight)));
+                xmax = std::max(0.0f, std::min(xmax, float(inputWidth)));
+                ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
+
+                if(xmax <= xmin || ymax <= ymin) {
+                    logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping");
+                    continue;
+                }
+
+                DetectionCandidate candidate = DetectionCandidate{xmin, ymin, xmax, ymax, bestConf, bestC, strideIdx, row, col, std::nullopt};
+
+                detectionCandidates.emplace_back(std::move(candidate));
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr);
+    if(keepCandidates.size() == 0) {
+        logger->trace("No detections after NMS, skipping overlay.");
+        return;
+    }
+
+    if(!properties.parser.classNames->empty()) {
+        for(auto& candidate : keepCandidates) {
+            candidate.labelName = (*properties.parser.classNames)[candidate.label];
+        }
+    }
+
+    createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
+
+    if(properties.parser.decodeSegmentation) {
+        logger->trace("Segmentation decoding.");
+        segmentationDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    if(properties.parser.decodeKeypoints) {
+        logger->trace("Keypoints decoding.");
+        keypointDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+}
+
+bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr<spdlog::async_logger>& logger) {
+    int anchorMultiplier = properties.parser.anchorsV2.empty() ? 1 : static_cast<int>(properties.parser.anchorsV2.size());
+    int channelSize = anchorMultiplier * (properties.parser.classes + properties.parser.coordinates + 1);
+
+    auto checkAndFixOrder = [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool {
+        // Check that the dims size is big enough
+        if(static_cast<int>(tensorInfo.dims.size()) <= channelDimIndex || static_cast<int>(tensorInfo.dims.size()) <= alternativeDimIndex) {
+            logger->error("Invalid tensor dims size. Skipping.");
+            return false;
+        }
+
+        if(tensorInfo.dims[channelDimIndex] != uint32_t(channelSize)) {
+            // Check if the channel size would match the alternative storage order
+            if(tensorInfo.dims[alternativeDimIndex] == uint32_t(channelSize)) {
+                logger->trace("Invalid channel size for the tensor. Expected {}, got {}, switching", channelSize, tensorInfo.dims[channelDimIndex]);
+                tensorInfo.order = alternativeOrder;
+            } else {
+                logger->error("Invalid channel size for the tensor. Expected {}, got {}. Skipping.", channelSize, tensorInfo.dims[channelDimIndex]);
+                return false;
+            }
+        }
+        return true;
+    };
+
+    switch(tensorInfo.order) {
+        case dai::TensorInfo::StorageOrder::CHW:
+            if(!checkAndFixOrder(0, 2, dai::TensorInfo::StorageOrder::HWC)) return false;
+            break;
+        case dai::TensorInfo::StorageOrder::HWC:
+            if(!checkAndFixOrder(2, 0, dai::TensorInfo::StorageOrder::CHW)) return false;
+            break;
+        case dai::TensorInfo::StorageOrder::NCHW:
+            if(!checkAndFixOrder(1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false;
+            break;
+        case dai::TensorInfo::StorageOrder::NHWC:
+            if(!checkAndFixOrder(3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false;
+            break;
+        case dai::TensorInfo::StorageOrder::NHCW:
+        case dai::TensorInfo::StorageOrder::WHC:
+        case dai::TensorInfo::StorageOrder::WCH:
+        case dai::TensorInfo::StorageOrder::HCW:
+        case dai::TensorInfo::StorageOrder::CWH:
+        case dai::TensorInfo::StorageOrder::NC:
+        case dai::TensorInfo::StorageOrder::CN:
+        case dai::TensorInfo::StorageOrder::C:
+        case dai::TensorInfo::StorageOrder::H:
+        case dai::TensorInfo::StorageOrder::W:
+        default:
+            logger->error("Invalid storage order for the tensor. Skipping.");
+            return false;
+    }
+
+    return true;
+}
+
+std::vector<std::string> getSortedDetectionLayerNames(const dai::NNData& nnData, std::string searchTerm, std::vector<std::string> outputNames) {
+    if(outputNames.empty()) {
+        outputNames = nnData.getAllLayerNames();
+    }
+
+    std::vector<std::string> layerNames;
+    for(const auto& name : outputNames) {
+        // if yolo in the name, push it to layerNames
+        if(name.find(searchTerm) != std::string::npos) {
+            layerNames.push_back(name);
+        }
+    }
+
+    std::sort(layerNames.begin(), layerNames.end());
+    return layerNames;
+}
+
+float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2) {
+    float width_of_overlap_area = fmin(box1.xmax, box2.xmax) - fmax(box1.xmin, box2.xmin);
+    float height_of_overlap_area = fmin(box1.ymax, box2.ymax) - fmax(box1.ymin, box2.ymin);
+    float area_of_overlap;
+    if(width_of_overlap_area < 0 || height_of_overlap_area < 0)
+        area_of_overlap = 0;
+    else
+        area_of_overlap = width_of_overlap_area * height_of_overlap_area;
+    float box_1_area = (box1.ymax - box1.ymin) * (box1.xmax - box1.xmin);
+    float box_2_area = (box2.ymax - box2.ymin) * (box2.xmax - box2.xmin);
+    float area_of_union = box_1_area + box_2_area - area_of_overlap;
+    return area_of_overlap / area_of_union;
+}
+
+std::vector<DetectionCandidate> nonMaximumSuppression(std::vector<DetectionCandidate>& detectionCandidates, float iouThr) {
+    std::sort(
+        detectionCandidates.begin(), detectionCandidates.end(), [](const DetectionCandidate& a, const DetectionCandidate& b) { return a.score > b.score; });
+
+    std::vector<uint8_t> keep(detectionCandidates.size(), 1);
+    std::vector<size_t> keepIndices;
+    keepIndices.reserve(detectionCandidates.size());
+
+    for(size_t i = 0; i < detectionCandidates.size(); ++i) {
+        if(!keep[i]) continue;
+        keepIndices.push_back(i);
+
+        for(size_t j = i + 1; j < detectionCandidates.size(); ++j) {
+            if(!keep[j]) continue;
+            if(YoloIntersectionOverUnion(detectionCandidates[i], detectionCandidates[j]) >= iouThr) {
+                keep[j] = 0;
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates;
+    keepCandidates.reserve(keepIndices.size());
+    for(size_t idx : keepIndices) keepCandidates.push_back(detectionCandidates[idx]);
+
+    return keepCandidates;
+}
+
+void createImgDetections(const std::vector<DetectionCandidate>& detectionCandidates,
+                         dai::ImgDetections& outDetections,
+                         unsigned int width,
+                         unsigned int height) {
+    for(const auto& det : detectionCandidates) {
+        dai::ImgDetection detection;
+        dai::RotatedRect rotatedRect(dai::Rect(dai::Point2f(det.xmin, det.ymin), dai::Point2f(det.xmax, det.ymax)), 0.0f);
+        detection.setBoundingBox(rotatedRect.normalize(width, height));
+        detection.confidence = det.score;
+        detection.label = det.label;
+        if(det.labelName) {
+            detection.labelName = *det.labelName;
+        }
+        outDetections.detections.push_back(std::move(detection));
+    }
+}
+
+void segmentationDecode(const dai::NNData& nnData,
+                        std::vector<DetectionCandidate>& detectionCandidates,
+                        dai::ImgDetections& outDetections,
+                        DetectionParserProperties properties,
+                        std::shared_ptr<spdlog::async_logger>& logger) {
+    std::pair<int, int> inputSize = nnData.transformation->getSize();
+    int inputWidth = inputSize.first;
+    int inputHeight = inputSize.second;
+
+    cv::Mat indexMask(inputHeight, inputWidth, CV_8U, cv::Scalar(255));
+
+    auto maskLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "masks", std::vector<std::string>{});
+    if(properties.parser.strides.size() != maskLayerNames.size()) {
+        logger->error(
+            "Number of strides does not match number of mask output layers. Strides size: {}, mask output layers size: {}. Skipping segmentation decoding.",
+            properties.parser.strides.size(),
+            maskLayerNames.size());
+        return;
+    }
+    auto protoLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "proto", std::vector<std::string>{});
+    if(protoLayerNames.size() == 0) {
+        logger->error("Expecting proto output layer, found no layer with proto label. Skipping segmentation decoding.");
+        return;
+    }
+
+    NNDataViewer protoValues = NNDataViewer(*nnData.getTensorInfo(protoLayerNames[0]), nnData.data, logger);
+    if(!protoValues.build()) {
+        logger->error("Failed to build NNDataViewer for proto layer {}. Skipping segmentation decoding.", protoLayerNames[0]);
+        return;
+    }
+
+    TensorInfo protoInfo = *nnData.getTensorInfo(protoLayerNames[0]);
+    int protoWidth = protoInfo.getWidth();
+    int protoHeight = protoInfo.getHeight();
+    int protoChannels = protoInfo.getChannels();
+    if(protoWidth <= 0 || protoHeight <= 0 || protoChannels <= 0) {
+        logger->error("Invalid proto tensor dimensions: channels {}, height {}, width {}.", protoChannels, protoHeight, protoWidth);
+        return;
+    }
+    int protoWidthScaleFactor = inputWidth / protoWidth;
+    int protoHeightScaleFactor = inputHeight / protoHeight;
+
+    cv::Mat maskUp;
+    cv::Mat maskLow(protoHeight, protoWidth, CV_32F);
+
+    dai::NNData& nnDataNonConst = const_cast<dai::NNData&>(nnData);
+    xt::xarray<float> protoData = nnDataNonConst.getTensor<float>(protoLayerNames[0], true);
+    if(protoInfo.order != dai::TensorInfo::StorageOrder::NHWC) {
+        logger->trace("Proto storage is not NHWC, changing order.");
+        nnDataNonConst.changeStorageOrder(protoData, protoInfo.order, dai::TensorInfo::StorageOrder::NHWC);
+    }
+    Eigen::MatrixXf protoMatrix = Eigen::Map<Eigen::MatrixXf>(protoData.data(), protoChannels, protoHeight * protoWidth);
+
+    Eigen::RowVectorXf coeffs(protoChannels);
+
+    auto maskFromCoeffs = [logger, protoHeight, protoWidth, &maskLow](const Eigen::MatrixXf& protos2d, const Eigen::RowVectorXf& coeffs) -> void {
+        if(protos2d.rows() != coeffs.size()) {
+            throw std::runtime_error("Mask coefficients size does not match proto channels.");
+        }
+
+        Eigen::Map<Eigen::RowVectorXf> logits(maskLow.ptr<float>(), protoHeight * protoWidth);
+        logits.noalias() = coeffs * protos2d;
+
+        // no need to do sigmoid
+        // logits = (1.0f / (1.0f + (-logits.array()).exp())).matrix();
+    };
+
+    std::map<int, NNDataViewer> maskValues;
+    for(int strideIdx = 0; strideIdx < static_cast<int>(maskLayerNames.size()); ++strideIdx) {
+        maskValues.try_emplace(strideIdx, *nnData.getTensorInfo(maskLayerNames[strideIdx]), nnData.data, logger);
+        if(!maskValues.at(strideIdx).build()) {
+            logger->error("Failed to build NNDataViewer for mask layer {}. Skipping segmentation decoding.", maskLayerNames[strideIdx]);
+            return;
+        }
+    }
+
+    for(size_t i = 0; i < detectionCandidates.size(); ++i) {  // loop over all detections
+        const auto& c = detectionCandidates[i];
+        const int detIdx = static_cast<int>(i);  // index in outDetections list
+
+        NNDataViewer& mask = maskValues.at(c.headIndex);
+        for(int ch = 0; ch < protoChannels; ++ch) {
+            coeffs(ch) = mask.get(ch, c.rowIndex, c.columnIndex);
+        }
+        // TODO (aljaz) perform operations on ROI only instead of the full resolution
+        // Eigen::MatrixXf roiMatrix = protoMatrix.block(0, y0 * protoWidth + x0, protoChannels, (y1 - y0) * (x1 - x0));
+
+        maskFromCoeffs(protoMatrix, coeffs);
+
+        int x0 = std::clamp(static_cast<int>(std::floor(c.xmin)), 0, inputWidth - 1);
+        int y0 = std::clamp(static_cast<int>(std::floor(c.ymin)), 0, inputHeight - 1);
+        int x1 = std::clamp(static_cast<int>(std::ceil(c.xmax)), 0, inputWidth);
+        int y1 = std::clamp(static_cast<int>(std::ceil(c.ymax)), 0, inputHeight);
+
+        if(x1 <= x0 || y1 <= y0) continue;
+        const cv::Rect roi(x0, y0, x1 - x0, y1 - y0);
+
+        int protoX0 = x0 / protoWidthScaleFactor;
+        int protoY0 = y0 / protoHeightScaleFactor;
+        int protoX1 = x1 / protoWidthScaleFactor;
+        int protoY1 = y1 / protoHeightScaleFactor;
+        const cv::Rect protoROI(protoX0, protoY0, protoX1 - protoX0, protoY1 - protoY0);
+
+        cv::Mat roiProb;
+        cv::resize(maskLow(protoROI), roiProb, roi.size(), 0, 0, cv::INTER_LINEAR);
+
+        // Threshold & paint only unassigned pixels
+        cv::Mat roiBin;
+        cv::compare(roiProb, 0.0f, roiBin, cv::CMP_GT);
+        cv::Mat roiOut = indexMask(roi);
+        cv::Mat unassigned;
+        cv::compare(roiOut, 255, unassigned, cv::CMP_EQ);
+        cv::Mat paintMask;
+        cv::bitwise_and(roiBin, unassigned, paintMask);
+
+        const uint8_t value = static_cast<uint8_t>(std::min(detIdx, 254));
+        roiOut.setTo(value, paintMask);
+    }
+
+    outDetections.setCvSegmentationMask(indexMask);
+}
+
+void keypointDecode(const dai::NNData& nnData,
+                    std::vector<DetectionCandidate>& detectionCandidates,
+                    dai::ImgDetections& outDetections,
+                    DetectionParserProperties properties,
+                    std::shared_ptr<spdlog::async_logger>& logger) {
+    if(!properties.parser.nKeypoints) {
+        logger->warn("Number of keypoints not set in properties.parser.nKeypoints. Skipping keypoints decoding.");
+        return;
+    }
+
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData.transformation->getSize();
+
+    auto yoloLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNamesToUse);
+    std::vector<int> featureMapWidths;
+    for(int i = 0; i < static_cast<int>(yoloLayerNames.size()); ++i) {
+        auto tensorInfo = nnData.getTensorInfo(yoloLayerNames[i]);
+        if(!tensorInfo) {
+            logger->error("Tensor info for layer {} is null. Skipping keypoints decoding.", yoloLayerNames[i]);
+            return;
+        }
+        featureMapWidths.push_back(tensorInfo->getWidth());
+    }
+
+    auto kptsLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "kpt_output", std::vector<std::string>{});
+    if(properties.parser.strides.size() != kptsLayerNames.size()) {
+        logger->error(
+            "Number of strides does not match number of keypoints output layers. Strides size: {}, keypoints output layers size: {}. Skipping keypoints "
+            "decoding.",
+            properties.parser.strides.size(),
+            kptsLayerNames.size());
+        return;
+    }
+
+    // TODO (aljaz) move to a function
+    std::map<int, NNDataViewer> keypointValues;
+    for(int strideIdx = 0; strideIdx < static_cast<int>(kptsLayerNames.size()); ++strideIdx) {
+        keypointValues.try_emplace(strideIdx, *nnData.getTensorInfo(kptsLayerNames[strideIdx]), nnData.data, logger);
+        if(!keypointValues.at(strideIdx).build()) {
+            logger->error("Failed to build NNDataViewer for keypoints layer {}. Skipping keypoints decoding.", kptsLayerNames[strideIdx]);
+            return;
+        }
+    }
+
+    if(outDetections.detections.size() != detectionCandidates.size()) {
+        logger->error(
+            "Number of detections in ImgDetections does not match number of detection candidates. ImgDetections size: {}, detection candidates size: {}. "
+            "Skipping keypoints decoding.",
+            outDetections.detections.size(),
+            detectionCandidates.size());
+        return;
+    }
+
+    for(size_t i = 0; i < detectionCandidates.size(); ++i) {  // loop over all detections
+        const auto& c = detectionCandidates[i];
+        int flattenedIndex = c.rowIndex * featureMapWidths[c.headIndex] + c.columnIndex;
+
+        std::vector<dai::Keypoint> keypoints;
+        keypoints.reserve(*properties.parser.nKeypoints);
+        NNDataViewer keypointMask = keypointValues.at(c.headIndex);
+
+        for(int k = 0; k < properties.parser.nKeypoints; ++k) {
+            int base = 3 * k;
+
+            // keypointValues tensor storage order HWC
+            //  H == 0
+            //  W == 51 == 17 * 3 (x, y, conf for each keypoint)
+            //  C == flattened spatial dimensions of row x col of the feature map
+            float x = std::clamp(keypointMask.get(flattenedIndex, 0, base + 0) / inputWidth, 0.0f, 1.0f);
+            float y = std::clamp(keypointMask.get(flattenedIndex, 0, base + 1) / inputHeight, 0.0f, 1.0f);
+            float conf = 1.f / (1.f + std::exp(-(keypointMask.get(flattenedIndex, 0, base + 2))));
+
+            keypoints.push_back(dai::Keypoint{dai::Point2f(x, y), conf});
+        }
+        outDetections.detections[i].keypoints = KeypointsList(keypoints, properties.parser.keypointEdges);
+    }
+}
+
+}  // namespace DetectionParserUtils
+}  // namespace utilities
+}  // namespace dai
diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
new file mode 100644
index 000000000..593007c14
--- /dev/null
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
@@ -0,0 +1,84 @@
+#pragma once
+#include <spdlog/async_logger.h>
+
+#include <optional>
+
+#include "depthai/pipeline/datatype/ImgDetections.hpp"
+#include "depthai/pipeline/datatype/NNData.hpp"
+#include "depthai/properties/DetectionParserProperties.hpp"
+
+namespace dai {
+namespace utilities {
+namespace DetectionParserUtils {
+
+constexpr std::size_t defaultMaxDetectionsPerFrame = 250;
+struct DetectionCandidate {
+    float xmin, ymin, xmax, ymax, score;
+    int label, headIndex, rowIndex, columnIndex;
+    std::optional<std::string> labelName;
+};
+/*
+Decode anchor free yolo v6r1 with sigmoid assisted center detection
+*/
+void decodeR1AF(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
+                std::shared_ptr<spdlog::async_logger>& logger);
+
+/*
+Decode anchor based yolo v3 and v3-Tiny
+*/
+void decodeV3AB(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
+                std::shared_ptr<spdlog::async_logger>& logger);
+
+/*
+Decode anchor based networks, e.g., yolo v5, v7, P
+*/
+void decodeV5AB(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
+                std::shared_ptr<spdlog::async_logger>& logger);
+
+/*
+Decode anchor free top-left-bottom-right (TLBR) style networks, e.g., yolo v6r2, v8, v10, v11
+*/
+void decodeTLBR(const dai::NNData& nnData,
+                dai::ImgDetections& outDetections,
+                DetectionParserProperties& properties,
+                std::shared_ptr<spdlog::async_logger>& logger);
+
+std::vector<std::string> getSortedDetectionLayerNames(const dai::NNData& nnData, std::string searchTerm, std::vector<std::string> outputNames);
+
+float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2);
+
+bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr<spdlog::async_logger>& logger);
+
+void createImgDetections(std::vector<DetectionCandidate>& detectionCandidates,
+                         std::vector<int> keepIndices,
+                         dai::ImgDetections& outDetections,
+                         std::shared_ptr<spdlog::async_logger>& logger);
+
+std::vector<DetectionCandidate> nonMaximumSuppression(std::vector<DetectionCandidate>& detectionCandidates, float iouThr);
+
+void createImgDetections(const std::vector<DetectionCandidate>& detectionCandidates,
+                         dai::ImgDetections& outDetections,
+                         unsigned int width,
+                         unsigned int height);
+
+void segmentationDecode(const dai::NNData& nnData,
+                        std::vector<DetectionCandidate>& detectionCandidates,
+                        dai::ImgDetections& outDetections,
+                        DetectionParserProperties properties,
+                        std::shared_ptr<spdlog::async_logger>& logger);
+
+void keypointDecode(const dai::NNData& nnData,
+                    std::vector<DetectionCandidate>& detectionCandidates,
+                    dai::ImgDetections& outDetections,
+                    DetectionParserProperties properties,
+                    std::shared_ptr<spdlog::async_logger>& logger);
+
+}  // namespace DetectionParserUtils
+}  // namespace utilities
+}  // namespace dai
\ No newline at end of file
diff --git a/src/pipeline/utilities/NNDataViewer.hpp b/src/pipeline/utilities/NNDataViewer.hpp
new file mode 100644
index 000000000..f00d23a6a
--- /dev/null
+++ b/src/pipeline/utilities/NNDataViewer.hpp
@@ -0,0 +1,165 @@
+#pragma once
+#include <spdlog/async_logger.h>
+
+#include "depthai/common/TensorInfo.hpp"
+#include "depthai/pipeline/datatype/NNData.hpp"
+#include "fp16/fp16.h"
+namespace dai {
+class NNDataViewer {
+   public:
+    std::shared_ptr<dai::Memory> data;
+    dai::TensorInfo tensor;
+    std::shared_ptr<spdlog::async_logger> logger;
+
+    // Factors to multiply with before the vectors
+    struct FactorsBefore {
+        int32_t h;
+        int32_t w;
+        int32_t c;
+    };
+
+    FactorsBefore factorsBefore;
+
+    NNDataViewer(dai::TensorInfo tensor, std::shared_ptr<dai::Memory> data, std::shared_ptr<spdlog::async_logger> logger)
+        : data{data}, tensor{tensor}, logger{logger} {};
+    bool build() {
+        if(tensor.strides.size() < 2) {
+            logger->error("Tensor doesn't have enough strides. Number of strides: {}, expected: {}", tensor.strides.size(), 2);
+            return false;
+        }
+        if(tensor.strides[0] == 0 || tensor.strides[1] == 0) {
+            logger->error("Tensor strides should not be set to zero. Strides are {} {}", tensor.strides[0], tensor.strides[1]);
+            return false;
+        }
+        switch(tensor.order) {
+            case TensorInfo::StorageOrder::NCHW:
+                if(tensor.dims[0] != 1) {
+                    logger->error("NCHW is only supported in Detection Parser if N is 1. It is {}", tensor.dims[0]);
+                    return false;
+                }
+                if(tensor.strides.size() != 4) {
+                    logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4);
+                    return false;
+                }
+                factorsBefore.c = tensor.strides[1];
+                factorsBefore.h = tensor.strides[2];
+                factorsBefore.w = tensor.getDataTypeSize();
+                break;
+            case TensorInfo::StorageOrder::NHWC:
+                if(tensor.dims[0] != 1) {
+                    logger->error("NHWC is only supported in Detection Parser if N is 1. It is {}", tensor.dims[0]);
+                    return false;
+                }
+                if(tensor.strides.size() != 4) {
+                    logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4);
+                    return false;
+                }
+                factorsBefore.h = tensor.strides[1];
+                factorsBefore.w = tensor.strides[2];
+                factorsBefore.c = tensor.getDataTypeSize();
+                break;
+            case TensorInfo::StorageOrder::HCW:
+                factorsBefore.h = tensor.strides[0];
+                factorsBefore.c = tensor.strides[1];
+                factorsBefore.w = tensor.getDataTypeSize();
+                break;
+
+            case TensorInfo::StorageOrder::HWC:
+                factorsBefore.h = tensor.strides[0];
+                factorsBefore.w = tensor.strides[1];
+                factorsBefore.c = tensor.getDataTypeSize();
+                break;
+            case TensorInfo::StorageOrder::CHW:
+                factorsBefore.c = tensor.strides[0];
+                factorsBefore.h = tensor.strides[1];
+                factorsBefore.w = tensor.getDataTypeSize();
+                break;
+
+            case TensorInfo::StorageOrder::CWH:
+                factorsBefore.c = tensor.strides[0];
+                factorsBefore.w = tensor.strides[1];
+                factorsBefore.h = tensor.getDataTypeSize();
+                break;
+
+            case TensorInfo::StorageOrder::WCH:
+                factorsBefore.w = tensor.strides[0];
+                factorsBefore.c = tensor.strides[1];
+                factorsBefore.h = tensor.getDataTypeSize();
+                break;
+
+            case TensorInfo::StorageOrder::WHC:
+                factorsBefore.w = tensor.strides[0];
+                factorsBefore.h = tensor.strides[1];
+                factorsBefore.c = tensor.getDataTypeSize();
+                break;
+            case TensorInfo::StorageOrder::NHCW:
+            case TensorInfo::StorageOrder::NC:
+            case TensorInfo::StorageOrder::CN:
+            case TensorInfo::StorageOrder::H:
+            case TensorInfo::StorageOrder::W:
+            case TensorInfo::StorageOrder::C:
+            default:
+                logger->error("Storage order not supported in NNDataViewer");
+                return false;
+        }
+        return sanity_check();
+    }
+
+    bool sanity_check() {
+        if(data->getSize() < (tensor.offset + (tensor.dims[0] * tensor.strides[0]))) {
+            logger->error(
+                "Underlying data does not hold enough data for the tensor to be contained.\
+                Tensor size: {}, Tensor offset: {}, Data type size: {}, Data size: {} ",
+                tensor.dims[0] * tensor.strides[0],
+                tensor.offset,
+                tensor.getDataTypeSize(),
+                data->getSize());
+            return false;
+        }
+        if(tensor.dims.size() < 2) {
+            logger->error("Number of dimensions for the input tensor is expected to be at least 2. It is {}", tensor.dims.size());
+            return false;
+        }
+        return true;
+    };
+
+    inline float get(int c, int h, int w) {
+        // If this turns out to be slow, use a function pointer instead and point to the right getter at build time
+        int32_t index = tensor.offset + factorsBefore.h * h + factorsBefore.w * w + factorsBefore.c * c;
+#ifdef DEPTHAI_SAFE_NN_DATA_ACCESS
+        logger->trace("Offset {}, fbH {}, fbW {}, fbC {}, h {}, w {}, c{}", tensor.offset, factorsBefore.h, factorsBefore.w, factorsBefore.c, h, w, c);
+        if(index > data->getSize()) {
+            logger->error("Out of bound access. Size is {}, index is {}", data->getSize(), index);
+            return 0.0;
+        }
+#endif
+
+        switch(tensor.dataType) {
+            case TensorInfo::DataType::U8F: {
+                uint8_t dataOut = data->getData()[index];
+                return (static_cast<float>(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::I8: {
+                int8_t dataOut = static_cast<int8_t>(data->getData()[index]);
+                return (static_cast<float>(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::INT: {
+                int32_t dataOut = reinterpret_cast<int32_t*>(data->getData().data())[index / sizeof(int32_t)];
+                return (static_cast<float>(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::FP16: {
+                int16_t dataOut = reinterpret_cast<int16_t*>(data->getData().data())[index / sizeof(int16_t)];
+                return (fp16_ieee_to_fp32_value(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::FP32: {
+                float dataOut = reinterpret_cast<float*>(data->getData().data())[index / sizeof(float)];
+                return (static_cast<float>(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::FP64:
+            default: {
+                return 0.0f;
+            }
+        }
+    }
+};
+}  // namespace dai