From f4e736b7d97399f4c402c52c335a7a0bb836e21f Mon Sep 17 00:00:00 2001
From: neuron-code-sharing-robot <neuron-code-sharing@amazon.com>
Date: Thu, 9 Apr 2026 19:39:50 +0000
Subject: [PATCH] Extracted contents of aws-neuronx-dkms-2.27.4.0.noarch.rpm

---
 dkms.conf                    |    2 +-
 neuron_cdev.c                |  333 ++++++++++-
 neuron_cdev.h                |   13 +
 neuron_core.c                |   54 +-
 neuron_device.h              |   10 +-
 neuron_dhal.c                |   14 +-
 neuron_dhal.h                |   34 +-
 neuron_dma.c                 | 1011 +++++++++++++++++++++++++++-------
 neuron_dma.h                 |   53 +-
 neuron_dmabuf.c              |    3 +-
 neuron_fw_io.c               |  156 +++++-
 neuron_fw_io.h               |  161 +++++-
 neuron_ioctl.h               |   39 +-
 neuron_mempool.c             |   98 +++-
 neuron_mempool.h             |   24 +-
 neuron_metrics.c             |  141 +++--
 neuron_metrics.h             |    8 +-
 neuron_module.c              |    8 +-
 neuron_nq.h                  |   42 ++
 neuron_pci.c                 |  208 ++++++-
 neuron_pci.h                 |    7 +
 neuron_power.c               |    2 +-
 neuron_reg_access.c          |    2 +-
 neuron_reset.c               |   18 +-
 neuron_reset.h               |    5 +-
 neuron_ring.c                |   37 +-
 neuron_ring.h                |  112 ++++
 neuron_sysfs_metrics.c       |    3 +-
 share/neuron_driver_shared.h |   39 +-
 udma/udma.h                  |   44 +-
 udma/udma_m2m.c              |    5 +-
 udma/udma_main.c             |   12 +-
 udma/udma_regs.h             |   10 +-
 v2/address_map.h             |    3 -
 v2/neuron_dhal_v2.c          |  358 ++----------
 v2/notific.c                 |    3 -
 v2/notific.h                 |   41 +-
 v3/address_map.h             |    1 -
 v3/neuron_dhal_v3.c          |  463 +++++-----------
 v3/neuron_pelect.c           |  208 ++++---
 v3/neuron_pelect.h           |    7 +
 v3/notific.c                 |    2 -
 v3/notific.h                 |   46 +-
 v4/address_map.h             |    1 -
 v4/neuron_dhal_v4.c          |   75 +--
 45 files changed, 2575 insertions(+), 1341 deletions(-)

diff --git a/dkms.conf b/dkms.conf
index 24de583..ea60afc 100644
--- a/dkms.conf
+++ b/dkms.conf
@@ -1,5 +1,5 @@
 PACKAGE_NAME=aws-neuronx
-PACKAGE_VERSION=2.26.5.0
+PACKAGE_VERSION=2.27.4.0
 BUILT_MODULE_NAME[0]="neuron"
 MAKE[0]="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build"
 CLEAN="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build clean"
diff --git a/neuron_cdev.c b/neuron_cdev.c
index ad4e3d7..4dfbfe7 100644
--- a/neuron_cdev.c
+++ b/neuron_cdev.c
@@ -1233,7 +1233,7 @@ static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, unsigned int cmd,
 		op.buffer = buffer;
 		op.size = size;
 
-		ret = ndma_memcpy_zerocopy(nd, nc_id, &op, 1, dev_base, qid, copy_to_mem_handle ? true : false);
+		ret = ndma_zerocopy_submit(nd, nc_id, &op, 1, dev_base, qid, copy_to_mem_handle ? true : false, 0);
 	}
 
 	return ret;
@@ -1409,7 +1409,7 @@ static int ncdev_mem_buf_zerocopy64_batch(struct neuron_device *nd, void *param)
 			}
 
 			// use the zero-copy batch function for ops within a single batch
-			ret = ndma_memcpy_zerocopy(nd, nc_id, batch->ops_ptr, batch->num_ops, dev_base, qid, arg.is_copy_to_device);
+			ret = ndma_zerocopy_submit(nd, nc_id, batch->ops_ptr, batch->num_ops, dev_base, qid, arg.is_copy_to_device, arg.sequence_num);
 			if (ret) {
 				pr_err("batch zero-copy DMA failed on batch %d on nd%02d: %d\n", i, nd->device_index, ret);
 				goto cleanup;
@@ -1502,7 +1502,7 @@ static long ncdev_bar_read(struct neuron_device *nd, u8 bar, u64 *reg_addresses,
 		if (data == NULL)
 			return -ENOMEM;
 
-		ret = ndhal->ndhal_reg_access.reg_read32_array((void **)reg_addresses, data, data_count);
+		ret = ndhal->ndhal_fw_io.fw_io_read_csr_array((void **)reg_addresses, data, data_count, true);
 		if (ret) {
 			kfree(data);
 			return ret;
@@ -2933,7 +2933,7 @@ static int ncdev_h2t_dma_alloc_queues(struct neuron_device *nd, unsigned int cmd
 		return -E2BIG;
 	}
 	
-	if (arg.copy_queue_cnt + arg.service_queue_cnt >= DMA_MAX_Q_MAX) {
+	if (arg.copy_queue_cnt + arg.service_queue_cnt >= ndhal->ndhal_udma.num_queues) {
 		pr_err("nd%02d: invalid total queue count %d provided", nd->device_index, arg.copy_queue_cnt + arg.service_queue_cnt);
 		return -E2BIG;
 	}
@@ -2964,7 +2964,7 @@ static int ncdev_h2t_dma_alloc_queues(struct neuron_device *nd, unsigned int cmd
 done:
 	if (ret) {
 		u32 combined_queue_bmap = arg.copy_queue_bmap | arg.service_queue_bmap;
-		for (i=0; i < DMA_MAX_Q_V4; i++) {
+		for (i=0; i < ndhal->ndhal_udma.num_queues; i++) {
 			if ((1<<i) & combined_queue_bmap) {
 				ndmar_h2t_ring_release(nd, arg.nc_id, i);
 			}
@@ -2990,7 +2990,7 @@ static int ncdev_h2t_dma_free_queues(struct neuron_device *nd, unsigned int cmd,
 		return -E2BIG;
 	}
 	
-	for (i=0; i < DMA_MAX_Q_V4; i++) {
+	for (i=0; i < ndhal->ndhal_udma.num_queues; i++) {
 		int lret;
 		if ((1<<i) & arg.queue_bmap) {
 			lret = ndmar_h2t_ring_release(nd, arg.nc_id, i);
@@ -3020,6 +3020,46 @@ static int ncdev_power_profile_set(struct neuron_device *nd, void *param)
 	return ndhal->ndhal_perf.perf_set_profile(nd, arg.profile);
 }
 
+static int ncdev_power_profile_get(struct neuron_device *nd, void *param)
+{
+	struct neuron_ioctl_power_profile arg;
+	int ret;
+
+	ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_power_profile*) param, sizeof(arg));
+	if (ret)
+		return ret;
+
+	if (arg.sz != sizeof(arg)) {
+		return -ENXIO;
+	}
+	if (arg.ctrl != 1) {
+		return -ENOTSUPP;
+	}
+	
+	ret = ndhal->ndhal_perf.perf_get_profile(nd, &arg.profile);
+	if (ret)
+		return ret;
+	
+	return copy_to_user(param, &arg, sizeof(arg));
+}
+
+static int ncdev_available_perf_profiles(struct neuron_device *nd, void *param)
+{
+	struct neuron_ioctl_available_perf_profiles arg;
+	int ret;
+
+	ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_available_perf_profiles*) param, sizeof(arg));
+	if (ret)
+		return ret;
+
+	ret = ndhal->ndhal_perf.perf_get_supported_profiles(nd, arg.requested_feature, &arg.num_profiles, arg.bitmap);
+	if (ret)
+		return ret;
+
+	return copy_to_user(param, &arg, sizeof(arg));
+}
+
+
 static int ncdev_throttling_notifications_set(struct neuron_device *nd, void *param)
 {
 	struct neuron_ioctl_throttling_notifications arg;
@@ -3053,6 +3093,57 @@ static int ncdev_get_va_placement(void *param)
 	return ret;
 }
 
+static int ncdev_get_async_h2d_dma_compl_queues(struct neuron_device *nd, void *param)
+{
+	int ret = 0;
+	u32 qid = 0;
+	int eng_id = 0;
+	struct neuron_ioctl_get_async_h2t_dma_compl_queues arg;
+
+	ret = neuron_copy_from_user(__func__, 
+								&arg,
+								(struct neuron_ioctl_get_async_h2t_dma_compl_queues *)param,
+								sizeof(arg));
+	if (ret) {
+		return ret;
+	}
+
+	/* TODO: start h2d kernel thread */
+
+	if (arg.nc_id >= ndhal->ndhal_address_map.nc_per_device) {
+		pr_err("nd%02d: invalid nc %u provided\n", nd->device_index, arg.nc_id);
+		return -EINVAL;
+	}
+
+	memset(arg.compl_queue_info, 0, sizeof(arg.compl_queue_info));
+
+	eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, arg.nc_id);
+
+	for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) {
+		if (!(arg.qid_bitmap & (1u << qid))) {
+			continue;
+		}
+
+		struct ndma_ring *ring = &nd->ndma_engine[eng_id].queues[qid].ring_info;
+		struct ndma_h2d_compl_queue *compl_queue = &ring->dma_compl_queue;
+		struct mem_chunk *mc = compl_queue->mc;
+
+		if (!mc) {
+			pr_err("nd%02d: invalid h2d qid %u; compl queue not initialized\n",
+			       nd->device_index, qid);
+			return -EINVAL;
+		}
+
+		arg.compl_queue_info[qid].mmap_offset = nmmap_offset(mc);
+		arg.compl_queue_info[qid].mmap_size = sizeof(neuron_h2d_dma_compl_queue_t) +
+			((compl_queue->capacity_mask + 1) * sizeof(neuron_h2d_dma_compl_queue_entry_t));
+	}
+
+	ret = copy_to_user(param, &arg, sizeof(arg));
+
+	return ret;
+}
+
 inline static long ncdev_misc_ioctl(struct file *filep, unsigned int cmd, unsigned long param) {
 	if ((cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK) || (cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK_EXT0)) {
 		return ncdev_crwl_nc_range_mark(filep, cmd, (void *)param);
@@ -3282,8 +3373,14 @@ static long ncdev_ioctl(struct file *filep, unsigned int cmd, unsigned long para
 		return ncdev_h2t_dma_free_queues(nd, cmd, (void*)param);
 	} else if (cmd == NEURON_IOCTL_POWER_PROFILE) {
 		return ncdev_power_profile_set(nd, (void*)param);
+	} else if (cmd == NEURON_IOCTL_GET_PERFORMANCE_PROFILE) {
+		return ncdev_power_profile_get(nd, (void*)param);
 	} else if (cmd == NEURON_IOCTL_THROTTLING_NOTIFICATIONS) {
 		return ncdev_throttling_notifications_set(nd, (void*)param);
+	} else if (cmd == NEURON_IOCTL_AVAILABLE_PERF_PROFILES) {
+		return ncdev_available_perf_profiles(nd, (void*)param);
+	} else if (cmd == NEURON_IOCTL_GET_ASYNC_H2T_DMA_COMPL_QUEUES) {
+		return ncdev_get_async_h2d_dma_compl_queues(nd, (void*)param);
 	}
 
 	// B/W compatibility
@@ -3374,8 +3471,6 @@ static int ncdev_flush(struct file *filep, fl_owner_t id)
 		// If this proc exited in the middle of a reset, wait for the reset to be processed.
 		nr_wait(nd, task_tgid_nr(current), true);
 
-		ndhal->ndhal_cdev.ncdev_quiesce_exec_on_proc_exit();
-
 		ndmar_handle_process_exit(nd, task_tgid_nr(current));
 		msleep(10); // TODO - confirm with HW dev, whether any delay needed after q reset.
 		ncrwl_release_current_process(nd);
@@ -3517,11 +3612,26 @@ static ssize_t neuron_connected_devices_show(struct device *dev, struct device_a
 
 static DEVICE_ATTR(connected_devices, S_IRUSR, neuron_connected_devices_show, NULL);
 
+static ssize_t fw_api_version_show(struct device *dev, struct device_attribute *attr, char *buf)
+{	int fw_api_version;
+	int minor = MINOR(dev->devt);
+	struct neuron_device *nd = devnodes[minor].ndev;
+
+	fw_io_api_version_read(nd->npdev.bar0, &fw_api_version);
+	if (fw_api_version == 0xdeadbeef) { // the value is not readable during reset, try later
+		return sprintf(buf, "busy\n");
+	}
+	return sprintf(buf, "%u\n", fw_api_version);
+}
+
+static DEVICE_ATTR(fw_api_version, S_IRUGO, fw_api_version_show, NULL);
+
 static struct attribute *attrs[] = {
 	&dev_attr_reset.attr,
 	&dev_attr_core_count.attr,
 	&dev_attr_connected_devices.attr,
-   	NULL,
+	&dev_attr_fw_api_version.attr,
+	NULL,
 };
 
 static struct attribute_group attr_group = {
@@ -3620,14 +3730,22 @@ int ncdev_delete_device_node(struct neuron_device *ndev)
 
 /*
  * neuron_device class sysfs nodes
- *   node_id_2/4
- *   node_cnt_2/4
- *   server_id_2/4
+ *   ULTRASERVER
+ *      node_id_2/4
+ *      server_id_2/4
+ *      ultraserver_mode
+ *
+ *   PDS
+ *      node_id
+ *      node_cnt
+ *      reservation_id
+ *      ultraserver_mode
  *
  */
 
 struct ncdev_class_attr {
 	struct class_attribute attr;
+	enum neuron_platform_type platform_type;
 	u32 info;
 };
 
@@ -3649,6 +3767,24 @@ static ssize_t ncdev_class_node_id_show(struct class *class, struct class_attrib
 	return ndhal->ndhal_npe.npe_class_node_id_show_data(buf, ca->info);
 }
 
+#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5)))
+static ssize_t ncdev_class_node_cnt_show(const struct class *class, const struct class_attribute *attr, char *buf)
+#else
+static ssize_t ncdev_class_node_cnt_show(struct class *class, struct class_attribute *attr, char *buf)
+#endif
+{
+	//struct ncdev_class_attr *ca = container_of(attr, struct ncdev_class_attr, attr); 
+
+	// protect against ndhal initialization race
+	if (ndhal == NULL) {
+		return 0;
+	}
+	if (ndhal->ndhal_npe.npe_class_node_cnt_show_data == NULL) {
+		return 0;
+	}
+	return ndhal->ndhal_npe.npe_class_node_cnt_show_data(buf);
+}
+
 #if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5)))
 static ssize_t ncdev_class_server_id_show(const struct class *class, const struct class_attribute *attr, char *buf)
 #else
@@ -3667,6 +3803,23 @@ static ssize_t ncdev_class_server_id_show(struct class *class, struct class_attr
 	return ndhal->ndhal_npe.npe_class_server_id_show_data(buf, ca->info);
 }
 
+#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5)))
+static ssize_t ncdev_class_reservation_id_show(const struct class *class, const struct class_attribute *attr, char *buf)
+#else
+static ssize_t ncdev_class_reservation_id_show(struct class *class, struct class_attribute *attr, char *buf)
+#endif
+{
+	struct ncdev_class_attr *ca = container_of(attr, struct ncdev_class_attr, attr); 
+
+	// protect against ndhal initialization race
+	if (ndhal == NULL) {
+		return 0;
+	}
+	if (ndhal->ndhal_npe.npe_class_server_id_show_data == NULL) {
+		return 0;
+	}
+	return ndhal->ndhal_npe.npe_class_server_id_show_data(buf, ca->info);
+}
 
 #if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5)))
 static ssize_t ncdev_class_ultraserver_mode_show(const struct class *class, const struct class_attribute *attr, char *buf)
@@ -3684,15 +3837,74 @@ static ssize_t ncdev_class_ultraserver_mode_show(struct class *class, struct cla
 	return ndhal->ndhal_npe.npe_class_ultraserver_mode_show_data(buf);
 }
 
-#define NCDEV_CLASS_ATTR(name, f, i) \
-	{__ATTR(name, S_IRUGO, f, NULL), i} 
+#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5)))
+static ssize_t ncdev_class_hbm_7200_show(const struct class *class, const struct class_attribute *attr, char *buf)
+#else
+static ssize_t ncdev_class_hbm_7200_show(struct class *class, struct class_attribute *attr, char *buf)
+#endif
+{
+	int i;
+	int supports_hbm_7200 = 1;
+	if (total_neuron_devices == 0) {
+		return dhal_sysfs_emit(buf, "busy\n");
+	}
+
+	for (i = 0; i < total_neuron_devices; i++) {
+		if (neuron_devices[i]->supports_hbm_7200 == -1) {
+			return dhal_sysfs_emit(buf, "busy\n");
+		}
+		supports_hbm_7200 = supports_hbm_7200 & neuron_devices[i]->supports_hbm_7200;
+	}
+
+	return dhal_sysfs_emit(buf, "%d\n", (supports_hbm_7200) ? 1 : 0);
+}
+
+#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5)))
+static ssize_t ncdev_class_cur_perf_profile_show(const struct class *class, const struct class_attribute *attr, char *buf)
+#else
+static ssize_t ncdev_class_cur_perf_profile_show(struct class *class, struct class_attribute *attr, char *buf)
+#endif
+{
+	int i;
+	int cur_perf_profile;
+	if (total_neuron_devices == 0) {
+		return dhal_sysfs_emit(buf, "busy\n");
+	}
+
+	cur_perf_profile = neuron_devices[0]->current_perf_profile;
+	for (i = 1; i < total_neuron_devices; i++) {
+		if (neuron_devices[i]->current_perf_profile != cur_perf_profile) {
+			return dhal_sysfs_emit(buf, "-1\n");
+		}
+	}
+	return dhal_sysfs_emit(buf, "%d\n", cur_perf_profile);
+}
+
+#define NCDEV_CLASS_ATTR(name, f, p, i) \
+	{__ATTR(name, S_IRUGO, f, NULL), p, i} 
 
 static const struct ncdev_class_attr ncdev_class_attrs[] = {
-	NCDEV_CLASS_ATTR(node_id_2, ncdev_class_node_id_show, 2),
-	NCDEV_CLASS_ATTR(node_id_4, ncdev_class_node_id_show, 4),
-	NCDEV_CLASS_ATTR(server_id_2, ncdev_class_server_id_show, 2),
-	NCDEV_CLASS_ATTR(server_id_4, ncdev_class_server_id_show, 4),
-	NCDEV_CLASS_ATTR(ultraserver_mode, ncdev_class_ultraserver_mode_show, 0)
+	NCDEV_CLASS_ATTR(hbm_7200_capable, ncdev_class_hbm_7200_show, NEURON_PLATFORM_TYPE_STD, 0),
+	NCDEV_CLASS_ATTR(current_perf_profile, ncdev_class_cur_perf_profile_show, NEURON_PLATFORM_TYPE_STD, 0),
+};
+
+static const struct ncdev_class_attr ncdev_class_attrs_us[] = {
+	NCDEV_CLASS_ATTR(node_id_2, ncdev_class_node_id_show, NEURON_PLATFORM_TYPE_ULTRASERVER, 2),
+	NCDEV_CLASS_ATTR(node_id_4, ncdev_class_node_id_show, NEURON_PLATFORM_TYPE_ULTRASERVER, 4),
+	NCDEV_CLASS_ATTR(server_id_2, ncdev_class_server_id_show, NEURON_PLATFORM_TYPE_ULTRASERVER, 2),
+	NCDEV_CLASS_ATTR(server_id_4, ncdev_class_server_id_show, NEURON_PLATFORM_TYPE_ULTRASERVER, 4),
+	NCDEV_CLASS_ATTR(ultraserver_mode, ncdev_class_ultraserver_mode_show, NEURON_PLATFORM_TYPE_ULTRASERVER, 0),
+	NCDEV_CLASS_ATTR(hbm_7200_capable, ncdev_class_hbm_7200_show, NEURON_PLATFORM_TYPE_STD, 0),
+	NCDEV_CLASS_ATTR(current_perf_profile, ncdev_class_cur_perf_profile_show, NEURON_PLATFORM_TYPE_STD, 0),
+};
+
+static const struct ncdev_class_attr ncdev_class_attrs_pds[] = {
+	NCDEV_CLASS_ATTR(node_id, ncdev_class_node_id_show, NEURON_PLATFORM_TYPE_PDS, 0),
+	NCDEV_CLASS_ATTR(node_cnt, ncdev_class_node_cnt_show, NEURON_PLATFORM_TYPE_PDS, 0),
+	NCDEV_CLASS_ATTR(reservation_id, ncdev_class_reservation_id_show, NEURON_PLATFORM_TYPE_PDS, 0),
+	NCDEV_CLASS_ATTR(ultraserver_mode, ncdev_class_ultraserver_mode_show, NEURON_PLATFORM_TYPE_PDS, 0),
+	NCDEV_CLASS_ATTR(hbm_7200_capable, ncdev_class_hbm_7200_show, NEURON_PLATFORM_TYPE_STD, 0),
+	NCDEV_CLASS_ATTR(current_perf_profile, ncdev_class_cur_perf_profile_show, NEURON_PLATFORM_TYPE_STD, 0),
 };
 
 static const struct class_attribute class_attr_node_id =
@@ -3704,6 +3916,79 @@ static const struct class_attribute class_attr_server_id =
 static const struct class_attribute class_attr_ultraserver_mode =
 	__ATTR(ultraserver_mode, S_IRUGO, ncdev_class_ultraserver_mode_show, NULL);
 
+// per platform class attributes. TODO we may eventually want to split this out into a neuron_platform.c
+//
+static const struct { 
+		const struct ncdev_class_attr *class_attrs;
+		int class_attrs_cnt;
+		enum neuron_platform_type platform_type;
+	} ncdev_platform_class_attrs[] = {
+		{ncdev_class_attrs,		sizeof(ncdev_class_attrs) 	  / sizeof(*ncdev_class_attrs), NEURON_PLATFORM_TYPE_STD},
+		{ncdev_class_attrs_us,	sizeof(ncdev_class_attrs_us)  / sizeof(*ncdev_class_attrs_us), NEURON_PLATFORM_TYPE_ULTRASERVER},
+		{ncdev_class_attrs_pds,	sizeof(ncdev_class_attrs_pds) / sizeof(*ncdev_class_attrs_pds), NEURON_PLATFORM_TYPE_PDS},
+		{NULL, 				0, NEURON_PLATFORM_TYPE_INVALID}};
+
+int ncdev_class_attr_init(void)
+{
+	int i;
+	int ret;
+
+	if (neuron_dev_class) {
+		const struct ncdev_class_attr *class_attrs = NULL;
+		int class_attrs_cnt;
+
+		for (i = 0; i < sizeof(ncdev_platform_class_attrs) / sizeof(*ncdev_platform_class_attrs); i++) {
+			if (ncdev_platform_class_attrs[i].platform_type == ndhal->ndhal_arch.platform_type) {
+					class_attrs = ncdev_platform_class_attrs[i].class_attrs;
+					class_attrs_cnt = ncdev_platform_class_attrs[i].class_attrs_cnt;
+			}
+		}
+
+		// no class attributes for this platform type
+		if (class_attrs == NULL) {
+			return 0;
+		}
+
+		for (i = 0; i < class_attrs_cnt; i++) {
+			ret  = class_create_file(neuron_dev_class, &class_attrs[i].attr);
+			if (ret) {
+				pr_err("create class/%s failed", class_attrs[i].attr.attr.name);
+				goto fail;
+			}
+		}
+	}
+	return 0;
+
+fail:
+	return ret;
+}
+
+void ncdev_class_attr_cleanup(void)
+{
+	int i;
+
+	if (neuron_dev_class) {
+		const struct ncdev_class_attr *class_attrs = NULL;
+		int class_attrs_cnt;
+
+		for (i = 0; i < sizeof(ncdev_platform_class_attrs) / sizeof(*ncdev_platform_class_attrs); i++) {
+			if (ncdev_platform_class_attrs[i].platform_type == ndhal->ndhal_arch.platform_type) {
+					class_attrs = ncdev_platform_class_attrs[i].class_attrs;
+					class_attrs_cnt = ncdev_platform_class_attrs[i].class_attrs_cnt;
+			}
+		}
+
+		// no class attributes for this platform type
+		if (class_attrs == NULL) {
+			return;
+		}
+
+		for (i = 0; i < class_attrs_cnt; i++) {
+			class_remove_file(neuron_dev_class, &class_attrs[i].attr);
+		}
+	}
+}
+
 static void ncdev_cleanup(void)
 {
 	int i;
@@ -3713,9 +3998,6 @@ static void ncdev_cleanup(void)
 	}
 
 	if (neuron_dev_class) {
-		for (i = 0; i < sizeof(ncdev_class_attrs) / sizeof(*ncdev_class_attrs); i++) {
-			class_remove_file(neuron_dev_class, &ncdev_class_attrs[i].attr);
-		}
 		class_destroy(neuron_dev_class);
 	}
 
@@ -3748,13 +4030,6 @@ int ncdev_module_init(void)
 		goto fail;
 	}
 
-	for (i = 0; i < sizeof(ncdev_class_attrs) / sizeof(*ncdev_class_attrs); i++) {
-		ret  = class_create_file(neuron_dev_class, &ncdev_class_attrs[i].attr);
-		if (ret) {
-			pr_err("create class/%s failed", ncdev_class_attrs[i].attr.attr.name);
-			goto fail;
-		}
-	}
 	return ret;
 
 fail:
diff --git a/neuron_cdev.h b/neuron_cdev.h
index f901a3e..0e24758 100644
--- a/neuron_cdev.h
+++ b/neuron_cdev.h
@@ -40,6 +40,19 @@ int ncdev_create_device_node(struct neuron_device *ndev);
  */
 int ncdev_delete_device_node(struct neuron_device *ndev);
 
+/**
+ * ncdev_class_attr_init() - initialize global class attributes
+ *
+ * @return int: return 0 on success, otherwise failure
+ */
+int ncdev_class_attr_init(void);
+
+/**
+ * ncdev_class_attr_cleanup() - cleanup glboal class attributes
+ *
+ */
+void ncdev_class_attr_cleanup(void);
+
 /**
  * ncdev_module_init() - Initialize the kernel module that creates the character devices
  * 
diff --git a/neuron_core.c b/neuron_core.c
index 89c3d4a..a15718e 100644
--- a/neuron_core.c
+++ b/neuron_core.c
@@ -49,24 +49,36 @@ DECLARE_FAULT_ATTR(neuron_fail_nc_mmap);
 
 int nc_semaphore_read(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, u32 *result)
 {
+	int ret = 0;
 	void *addr;
 
 	if (semaphore_index >= ndhal->ndhal_address_map.semaphore_count)
 		return -EINVAL;
 
-	addr = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id);
+	ret = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id, &addr);
+	if (ret) {
+		pr_err("failed to retrieve semaphore base");
+		return ret;
+	}
+
 	addr += ndhal->ndhal_address_map.mmap_nc_sema_read_offset + (semaphore_index * NC_SEMAPHORE_SIZE);
-	return ndhal->ndhal_reg_access.reg_read32_array((void **)&addr, result, 1);
+	return ndhal->ndhal_fw_io.fw_io_read_csr_array((void **)&addr, result, 1, true);
 }
 
 int nc_semaphore_write(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, u32 value)
 {
+	int ret = 0;
 	void *addr;
 
 	if (semaphore_index >= ndhal->ndhal_address_map.semaphore_count)
 		return -EINVAL;
 
-	addr = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id);
+	ret = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id, &addr);
+	if (ret) {
+		pr_err("failed to retrieve semaphore base");
+		return ret;
+	}
+
 	addr += ndhal->ndhal_address_map.mmap_nc_sema_set_offset + (semaphore_index * NC_SEMAPHORE_SIZE);
 	writel(value, addr);
 	return 0;
@@ -74,12 +86,18 @@ int nc_semaphore_write(struct neuron_device *nd, u8 nc_id, u16 semaphore_index,
 
 int nc_semaphore_increment(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, u32 value)
 {
+	int ret = 0;
 	void *addr;
 
 	if (semaphore_index >= ndhal->ndhal_address_map.semaphore_count)
 		return -EINVAL;
 
-	addr = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id);
+	ret = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id, &addr);
+	if (ret) {
+		pr_err("failed to retrieve semaphore base");
+		return ret;
+	}
+
 	addr += ndhal->ndhal_address_map.mmap_nc_sema_incr_offset + (semaphore_index * NC_SEMAPHORE_SIZE);
 	writel(value, addr);
 	return 0;
@@ -87,12 +105,18 @@ int nc_semaphore_increment(struct neuron_device *nd, u8 nc_id, u16 semaphore_ind
 
 int nc_semaphore_decrement(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, u32 value)
 {
+	int ret = 0;
 	void *addr;
 
 	if (semaphore_index >= ndhal->ndhal_address_map.semaphore_count)
 		return -EINVAL;
 
-	addr = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id);
+	ret = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id, &addr);
+	if (ret) {
+		pr_err("failed to retrieve semaphore base");
+		return ret;
+	}
+
 	addr += ndhal->ndhal_address_map.mmap_nc_sema_decr_offset + (semaphore_index * NC_SEMAPHORE_SIZE);
 	writel(value, addr);
 	return 0;
@@ -100,23 +124,35 @@ int nc_semaphore_decrement(struct neuron_device *nd, u8 nc_id, u16 semaphore_ind
 
 int nc_event_get(struct neuron_device *nd, u8 nc_id, u16 event_index, u32 *result)
 {
+	int ret = 0;
 	void *addr;
 
 	if (event_index > ndhal->ndhal_address_map.event_count)
 		return -EINVAL;
 
-	addr = ndhal->ndhal_nc.nc_get_event_addr(nd, nc_id, event_index);
-	return ndhal->ndhal_reg_access.reg_read32_array(&addr, result, 1);
+	ret = ndhal->ndhal_nc.nc_get_event_addr(nd, nc_id, event_index, &addr);
+	if (ret) {
+		pr_err("failed to retrieve event %u addr", event_index);
+		return ret;
+	}
+
+	return ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, result, 1, true);
 }
 
 int nc_event_set(struct neuron_device *nd, u8 nc_id, u16 event_index, u32 value)
 {
-	u32 *addr;
+	int ret = 0;
+	void *addr;
 
 	if (event_index > ndhal->ndhal_address_map.event_count)
 		return -EINVAL;
 
-	addr = ndhal->ndhal_nc.nc_get_event_addr(nd, nc_id, event_index);
+	ret = ndhal->ndhal_nc.nc_get_event_addr(nd, nc_id, event_index, &addr);
+	if (ret) {
+		pr_err("failed to retrieve event %u addr", event_index);
+		return ret;
+	}
+
 	writel(value, addr);
 	return 0;
 }
diff --git a/neuron_device.h b/neuron_device.h
index 93fd781..8af4f29 100644
--- a/neuron_device.h
+++ b/neuron_device.h
@@ -85,7 +85,7 @@ struct neuron_device {
 
 	void *fw_io_ctx;
 
-	struct mempool_set mpset;
+	struct neuron_mempool_set mpset;
 
 	// memory chunk allocated for notification queue in each neuron core.
 	struct mem_chunk *nq_mc[MAX_NC_PER_DEVICE][MAX_NQ_SUPPORTED];
@@ -118,6 +118,14 @@ struct neuron_device {
 	struct neuron_log_obj log_obj; // logging object
 
 	struct neuron_hbm_scrub_ctx hbm_scrub_ctx;
+
+	// volatile to prevent compiler optimizations since accessed by different threads
+	// Indicates whether any performance profile with 7200 Mhz HBM is supported by this device
+	volatile int supports_hbm_7200;
+
+	// volatile to prevent compiler optimizations since accessed by different threads
+	// This is the true value per-device, instead of the global one in ndhal_perf used only for metrics
+	volatile int current_perf_profile;
 };
 
 #endif
diff --git a/neuron_dhal.c b/neuron_dhal.c
index bb269c2..179cb8e 100644
--- a/neuron_dhal.c
+++ b/neuron_dhal.c
@@ -2,6 +2,7 @@
 
 #include "neuron_arch.h"
 #include "neuron_dhal.h"
+#include "neuron_cdev.h"
 
 struct neuron_dhal *ndhal = NULL;
 
@@ -24,10 +25,9 @@ int neuron_dhal_init(unsigned int pci_device_id) {
             return -ENOMEM;
         }
     } else {
-        mutex_unlock(&ndhal_init_lock);
-        return 0;
+		mutex_unlock(&ndhal_init_lock);
+		return 0;
     }
-    mutex_unlock(&ndhal_init_lock);
 
     ndhal->ndhal_arch.arch = narch_get_arch();
     ndhal->pci_device_id = pci_device_id;
@@ -46,15 +46,23 @@ int neuron_dhal_init(unsigned int pci_device_id) {
             break;
         default:
             pr_err("Unknown HW architecture: %d. Can't init neuron_dhal.\n", ndhal->ndhal_arch.arch);
+    		mutex_unlock(&ndhal_init_lock);
             return -EINVAL;
     }
 
+	// global class attributes get delayed initialization - need platform data from dhal
+	ncdev_class_attr_init();
+
+	mutex_unlock(&ndhal_init_lock);
+
     return ret;
 }
 
 void neuron_dhal_cleanup(void)
 {
     if (ndhal) {
+		ncdev_class_attr_cleanup();
+
     	if (ndhal->ndhal_ext_cleanup) {
     		ndhal->ndhal_ext_cleanup();
 		}
diff --git a/neuron_dhal.h b/neuron_dhal.h
index ab34019..dbce141 100644
--- a/neuron_dhal.h
+++ b/neuron_dhal.h
@@ -30,7 +30,6 @@ struct ndhal_arch {
 struct ndhal_address_map {
 	// addresses
 	uint64_t pci_host_base;
-	uint64_t mmap_p_offset;
 	uint64_t mmap_nc_event_offset;
 	uint64_t mmap_nc_sema_read_offset;
 	uint64_t mmap_nc_sema_set_offset;
@@ -39,9 +38,6 @@ struct ndhal_address_map {
 	uint64_t bar0_misc_ram_offset;
 	uint64_t port_1_base;
 
-	// sizes
-	uint64_t mmap_nc_size;
-
 	// counts
 	int nc_per_device;
 	unsigned dice_per_device;
@@ -56,13 +52,12 @@ struct ndhal_address_map {
 };
 
 struct ndhal_reset {
-    uint64_t reset_poll_interval;
     uint64_t reset_tpb_initial_poll_delay;
 	uint64_t initiate_max_wait_time;
     uint32_t retry_count;
     int (*nr_initiate_reset) (struct neuron_device *nd, uint32_t nc_map);
     int (*nr_wait_for_reset_completion) (struct neuron_device *nd);
-	int (*nr_post_reset_config) (struct neuron_device *nd, bool reset_successful);
+	int (*nr_post_reset_config) (struct neuron_device *nd, bool reset_successful, bool is_no_reset);
 };
 
 struct ndhal_topsp {
@@ -75,8 +70,8 @@ struct ndhal_topsp {
 };
 
 struct ndhal_nc {
-    void *(*nc_get_semaphore_base) (struct neuron_device *nd, u8 nc_id);
-    void *(*nc_get_event_addr) (struct neuron_device *nd, u8 nc_id, u16 event_index);
+    int (*nc_get_semaphore_base) (struct neuron_device *nd, u8 nc_id, void **sem_base);
+    int (*nc_get_event_addr) (struct neuron_device *nd, u8 nc_id, u16 event_index, void **ev_addr);
 };
 
 struct ndhal_nq {
@@ -89,8 +84,7 @@ struct ndhal_mpset {
     u64 device_dram_effective_base_addr[MAX_DRAM_CHANNELS];
     u64 device_dram_end_addr[MAX_DRAM_CHANNELS];
     bool small_pool_supported;
-    void (*mpset_set_dram_and_mpset_info) (struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size);
-    int (*mpset_block_carveout_regions) (struct neuron_device *nd, struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size);
+    void (*mpset_set_dram_and_mpset_info) (struct neuron_mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size);
 };
 
 struct ndhal_ndmar {
@@ -100,7 +94,6 @@ struct ndhal_ndmar {
     bool (*nr_init_h2t_eng) ( int nc_idx, uint32_t nc_map); 
     bool (*ndmar_is_nx_ring) (uint32_t eng_id, uint32_t q_id);
     int (*ndmar_quiesce_queues) (struct neuron_device *nd, u32 nc_id, u32 engine_count, u32 *queue_mask);
-    void (*ndmar_set_model_started) (struct neuron_device *nd, phys_addr_t pa, struct mem_chunk *mc);
 };
 
 struct ndhal_fw_io {
@@ -111,10 +104,6 @@ struct ndhal_fw_io {
     int (*fw_io_post_metric) (struct fw_io_ctx *ctx, u8 *data, u32 size);
 };
 
-struct ndhal_reg_access {
-    int (*reg_read32_array) (void **addr, u32 *value, u32 num_values);
-};
-
 struct ndhal_mmap {
     struct neuron_dm_special_mmap_ent *dm_mmap_special;
     int (*mmap_get_bar4_offset) (u64 start_addr, u64 size, u64 *offset);
@@ -150,14 +139,6 @@ struct ndhal_pci {
     int dram_bar;
     u64 dram_bar_size;
 
-    int (*neuron_pci_release_bar) (struct pci_dev *dev, int bar);
-    int (*neuron_pci_reserve_bar) (struct pci_dev *dev, int bar, const char *res_name);
-    int (*neuron_pci_set_npdev) (struct pci_dev *dev,
-                                int bar,
-                                const char *res_name,
-                                phys_addr_t *bar_pa,
-                                void __iomem **bar_ioaddr,
-                                u64 *bar_size);
     int (*neuron_pci_get_device_id) (struct neuron_device *nd, struct pci_dev *dev);
     int (*neuron_pci_device_id_to_rid_map) (uint32_t * count, uint32_t * did_to_rid_map);
 };
@@ -167,12 +148,12 @@ struct ndhal_cdev {
     u64 *ncdev_bar0_write_blocked_addrs;
 
     void (*ncdev_compatible_version) (struct neuron_ioctl_compatible_version *arg);
-    void (*ncdev_quiesce_exec_on_proc_exit) (void);
     int (*ncdev_logical_to_physical_nc_map)(struct neuron_ioctl_nc_map *map, uint32_t max_num_entries, enum neuron_ioctl_nc_mapping_type mapping_type);
     void (*ncdev_get_default_tpbs_for_hbm) (u32 hbm_index, u32 tpbs[MAX_NC_PER_DEVICE], u32 *tpb_count);
 };
 
 struct ndhal_udma {
+	unsigned int num_queues;
 	unsigned int num_beats;
 };
 
@@ -193,6 +174,7 @@ struct ndhal_npe {
 	int (*npe_pod_status)( u32 *pod_state, s8 *node_id);
 	int (*npe_pod_ctrl)( struct neuron_device *nd, u32 pod_ctrl, enum neuron_ultraserver_mode mode, u32 timeout, u32 *pod_state);
 	ssize_t (*npe_class_node_id_show_data)(char *buf, u32 sz);
+	ssize_t (*npe_class_node_cnt_show_data)(char *buf);
 	ssize_t (*npe_class_server_id_show_data)(char *buf, u32 sz);
 	ssize_t (*npe_class_ultraserver_mode_show_data)(char *buf);
 	u32 (*npe_neighbor_eng_ids)[2];
@@ -217,6 +199,9 @@ struct ndhal_tpb {
 struct ndhal_perf {
     int current_performance_profile;
     int (*perf_set_profile) (struct neuron_device *nd, uint32_t profile);
+    int (*perf_get_profile) (struct neuron_device *nd, uint32_t *profile);
+    int (*perf_get_supported_profiles) (struct neuron_device *nd, u16 feature, u8 *num_profiles, u8 out_bitmap[32]);
+    void (*perf_update_hbm_7200_supported) (struct neuron_device *nd);
 };
 
 struct neuron_dhal {
@@ -231,7 +216,6 @@ struct neuron_dhal {
     struct ndhal_mpset ndhal_mpset;
     struct ndhal_ndmar ndhal_ndmar;
     struct ndhal_fw_io ndhal_fw_io;
-    struct ndhal_reg_access ndhal_reg_access;
     struct ndhal_mmap ndhal_mmap;
     struct ndhal_sysfs_metrics ndhal_sysfs_metrics;
     struct ndhal_pci ndhal_pci;
diff --git a/neuron_dma.c b/neuron_dma.c
index 8258605..32e7d43 100644
--- a/neuron_dma.c
+++ b/neuron_dma.c
@@ -9,12 +9,15 @@
 #include <linux/delay.h>
 #include <linux/fault-inject.h>
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/bitops.h>
 
 #include "udma/udma.h"
 #include "neuron_trace.h"
 #include "neuron_device.h"
 #include "neuron_dma.h"
 #include "neuron_mempool.h"
+#include "neuron_mmap.h"
 #include "neuron_dhal.h"
 #include "neuron_pci.h"
 
@@ -233,6 +236,10 @@ int ndma_memcpy_wait_for_completion(struct ndma_eng *eng, struct ndma_ring *ring
 	u64 first_wait_time, wait;
 
 	ndhal->ndhal_ndma.ndma_get_wait_for_completion_time(count, async, &first_wait_time, &wait);
+	// Increase the wait time on virtual platforms
+	if (narch_is_qemu() || narch_is_emu()) {
+		wait = wait * 100 * 1000;
+	}
 	if (is_intra_device_dma && !async) {
 		first_wait_time = 10; // device-to-device DMA is much faster, just choose a small value independent of number of descriptors
 		wait = wait/200; // can probably be set even lower if required
@@ -271,7 +278,7 @@ int ndma_memcpy_wait_for_completion(struct ndma_eng *eng, struct ndma_ring *ring
 	}
 	if (i > loop) {
 		pr_err("DMA completion timeout on nd%02d for %s q%d desc count %u\n", eng->nd->device_index, eng->udma.name, ring->qid, count);
-		ret = -1;
+		ret = -ETIMEDOUT;
 		goto error;
 	}
 
@@ -876,7 +883,7 @@ int ndma_bar0_blocked_one_engine(u64 base, u64 off)
 			q_start = base + offsetof(struct unit_regs_v4, s2m); // start of s2m block
 			q_start += offsetof(struct udma_s2m_regs_v4, s2m_q); // start of q registers
 		}
-		for (qid = 0; qid < DMA_MAX_Q_V4; qid++) {
+		for (qid = 0; qid < ndhal->ndhal_udma.num_queues; qid++) {
 			u64 q_off = q_start + q_size * qid;
 			int i;
 			for (i = 0; i < sizeof(udma_blocked) / sizeof(udma_blocked[0]); i++) {
@@ -891,37 +898,460 @@ int ndma_bar0_blocked_one_engine(u64 base, u64 off)
 
 /*
  * Zero copy impementation.
- *
- *
- *
  */
 
+/* Context for tracking a single tensor batch operation on submit flow */
+struct ndma_h2t_zcdma_op_context {
+	void       *host_addr;
+	dma_addr_t  dev_addr;
+	u64         offset;
+	u64         pin_size;
+	u64         remaining;
+};
+
+/* DMA context state */
+enum ndma_zcdma_state {
+	NDMA_INVALID = 0,
+	NDMA_UNPINNED,
+	NDMA_PINNED_UNSUBMITTED,
+	NDMA_SUBMITTED,
+	NDMA_COMPLETED, // not in dma context queue anymore
+};
+
+/* DMA context */
 struct ndma_h2t_zcdma_context {
-	struct ndma_eng  *eng;                // engine 
-	struct ndma_ring *ring;               //
-	void             *host_addr;          // host address
-	dma_addr_t        dev_addr;           // device address
-	u64               size;               // size for this transfer
-	bool              direction;          // direction. true = to device
-	bool              last;               // last transfer for the entire request.
-	u64               start_time;         // start time for this transfer
-	int               nr_pages;           // number of pages for this transfer
-	int               nr_desc;            // number of descriptors which is equal to pending transfers -1
-	void             *completion_ptr;     // completion buffer pointer (host memory buffer we poll on for completions)
-	struct page     **page_list;          // page structures tracking our pinned pages
+	struct ndma_eng  *eng;                // engine
+	struct ndma_ring *ring;               // ring
+
+	// Submission-related
+	void                 *host_addr;          // host address
+	dma_addr_t            dev_addr;           // device address
+	u64                   size;               // size for this transfer
+	bool                  direction;          // direction. true = to-device/write/copy-in
+	bool                  last;               // last transfer for the entire request.
+	u64                   start_time;         // start time for this transfer
+	int                   nr_pages;           // number of pages for this transfer
+	int                   nr_desc;            // number of descriptors which is equal to pending transfers -1
+	struct page         **page_list;          // page structures tracking our pinned pages;
+											  // managed by page_list_pool in ctx queue
+	enum ndma_zcdma_state state;              // state of this transfer
+
+	// Completion-related
+	void                 *completion_ptr;     // completion buffer pointer;
+											  // host memory buffer which driver polls on for completions;
+											  // managed by completion_pool in ctx queue
+	u64                   sequence_num;       // async sequence number; 0 for sync transfers
+
+	// Async-only
+	struct mm_struct     *mm;                 // mm that owns the user buffers
 };
 
+static void ndma_zc_release_ctx(struct ndma_h2t_zcdma_context *ctx, u64 *nr_pinned_pages)
+{
+	// do not free or set completion_ptr null. it is managed by completion_pool in ctx queue
+	// do not free or set page_list null. it is managed by page_list_pool in ctx queue
+
+	if (ctx->state >= NDMA_PINNED_UNSUBMITTED) {
+		if (ctx->direction) {
+			unpin_user_pages(ctx->page_list, ctx->nr_pages);
+		} else {
+			unpin_user_pages_dirty_lock(ctx->page_list, ctx->nr_pages, true);
+		}
+		*nr_pinned_pages -= ctx->nr_pages;
+	}
+	ctx->nr_pages = 0;
+
+	if (ctx->mm) {
+		mmput(ctx->mm);
+		ctx->mm = NULL;
+	}
+
+	ctx->state = NDMA_INVALID;
+	ctx->sequence_num = 0;
+}
+
+/* H2D DMA Completion Queue (CQ) */
+#define NDMA_H2D_COMPL_QUEUE_CAPACITY 1024
+int ndma_h2d_compl_queue_init(struct neuron_device *nd, struct ndma_h2d_compl_queue *compl_queue)
+{
+	int ret = 0;
+	size_t queue_size = 0;
+	struct mem_chunk *mc = NULL;
+	neuron_h2d_dma_compl_queue_t *compl_queue_shared = NULL;
+
+	queue_size = sizeof(neuron_h2d_dma_compl_queue_t) + (NDMA_H2D_COMPL_QUEUE_CAPACITY * sizeof(neuron_h2d_dma_compl_queue_entry_t));
+	ret = mc_alloc_align(nd, MC_LIFESPAN_DEVICE, queue_size, 0,
+						 MEM_LOC_HOST, 0, 0, 0,
+						 NEURON_MEMALLOC_TYPE_NCDEV_HOST, &mc);
+	if (ret) {
+		pr_err("failed to allocate h2d dma completion queue mc: %d\n", ret);
+		return ret;
+	}
+	ret = nmch_handle_alloc(nd, mc, &mc->mc_handle);
+	if (ret) {
+		pr_err("failed to allocate mc handle for h2d dma completion queue: %d\n", ret);
+		mc_free(&mc);
+		return ret;
+	}
+	memset(mc->va, 0, queue_size);
+
+	compl_queue_shared = (neuron_h2d_dma_compl_queue_t *)mc->va;
+	compl_queue_shared->capacity = NDMA_H2D_COMPL_QUEUE_CAPACITY;
+	compl_queue_shared->head = 0;
+	compl_queue_shared->tail = 0;
 
-#define NDMA_ZC_PAGES_PER_XFER  64        // number of pages in each zero copy dma transfer.  This is somewhat, but not 
+	compl_queue->mc = mc;
+	compl_queue->compl_queue_shared = compl_queue_shared;
+	compl_queue->capacity_mask = NDMA_H2D_COMPL_QUEUE_CAPACITY - 1;
+	compl_queue->tail = 0;
+
+	return 0;
+}
+
+void ndma_h2d_compl_queue_destroy(struct ndma_h2d_compl_queue *compl_queue)
+{
+	if (compl_queue->mc) {
+		mc_free(&compl_queue->mc);
+	}
+	compl_queue->mc = NULL;
+	compl_queue->compl_queue_shared = NULL;
+	compl_queue->capacity_mask = 0;
+	compl_queue->tail = 0;
+}
+
+static void ndma_h2d_compl_queue_put(struct ndma_h2d_compl_queue *compl_queue,
+									 u64 sequence_num,
+									 s64 compl_ret,
+									 void *context)
+{
+	u32 head = 0;
+	u32 tail = 0;
+	neuron_h2d_dma_compl_queue_t *compl_queue_shared = compl_queue->compl_queue_shared;
+	neuron_h2d_dma_compl_queue_entry_t *entry = NULL;
+
+	head = smp_load_acquire(&compl_queue_shared->head);
+	tail = compl_queue->tail;
+
+	while ((tail - head) >= (compl_queue->capacity_mask + 1)) {
+		pr_warn_once("h2d dma completion queue full; blocking until space is available\n");
+		msleep(1);
+		head = smp_load_acquire(&compl_queue_shared->head);
+		tail = compl_queue->tail;
+	}
+
+	entry = &compl_queue_shared->entries[tail & compl_queue->capacity_mask];
+
+	/* Write completion result to tail */
+	entry->compl_ret = compl_ret;
+	entry->context = context;
+	entry->sequence_num = sequence_num;
+
+	/* Move tail */
+	compl_queue->tail = tail + 1;
+	smp_store_release(&compl_queue_shared->tail, compl_queue->tail);
+}
+
+#define NDMA_ZC_PAGES_PER_XFER  64        // number of pages in each zero copy dma transfer.  This is somewhat, but not
 										  // totally arbitrary.  We don't want to pin a lot of pages. We just want to
 										  // pin enough where (approximately):  
 										  //       dma time > (pin time + setup time + completion update + initial poll wait)
 										  // That's the simple explanation. It's a tad more complicated in trading off smaller
 										  // transfers where even if that equation doesn't hold, the overlap can be beneficial.
 										  // Right now the sweet spot looks to be ~ 64 pages.  More tuning is required.
-										  // 
 #define NDMA_ZC_MIN_PAGES_PER_XFER 64
 
+/* Hysteresis thresholds for descriptor wait checks in submission flow. */
+#define NDMA_ZC_DESC_WAIT_THRESHOLD_LO (NDMA_ZC_PAGES_PER_XFER    + 1)
+#define NDMA_ZC_DESC_WAIT_THRESHOLD_HI (NDMA_ZC_DESC_WAIT_THRESHOLD_LO * 8)
+
+/* DMA ctx queue constants */
+#define NDMA_CTX_QUEUE_DEFAULT_CAPACITY   1024
+#define NDMA_CTX_QUEUE_MAX_PINNED_PAGES   524288
+
+/* Skip tombstone ctxs */
+static void ndma_ctx_queue_advance_to_valid(struct ndma_ctx_queue *queue, u32 *idx, u32 stop)
+{
+	while (*idx != stop && queue->entries[*idx].state == NDMA_INVALID) {
+		*idx = (*idx + 1) & queue->capacity_mask;
+	}
+}
+
+/* Check empty or full */
+static bool ndma_ctx_queue_is_empty(const struct ndma_ctx_queue *queue)
+{
+	return queue->head == queue->tail;
+}
+
+static bool ndma_ctx_queue_is_full(const struct ndma_ctx_queue *queue)
+{
+	return queue->head == ((queue->tail + 1) & queue->capacity_mask);
+}
+
+static bool ndma_ctx_queue_submitted_empty(const struct ndma_ctx_queue *queue)
+{
+	return queue->head == queue->first_pinned_unsubmitted;
+}
+
+static bool ndma_ctx_queue_pinned_unsubmitted_empty(const struct ndma_ctx_queue *queue)
+{
+	return queue->first_pinned_unsubmitted == queue->first_unpinned;
+}
+
+static bool ndma_ctx_queue_unpinned_empty(const struct ndma_ctx_queue *queue)
+{
+	return queue->first_unpinned == queue->tail;
+}
+
+/* Increment to next index */
+static void ndma_ctx_queue_inc_first_pinned_unsubmitted(struct ndma_ctx_queue *queue)
+{
+	if (ndma_ctx_queue_pinned_unsubmitted_empty(queue)) {
+		return;
+	}
+	queue->first_pinned_unsubmitted = (queue->first_pinned_unsubmitted + 1) & queue->capacity_mask;
+	ndma_ctx_queue_advance_to_valid(queue, &queue->first_pinned_unsubmitted, queue->first_unpinned);
+}
+
+static void ndma_ctx_queue_inc_first_unpinned(struct ndma_ctx_queue *queue)
+{
+	if (ndma_ctx_queue_unpinned_empty(queue)) {
+		return;
+	}
+	queue->first_unpinned = (queue->first_unpinned + 1) & queue->capacity_mask;
+	ndma_ctx_queue_advance_to_valid(queue, &queue->first_unpinned, queue->tail);
+}
+
+static void ndma_ctx_queue_inc_tail(struct ndma_ctx_queue *queue)
+{
+	u32 old_tail = queue->tail;
+	u32 new_tail = (old_tail + 1) & queue->capacity_mask;
+
+	// Assume the ctx at old tail is already filled by caller
+	// Tail advance may also initialize/advance the pinned+unsubmitted and unpinned pointers
+	struct ndma_h2t_zcdma_context *ctx = &queue->entries[old_tail];
+	if (ctx->state == NDMA_PINNED_UNSUBMITTED) {
+		if (ndma_ctx_queue_pinned_unsubmitted_empty(queue)) {
+			// The first pinned+unsubmitted pointer appears at old_tail
+			queue->first_pinned_unsubmitted = old_tail;
+		}
+		if (ndma_ctx_queue_unpinned_empty(queue)) {
+			// No unpinned elements yet; start after the new tail
+			queue->first_unpinned = new_tail;
+		}
+	} else if (ctx->state == NDMA_UNPINNED) {
+		if (ndma_ctx_queue_unpinned_empty(queue)) {
+			// The first unpinned pointer appears at old_tail
+			queue->first_unpinned = old_tail;
+		}
+	}
+
+	// Move tail forward after updating the two pointers
+	queue->tail = new_tail;
+}
+
+/* Peek */
+static struct ndma_h2t_zcdma_context *ndma_ctx_queue_peek_tail(struct ndma_ctx_queue *queue)
+{
+	if (ndma_ctx_queue_is_full(queue)) {
+		return NULL;
+	}
+	return &queue->entries[queue->tail];
+}
+
+static struct ndma_h2t_zcdma_context *ndma_ctx_queue_peek_pinned_unsubmitted(struct ndma_ctx_queue *queue)
+{
+	if (ndma_ctx_queue_pinned_unsubmitted_empty(queue)) {
+		return NULL;
+	}
+	return &queue->entries[queue->first_pinned_unsubmitted];
+}
+
+static struct ndma_h2t_zcdma_context *ndma_ctx_queue_peek_first_unpinned(struct ndma_ctx_queue *queue)
+{
+	if (ndma_ctx_queue_unpinned_empty(queue)) {
+		return NULL;
+	}
+	return &queue->entries[queue->first_unpinned];
+}
+
+/* Pop */
+static struct ndma_h2t_zcdma_context *ndma_ctx_queue_pop_head(struct ndma_ctx_queue *queue)
+{
+	u32 old_head;
+	struct ndma_h2t_zcdma_context *ctx = NULL;
+
+	if (ndma_ctx_queue_is_empty(queue)) {
+		return NULL;
+	}
+
+	old_head = queue->head;
+	ctx = &queue->entries[old_head];
+	queue->head = (queue->head + 1) & queue->capacity_mask;
+	ndma_ctx_queue_advance_to_valid(queue, &queue->head, queue->tail);
+
+	if (ndma_ctx_queue_is_empty(queue)) {
+		queue->first_pinned_unsubmitted = queue->tail;
+		queue->first_unpinned = queue->tail;
+	} else {
+		if (old_head == queue->first_pinned_unsubmitted) {
+			ndma_ctx_queue_inc_first_pinned_unsubmitted(queue);
+		}
+		if (old_head == queue->first_unpinned) {
+			ndma_ctx_queue_inc_first_unpinned(queue);
+		}
+	}
+
+	return ctx;
+}
+
+static struct ndma_h2t_zcdma_context *ndma_ctx_queue_pop_submitted(struct ndma_ctx_queue *queue)
+{
+	if (ndma_ctx_queue_submitted_empty(queue)) {
+		return NULL;
+	}
+
+	return ndma_ctx_queue_pop_head(queue);
+}
+
+/* Failure-path helper.
+ * Given a sequence number of a async request, wait for any matching submitted ctxs, then reset all matching ctxs.
+ * This prevents further remote pinning and submitting on a failed async request.
+ * Mostly used in failure and cleanup paths, so don't stop on failed DMAs.
+ */
+static void ndma_ctx_queue_drain_sequence(struct ndma_ctx_queue *queue, u64 sequence_num)
+{
+	u32 idx;
+
+	for (idx = queue->head; idx != queue->tail; idx = (idx + 1) & queue->capacity_mask) {
+		struct ndma_h2t_zcdma_context *ctx = &queue->entries[idx];
+
+		if (ctx->sequence_num == sequence_num) {
+			// wait for already submitted DMAs to complete.
+			if (ctx->state == NDMA_SUBMITTED) {
+				ndma_memcpy_wait_for_completion(ctx->eng, ctx->ring, ctx->nr_desc + 1, ctx->completion_ptr, false, false);
+			}
+
+			// release pinned pages and mm, and set state to invalid (tombstone).
+			ndma_zc_release_ctx(ctx, &queue->nr_pinned_pages);
+		}
+	}
+
+	// After draining, advance the pointers to skip the invalidated ctxs.
+	ndma_ctx_queue_advance_to_valid(queue, &queue->first_unpinned, queue->tail);
+	ndma_ctx_queue_advance_to_valid(queue, &queue->first_pinned_unsubmitted, queue->first_unpinned);
+	ndma_ctx_queue_advance_to_valid(queue, &queue->head, queue->tail);
+}
+
+/* Failure-path helper.
+ * Wait for submitted contexts from head up to (but not including) first_pinned_unsubmitted.
+ * Unpin from head up to (but not including) first_unpinned.
+ * Mostly used in failure and cleanup paths, so don't stop on failed DMAs.
+ */
+static void ndma_ctx_queue_drain(struct ndma_eng *eng,
+								 struct ndma_ring *ring,
+								 struct ndma_ctx_queue *queue)
+{
+	while (!ndma_ctx_queue_is_empty(queue)) {
+		struct ndma_h2t_zcdma_context *ctx = ndma_ctx_queue_pop_head(queue);
+
+		if (ctx->state == NDMA_SUBMITTED) {
+			ndma_memcpy_wait_for_completion(eng, ring, ctx->nr_desc + 1, ctx->completion_ptr, false, false);
+		}
+
+		ndma_zc_release_ctx(ctx, &queue->nr_pinned_pages);
+	}
+}
+
+/* Init and destroy queue */
+int ndma_ctx_queue_init(struct ndma_ctx_queue *queue)
+{
+	int i;
+
+	if (!queue) {
+		pr_err("ctx queue pointer cannot be NULL\n");
+		return -EINVAL;
+	}
+
+	memset(queue, 0, sizeof(*queue));
+
+	u32 capacity = NDMA_CTX_QUEUE_DEFAULT_CAPACITY;
+	if (!is_power_of_2(capacity)) {
+		pr_err("ctx queue capacity must be power of two\n");
+		return -EINVAL;
+	}
+	queue->capacity_mask = capacity - 1;
+	queue->head = 0;
+	queue->tail = 0;
+	queue->first_pinned_unsubmitted = 0;
+	queue->first_unpinned = 0;
+
+	queue->entries = kvcalloc(capacity, sizeof(*queue->entries), GFP_KERNEL);
+	if (!queue->entries) {
+		pr_err("failed to allocate ctx queue entries\n");
+		return -ENOMEM;
+	}
+
+	// allocate completion ptrs in one contiguous array at once,
+	// and let queue->entries[i].completion_ptr point to each completion buffer
+	queue->completion_pool = kcalloc(capacity, DMA_COMPLETION_MARKER_SIZE * 2, GFP_KERNEL);
+	if (!queue->completion_pool) {
+		pr_err("failed to allocate ctx queue completion pool\n");
+		goto err;
+	}
+
+	// allocate page_list arrays in one contiguous pool, and let each entry point to its slice
+	queue->page_list_pool = kcalloc(capacity * NDMA_ZC_PAGES_PER_XFER, sizeof(struct page *), GFP_KERNEL);
+	if (!queue->page_list_pool) {
+		pr_err("failed to allocate ctx queue page_list pool\n");
+		goto err;
+	}
+
+	for (i = 0; i < capacity; i++) {
+		queue->entries[i].completion_ptr =
+			(u8 *)queue->completion_pool + i * DMA_COMPLETION_MARKER_SIZE * 2;
+		queue->entries[i].page_list =
+			(struct page **)queue->page_list_pool + i * NDMA_ZC_PAGES_PER_XFER;
+	}
+
+	return 0;
+
+err:
+	if (queue->completion_pool) {
+		kfree(queue->completion_pool);
+		queue->completion_pool = NULL;
+	}
+	if (queue->page_list_pool) {
+		kfree(queue->page_list_pool);
+		queue->page_list_pool = NULL;
+	}
+	if (queue->entries) {
+		kvfree(queue->entries);
+		queue->entries = NULL;
+	}
+	return -ENOMEM;
+}
+
+void ndma_ctx_queue_free(struct ndma_eng *eng, struct ndma_ring *ring, struct ndma_ctx_queue *queue)
+{
+	if (!queue) {
+		return;
+	}
+	if (queue->entries) {
+		ndma_ctx_queue_drain(eng, ring, queue);
+		kvfree(queue->entries);
+		queue->entries = NULL;
+	}
+	if (queue->completion_pool) {
+		kfree(queue->completion_pool);
+		queue->completion_pool = NULL;
+	}
+	if (queue->page_list_pool) {
+		kfree(queue->page_list_pool);
+		queue->page_list_pool = NULL;
+	}
+	memset(queue, 0, sizeof(*queue));
+}
+
 /** ndma_calc_zc_pin_size()
  *
  *   determine how many pages to pin per step for zercopy dma pipelining.
@@ -960,7 +1390,7 @@ bool ndma_zerocopy_supported(void)
  *     Think about using some permanent location in HBM as source for completion descriptor update.  Like
  *     why are we reading across the PCIe bus to fetch completion data.
  */
-static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx)
+static int ndma_build_n_issue_zc_descs(struct ndma_h2t_zcdma_context * dma_ctx)
 {
 	int            ret;
 	unsigned long  offset        = (unsigned long)(dma_ctx->host_addr) & (PAGE_SIZE-1);
@@ -970,7 +1400,7 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx)
 	int            i = 0;
 	u64            chunk_size;
 	int            pending_transfers = 0;
-	int barrier_type;
+	int            barrier_type;
 
 	while (i < dma_ctx->nr_pages) {
 		dma_addr_t src_addr;
@@ -1035,7 +1465,7 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx)
 			pending_transfers++;
 		}
 	}
-	
+
 	dma_ctx->nr_desc = pending_transfers;
 
 	if (narch_get_arch() != NEURON_ARCH_V2)
@@ -1050,202 +1480,387 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx)
 	pending_transfers++;
 
 	ret = udma_m2m_copy_start(&dma_ctx->eng->udma, dma_ctx->ring->qid, pending_transfers, pending_transfers);
-
 	if (ret) {
 		pr_info("copy start failed %d\n", ret);
 	}
+	dma_ctx->state = NDMA_SUBMITTED;
 
 error:
 	return ret;
 }
 
-/**
- * ndma_zerocopy_wait_for_completion()
- *
- *
- *
- */
-static int ndma_zerocopy_wait_for_completion( struct neuron_device *nd, u32 nc_id, struct ndma_eng   *eng, struct ndma_ring  *ring,
-											  struct ndma_h2t_zcdma_context * dma_ctx, struct ndma_h2t_zcdma_context * ndma_ctx)
+/* Return the number of descriptors available (TX-only; TX/RX counts match) */
+static u32 ndma_zc_descs_available(struct ndma_eng *eng, u32 qid)
 {
-	int  ret;
+	struct udma_q *txq;
+	u32 tx_desc_available;
 
-	ret = ndma_memcpy_wait_for_completion(eng, ring, dma_ctx->nr_desc+1, dma_ctx->completion_ptr, true, false);
-	//atomic_sub(dma_ctx->nr_desc+1, &dma_ctx->ring->h2t_outstanding_desc);
+	udma_q_handle_get(&eng->udma, qid, UDMA_TX, &txq);
 
-	if (ret == 0) {
-		if (dma_ctx->direction)
-			unpin_user_pages(dma_ctx->page_list, dma_ctx->nr_pages);
-		else
-			unpin_user_pages_dirty_lock(dma_ctx->page_list, dma_ctx->nr_pages, true);
-		return ret;
-	}
+	tx_desc_available = udma_available_get(txq);
 
-	// If we are exiting here, we've failed so unpin pages associated with the DMA.  If the next DMA
-	// context is valid, do an obligatory wait for the DMA operation so we don't splat data on someone 
-	// else's memory just in case the physical pages are reassigned after unpinning.
-	//
-	unpin_user_pages(dma_ctx->page_list, dma_ctx->nr_pages);
+	/* TX/RX descriptor availability is kept in lock-step. */
+	return tx_desc_available;
+}
 
-	// blindly wait
-	if (ndma_ctx != NULL) {
-		ndma_memcpy_wait_for_completion(eng, ring, ndma_ctx->nr_desc+1, ndma_ctx->completion_ptr, false, false);
-		unpin_user_pages(ndma_ctx->page_list, ndma_ctx->nr_pages);
+/* Estimate if a zero-copy DMA context fits in the available descriptors. */
+static bool _ndma_zc_descs_available(struct ndma_eng *eng, u32 qid, u32 threshold)
+{
+	u32 max_descs_required = threshold + 1; /* +1 for completion descriptor */
+
+	return ndma_zc_descs_available(eng, qid) >= max_descs_required;
+}
+
+/* Whether we should wait for some completions before submitting more in the next iteration */
+static bool ndma_zc_should_wait(struct ndma_eng *eng,
+								struct ndma_ring *ring,
+								struct ndma_ctx_queue *ctx_queue,
+								u32 *desc_threshold)
+{
+	bool pinned_at_max;
+	bool desc_ring_full;
+	bool ctx_queue_full;
+
+	pinned_at_max = ctx_queue->nr_pinned_pages >= NDMA_CTX_QUEUE_MAX_PINNED_PAGES;
+	ctx_queue_full = ndma_ctx_queue_is_full(ctx_queue);
+	desc_ring_full = !_ndma_zc_descs_available(eng, ring->qid, *desc_threshold);
+
+	if (pinned_at_max || desc_ring_full || ctx_queue_full) {
+		*desc_threshold = NDMA_ZC_DESC_WAIT_THRESHOLD_HI;
+		return true;
 	}
-	
-	return ret;
+
+	return false;
 }
 
-int ndma_memcpy_zerocopy(struct neuron_device *nd,
-                                u32 nc_id,
-                                const nrt_tensor_batch_op_t *ops,
-                                u32 num_ops,
-                                dma_addr_t dev_base,
-                                int qid,
-                                bool direction)
+static int ndma_zerocopy_pin_pages(int nd_id,
+								   u32 nc_id,
+								   struct ndma_ctx_queue *ctx_queue,
+								   struct ndma_h2t_zcdma_context *dma_ctx,
+								   bool use_remote_pin)
 {
-    int ret = 0;
-    const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id);
-    struct ndma_eng   *eng   = &nd->ndma_engine[eng_id];
-    struct ndma_queue *queue = &eng->queues[qid];
-    struct ndma_ring  *ring  = &queue->ring_info;
-    struct ndma_h2t_zcdma_context   dma_ctx_tbl[2] = {0};
-    struct ndma_h2t_zcdma_context *pdma_ctx = NULL;
-    int    next_dma_idx = 0;
-    int    i            = 0;
-    bool   locked       = false;
-
-    // sanity check ring is owned by nc_id
-    if (!ndmar_h2t_ring_is_owner(ring, nc_id)) {
-        pr_err("nd%02d: attempting to use qid %d that was not assigned to nc %d\n", nd->device_index, qid, nc_id);
-        return -ENOENT;
-    }
-
-    // initialize the static fields in the dma contexts that are the same for every operation
-    for (i=0;i< 2;i++) {
-        dma_ctx_tbl[i].eng            = eng;
-        dma_ctx_tbl[i].ring           = ring;
-        dma_ctx_tbl[i].direction      = direction;
-        dma_ctx_tbl[i].page_list      = kcalloc( NDMA_ZC_PAGES_PER_XFER, sizeof(struct page *), GFP_KERNEL);
-        dma_ctx_tbl[i].completion_ptr = kmalloc(DMA_COMPLETION_MARKER_SIZE * 2, GFP_KERNEL);
-
-        if ((dma_ctx_tbl[i].page_list == NULL) || (dma_ctx_tbl[i].completion_ptr == NULL)) {
-            pr_err("could not allocate memory for dma contexts on nd %d\n", nd->device_index);
-            ret = -ENOMEM;
-            goto fail;
-        }
-    }
-    pdma_ctx = NULL;
-
-    mutex_lock(&ring->h2t_ring_lock);
-    locked = true;
-
-    // Process all operations with pipelining
-    for (i = 0; i < num_ops; i++) {
-        const nrt_tensor_batch_op_t *op = &ops[i];
-        u64 remaining = op->size;
-        void *host_addr = op->buffer;
-        dma_addr_t dev_addr = dev_base + op->offset;
-        u64 offset = (unsigned long)host_addr & (PAGE_SIZE - 1);
-        u64 pin_size = ndma_calc_zc_pin_size(op->size + offset);  // pin size is in page units, so include the page offset in size calc
-
-        while (remaining) {
-            struct ndma_h2t_zcdma_context *dma_ctx = &dma_ctx_tbl[next_dma_idx];
-            dma_ctx->start_time = get_jiffies_64();
-            dma_ctx->host_addr  = host_addr;
-            dma_ctx->dev_addr   = dev_addr;
-            dma_ctx->size       = pin_size - offset; // first chunk might not be aligned on the page boundary, all subsequent chunk will be aligned
-                                                     // and the offset will be 0
-            dma_ctx->last       = (dma_ctx->size == remaining && i == num_ops - 1);
-            dma_ctx->nr_pages   = DIV_ROUND_UP(pin_size, PAGE_SIZE);
-            if (dma_ctx->nr_pages > NDMA_ZC_PAGES_PER_XFER) {
-                pr_err_once("page count too large: %u\n", dma_ctx->nr_pages);
-            }
-
-            //__GFP_SKIP_ZERO
-            int nr_pinned = pin_user_pages_fast((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages,
-                                                direction ? 0 : FOLL_WRITE, dma_ctx->page_list);
-            if (nr_pinned != dma_ctx->nr_pages) {
-                // if failed pin_fast because of page fault, do the regular pinning
-                if (nr_pinned > 0) {
-                    unpin_user_pages( dma_ctx->page_list, nr_pinned);
-                }
+	int nr_pinned = 0;
 
+	if (use_remote_pin) {
+		if (!dma_ctx->mm) {
+			pr_err("remote pin requested without mm context\n");
+			return -EINVAL;
+		}
 #if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 5, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 6)))
-                nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list);
+		nr_pinned = pin_user_pages_remote(dma_ctx->mm,
+										(unsigned long)dma_ctx->host_addr & PAGE_MASK,
+										dma_ctx->nr_pages,
+										dma_ctx->direction ? 0 : FOLL_WRITE,
+										dma_ctx->page_list,
+										NULL);
 #else
-                nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list, NULL);
+		nr_pinned = pin_user_pages_remote(dma_ctx->mm,
+										(unsigned long)dma_ctx->host_addr & PAGE_MASK,
+										dma_ctx->nr_pages,
+										dma_ctx->direction ? 0 : FOLL_WRITE,
+										dma_ctx->page_list,
+										NULL,
+										NULL);
 #endif
-                if (nr_pinned != dma_ctx->nr_pages) {
-                    ret = -ENOMEM; // could use -EBUSY instead
-                    pr_err("could not pin host pages for zero copy dma on nd %d: nr_pinned %d\n", nd->device_index, nr_pinned);
-
-                    if (nr_pinned > 0) {
-                        unpin_user_pages( dma_ctx->page_list, nr_pinned);
-                    }
-                    // cleanup: wait for prev dma to complete (which also unpins pages)
-                    if (pdma_ctx != NULL) {
-                        ndma_zerocopy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL);
-                    }
-                    goto fail;
-                }
-            }
-
-            // TODO need to have this for other architectures
-            // for (i=0; i < dma_ctx->nr_pages; i++) {
-            //     struct device
-            //      dma_ctx->addr[i] = dma_map_page( nd->pdev->dev, dma_ctx_page_list[i], 0, PAGE_SIZE, DMA_TO_DEVICE/DMA_FROM_DEVICE);
-            //      ret = dma_mapping_error(dev->dev, dma_ctx->addr[i]);
-            //      if (ret) { }
-            // }
-            // flush_cache_range(vma, 
-
-            ret = ndma_build_n_issue_zc_descs(dma_ctx);
-            if (ret) {
-                unpin_user_pages( dma_ctx->page_list, dma_ctx->nr_pages);
-                // cleanup: wait for prev dma to complete (which also unpins pages)
-                if (pdma_ctx != NULL) {
-                    ndma_zerocopy_wait_for_completion(nd, nc_id, eng, ring, pdma_ctx, NULL);
-                }
-                goto fail;
-            }
-
-            if (pdma_ctx != NULL) {
-                ret = ndma_zerocopy_wait_for_completion(nd, nc_id, eng, ring, pdma_ctx, dma_ctx);
-                if (ret) {
-                    goto fail;
-                }
-            }
-
-            pdma_ctx     = dma_ctx;
-            next_dma_idx = (next_dma_idx+1) % 2;
-
-            remaining -= dma_ctx->size;
-            host_addr += dma_ctx->size;
-            dev_addr  += dma_ctx->size;
-            pin_size   = (remaining < pin_size) ? remaining : pin_size;
-            offset     = 0;
-        }
-    }
-
-
-    // Wait for the last chunk
-    if (pdma_ctx) {
-        ret = ndma_zerocopy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL);
-    }
+		mmput(dma_ctx->mm);
+		dma_ctx->mm = NULL;
+	} else {
+		nr_pinned = pin_user_pages_fast((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages,
+							dma_ctx->direction ? 0 : FOLL_WRITE, dma_ctx->page_list);
 
-fail:
-    // release resources
-    for (i = 0; i < 2; i++) {
-        if (dma_ctx_tbl[i].page_list != NULL)
-            kfree(dma_ctx_tbl[i].page_list);
-        if (dma_ctx_tbl[i].completion_ptr != NULL) {
-            kfree(dma_ctx_tbl[i].completion_ptr);
-        }
-    }
-    if (locked) {
-        mutex_unlock(&ring->h2t_ring_lock);
-    }
-
-    return ret;
+		if (nr_pinned != dma_ctx->nr_pages) {
+			// if failed pin_fast because of page fault, do the regular pinning
+			if (nr_pinned > 0) {
+				unpin_user_pages(dma_ctx->page_list, nr_pinned);
+			}
+
+#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 5, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 6)))
+			nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, dma_ctx->direction ? 0 : FOLL_WRITE, dma_ctx->page_list);
+#else
+			nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, dma_ctx->direction ? 0 : FOLL_WRITE, dma_ctx->page_list, NULL);
+#endif
+		}
+	}
+
+	if (nr_pinned != dma_ctx->nr_pages) {
+		int err = (nr_pinned < 0) ? nr_pinned : -ENOMEM;
+
+		pr_err("could not pin host pages for zero copy dma on nd %d: nr_pinned %d\n", nd_id, nr_pinned);
+
+		if (nr_pinned > 0) {
+			unpin_user_pages(dma_ctx->page_list, nr_pinned);
+		}
+
+		return err;
+	}
+
+	ctx_queue->nr_pinned_pages += dma_ctx->nr_pages;
+	dma_ctx->state = NDMA_PINNED_UNSUBMITTED;
+
+	return 0;
+}
+
+int ndma_zerocopy_submit(struct neuron_device *nd,
+			 u32 nc_id,
+			 const nrt_tensor_batch_op_t *ops,
+			 u32 num_ops,
+			 dma_addr_t dev_base,
+			 int qid,
+			 bool direction,
+			 u64 sequence_num)
+{
+	int ret = 0;
+	int i = 0;
+	const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id);
+	struct ndma_eng *eng = &nd->ndma_engine[eng_id];
+	struct ndma_queue *queue = &eng->queues[qid];
+	struct ndma_ring *ring = &queue->ring_info;
+	struct ndma_ctx_queue *ctx_queue = &ring->dma_ctx_queue;
+	struct ndma_h2t_zcdma_context *cur_ctx = NULL;
+	struct ndma_h2t_zcdma_op_context op_ctx;
+	bool async = (sequence_num != 0);
+
+	/* Verify ring ownership. */
+	if (!ndmar_h2t_ring_is_owner(ring, nc_id)) {
+		pr_err("nd%02d: attempting to use qid %d that was not assigned to nc %d\n",
+		       nd->device_index, qid, nc_id);
+		return -ENOENT;
+	}
+
+	mutex_lock(&ring->h2t_ring_lock);
+
+	for (i = 0; i < num_ops; i++) {
+		const nrt_tensor_batch_op_t *op = &ops[i];
+		op_ctx.host_addr = op->buffer;
+		op_ctx.dev_addr = dev_base + op->offset;
+		op_ctx.offset = (unsigned long)op_ctx.host_addr & (PAGE_SIZE - 1);
+		op_ctx.remaining = op->size;
+		/* pin_size is in page units; include the page offset. */
+		op_ctx.pin_size = ndma_calc_zc_pin_size(op_ctx.remaining + op_ctx.offset);
+
+		while (op_ctx.remaining) {
+			int nr_pages;
+			bool can_pin;
+			bool ctx_queue_full;
+
+			/* Step 1: submit any pinned contexts that have available descriptors. */
+			while (true) {
+				struct ndma_h2t_zcdma_context *unsubmitted_ctx = ndma_ctx_queue_peek_pinned_unsubmitted(ctx_queue);
+
+				if (!unsubmitted_ctx || !_ndma_zc_descs_available(eng, ring->qid, unsubmitted_ctx->nr_pages)) {
+					break;
+				}
+
+				ret = ndma_build_n_issue_zc_descs(unsubmitted_ctx);
+				if (ret) {
+					pr_err("failed to build and issue zero-copy descs\n");
+					goto done;
+				}
+
+				ndma_ctx_queue_inc_first_pinned_unsubmitted(ctx_queue);
+			}
+
+			/* Step 2: set up the current ctx if there is room and pinned-page budget. */
+			nr_pages = DIV_ROUND_UP(op_ctx.pin_size, PAGE_SIZE);
+			can_pin = (ctx_queue->nr_pinned_pages + nr_pages <= NDMA_CTX_QUEUE_MAX_PINNED_PAGES);
+			ctx_queue_full = ndma_ctx_queue_is_full(ctx_queue);
+
+			if (async && ctx_queue_full) {
+				pr_err("ctx queue full. failed to submit async ctx\n");
+				ret = -EBUSY;
+				goto done;
+			}
+
+			if ((can_pin || async) && !ctx_queue_full) {
+				cur_ctx                 = ndma_ctx_queue_peek_tail(ctx_queue);
+				cur_ctx->eng            = eng;
+				cur_ctx->ring           = ring;
+				cur_ctx->host_addr      = op_ctx.host_addr;
+				cur_ctx->dev_addr       = op_ctx.dev_addr;
+				// First chunk may be unaligned; later chunks are page-aligned with offset=0.
+				cur_ctx->size           = op_ctx.pin_size - op_ctx.offset;
+				cur_ctx->direction      = direction;
+				cur_ctx->last           = (cur_ctx->size == op_ctx.remaining && i == num_ops - 1);
+				cur_ctx->nr_pages       = nr_pages;
+				cur_ctx->state          = NDMA_UNPINNED;
+				cur_ctx->nr_desc        = 0; // Set by ndma_build_n_issue_zc_descs().
+				cur_ctx->mm             = NULL;
+				cur_ctx->sequence_num   = sequence_num;
+
+				/* Pin now if possible; otherwise capture mm for remote pinning (async only). */
+				if (can_pin) {
+					ret = ndma_zerocopy_pin_pages(nd->device_index, nc_id, ctx_queue, cur_ctx, false);
+					if (ret) {
+						pr_err("failed to pin pages for zero copy dma on nd %d\n", nd->device_index);
+						goto done;
+					}
+				} else if (async) {
+					struct mm_struct *mm = current->mm;
+					mmget(mm);
+					cur_ctx->mm = mm;
+				}
+
+				/* Advance the queue tail.
+				 * May also initialize/advance the pinned+unsubmitted and unpinned pointers.
+				 */
+				ndma_ctx_queue_inc_tail(ctx_queue);
+
+				/* Update loop variables for the next chunk. */
+				op_ctx.remaining -= cur_ctx->size;
+				op_ctx.host_addr += cur_ctx->size;
+				op_ctx.dev_addr += cur_ctx->size;
+				op_ctx.pin_size = (op_ctx.remaining < op_ctx.pin_size) ? op_ctx.remaining : op_ctx.pin_size;
+				op_ctx.offset = 0;
+				cur_ctx = NULL;
+			}
+
+			/* Step 3 (sync): wait for submitted transfers to complete from the head. */
+			if (!async) {
+				u32 desc_threshold = NDMA_ZC_DESC_WAIT_THRESHOLD_LO;
+
+				while (ndma_zc_should_wait(eng, ring, ctx_queue, &desc_threshold)) {
+					struct ndma_h2t_zcdma_context *submitted_ctx = ndma_ctx_queue_pop_submitted(ctx_queue);
+
+					ret = ndma_memcpy_wait_for_completion(eng, ring, submitted_ctx->nr_desc + 1,
+									      submitted_ctx->completion_ptr,
+									      true, false);
+					ndma_zc_release_ctx(submitted_ctx, &ctx_queue->nr_pinned_pages);
+					if (ret) {
+						pr_err("failed to wait for completion of zero copy dma\n");
+						goto done;
+					}
+				}
+			}
+		}
+	}
+
+	if (!async) {
+		/* Step 4 (sync): submit remaining pinned ctxs, then drain all submitted ctxs. */
+		while (true) {
+			struct ndma_h2t_zcdma_context *ctx_to_submit = ndma_ctx_queue_peek_pinned_unsubmitted(ctx_queue);
+
+			if (ctx_to_submit && _ndma_zc_descs_available(eng, ring->qid, ctx_to_submit->nr_pages)) {
+				ret = ndma_build_n_issue_zc_descs(ctx_to_submit);
+				if (ret) {
+					pr_err("failed to build and issue zero-copy descs\n");
+					goto done;
+				}
+				ndma_ctx_queue_inc_first_pinned_unsubmitted(ctx_queue);
+			}
+
+			struct ndma_h2t_zcdma_context *ctx_to_wait = ndma_ctx_queue_pop_submitted(ctx_queue);
+			if (ctx_to_wait) {
+				ret = ndma_memcpy_wait_for_completion(eng, ring, ctx_to_wait->nr_desc + 1,
+								      ctx_to_wait->completion_ptr,
+								      true, false);
+				ndma_zc_release_ctx(ctx_to_wait, &ctx_queue->nr_pinned_pages);
+				if (ret) {
+					pr_err("failed to wait for completion of zero copy dma\n");
+					goto done;
+				}
+			}
+
+			if (!ctx_to_submit && !ctx_to_wait) {
+				break;
+			}
+		}
+	}
+
+done:
+	if (ret) {
+		ndma_ctx_queue_drain(eng, ring, ctx_queue);
+	}
+	mutex_unlock(&ring->h2t_ring_lock);
+	return ret;
+}
+
+/* The completion flow for completion, remote pinning, and submission. Async IO only */
+static __maybe_unused int ndma_zerocopy_complete(struct neuron_device *nd,
+												 struct ndma_eng *eng,
+												 struct ndma_ring *ring,
+												 bool *did_work)
+{
+	int ret = 0;
+	int err = 0;
+	struct ndma_ctx_queue *ctx_queue = NULL;
+	u32 desc_threshold = NDMA_ZC_DESC_WAIT_THRESHOLD_LO;
+
+	if (!ring || !did_work) {
+		return -EINVAL;
+	}
+	*did_work = false;
+
+	ctx_queue = &ring->dma_ctx_queue;
+
+	mutex_lock(&ring->h2t_ring_lock);
+
+	/* 1) Wait for at least one submitted context to complete */
+	while (true) {
+		if (ndma_ctx_queue_submitted_empty(ctx_queue)) {
+			break;
+		}
+		if (*did_work && !ndma_zc_should_wait(eng, ring, ctx_queue, &desc_threshold)) {
+			break;
+		}
+		struct ndma_h2t_zcdma_context *submitted_ctx = ndma_ctx_queue_pop_submitted(ctx_queue);
+
+		ret = ndma_memcpy_wait_for_completion(eng, ring, submitted_ctx->nr_desc + 1, submitted_ctx->completion_ptr, true, false);
+		if (ret) {
+			err = ret;
+			pr_err("async h2d dma completion failed for seq num %llu: %d\n", submitted_ctx->sequence_num, ret);
+			ndma_h2d_compl_queue_put(&ring->dma_compl_queue, submitted_ctx->sequence_num, ret, NULL);
+			ndma_ctx_queue_drain_sequence(ctx_queue, submitted_ctx->sequence_num);
+		} else if (submitted_ctx->last) {
+			ndma_h2d_compl_queue_put(&ring->dma_compl_queue, submitted_ctx->sequence_num, 0, NULL);
+		}
+
+		ndma_zc_release_ctx(submitted_ctx, &ctx_queue->nr_pinned_pages);
+
+		*did_work = true;
+	}
+
+	/* 2) Submit pinned but unsubmitted contexts */
+	while (true) {
+		struct ndma_h2t_zcdma_context *pinned_unsubmitted_ctx = ndma_ctx_queue_peek_pinned_unsubmitted(ctx_queue);
+
+		if (!pinned_unsubmitted_ctx || !_ndma_zc_descs_available(eng, ring->qid, pinned_unsubmitted_ctx->nr_pages)) {
+			break;
+		}
+
+		ret = ndma_build_n_issue_zc_descs(pinned_unsubmitted_ctx);
+		if (ret) {
+			err = ret;
+			pr_err("async h2d dma submission failed for seq num %llu: %d\n", pinned_unsubmitted_ctx->sequence_num, ret);
+			ndma_h2d_compl_queue_put(&ring->dma_compl_queue, pinned_unsubmitted_ctx->sequence_num, ret, NULL);
+			ndma_ctx_queue_drain_sequence(ctx_queue, pinned_unsubmitted_ctx->sequence_num);
+		} else {
+			ndma_ctx_queue_inc_first_pinned_unsubmitted(ctx_queue);
+		}
+
+		*did_work = true;
+	}
+
+	/* 3) Remote pin unpinned contexts */
+	while (true) {
+		struct ndma_h2t_zcdma_context *unpinned_ctx = ndma_ctx_queue_peek_first_unpinned(ctx_queue);
+
+		if (!unpinned_ctx || ctx_queue->nr_pinned_pages + unpinned_ctx->nr_pages > NDMA_CTX_QUEUE_MAX_PINNED_PAGES) {
+			break;
+		}
+
+		ret = ndma_zerocopy_pin_pages(nd->device_index, ring->h2t_nc_id, ctx_queue, unpinned_ctx, true);
+		if (ret) {
+			err = ret;
+			pr_err("async h2d dma remote pinning failed for seq num %llu: %d\n", unpinned_ctx->sequence_num, ret);
+			ndma_h2d_compl_queue_put(&ring->dma_compl_queue, unpinned_ctx->sequence_num, ret, NULL);
+			ndma_ctx_queue_drain_sequence(ctx_queue, unpinned_ctx->sequence_num);
+		} else {
+			ndma_ctx_queue_inc_first_unpinned(ctx_queue);
+		}
+
+		*did_work = true;
+	}
+
+	mutex_unlock(&ring->h2t_ring_lock);
+	return err;
 }
diff --git a/neuron_dma.h b/neuron_dma.h
index 012bef4..eeea14f 100644
--- a/neuron_dma.h
+++ b/neuron_dma.h
@@ -6,6 +6,8 @@
 #ifndef NEURON_DMA_H
 #define NEURON_DMA_H
 
+#include <linux/mm_types.h>
+
 #include "udma/udma.h"
 
 #include "neuron_mempool.h"
@@ -200,7 +202,7 @@ dma_addr_t ndma_mc_to_pa(struct mem_chunk *mc);
 bool ndma_zerocopy_supported(void);
 
 /**
- * ndma_memcpy_zerocopy - Perform a pipelined zero-copy DMA transfer.
+ * ndma_zerocopy_submit() - Perform a pipelined zero-copy DMA transfer.
  * @nd: Neuron device whose DMA engine is used.
  * @nc_id: Neuron core identifier owning the queue.
  * @ops: Array of host buffer descriptors.
@@ -208,6 +210,7 @@ bool ndma_zerocopy_supported(void);
  * @dev_base: Base device physical address for the transfer.
  * @qid: Queue identifier to submit descriptors on.
  * @direction: true for host-to-device, false for device-to-host.
+ * @sequence_num: sequence number under async submission; 0 for sync.
  *
  *   DMA data between a user space virtual address range and a contiguous location in device memory.
  *   In order to do this, we need to know the physical pages are associated with
@@ -222,26 +225,25 @@ bool ndma_zerocopy_supported(void);
  *   We use pin_user_pages_fast() to reduce pinning overhead because we know the process can't go 
  *   away while we are down here doing our thing in the kernel within a single IOCTL call. 
  *   
- *   We ping pong back and forth between two dma contexts. So while dma for context A is in progress, 
- *   we are pinning pages and starting dmas for context B. 
- *
- *   Algorithm goes like this:
- *      initial a pair of dma contexts 
- *      prev dma ctx = null
+ *   ## For sync mode ##
+ *   We enqueue DMA contexts into a fixed-size queue and drive submission from that queue.
+ *   The loop:
  *      lock()
- *      while still more data remaining
- *         current dma ctx = next available context
- *         init current dma context
- *         calc size of the transfer for this dma context.  We want to transfer up to page boundaries
- *         calc number of pages that need to be pinned for this dma
- *         pin host pages in memory
- *         generate descriptors for 
- *         if prev dma ctx != NULL, wait for the prev dma to complete
- *         update host address, device address and ammount remaining
- *      wait for the last dma ctx to complete
+ *      while data remains
+ *         submit any pinned-but-unsubmitted ctxs when descriptors are available
+ *         create a new ctx when queue space and pin budget allow
+ *         pin pages immediately for the ctx
+ *         advance queue tail and update host/device pointers
+ *         wait on submitted ctxs from the head as needed to relieve pressure
+ *      submit remaining pinned ctxs
+ *      drain submitted ctxs from the head
  *      unlock()
- *      free resources
  *
+ *   ## For async mode ##
+ *   We keep submitting dma contexts until we hit a threshold of pinned pages. 
+ *   Once we hit the threshold, we stop pinning pages and set the mm_struct for remote pinning later.
+ *   TODO: liulily to add more detail once the complete async path is implemented.
+ * 
  *  Notes:
  *    unpinning responsibilities. Up until a dma is successfully launched, this routine is responsible for unpinning
  *    host memory.  After that ndma_zerocopy_wait_for_completion() owns responsibility for unpinning pages.
@@ -251,12 +253,13 @@ bool ndma_zerocopy_supported(void);
  *    process context.
  *
  */
-int ndma_memcpy_zerocopy(struct neuron_device *nd,
-			 u32 nc_id,
-			 const nrt_tensor_batch_op_t *ops,
-			 u32 num_ops,
-			 dma_addr_t dev_base,
-			 int qid,
-			 bool direction);
+int ndma_zerocopy_submit(struct neuron_device *nd,
+						u32 nc_id,
+						const nrt_tensor_batch_op_t *ops,
+						u32 num_ops,
+						dma_addr_t dev_base,
+						int qid,
+						bool direction,
+						u64 sequence_num);
 
 #endif
diff --git a/neuron_dmabuf.c b/neuron_dmabuf.c
index 6a510e1..2f53a07 100644
--- a/neuron_dmabuf.c
+++ b/neuron_dmabuf.c
@@ -28,7 +28,8 @@
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 13, 0)
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 13, 0)) || \
+    (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(10, 0)))
 MODULE_IMPORT_NS("DMA_BUF");
 #else
 MODULE_IMPORT_NS(DMA_BUF);
diff --git a/neuron_fw_io.c b/neuron_fw_io.c
index 4222309..dbf9133 100644
--- a/neuron_fw_io.c
+++ b/neuron_fw_io.c
@@ -133,17 +133,71 @@ int fw_io_api_version_read(void * bar0, u32 *version)
 	return ret;
 }
 
-int fw_io_server_info_read(void *bar0, u32 *server_info)
+int fw_io_server_info_read(void *bar0, int *server_id, int * rack_id)
 {
 	int ret;
+	uint32_t server_info;
 
 	void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_SERVER_RACK_ID_OFFSET;
-	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, server_info, 1, true);
+	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, &server_info, 1, true);
 	if (ret) {
 		pr_err("failed to get server info from the device, ret = %d\n", ret);
+		return -EIO;
 	}
 
-	return ret;
+	if (server_id != NULL) {
+		*server_id = _REG_SERVERINFO_SVALID(server_info) ? _REG_SERVERINFO_SERVER(server_info) : -1;
+	}
+	if (rack_id != NULL) {
+		*rack_id =  _REG_SERVERINFO_RVALID(server_info) ? _REG_SERVERINFO_RACK(server_info) : -1;
+	}
+	return 0;
+}
+
+int fw_io_reservation_id_read(void *bar0, uint64_t *reservation_id)
+{
+	int ret;
+	uint32_t reservation_id_lo;
+	uint32_t reservation_id_hi;
+	void *addr;
+
+	addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_RESERVATION_ID_LO;
+	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, &reservation_id_lo, 1, true);
+	if (ret) {
+		pr_err("failed to get the lower 32 bits of the reservation id from the device\n");
+		return -EIO;
+	}
+	addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_RESERVATION_ID_HI;
+	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, &reservation_id_hi, 1, true);
+	if (ret) {
+		pr_err("failed to get the upper 32 bits of the reservation id from the device\n");
+		return -EIO;
+	}
+	
+	*reservation_id = ((uint64_t)reservation_id_hi << 32) | reservation_id_lo;
+	return 0;
+}
+
+int fw_io_instance_partition_sz_read(void *bar0, int *instance_sz, int *partition_sz)
+{
+	int ret;
+	uint32_t instance_partition_sz;
+
+	void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_INSTANCE_PARTITION_SZ_OFFSET;
+	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, &instance_partition_sz, 1, true);
+	if (ret) {
+		pr_err("failed to get instance/partition size info from the device, ret = %d\n", ret);
+		return -EIO;
+	}
+
+	if (instance_sz != NULL) {
+		*instance_sz = _REG_INSTPARTSZ_INST(instance_partition_sz);
+	}
+	if (partition_sz != NULL) {
+		*partition_sz = _REG_INSTPARTSZ_VAL(instance_partition_sz) ? _REG_INSTPARTSZ_PART(instance_partition_sz) : -1;
+	}
+
+	return 0;
 }
 
 int fw_io_device_id_read(void *bar0, u32 *device_id)
@@ -229,12 +283,16 @@ static void dx_crc32c_add(const u8 *data, size_t len, u32 *csum)
 	}
 }
 
+// Note: fw_io_cmd_timeout_tbl is only used in fw_io_execute_request_new().
+// The timeouts only apply to cmd 3-5 as cmd 1-2 are still using legacy framework.
+// In the future when switching cmd 1-2 to new framework, will likely need to
+// bump timeout to 10s as fimrware side request can complete ranging from 100ms to 7s.
 static const u32 fw_io_cmd_timeout_tbl[FW_IO_CMD_MAX] = {
 	0,                   // cmd 0
 	(1000 * 1000 * 1),   // cmd 1 (FW_IO_CMD_READ)
 	(1000 * 1000 * 1),   // cmd 2 (FW_IO_CMD_POST_TO_CW)
-	(1000 * 1000 * 60),  // cmd 3 (FW_IO_CMD_SET_POWER_PROFILE)
-	(1000 * 1000 * 1),   // cmd 4 (FW_IO_CMD_GET_DATA)
+	(1000 * 1000 * 90),  // cmd 3 (FW_IO_CMD_SET_POWER_PROFILE)
+	(1000 * 1000 * 10),   // cmd 4 (FW_IO_CMD_GET_DATA)
 	(1000 * 1000 * 60),   // cmd 5 (FW_IO_CMD_SET_FEATURE)
 };
 
@@ -409,7 +467,7 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re
 		}
 		
 		// Read response header
-		union fw_io_response_hdr resp_header;
+		union fw_io_response_hdr_new resp_header;
 		reg_read32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_REQUEST_BASE_ADDR_HIG_OFFSET, &resp_header.reg.dw0);
 
 		if (resp_header.hdr.sequence_number != ctx->next_seq_num) {
@@ -423,9 +481,15 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re
 			if (data_size > 0 && resp != NULL) {
 				u32 copy_size = min(resp_size, data_size);
 				u32 *resp_data = (u32*)resp;
-				for (j = 0; j < (copy_size + 3) / 4; j++) {
+				for (j = 0; j < copy_size / 4; j++) {
 					reg_read32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_DATA_OFFSET + j*4, &resp_data[j]);
 				}
+				if (copy_size % 4) {
+					u32 remaining_resp = 0;
+					int idx = copy_size/4;
+					reg_read32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_DATA_OFFSET + idx*4, &remaining_resp);
+					memcpy(&resp_data[idx], &remaining_resp, copy_size % 4);
+				}
 			}
 			ret = 0;
 			break;
@@ -433,8 +497,8 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re
 
 		ctx->fw_io_err_count++;
 		pr_err(KERN_ERR "seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id, resp_header.hdr.error_code);
+		ret = -1;
 		if (resp_header.hdr.error_code == FW_IO_UNKNOWN_COMMAND) {
-			ret = -1;
 			break;
 		}
 	}
@@ -551,22 +615,23 @@ int fw_io_read_csr_array_readless(void **ptrs, u32 *values, u32 num_csrs)
 	return -1;
 }
 
-void fw_io_initiate_reset(void __iomem *bar0, bool device_reset, u32 tpb_reset_map)
+void fw_io_initiate_reset(void __iomem *bar0, bool device_reset, u32 tpb_reset_map_lo, u32 tpb_reset_map_hi)
 {
-	u32 reset_type;
-	void *address;
-	if (device_reset) {
-		reset_type = FW_IO_RESET_TYPE_DEVICE;
-	} else {
-		reset_type = FW_IO_RESET_TYPE_TPB;
-		address = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_RESET_TPB_MAP_OFFSET;
-		reg_write32((u32 *)address, tpb_reset_map);
-		mb();
-	}
-	address = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_RESET_OFFSET;
-	reg_write32((u32 *)address, reset_type);
-	mb();
-	fw_io_trigger(bar0);
+    void __iomem *misc_ram_addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset;
+    u32 reset_type;
+
+    if (device_reset) {
+        reset_type = FW_IO_RESET_TYPE_DEVICE;
+    } else {
+        reset_type = FW_IO_RESET_TYPE_TPB;
+        reg_write32(misc_ram_addr + FW_IO_REG_RESET_TPB_MAP_LO_OFFSET, tpb_reset_map_lo);
+        reg_write32(misc_ram_addr + FW_IO_REG_RESET_TPB_MAP_HI_OFFSET, tpb_reset_map_hi);
+        mb();
+    }
+
+    reg_write32(misc_ram_addr + FW_IO_REG_RESET_OFFSET, reset_type);
+    mb();
+    fw_io_trigger(bar0);
 }
 
 bool fw_io_is_reset_initiated(void __iomem *bar0)
@@ -762,6 +827,27 @@ int fw_io_set_power_profile(struct fw_io_ctx *ctx, uint32_t profile)
 	return fw_io_execute_request_new(ctx, FW_IO_CMD_SET_POWER_PROFILE, (u8 *)&data, sizeof(data), NULL, 0);
 }
 
+int fw_io_get_performance_profile(struct fw_io_ctx *ctx, uint32_t *profile)
+{
+	struct fw_io_get_data_request req = {0};
+	struct fw_io_get_perfprofile_response resp = {0};
+	int ret;
+	if (!ctx || !profile) {
+		return -EINVAL;
+	}
+
+	req.type = 1;
+
+	ret = fw_io_execute_request_new(ctx, FW_IO_CMD_GET_DATA, (u8 *)&req, sizeof(req), (u8 *)&resp, sizeof(resp));
+	if (ret == 0) {
+		*profile = (uint32_t)resp.profile;
+	} else {
+		pr_err("failed to get profile, ret = %d\n", ret);
+	}
+
+	return ret;
+}
+
 int fw_io_enable_throttling_notifications(struct fw_io_ctx *ctx, bool enable)
 {	/*
 	 * Note: 
@@ -784,4 +870,26 @@ int fw_io_enable_throttling_notifications(struct fw_io_ctx *ctx, bool enable)
 	}
 
 	return fw_io_execute_request_new(ctx, FW_IO_CMD_SET_FEATURE, &features, sizeof(features), NULL, 0);
-}
\ No newline at end of file
+}
+
+int fw_io_get_available_profiles(struct fw_io_ctx *ctx, u16 feature, u8 *num_profiles, u8 bitmap[32])
+{
+	struct fw_io_get_available_profiles_request req;
+	struct fw_io_get_available_profiles_response response;
+	int ret;
+	if (!ctx) {
+		return -EINVAL;
+	}
+
+	req.type = 2;
+	req.operation = feature;
+
+	ret = fw_io_execute_request_new(ctx, FW_IO_CMD_GET_DATA, (u8*)&req, sizeof(req), (u8*)&response, sizeof(response));
+	if (ret) {
+		return ret;
+	}
+
+	*num_profiles = response.num_profiles;
+	memcpy(bitmap, response.profiles_bitmap, sizeof(response.profiles_bitmap));
+	return 0;
+}
diff --git a/neuron_fw_io.h b/neuron_fw_io.h
index 3faf65c..83a5709 100644
--- a/neuron_fw_io.h
+++ b/neuron_fw_io.h
@@ -20,14 +20,33 @@ union fw_io_request_hdr {
 	} reg;
 };
 
+// Note: Firmware updated to include crc32 field in response header, but 
+// to maintain backward compatibility, keeping original response header
+// struct and adding crc32 field to new header.
+
+// Response header for legacy protocol
+// Used by fw_io_execute_request() for legacy commands
 union fw_io_response_hdr {
 	struct {
 		u8 sequence_number; // request sequence number
 		u8 error_code; // 0 means request was successfully completed
 		u16 size; // response size in bytes including this header
 	} hdr;
+	u32 dw0; // bytes 0-3: sequence_number, error_code, size
+};
+
+// Response header for new protocol
+// Used by fw_io_execute_request_new() for new commands
+union fw_io_response_hdr_new {
+	struct {
+		u8 sequence_number; // request sequence number
+		u8 error_code; // 0 means request was successfully completed
+		u16 size; // response size in bytes including this header
+		u32 crc32;
+	} hdr;
 	struct {
-		u32 dw0;
+		u32 dw0; // bytes 0-3: sequence_number, error_code, size
+		u32 dw1; // bytes 4-7: crc32
 	} reg;
 };
 
@@ -41,6 +60,11 @@ struct fw_io_response {
 	u8 data[];
 };
 
+struct fw_io_response_new {
+	union fw_io_response_hdr_new response_hdr;
+	u8 data[];
+};
+
 union fw_io_req_perfprofile_data {
 	struct {
 		uint32_t reserved;
@@ -53,8 +77,30 @@ union fw_io_req_perfprofile_data {
 };
 
 struct fw_io_get_data_request {
-	uint8_t type;			// fw_io_data_request_type
-	uint8_t reserved[3];	// reserved for future use/alignment
+	uint8_t type;
+};
+
+struct fw_io_get_perfprofile_response {
+	uint8_t reserved[4];
+	uint8_t profile;
+	uint8_t voltage_margin;
+	uint8_t frequency;
+	uint8_t ocw;
+};
+
+struct fw_io_get_available_profiles_request {
+	uint8_t type; // must be 2
+	uint16_t operation; 
+} __packed;
+
+struct fw_io_get_available_profiles_response {
+	uint8_t num_profiles;
+	uint8_t profiles_bitmap[32]; 
+};
+
+enum fw_io_get_available_profiles_feature {
+	FW_IO_AVAILABLE_PERF_PROFILES_ALL = 0,
+	FW_IO_AVAILABLE_PERF_PROFILES_HBM_7200 = 5
 };
 
 // Feature bitmap for FW_IO_CMD_SET_FEATURE
@@ -86,7 +132,7 @@ enum {
 // Bitmap of PIR reset types to be written to FW_IO_REG_RESET_OFFSET
 enum {
 	FW_IO_RESET_TYPE_DEVICE = 1,
-	FW_IO_RESET_TYPE_TPB = 2  // Requires FW_IO_REG_RESET_TPB_MAP_OFFSET to be populated with a tpb map prior to use
+	FW_IO_RESET_TYPE_TPB = 2  // Requires FW_IO_REG_RESET_TPB_MAP_LO_OFFSET to be populated with a tpb map prior to use
 };
 
 // offsets in MISC RAM for FWIO
@@ -99,6 +145,11 @@ enum {
 	//   - The value of this register is used to determine the offset of other registers.
 	FW_IO_REG_API_VERSION_OFFSET = 0x00,
 
+
+	// MISC RAM instance/partition size info
+	// (0:5) instance size, 16:30 partition size, 31 partition size valid
+	FW_IO_REG_INSTANCE_PARTITION_SZ_OFFSET = 0x30,
+
 	// MISC RAM slots for serial number for V2
 	//   - The lower 32 bits and the upper 32 bits together represent the 64-bit serial number.
 	FW_IO_REG_SERIAL_NUMBER_LO_OFFSET = 0x38, // 14 * 4 bytes
@@ -125,6 +176,10 @@ enum {
 	FW_IO_REG_POWER_UTIL_D1_OFFSET = 0x58, // 22 * 4 bytes
 
 	FW_IO_REG_HBM_REPAIR_STATE_OFFSET = 0x64, // 25 * 4 bytes
+											  //
+
+	FW_IO_REG_RESERVATION_ID_HI = 0x80,	// 32 * 4 bytes
+	FW_IO_REG_RESERVATION_ID_LO = 0x84,	// 33 * 4 bytes
 
 	FW_IO_REG_RUNTIME_RESERVED0 = 0xC0, // 0xC0 to 0xF0
 
@@ -138,7 +193,8 @@ enum {
 	FW_IO_REG_POD_SERNUM_LO = 0x198, 
 	FW_IO_REG_RUNTIME_RESERVED1  = 0x1a0, // 0x1a0 to 1d0
 	
-	FW_IO_REG_RESET_TPB_MAP_OFFSET = 0x1d8,
+	FW_IO_REG_RESET_TPB_MAP_HI_OFFSET = 0x1d4,
+	FW_IO_REG_RESET_TPB_MAP_LO_OFFSET = 0x1d8,
 	FW_IO_REG_RESET_OFFSET = 0x1ec,
 	FW_IO_REG_REQUEST_BASE_ADDR_LOW_OFFSET = 0x1f4,
 	FW_IO_REG_REQUEST_BASE_ADDR_HIG_OFFSET = 0x1f0,
@@ -147,7 +203,47 @@ enum {
 	FW_IO_REG_TRIGGER_INT_NOSEC_OFFSET = 0x800,
 	FW_IO_REG_ACK_OFFSET = 0xf0,
 };
-	
+
+// Instance/partition register field decode
+//
+#define _REG_INSTPARTSZ_INSTBITS	6
+#define _REG_INSTPARTSZ_INSTSHIFT 	0
+#define _REG_INSTPARTSZ_INSTMASK	((1 << _REG_INSTPARTSZ_INSTBITS)-1)
+#define _REG_INSTPARTSZ_INST(inst)	(((inst) >> _REG_INSTPARTSZ_INSTSHIFT) & _REG_INSTPARTSZ_INSTMASK)
+
+#define _REG_INSTPARTSZ_PARTBITS	15
+#define _REG_INSTPARTSZ_PARTSHIFT 	16
+#define _REG_INSTPARTSZ_PARTMASK	((1 << _REG_INSTPARTSZ_PARTBITS)-1)
+#define _REG_INSTPARTSZ_PART(part)	(((part) >> _REG_INSTPARTSZ_PARTSHIFT) & _REG_INSTPARTSZ_PARTMASK)
+
+#define _REG_INSTPARTSZ_VALBITS		1
+#define _REG_INSTPARTSZ_VALSHIFT 	31
+#define _REG_INSTPARTSZ_VALMASK		((1 << _REG_INSTPARTSZ_VALBITS)-1)
+#define _REG_INSTPARTSZ_VAL(val)	(((val) >> _REG_INSTPARTSZ_VALSHIFT) & _REG_INSTPARTSZ_VALMASK)
+
+// server info register field decode
+//
+#define _REG_SERVERINFO_SERVERBITS		15
+#define _REG_SERVERINFO_SERVERSHIFT 	0
+#define _REG_SERVERINFO_SERVERMASK		((1 << _REG_SERVERINFO_SERVERBITS)-1)
+#define _REG_SERVERINFO_SERVER(serv)	(((serv) >> _REG_SERVERINFO_SERVERSHIFT) & _REG_SERVERINFO_SERVERMASK)
+
+#define _REG_SERVERINFO_SVALIDBITS		1 
+#define _REG_SERVERINFO_SVALIDSHIFT 	15
+#define _REG_SERVERINFO_SVALIDMASK		((1 << _REG_SERVERINFO_SVALIDBITS)-1)
+#define _REG_SERVERINFO_SVALID(sval)	(((sval) >> _REG_SERVERINFO_SVALIDSHIFT) & _REG_SERVERINFO_SVALIDMASK)
+
+#define _REG_SERVERINFO_RACKBITS		15
+#define _REG_SERVERINFO_RACKSHIFT 		16
+#define _REG_SERVERINFO_RACKMASK		((1 << _REG_SERVERINFO_RACKBITS)-1)
+#define _REG_SERVERINFO_RACK(rack)		(((rack) >> _REG_SERVERINFO_RACKSHIFT) & _REG_SERVERINFO_RACKMASK)
+
+#define _REG_SERVERINFO_RVALIDBITS		1 
+#define _REG_SERVERINFO_RVALIDSHIFT 	31
+#define _REG_SERVERINFO_RVALIDMASK		((1 << _REG_SERVERINFO_RVALIDBITS)-1)
+#define _REG_SERVERINFO_RVALID(rval)	(((rval) >> _REG_SERVERINFO_RVALIDSHIFT) & _REG_SERVERINFO_RVALIDMASK)
+
+//
 #define FW_IO_REG_METRIC_BUF_SZ 128
 
 struct fw_io_ctx {
@@ -287,11 +383,15 @@ int fw_io_post_metric_new(struct fw_io_ctx *ctx, u8 *data, u32 size);
  *
  * @bar0: Device's BAR0 base address
  * @device_reset: True if we are doing a device-level reset
- * @tpb_reset_map: If device_reset is false (tpb reset), bitmap of blocks to reset
- *     [1:0] NC mask
- *     [13:8] TopSp mask
+ * @tpb_reset_map_lo: If device_reset is false (tpb reset), bitmap of blocks to reset (bits 0-31)
+ *     [7:0]   TPB mask
+ *     [15:8]  SDMA group mask
+ *     [23:16] TOP_SP mask
+ *     [31:24] CC_TOP mask
+ * @tpb_reset_map_hi:
+ *     [3:0]   Top-Level DMA group mask
  */
-void fw_io_initiate_reset(void __iomem *bar0, bool device_reset, u32 tpb_reset_map);
+void fw_io_initiate_reset(void __iomem *bar0, bool device_reset, u32 tpb_reset_map_lo, u32 tpb_reset_map_hi);
 
 /**
  * fw_io_is_reset_initiated() - Check if local reset is initiated or not.
@@ -319,10 +419,29 @@ int fw_io_read_counters(struct fw_io_ctx *ctx, uint64_t addr_in[], uint32_t val_
 /**
  * fw_io_server_info_read() - Read server info
  * @param bar - from bar
- * @param server_info  - server info containing rack & server ids
+ * @param server_id - server id or -1 if invalid
+ * @param rack_id - rack id or -1 if invalid
+ * @return  0 on success.
+ */
+int fw_io_server_info_read(void *bar0, int *server_id, int * rack_id);
+
+
+/**
+ * fw_io_reservation_id_read() - Read reservation id
+ * @param bar - from bar
+ * @param reservation_id - server reservation id
+ * @return  0 on success.
+ */
+int fw_io_reservation_id_read(void *bar0, uint64_t *reservation_id);
+
+/**
+ * fw_io_instance_partition_sz_read() - instance/partition sizes
+ * @param bar - from bar
+ * @param instance_sz - instance size.  -1 if invalid
+ * @param partition_sz - partition size.  -1 if invalid
  * @return  0 on success.
  */
-int fw_io_server_info_read(void *bar0, u32 *server_info);
+int fw_io_instance_partition_sz_read(void *bar0, int *instance_sz, int *partition_sz);
 
 /**
  * fw_io_device_id_read() - Read device id
@@ -433,6 +552,14 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re
  */
 int fw_io_set_power_profile(struct fw_io_ctx *ctx, uint32_t profile);
 
+/**
+ * fw_io_get_performance_profile() - Get current performance profile
+ * @param ctx: FWIO context
+ * @param profile: Pointer to store the current profile value
+ * @return 0 on success, negative on failure
+ */
+int fw_io_get_performance_profile(struct fw_io_ctx *ctx, uint32_t *profile);
+
 /**
  * fw_io_enable_throttling_notifications() - Enable throttling notifications
  * @param ctx: FWIO context
@@ -440,4 +567,14 @@ int fw_io_set_power_profile(struct fw_io_ctx *ctx, uint32_t profile);
  * @return 0 on success, negative on failure
  */
 int fw_io_enable_throttling_notifications(struct fw_io_ctx *ctx, bool enable);
+
+/**
+ * fw_io_get_available_profiles() - Get available profiles
+ * @param ctx: FWIO context
+ * @param feature: Profiles with a particular feature (0 for all profiles supported by instance)
+ * @param num_profiles: Number of valid profiles in response
+ * @param bitmap: Bitmap of supported profiles in response
+ */
+int fw_io_get_available_profiles(struct fw_io_ctx *ctx, u16 feature, u8 *num_profiles, u8 bitmap[32]);
+
 #endif
diff --git a/neuron_ioctl.h b/neuron_ioctl.h
index bcc9c6e..b20170b 100644
--- a/neuron_ioctl.h
+++ b/neuron_ioctl.h
@@ -637,13 +637,27 @@ struct neuron_ioctl_get_va_placement {
 	__s32 device_index;		// [out] Neuron device index (negative if VA does not represent Neuron memory)
 	__s32 hbm_index;		// [out] HBM index
 };
+struct neuron_ioctl_available_perf_profiles {
+	__u16 requested_feature; // [in] - 0 means all available profiles (no feature filter)
+	__u8 num_profiles; // [out]
+	__u8 bitmap[32]; // [out] firmware limited to 256 profiles
+};
+
+struct neuron_ioctl_get_async_h2t_dma_compl_queues {
+	__u32 nc_id;		/* [in] neuron core id */
+	__u32 qid_bitmap;	/* [in] bitmap of dma queues requested */
+	struct {
+		__u64 mmap_offset;  /* [out] mmap offset of each completion queue */
+		__u32 mmap_size;	/* [out] mmap size of queue + metadata */
+	} compl_queue_info[16];
+};
+
 
 #define NEURON_IOCTL_BASE 'N'
 
 /* Deprecated reset related IOCTLs. Now it would always return success. */
 #define NEURON_IOCTL_DEVICE_RESET _IO(NEURON_IOCTL_BASE, 1)
 #define NEURON_IOCTL_DEVICE_READY _IOR(NEURON_IOCTL_BASE, 2, __u8)
-#define NEURON_IOCTL_DEVICE_RESET_STATUS _IOR(NEURON_IOCTL_BASE, 106, __u8)
 
 /** Returns devices information and connection topology. */
 #define NEURON_IOCTL_DEVICE_INFO _IOR(NEURON_IOCTL_BASE, 3, struct neuron_ioctl_device_info *)
@@ -665,9 +679,6 @@ struct neuron_ioctl_get_va_placement {
 
 /** Allocated memory and return a memory_handle. */
 #define NEURON_IOCTL_MEM_ALLOC _IOR(NEURON_IOCTL_BASE, 21, struct neuron_ioctl_mem_alloc *)
-#define NEURON_IOCTL_MEM_ALLOC_V2 _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2 *) // V2 here refers to neuron 2.x, not arch type
-#define NEURON_IOCTL_MEM_ALLOC_V2MT _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type) // just V2 with additional field mem_type
-#define NEURON_IOCTL_MEM_ALLOC_V2MT64 _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type64) // V2 + mem_type + pad
 
 /** Free given memory_handle. */
 #define NEURON_IOCTL_MEM_FREE _IOR(NEURON_IOCTL_BASE, 22, struct neuron_ioctl_mem_free *)
@@ -704,8 +715,6 @@ struct neuron_ioctl_get_va_placement {
 /** Initializes given DMA queue */
 #define NEURON_IOCTL_DMA_QUEUE_INIT _IOR(NEURON_IOCTL_BASE, 33, struct neuron_ioctl_dma_queue_init *)
 
-#define NEURON_IOCTL_DMA_QUEUE_INIT_BATCH _IOR(NEURON_IOCTL_BASE, 133, struct neuron_ioctl_dma_queue_init_batch)
-
 /** Releases given DMA queue */
 #define NEURON_IOCTL_DMA_QUEUE_RELEASE _IOR(NEURON_IOCTL_BASE, 34, struct neuron_ioctl_dma_queue_release *)
 /** Starts DMA transfer of given number of descriptors */
@@ -772,6 +781,11 @@ struct neuron_ioctl_get_va_placement {
 /** Returns pci device information - only for devices opened by the calling proceess (deprecated, don't use) */
 #define NEURON_IOCTL_DEVICE_BDF _IOR(NEURON_IOCTL_BASE, 101, struct neuron_ioctl_device_bdf *)
 
+/** Allocated memory and return a memory_handle. */
+#define NEURON_IOCTL_MEM_ALLOC_V2 _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2 *) // V2 here refers to neuron 2.x, not arch type
+#define NEURON_IOCTL_MEM_ALLOC_V2MT _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type) // just V2 with additional field mem_type
+#define NEURON_IOCTL_MEM_ALLOC_V2MT64 _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type64) // V2 + mem_type + pad
+
 /** Resets the requested NC (-1 for full device) */
 #define NEURON_IOCTL_NC_RESET _IOR(NEURON_IOCTL_BASE, 103, struct neuron_ioctl_device_reset *)
 
@@ -782,6 +796,9 @@ struct neuron_ioctl_get_va_placement {
 #define NEURON_IOCTL_PROGRAM_ENGINE_NC _IOWR(NEURON_IOCTL_BASE, 105, struct neuron_ioctl_program_engine_nc *)
 #define NEURON_IOCTL_PROGRAM_ENGINE_NC64 _IOWR(NEURON_IOCTL_BASE, 105, struct neuron_ioctl_program_engine_nc64)
 
+/* Deprecated reset related IOCTLs. Now it would always return success. */
+#define NEURON_IOCTL_DEVICE_RESET_STATUS _IOR(NEURON_IOCTL_BASE, 106, __u8)
+
 /** Returns pci device information for any Neuron devices (not just these opened by the calling process */
 #define NEURON_IOCTL_DEVICE_BDF_EXT _IOR(NEURON_IOCTL_BASE, 106, struct neuron_ioctl_device_bdf_ext *)
 
@@ -847,7 +864,13 @@ struct neuron_ioctl_get_va_placement {
 
 #define NEURON_IOCTL_GET_VA_PLACEMENT _IOW(NEURON_IOCTL_BASE, 131, struct neuron_ioctl_get_va_placement)
 
-// Note: 133 is taken by NEURON_IOCTL_DMA_QUEUE_INIT_BATCH
-#define NEURON_IOCTL_MAX 132
+#define NEURON_IOCTL_GET_PERFORMANCE_PROFILE _IOWR(NEURON_IOCTL_BASE, 132, struct neuron_ioctl_power_profile)
+
+/** Batch DMA initialization given DMA queue */
+#define NEURON_IOCTL_DMA_QUEUE_INIT_BATCH _IOR(NEURON_IOCTL_BASE, 133, struct neuron_ioctl_dma_queue_init_batch)
+
+#define NEURON_IOCTL_AVAILABLE_PERF_PROFILES _IOWR(NEURON_IOCTL_BASE, 134, struct neuron_ioctl_available_perf_profiles)
+
+#define NEURON_IOCTL_GET_ASYNC_H2T_DMA_COMPL_QUEUES _IOWR(NEURON_IOCTL_BASE, 135, struct neuron_ioctl_get_async_h2t_dma_compl_queues)
 
 #endif
diff --git a/neuron_mempool.c b/neuron_mempool.c
index 20e4436..b5b2d2f 100644
--- a/neuron_mempool.c
+++ b/neuron_mempool.c
@@ -133,7 +133,7 @@ static void mc_remove_node(struct rb_root *root, struct mem_chunk *mc)
  * Return: 0 if pool is created, a negative error code otherwise.
  */
 
-static int mp_init_device_mem(struct mempool *mp, struct mempool_set *mpset,
+static int mp_init_device_mem(struct neuron_mempool *mp, struct neuron_mempool_set *mpset,
 			      u64 start_addr, size_t pool_size,	u32 dram_channel, u32 dram_region)
 {
 	int ret;
@@ -205,7 +205,7 @@ static int mp_init_device_mem(struct mempool *mp, struct mempool_set *mpset,
  * Frees all backing pages allocated for reserved host_mem pool.
  * Does opposite work of mp_init_hrm_pool
  */
-static void mp_destroy_hrm_pool(struct mempool *mp)
+static void mp_destroy_hrm_pool(struct neuron_mempool *mp)
 {
 	int i = 0;
 	if (mp->page_va_array == NULL)
@@ -232,13 +232,13 @@ static void mp_destroy_hrm_pool(struct mempool *mp)
  * Any page allocation failure is ignored.
  *
  * @mp: pointer to mempool that needs to be initialized
- * @mpset: pointer to parent mempool_set
+ * @mpset: pointer to parent neuron_mempool_set
  * @page_size: backing host memory's page size
  * @page_count: Max number of pages to allocate
  *
  * Return: 0 if pool is created, a negative error code otherwise.
  */
-static int mp_init_hrm_pool(struct mempool *mp, struct mempool_set *mpset,
+static int mp_init_hrm_pool(struct neuron_mempool *mp, struct neuron_mempool_set *mpset,
 			    u32 page_size, u32 page_count)
 {
 	int ret;
@@ -299,7 +299,7 @@ static int mp_init_hrm_pool(struct mempool *mp, struct mempool_set *mpset,
 /**
  * Frees all the chunks associated with the mempool and releases the mempool.
  */
-static void mp_destroy_gen_pool(struct mempool *mp)
+static void mp_destroy_gen_pool(struct neuron_mempool *mp)
 {
 	BUG_ON(mp == NULL);
 	if (!mp->initialized)
@@ -315,6 +315,58 @@ static void mp_destroy_gen_pool(struct mempool *mp)
 	}
 }
 
+// Upper 16MB is used internally by the firmware, don't use it in the allocation pool
+#define MEMPOOL_CARVEOUT_SIZE 0x1000000 // 16MB
+/**
+ * mpset_block_carveout_regions()
+ *
+ * @param nd: neuron device
+ * @param mpset: pointer to mpset
+ * @param device_dram_addr: DRAM Channel addresses
+ * @param device_dram_size: DRAM Channel sizes
+ * @return int: 0 on success, o/w on failure
+ */
+static int mpset_block_carveout_regions(struct neuron_device *nd, struct neuron_mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size)
+{
+	int ret;
+	u64 region_sz;
+	int channel = 0, region = 0;
+
+	/*
+	*  Block carve out regions: Upper 16 MB is used internally by firmware
+	*
+	*  Ideally we would carve out by simply changing the start address of the chunk;
+	*  however, that breaks aligned allocation in 4.x kernel versions (fixed in 5.x).
+	*  Fix here:
+	*     commit 52fbf1134d479234d7e64ba9dcbaea23405f229e
+	*     Author: Alexey Skidanov <alexey.skidanov@intel.com>
+	*     Date:   Thu Jan 3 15:26:44 2019 -0800
+	*
+	*     lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk
+	*/
+	for (channel = 0; channel < mpset->num_channels; channel++) {
+		region_sz = device_dram_size[channel] / mpset->mp_device_num_regions;
+		for (region = 0; region < mpset->mp_device_num_regions; region++) {
+			const dma_addr_t start_addr = device_dram_addr[channel] + (region * region_sz);
+			struct mem_chunk *mc = NULL;
+			u32 nc_id = channel;
+			ret = mc_alloc_align(nd, MC_LIFESPAN_DEVICE, MEMPOOL_CARVEOUT_SIZE, 0, MEM_LOC_DEVICE, channel, region, nc_id, NEURON_MEMALLOC_TYPE_NCDEV_DEVICE, &mc);
+			if (ret) {
+				pr_err("failed to allocate hbm carveout region: ret=%d\n", ret);
+				return -ENOMEM;
+			}
+			if (mc->pa != start_addr) {
+				pr_err("carve out mc not offset 0!");
+				mc_free(&mc);
+				return -EINVAL;
+			}
+		}
+		ndhal->ndhal_mpset.device_dram_effective_base_addr[channel] = device_dram_addr[channel] + MEMPOOL_CARVEOUT_SIZE;
+	}
+
+	return 0;
+}
+
 /**
  * mpset_init_device_pools() - Prepare device mp in given mpset.
  *
@@ -323,7 +375,7 @@ static void mp_destroy_gen_pool(struct mempool *mp)
  *
  * Return: 0 if initialization succeeds, a negative error code otherwise.
  */
-static int mpset_init_device_pools(struct mempool_set *mpset, struct neuron_device *nd)
+static int mpset_init_device_pools(struct neuron_mempool_set *mpset, struct neuron_device *nd)
 {
 	int ret;
 	int channel = 0, region = 0;
@@ -345,7 +397,7 @@ static int mpset_init_device_pools(struct mempool_set *mpset, struct neuron_devi
 		}
 	}
 
-	ret = ndhal->ndhal_mpset.mpset_block_carveout_regions(nd, mpset, device_dram_addr, device_dram_size);
+	ret = mpset_block_carveout_regions(nd, mpset, device_dram_addr, device_dram_size);
 	if (ret) {
 		goto fail;
 	}
@@ -358,7 +410,7 @@ static int mpset_init_device_pools(struct mempool_set *mpset, struct neuron_devi
 			mp_destroy_gen_pool(&mpset->mp_device[channel][region]);
 		}
 	}
-	memset(mpset, 0, sizeof(struct mempool_set));
+	memset(mpset, 0, sizeof(struct neuron_mempool_set));
 
 	return ret;
 }
@@ -383,7 +435,7 @@ static int mpset_print_lifespan_list(const char *name, struct list_head *head)
 
 /** Verifies all MC allocated from the mpset is freed.
  */
-static void mpset_verify_all_mc_freed(struct mempool_set *mpset)
+static void mpset_verify_all_mc_freed(struct neuron_mempool_set *mpset)
 {
 	int i, count;
 	count = mpset_print_lifespan_list("LOCAL", &mpset->mc_lifespan_local_head);
@@ -396,7 +448,7 @@ static void mpset_verify_all_mc_freed(struct mempool_set *mpset)
 	BUG_ON(count != 0);
 }
 
-int mpset_constructor(struct mempool_set *mpset, void *pdev, struct neuron_device *nd)
+int mpset_constructor(struct neuron_mempool_set *mpset, void *pdev, struct neuron_device *nd)
 {
 	int host_page_index;
 	u64 host_allocated_size = 0;
@@ -442,9 +494,9 @@ int mpset_constructor(struct mempool_set *mpset, void *pdev, struct neuron_devic
 }
 
 static void mpset_free_lifespan_list(struct list_head *head, struct list_head *new_head);
-static struct list_head * mpset_get_lifespan_head(struct mempool_set *mpset, enum mc_lifespan lifespan);
+static struct list_head * mpset_get_lifespan_head(struct neuron_mempool_set *mpset, enum mc_lifespan lifespan);
 
-void mpset_destructor(struct mempool_set *mpset)
+void mpset_destructor(struct neuron_mempool_set *mpset)
 {
 	int i, channel, region;
 	struct list_head *head;
@@ -477,7 +529,7 @@ void mpset_destructor(struct mempool_set *mpset)
 	mutex_unlock(&mpset->lock);
 }
 
-struct mem_chunk *mpset_search_mc(struct mempool_set *mp, phys_addr_t pa)
+struct mem_chunk *mpset_search_mc(struct neuron_mempool_set *mp, phys_addr_t pa)
 {
 	struct rb_node *node = mp->root.rb_node; /* top of the tree */
 
@@ -495,7 +547,7 @@ struct mem_chunk *mpset_search_mc(struct mempool_set *mp, phys_addr_t pa)
 	return NULL;
 }
 
-static inline struct list_head * mpset_get_lifespan_head(struct mempool_set *mpset, enum mc_lifespan lifespan)
+static inline struct list_head * mpset_get_lifespan_head(struct neuron_mempool_set *mpset, enum mc_lifespan lifespan)
 {
 	struct list_head *head = NULL;
 	if (lifespan == MC_LIFESPAN_LOCAL) {
@@ -514,7 +566,7 @@ static inline struct list_head * mpset_get_lifespan_head(struct mempool_set *mps
 
 static void mc_add_to_lifespan_list(struct mem_chunk *mc)
 {
-	struct mempool_set *mpset = mc->mpset;
+	struct neuron_mempool_set *mpset = mc->mpset;
 	struct list_head *head;
 	head = mpset_get_lifespan_head(mpset, mc->lifespan);
 	list_add(&mc->lifespan_list, head);
@@ -554,7 +606,7 @@ static void mpset_free_lifespan_list(struct list_head *head, struct list_head *n
 	}
 }
 
-void mpset_free_expired_mc(struct mempool_set *mpset, enum mc_lifespan lifespan)
+void mpset_free_expired_mc(struct neuron_mempool_set *mpset, enum mc_lifespan lifespan)
 {
 	struct list_head *head, *next_head;
 	head = mpset_get_lifespan_head(mpset, lifespan);
@@ -562,7 +614,7 @@ void mpset_free_expired_mc(struct mempool_set *mpset, enum mc_lifespan lifespan)
 	mpset_free_lifespan_list(head, next_head);
 }
 
-static inline u64 get_offset_for_scratchpad_alloc(const struct mempool *mp, u64 alloc_size)
+static inline u64 get_offset_for_scratchpad_alloc(const struct neuron_mempool *mp, u64 alloc_size)
 {
 	/*
 	Contiguous scratchpad grows backwards from the end of the main genpool
@@ -581,8 +633,8 @@ static int mc_alloc_internal(struct neuron_device *nd, enum mc_lifespan lifespan
 	     struct mem_chunk **result)
 {
 	struct mem_chunk *mc;
-	struct mempool *mp = NULL;
-	struct mempool_set *mpset = &nd->mpset;
+	struct neuron_mempool *mp = NULL;
+	struct neuron_mempool_set *mpset = &nd->mpset;
 	struct gen_pool *pool = NULL;
 	struct gen_pool *alt_pool = NULL;
 	int ret = 0;
@@ -799,7 +851,7 @@ int mc_alloc_align(struct neuron_device *nd, enum mc_lifespan lifespan, u64 size
 
 void mc_inc_refcount(struct mem_chunk *mc)
 {
-	struct mempool_set *mpset = mc->mpset;
+	struct neuron_mempool_set *mpset = mc->mpset;
 	mutex_lock(&mpset->lock);
 	mc->ref_count++;
 	mutex_unlock(&mpset->lock);
@@ -807,7 +859,7 @@ void mc_inc_refcount(struct mem_chunk *mc)
 
 void mc_free(struct mem_chunk **mcp)
 {
-	struct mempool_set *mpset;
+	struct neuron_mempool_set *mpset;
 	struct mem_chunk *mc = *mcp;
 
 	BUG_ON(mc == NULL);
@@ -845,7 +897,7 @@ void mc_free(struct mem_chunk **mcp)
 		mpset->host_mem_size -= mc->size;
 		nsysfsmetric_dec_counter(mpset->nd, NON_NDS_METRIC, NON_NDS_COUNTER_HOST_MEM, mc->nc_id, mc->size, false);
 	} else if (mc->mem_location == MEM_LOC_DEVICE) {
-		struct mempool *mp;
+		struct neuron_mempool *mp;
 		mp = &mpset->mp_device[mc->dram_channel][mc->dram_region];
 		gen_pool_free(mc->gen_pool, (u64)mc->va, mc->size);
 		mp->allocated_size -= mc->size;
@@ -878,7 +930,7 @@ void mc_free(struct mem_chunk **mcp)
 
 int mc_dump_all_chunks(struct neuron_device *nd, u32 channel, u32 num_entries_in, struct neuron_ioctl_mem_chunk_info *data, u32 *num_entries_out)
 {
-	struct mempool_set *mpset = &nd->mpset;
+	struct neuron_mempool_set *mpset = &nd->mpset;
 	u32 cnt = 0;
 	struct rb_node *node;
 
diff --git a/neuron_mempool.h b/neuron_mempool.h
index 2fcdf10..61cdd5f 100644
--- a/neuron_mempool.h
+++ b/neuron_mempool.h
@@ -9,7 +9,7 @@
  *  2. mempool/mp           - Is a pool of memory backed either device DRAM or host DRAM.
  *                            For device memory it uses gen_pool allocator to allocate memory.
  *                            For host memory it directly uses kmalloc().
- *  3. mempool_set/mpset    - Is collection for mp for given neuron device.
+ *  3. neuron_mempool_set/mpset    - Is collection for mp for given neuron device.
  */
 
 #ifndef NEURON_MEMPOOL_H
@@ -39,11 +39,11 @@ enum mem_location {
  * Device is memory is split in to chunks and allocated.
  * Uses genpool allocator in the backend.
  */
-struct mempool {
+struct neuron_mempool {
 	char name[32]; // friendly name
 	bool initialized; // True if initialized.
 
-	struct mempool_set *mpset; // parent mpset
+	struct neuron_mempool_set *mpset; // parent mpset
 
 	enum mem_location mem_location; // location of the memory
 	u32 dram_channel; // DRAM channel valid only if location is device
@@ -75,16 +75,16 @@ struct mempool {
 // Number for MPs for host allocation
 #define MP_HOST_RESERVE_MEMORY_POOL_COUNT 4
 
-struct mempool_set {
+struct neuron_mempool_set {
 	struct mutex lock;
 
 	struct neuron_device *nd; // backponter to neuron_device
 
 	u32 mp_device_num_regions; // number of regions in the device pool
 	u32 num_channels; // number of regions in the device pool
-	struct mempool mp_device[MAX_DRAM_CHANNELS][MAX_DDR_REGIONS]; // device memory pools
+	struct neuron_mempool mp_device[MAX_DRAM_CHANNELS][MAX_DDR_REGIONS]; // device memory pools
 
-	struct mempool mp_hrm[MP_HOST_RESERVE_MEMORY_POOL_COUNT]; // host reserve memory pools
+	struct neuron_mempool mp_hrm[MP_HOST_RESERVE_MEMORY_POOL_COUNT]; // host reserve memory pools
 
 	// linked list head to store mem_chunk of different lifespan
 	struct list_head mc_lifespan_local_head;
@@ -127,8 +127,8 @@ struct mem_chunk {
 
 	u64 size; // chunk size
 
-	struct mempool *mp; // backpointer to mp
-	struct mempool_set *mpset; // back pointer to mpset
+	struct neuron_mempool *mp; // backpointer to mp
+	struct neuron_mempool_set *mpset; // back pointer to mpset
 	struct gen_pool *gen_pool; // pointer to genpool
 
 	u32 dram_channel; // DRAM channel
@@ -160,14 +160,14 @@ struct mem_chunk {
  *
  * Return: 0 if initialization succeeds, a negative error code otherwise.
  */
-int mpset_constructor(struct mempool_set *mpset, void *pdev, struct neuron_device *nd);
+int mpset_constructor(struct neuron_mempool_set *mpset, void *pdev, struct neuron_device *nd);
 
 /**
  * mpset_destructor() - Free all mp in the set.
  *
  * @mpset: Pointer to mpset which need to be destroyed.
  */
-void mpset_destructor(struct mempool_set *mpset);
+void mpset_destructor(struct neuron_mempool_set *mpset);
 
 /** mpset_search_mc() - Find memory chunk which maps given physical address
  *
@@ -176,7 +176,7 @@ void mpset_destructor(struct mempool_set *mpset);
  *
  * Return: mem chunk that has pa on success, NULL on failure
  */
-struct mem_chunk *mpset_search_mc(struct mempool_set *mp, phys_addr_t pa);
+struct mem_chunk *mpset_search_mc(struct neuron_mempool_set *mp, phys_addr_t pa);
 
 /**
  * mc_alloc_align() - Allocate a memory chunk of size from given mpset, with alignment
@@ -210,7 +210,7 @@ void mc_free(struct mem_chunk **mcp);
  * @mpset: Pointer to mpset
  * @lifespan: Lifespan list to use
  */
-void mpset_free_expired_mc(struct mempool_set *mpset, enum mc_lifespan lifespan);
+void mpset_free_expired_mc(struct neuron_mempool_set *mpset, enum mc_lifespan lifespan);
 
 /**
  * mc_inc_refcount() - Increases reference count of the given mc.
diff --git a/neuron_metrics.c b/neuron_metrics.c
index b849262..65185fb 100644
--- a/neuron_metrics.c
+++ b/neuron_metrics.c
@@ -33,6 +33,7 @@ MODULE_PARM_DESC(nmetric_log_posts, "1: send metrics to CW, 2: send metrics to t
 static int nmetric_counters_buf_size = sizeof(u64) * NMETRIC_COUNTER_COUNT;
 static int nmetric_versions_buf_size = sizeof(struct nmetric_versions) * NMETRIC_VERSION_COUNT;
 static int nmetric_constants_buf_size = sizeof(char) * NMETRIC_CONSTANTS_COUNT * (NEURON_METRICS_VERSION_STRING_MAX_LEN + 1);
+static int nmetric_ecc_err_buf_size = sizeof(u64) * NMETRIC_ECC_ERR_COUNT;
 
 static char nmetric_constant_metrics[NMETRIC_CONSTANTS_COUNT][NEURON_METRICS_VERSION_STRING_MAX_LEN + 1];
 static const char nmetric_instance_id_path[] = "/sys/devices/virtual/dmi/id/board_asset_tag";
@@ -67,6 +68,9 @@ enum nmetric_cw_id {
 	// Ultraserver mode configured on device (only for ULTRASERVER/PDS platforms), values defined in neuron_ultraserver_mode enum
 	NMETRIC_CW_ID_ULTRASERVER_MODE = 58,
 
+	// Workload ID based off hashed neff id
+	NMETRIC_CW_ID_AGG_NEFF_ID = 80,
+
 	// Platform Utilization Metrics
 	// Percentage of time that the neuron device was executing NEFFs in a given interval, aggregated across NCs
 	// For example, a ND with full utilization of one core with the other idle, will be reported as 50%
@@ -202,7 +206,6 @@ static const nmetric_def_t nmetric_defs[] = {
 	NMETRIC_COUNTER_DEF(18, POST_TIME_TICK_0, NMETRIC_CW_ID_NERR_OOB, NDS_NC_COUNTER_OOB),
 
 	NMETRIC_COUNTER_DEF(19, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_COLLECTIVES, NDS_EXT_NC_COUNTER_HW_ERR_COLLECTIVES),
-	NMETRIC_COUNTER_DEF(20, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_HBM_UE, NDS_EXT_NC_COUNTER_HW_ERR_HBM_UE),
 	NMETRIC_COUNTER_DEF(21, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_NC_UE, NDS_EXT_NC_COUNTER_HW_ERR_NC_UE),
 	NMETRIC_COUNTER_DEF(22, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_DMA_ABORT, NDS_EXT_NC_COUNTER_HW_ERR_DMA_ABORT),
 
@@ -212,15 +215,19 @@ static const nmetric_def_t nmetric_defs[] = {
 	NMETRIC_COUNTER_DEF(25, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_SW_EVENT_ERROR, NDS_EXT_NC_COUNTER_ERR_SW_EVENT_ERROR),
 	NMETRIC_COUNTER_DEF(26, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_SW_PSUM_COLLISION, NDS_EXT_NC_COUNTER_ERR_SW_PSUM_COLLISION),
 	NMETRIC_COUNTER_DEF(27, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_SW_SEQUENCER_FATAL, NDS_EXT_NC_COUNTER_ERR_SW_SEQUENCER_FATAL),
-	NMETRIC_COUNTER_DEF(28, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_REPAIRABLE_HBM_UE, NDS_EXT_NC_COUNTER_HW_ERR_REPAIRABLE_HBM_UE),
 	NMETRIC_UTILIZATION_DEF(29, POST_TIME_ALWAYS, NMETRIC_CW_ID_NC_UTILIZATION, NDS_NC_COUNTER_TIME_IN_USE),
 
+	// ECC Error Count Metrics
+	NMETRIC_DRIVER_ECC_ERR_DEF(0, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_HBM_UE),
+	NMETRIC_DRIVER_ECC_ERR_DEF(1, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_REPAIRABLE_HBM_UE),
+
 	// bitmap metrics
 	NMETRIC_BITMAP_DEF(0, POST_TIME_TICK_1, NMETRIC_CW_ID_FEATURE_BITMAP, NDS_ND_COUNTER_FEATURE_BITMAP),
 	NMETRIC_BITMAP_DEF(0, POST_TIME_TICK_1, NMETRIC_CW_ID_UNUSED, NDS_ND_COUNTER_DYNAMIC_SYSFS_METRIC_BITMAP),
 
 	// const uint64 metrics
 	NMETRIC_CONSTANT_U64(0, POST_TIME_TICK_1, NMETRIC_CW_ID_DEVICE_CLUSTER_ID, NDS_ND_COUNTER_DEVICE_CLUSTER_ID, NMETRIC_CONST_U64_FLAG_SKIP_ZERO),
+	NMETRIC_CONSTANT_U64(1, POST_TIME_TICK_1, NMETRIC_CW_ID_AGG_NEFF_ID, NDS_ND_COUNTER_AGG_NEFF_ID, NMETRIC_CONST_U64_FLAG_SKIP_ZERO),
 
 	// driver metrics. not in datastore
 	NMETRIC_DRIVER_DEF(NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS, POST_TIME_TICK_1, NMETRIC_CW_ID_MAX_DEVICE_RESET_TIME_MS),
@@ -649,10 +656,10 @@ static inline int nmetric_post_feature_bitmap(const nmetric_def_t *metric, struc
 	return metric_size;
 }
 
-static int nmetric_post_u64(const nmetric_def_t *metric, u64 metric_value, struct nmetric_cw_metric *dest, int available_size)
+static int nmetric_post_u64_fmt(const nmetric_def_t *metric, const char *format, u64 metric_value, struct nmetric_cw_metric *dest, int available_size)
 {
 	// check if there is enough space in buffer
-	int expected_len = snprintf(NULL, 0, "%llu", metric_value);
+	int expected_len = snprintf(NULL, 0, format, metric_value);
 	int metric_size = sizeof(struct nmetric_cw_metric) + expected_len;
 	if (available_size < metric_size) {
 		return 0;
@@ -661,12 +668,12 @@ static int nmetric_post_u64(const nmetric_def_t *metric, u64 metric_value, struc
 	// save metrics to buffer
 	dest->id = metric->cw_id;
 	dest->len = expected_len;
-	snprintf(dest->data, expected_len + 1, "%llu", metric_value); // post the as decimal not hex, as cw reads it in decimal format
+	snprintf(dest->data, expected_len + 1, format, metric_value);
 
 	return metric_size;
 }
 
-static inline int nmetric_post_constant_u64(const nmetric_def_t *metric, struct nmetric_cw_metric *dest, u64 *const_u64_metrics, u64 *freed_const_u64_metrics, int available_size)
+static inline int nmetric_post_constant_u64_fmt(const nmetric_def_t *metric, const char *format, u64 *const_u64_metrics, u64 *freed_const_u64_metrics, struct nmetric_cw_metric *dest, int available_size)
 {
 	// we have a choice of taking the metric value from previous
 	// NDS or current NDS.
@@ -687,7 +694,17 @@ static inline int nmetric_post_constant_u64(const nmetric_def_t *metric, struct
 			return 0;
 	}
 
-	return nmetric_post_u64(metric, metric_value, dest, available_size);
+	return nmetric_post_u64_fmt(metric, format, metric_value, dest, available_size);
+}
+
+static inline int nmetric_post_decimal_constant_u64(const nmetric_def_t *metric, struct nmetric_cw_metric *dest, u64 *const_u64_metrics, u64 *freed_const_u64_metrics, int available_size)
+{
+	return nmetric_post_constant_u64_fmt(metric, "%llu", const_u64_metrics, freed_const_u64_metrics, dest, available_size);
+}
+
+static inline int nmetric_post_hex_constant_u64(const nmetric_def_t *metric, struct nmetric_cw_metric *dest, u64 *const_u64_metrics, u64 *freed_const_u64_metrics, int available_size)
+{
+	return nmetric_post_constant_u64_fmt(metric, "%llx", const_u64_metrics, freed_const_u64_metrics, dest, available_size);
 }
 
 // TODO: This function is a quick workaround to post and reset the driver metrics:
@@ -721,7 +738,7 @@ static inline int nmetric_post_and_reset_driver_metrics(const nmetric_def_t *dri
 			metric_value = total_time / total_count;
 	}
 
-	return nmetric_post_u64(driver_final_metric, metric_value, dest, available_size);
+	return nmetric_post_u64_fmt(driver_final_metric, "%llu", metric_value, dest, available_size);
 }
 
 static inline int nmetric_post_driver_userver_metrics(const nmetric_def_t *metric, struct nmetric_cw_metric *dest, int available_size)
@@ -754,7 +771,51 @@ static inline int nmetric_post_driver_userver_metrics(const nmetric_def_t *metri
 		metric_value = mode;
 	}
 
-	return nmetric_post_u64(metric, metric_value, dest, available_size);
+	return nmetric_post_u64_fmt(metric, "%llu", metric_value, dest, available_size);
+}
+
+/**
+ * Function for updating the ECC memory error counts in the driver. Uses the same parsing logic for the ECC miscram registers as the sysfs
+ * module to ensure data consistency.
+ *
+ * @param metric Current metric to be posted
+ * @param dest The destination buffer to write the TVL metric data into
+ * @param available_size The remaining size in the dest buffer
+ *
+ * @return Size of the metric posting when appended to the buffer
+ */
+static inline int nmetric_post_driver_ecc_metrics(struct neuron_device *nd, const nmetric_def_t *metric, 
+						  struct nmetric_cw_metric *dest, int available_size)
+{
+	uint32_t metric_value = 0;
+	
+	// Read the current value of the hbm_err_count registers in miscram using the same function as sysfs for consistency
+	switch (metric->cw_id) {
+		case NMETRIC_CW_ID_NERR_HW_ERR_HBM_UE:
+			ndhal->ndhal_sysfs_metrics.nsysfsmetric_get_hbm_error_count(nd, false, &metric_value);
+		break;
+		case NMETRIC_CW_ID_NERR_HW_ERR_REPAIRABLE_HBM_UE:
+			ndhal->ndhal_sysfs_metrics.nsysfsmetric_get_hbm_error_count(nd, true, &metric_value);
+		break;
+		default:
+			pr_err_once("Unrecognized ECC Metric ID %d. Skipping parsing metric", metric->cw_id);
+			return 0;
+		break;
+	}
+
+	// Subtract out previous errors during this session e.g. we get HBM UEs but do not degrade the node. Prevents double counting errors.
+	// In the case we detect an underflow, record the metric as 0 and set ecc_prev to the current register value. This is mostly to combat
+	// the case where Pacific has a bug in register writing, or resets the chip underneath us.
+	if (nd->metrics.neuron_aggregation.ecc_prev[metric->index] <= metric_value) {
+		metric_value -= nd->metrics.neuron_aggregation.ecc_prev[metric->index];
+		nd->metrics.neuron_aggregation.ecc_prev[metric->index] += metric_value;
+	} else {
+		pr_warn_once("Integer underflow detected when parsing HBM UE metrics. Adjusting stats to avoid an overcount.");
+		nd->metrics.neuron_aggregation.ecc_prev[metric->index] = metric_value;
+		metric_value = 0;
+	}
+
+	return nmetric_post_u64_fmt(metric, "%llu", metric_value, dest, available_size);
 }
 
 /**
@@ -793,33 +854,40 @@ static void nmetric_post_metrics(struct neuron_device *nd, u64 *curr_metrics, u6
 		}
 		dest = (struct nmetric_cw_metric *)&nd->metrics.posting_buffer[data_size];
 		switch(curr_metric->type) {
-		case NMETRIC_TYPE_CONSTANT:
-			data_size += nmetric_post_constant(curr_metric, dest, available_size);
-		break;
-		case NMETRIC_TYPE_VERSION:
-			data_size += nmetric_post_version(versions, curr_metric, dest, available_size);
-		break;
-		case NMETRIC_TYPE_UTILIZATION:
-			data_size += nmetric_post_utilization(nd, curr_metrics, prev_metrics, freed_metrics,
-							      curr_metric, dest, available_size);
-		break;
-		case NMETRIC_TYPE_COUNTER:
-		case NMETRIC_TYPE_FW_IO_ERR:
-			data_size += nmetric_post_counter(curr_metrics, prev_metrics, freed_metrics,
-							  curr_metric, dest, available_size);
-		break;
-		case NMETRIC_TYPE_BITMAP:
-			data_size += nmetric_post_feature_bitmap(curr_metric, dest, curr_feature_bitmap, freed_feature_bitmap, available_size);
-		break;
-		case NMETRIC_TYPE_CONSTANT_U64:
-			data_size += nmetric_post_constant_u64(curr_metric, dest, const_u64_metrics, freed_const_u64_metrics, available_size);
-		break;
-		case NMETRIC_TYPE_DRIVER_RESET:
-			data_size += nmetric_post_and_reset_driver_metrics(curr_metric, dest, &nd->metrics.driver_metrics, available_size);
-		break;
-		case NMETRIC_TYPE_DRIVER_USERVER:
-			data_size += nmetric_post_driver_userver_metrics(curr_metric, dest, available_size);
-		break;
+			case NMETRIC_TYPE_CONSTANT:
+				data_size += nmetric_post_constant(curr_metric, dest, available_size);
+			break;
+			case NMETRIC_TYPE_VERSION:
+				data_size += nmetric_post_version(versions, curr_metric, dest, available_size);
+			break;
+			case NMETRIC_TYPE_UTILIZATION:
+				data_size += nmetric_post_utilization(nd, curr_metrics, prev_metrics, freed_metrics,
+								      curr_metric, dest, available_size);
+			break;
+			case NMETRIC_TYPE_COUNTER:
+			case NMETRIC_TYPE_FW_IO_ERR:
+				data_size += nmetric_post_counter(curr_metrics, prev_metrics, freed_metrics,
+								  curr_metric, dest, available_size);
+			break;
+			case NMETRIC_TYPE_BITMAP:
+				data_size += nmetric_post_feature_bitmap(curr_metric, dest, curr_feature_bitmap, freed_feature_bitmap, available_size);
+			break;
+			case NMETRIC_TYPE_CONSTANT_U64:
+				if (curr_metric->cw_id == NMETRIC_CW_ID_AGG_NEFF_ID) {
+					data_size += nmetric_post_hex_constant_u64(curr_metric, dest, const_u64_metrics, freed_const_u64_metrics, available_size);
+	 			} else {
+	 				data_size += nmetric_post_decimal_constant_u64(curr_metric, dest, const_u64_metrics, freed_const_u64_metrics, available_size);
+	 			}
+			break;
+			case NMETRIC_TYPE_DRIVER_RESET:
+				data_size += nmetric_post_and_reset_driver_metrics(curr_metric, dest, &nd->metrics.driver_metrics, available_size);
+			break;
+			case NMETRIC_TYPE_DRIVER_USERVER:
+				data_size += nmetric_post_driver_userver_metrics(curr_metric, dest, available_size);
+			break;
+			case NMETRIC_TYPE_ECC_ERR_COUNTER:
+				data_size += nmetric_post_driver_ecc_metrics(nd, curr_metric, dest, available_size);
+			break;
 		}
 	}
 
@@ -996,6 +1064,7 @@ static int nmetric_thread_fn(void *arg)
 	memset(nd->metrics.neuron_aggregation.prev, 0, nmetric_counters_buf_size);
 	memset(nd->metrics.neuron_aggregation.curr, 0, nmetric_counters_buf_size);
 	memset(nd->metrics.neuron_aggregation.freed, 0, nmetric_counters_buf_size);
+	memset(nd->metrics.neuron_aggregation.ecc_prev, 0, nmetric_ecc_err_buf_size);
 	memset(component_versions, 0, nmetric_versions_buf_size);
 	curr_feature_bitmap = 0;
 	freed_feature_bitmap = 0;
diff --git a/neuron_metrics.h b/neuron_metrics.h
index 60fbb61..59585ad 100644
--- a/neuron_metrics.h
+++ b/neuron_metrics.h
@@ -29,6 +29,7 @@
 #define NMETRIC_TYPE_DRIVER_RESET 0x6
 #define NMETRIC_TYPE_DRIVER_USERVER 0x7
 #define NMETRIC_TYPE_UTILIZATION  0x8
+#define NMETRIC_TYPE_ECC_ERR_COUNTER 0x9
 
 #define NMETRIC_FLAG_VERS_ALLOW_TYPE	(1)
 
@@ -81,7 +82,10 @@ struct nmetric_driver_metrics {
 #define NMETRIC_BITMAP_COUNT 1
 
 // Number of metrics of type NMETRIC_CONSTANT_U64
-#define NMETRIC_CONSTANT_U64_COUNT 1
+#define NMETRIC_CONSTANT_U64_COUNT 2
+
+// Number of metrics of type NMETRIC_TYPE_ECC_ERR_COUNTER
+#define NMETRIC_ECC_ERR_COUNT 3
 
 typedef struct {
 	u8 index;	// metric specific index
@@ -104,6 +108,7 @@ typedef struct {
 #define NMETRIC_CONSTANT_U64(idx, tick, cw_id, ds_id, flags) NMETRIC_DEF(idx, NMETRIC_TYPE_CONSTANT_U64, 1, tick, cw_id, ds_id, flags)
 #define NMETRIC_DRIVER_DEF(idx, tick, cw_id)                 NMETRIC_DEF(idx, NMETRIC_TYPE_DRIVER_RESET, 1, tick, cw_id, 0xFF, 0)
 #define NMETRIC_DRIVER_USERVER_DEF(idx, tick, cw_id)         NMETRIC_DEF(idx, NMETRIC_TYPE_DRIVER_USERVER, 1, tick, cw_id, 0xFF, 0)
+#define NMETRIC_DRIVER_ECC_ERR_DEF(idx, tick, cw_id)         NMETRIC_DEF(idx, NMETRIC_TYPE_ECC_ERR_COUNTER, 1, tick, cw_id, 0xFF, 0)
 
 struct nmetric_versions {
 	u32 version_usage_count[NEURON_METRICS_VERSION_MAX_CAPACITY];
@@ -126,6 +131,7 @@ struct nmetric_aggregation_thread {
 	u64 curr[NMETRIC_COUNTER_COUNT]; // metrics for the current session so far
 	u64 prev[NMETRIC_COUNTER_COUNT]; // recorded metrics from the last post
 	u64 freed[NMETRIC_COUNTER_COUNT]; // cache holding metrics that were freed before the posting period was reached
+	u64 ecc_prev[NMETRIC_ECC_ERR_COUNT]; // ECC error counts up to the current post
 };
 
 struct neuron_metrics {
diff --git a/neuron_module.c b/neuron_module.c
index 0b257b3..56713ba 100644
--- a/neuron_module.c
+++ b/neuron_module.c
@@ -18,13 +18,13 @@
 #include "neuron_cdev.h"
 #include "neuron_pci.h"
 
-MODULE_DESCRIPTION("Neuron Driver, built from SHA: 6670442319042643165ab7986e5184496ea4407c");
+MODULE_DESCRIPTION("Neuron Driver, built from SHA: 1c7ed9bd14936635773b5a01777882804ee8ea6e");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("2.26.5.0");
+MODULE_VERSION("2.27.4.0");
 MODULE_ALIAS("pci:v00001d0fd00007064sv*sd*bc*sc*i*");
 
-const char driver_version[] = "2.26.5.0";
-const char driver_revision[] = "6670442319042643165ab7986e5184496ea4407c";
+const char driver_version[] = "2.27.4.0";
+const char driver_revision[] = "1c7ed9bd14936635773b5a01777882804ee8ea6e";
 
 #ifdef CONFIG_FAULT_INJECTION
 
diff --git a/neuron_nq.h b/neuron_nq.h
index 6bfd7e3..e956a1a 100644
--- a/neuron_nq.h
+++ b/neuron_nq.h
@@ -10,6 +10,48 @@
 #include <linux/types.h>
 #include "neuron_device.h"
 
+#define NOTIFIC_NQ_SIZE 0x28   // total size of the NQ register space
+#define NOTIFIC_NQ_BASE_ADDR_LO_OFFSET_START 0x100
+#define NOTIFIC_NQ_BASE_ADDR_LO_OFFSET(index) (NOTIFIC_NQ_BASE_ADDR_LO_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0)
+
+#define NOTIFIC_NQ_BASE_ADDR_LO_RESET_VALUE 0x00000000
+
+static inline void notific_write_nq_base_addr_lo(void __iomem *base, size_t index,
+								  uint32_t value)
+{
+	const size_t offset = NOTIFIC_NQ_BASE_ADDR_LO_OFFSET(index);
+
+	reg_write32(base + offset, value);
+}
+
+#define NOTIFIC_NQ_BASE_ADDR_HI_OFFSET_START 0x104
+#define NOTIFIC_NQ_BASE_ADDR_HI_OFFSET(index) (NOTIFIC_NQ_BASE_ADDR_HI_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0)
+
+#define NOTIFIC_NQ_BASE_ADDR_HI_RESET_VALUE 0x00000000
+
+static inline void notific_write_nq_base_addr_hi(void __iomem *base, size_t index,
+								  uint32_t value)
+{
+	const size_t offset = NOTIFIC_NQ_BASE_ADDR_HI_OFFSET(index);
+
+	reg_write32(base + offset, value);
+}
+
+#define NOTIFIC_NQ_F_SIZE_OFFSET_START 0x108
+#define NOTIFIC_NQ_F_SIZE_OFFSET(index) (NOTIFIC_NQ_F_SIZE_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0)
+
+#define NOTIFIC_F_SIZE_RESET_VALUE 0x00000000
+
+static inline void notific_write_nq_f_size(void __iomem *base, size_t index,
+							    uint32_t value)
+{
+	const size_t offset = NOTIFIC_NQ_F_SIZE_OFFSET(index);
+
+	reg_write32(base + offset, value);
+}
+
+#define NOTIFIC_NQ_HEAD_OFFSET 0x10c
+
 /**
  * nnq_init() - Initialize notification queue for NeuronCore
  *
diff --git a/neuron_pci.c b/neuron_pci.c
index dbb1b14..f385b3d 100644
--- a/neuron_pci.c
+++ b/neuron_pci.c
@@ -57,6 +57,8 @@ static atomic_t device_count = ATOMIC_INIT(0);
 struct neuron_device *neuron_devices[MAX_NEURON_DEVICE_COUNT] = { 0 };
 int total_neuron_devices = 0;
 
+extern unsigned int nmetric_log_posts;
+
 extern void ndmar_preinit(struct neuron_device *nd);
 
 struct neuron_device *neuron_pci_get_device(u8 device_index)
@@ -65,6 +67,45 @@ struct neuron_device *neuron_pci_get_device(u8 device_index)
 	return neuron_devices[device_index];
 }
 
+static atomic_t dup_rid_cnt = ATOMIC_INIT(0); // count of duplicate routing IDs encountered
+int neuron_pci_handle_dup_routing_id(void) {
+	int  ret = -ENODEV;
+	int  dup_cnt;
+	char cmd[256];
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
+	dup_cnt = atomic_fetch_add(1, &dup_rid_cnt);
+#else
+	dup_cnt = atomic_add_return(1, &dup_rid_cnt) - 1;
+#endif
+
+	// If this is the first dup encounted, unload the driver
+	if ((dup_cnt == 0) && dup_helper_enable) {
+		pr_err("scheduling unload of %s due to duplicate routing id\n", module_name(THIS_MODULE));
+
+		int n = snprintf(cmd, sizeof(cmd), "sleep 10;/sbin/modprobe -r %s", module_name(THIS_MODULE));
+		if (n >= sizeof(cmd)) {
+			pr_err("unable to schedule driver unload cmd buffer len exceeded\n");
+			return -EINVAL;
+		}
+		char *argv[] = 		  { "/bin/sh",
+								"-c",
+								cmd,
+								NULL};
+		static char *envp[] = { "HOME=/",
+								"TERM=linux",
+								"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+								NULL};
+
+		ret = call_usermodehelper( argv[0], argv, envp, UMH_WAIT_EXEC);
+		if (ret) {
+			pr_err("unable to schedule driver unload. Error: %d\n", ret);
+		}
+	}
+
+	return ret;
+}
+
 static int neuron_pci_device_init(struct neuron_device *nd)
 {
 	int i, ret;
@@ -102,7 +143,7 @@ static int neuron_pci_device_init(struct neuron_device *nd)
 		goto fail_mch;
 
 	// Initialize the device mpset
-	memset(&nd->mpset, 0, sizeof(struct mempool_set));
+	memset(&nd->mpset, 0, sizeof(struct neuron_mempool_set));
 
 	// Initialize the host portion in mpset
 	ret = mpset_constructor(&nd->mpset, &(nd->pdev->dev), nd);
@@ -113,6 +154,8 @@ static int neuron_pci_device_init(struct neuron_device *nd)
 	for (i = 0; i < MAX_NC_PER_DEVICE; i++)
 		mutex_init(&nd->crwl[i].lock);
 
+	nd->supports_hbm_7200 = -1;
+
 	ret = ncdev_create_device_node(nd);
 	if (ret) {
 		pci_info(nd->pdev, "create device node failed\n");
@@ -132,7 +175,6 @@ static int neuron_pci_device_init(struct neuron_device *nd)
 fail_mch:
 	if (nd->fw_io_ctx)
 		fw_io_destroy((struct fw_io_ctx *)nd->fw_io_ctx);
-
 	nd->fw_io_ctx = NULL;
 	return ret;
 }
@@ -185,12 +227,134 @@ static void neuron_pci_set_device_architecture(struct neuron_device *nd)
 	narch_init(arch, revision);
 }
 
+static bool is_valid_bar(int bar) {
+	return ((bar == ndhal->ndhal_pci.apb_bar) || (bar == ndhal->ndhal_pci.axi_bar) || (bar == ndhal->ndhal_pci.dram_bar));
+}
+
+/**
+ * neuron_pci_reserve_bar() - Mark the PCI region associated with PCI BAR as being reserved
+ *
+ * @param dev: PCI device whose resources are to be reserved
+ * @param bar: BAR to be reserved
+ * @param res_name: Name to be associated with resource.
+ * @return int: Returns 0 on success, otherwise failure
+ */
+static int neuron_pci_reserve_bar(struct pci_dev *dev, int bar, const char *res_name) {
+	int ret;
+
+	if (!is_valid_bar(bar)) {
+		pci_info(dev, "invalid BAR%d\n", bar);
+		goto err;
+	}
+	if (bar == BAR_UNUSED) {
+		return 0;
+	}
+
+	ret = pci_request_region(dev, bar, res_name);
+	if (ret) {
+		pci_info(dev, "BAR %d: can't reserve %s\n", bar, res_name);
+		goto err;
+	}
+
+	return 0;
+
+err:
+	// allow failure to map on dram bar, as some setups may not support it
+	if (bar == ndhal->ndhal_pci.dram_bar) {
+		return 0;
+	} else {
+		return -ENODEV;
+	}
+}
+
+ /**
+ * neuron_pci_set_npdev() - set BAR's physical addr, io addr, and size of neuron_pci_device
+ *
+ * @param dev: PCI device that owns the BAR
+ * @param bar: BAR number
+ * @param res_name: Name associated with resource
+ * @param bar_pa: start physical address of BAR
+ * @param bar_ioaddr: __iomem address to device BAR
+ * @param bar_size: size of BAR
+ * @return int: Returns 0 on success, otherwise failure
+ */
+static int neuron_pci_set_npdev(struct pci_dev *dev,
+                            int bar,
+                            const char *res_name,
+                            phys_addr_t *bar_pa,
+                            void __iomem **bar_ioaddr,
+                            u64 *bar_size) {
+	if (!is_valid_bar(bar)) {
+		pci_info(dev, "invalid BAR%d\n", bar);
+		return -ENODEV;
+	}
+	if (bar == BAR_UNUSED) {
+		return 0;
+	}
+
+	if (pci_resource_len(dev, bar) == 0) {
+		pci_info(dev, "BAR%d len is 0\n", bar);
+		goto err;
+	}
+
+	*bar_pa = pci_resource_start(dev, bar);
+	if (!(*bar_pa)) {
+		pci_info(dev, "Can't get start address of BAR%d %s\n", bar, res_name);
+		goto err;
+	}
+	*bar_size = pci_resource_len(dev, bar);
+
+	if (bar == ndhal->ndhal_pci.dram_bar) {
+		ndhal->ndhal_pci.dram_bar_size = *bar_size;
+	}
+
+	if (bar == ndhal->ndhal_pci.dram_bar && wc_enable) {
+		*bar_ioaddr = pci_iomap_wc(dev, bar, pci_resource_len(dev, bar));
+	}
+	else {
+		*bar_ioaddr = pci_iomap(dev, bar, pci_resource_len(dev, bar));
+	}
+
+	return 0;
+
+err:
+	// allow failure to map on dram bar, as some setups may not support it
+	if (bar == ndhal->ndhal_pci.dram_bar) {
+		*bar_pa = 0;
+		*bar_size = 0;
+		*bar_ioaddr = NULL;
+		return 0;
+	} else {
+		return -ENODEV;
+	}
+}
+
+/**
+ * neuron_pci_release_bar() - Release a PCI BAR
+ *
+ * @param dev: PCI device whose resources were previously reserved by pci_request_region()
+ * @param bar: BAR to be reserved
+ *
+ */
+static int neuron_pci_release_bar(struct pci_dev *dev, int bar) {
+	if (!is_valid_bar(bar)) {
+		pci_info(dev, "invalid BAR%d\n", bar);
+		return -ENODEV;
+	}
+	if (bar == BAR_UNUSED) {
+		return 0;
+	}
+
+	pci_release_region(dev, bar);
+	return 0;
+}
+
 static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 {
 	int ret = 0;
 	struct neuron_device *nd;
 
-	nd = kzalloc(sizeof(struct neuron_device), GFP_KERNEL);
+	nd = kvzalloc(sizeof(struct neuron_device), GFP_KERNEL);
 	if (nd == NULL) {
 		pci_info(dev, "Can't allocate memory for neuron_device\n");
 		goto fail_alloc_nd_mem;
@@ -223,31 +387,31 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	}
 
 	// map apb bar
-	ret = ndhal->ndhal_pci.neuron_pci_reserve_bar(dev, ndhal->ndhal_pci.apb_bar, "APB");
+	ret = neuron_pci_reserve_bar(dev, ndhal->ndhal_pci.apb_bar, "APB");
 	if (ret) {
 		goto fail_bar0_map;
 	}
-	ret = ndhal->ndhal_pci.neuron_pci_set_npdev(dev, ndhal->ndhal_pci.apb_bar, "APB", &nd->npdev.bar0_pa, &nd->npdev.bar0, &nd->npdev.bar0_size);
+	ret = neuron_pci_set_npdev(dev, ndhal->ndhal_pci.apb_bar, "APB", &nd->npdev.bar0_pa, &nd->npdev.bar0, &nd->npdev.bar0_size);
 	if (ret) {
 		goto fail_bar0_resource;
 	}
 
 	// map bar2
-	ret = ndhal->ndhal_pci.neuron_pci_reserve_bar(dev,  ndhal->ndhal_pci.axi_bar, "AXI");
+	ret = neuron_pci_reserve_bar(dev,  ndhal->ndhal_pci.axi_bar, "AXI");
 	if (ret) {
 		goto fail_bar2_map;
 	}
-	ret = ndhal->ndhal_pci.neuron_pci_set_npdev(dev, ndhal->ndhal_pci.axi_bar, "AXI", &nd->npdev.bar2_pa, &nd->npdev.bar2, &nd->npdev.bar2_size);
+	ret = neuron_pci_set_npdev(dev, ndhal->ndhal_pci.axi_bar, "AXI", &nd->npdev.bar2_pa, &nd->npdev.bar2, &nd->npdev.bar2_size);
 	if (ret) {
 		goto fail_bar2_resource;
 	}
 
 	// map bar4
-	ret = ndhal->ndhal_pci.neuron_pci_reserve_bar(dev, ndhal->ndhal_pci.dram_bar, "BAR4");
+	ret = neuron_pci_reserve_bar(dev, ndhal->ndhal_pci.dram_bar, "BAR4");
 	if (ret) {
 		goto fail_bar4_map;
 	}
-	ret = ndhal->ndhal_pci.neuron_pci_set_npdev(dev, ndhal->ndhal_pci.dram_bar, "BAR4", &nd->npdev.bar4_pa, &nd->npdev.bar4, &nd->npdev.bar4_size);
+	ret = neuron_pci_set_npdev(dev, ndhal->ndhal_pci.dram_bar, "BAR4", &nd->npdev.bar4_pa, &nd->npdev.bar4, &nd->npdev.bar4_size);
 	if (ret) {
 		goto fail_bar4_resource;
 	}
@@ -290,10 +454,12 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	if (ret)
 		goto fail_memset_mc;
 
-	// initialize metric aggregation and posting
- 	ret = nmetric_init(nd);
- 	if (ret)
- 		goto fail_nmetric_resource;
+	if (nmetric_log_posts != 0) {
+		// initialize metric aggregation and posting
+		ret = nmetric_init(nd);
+		if (ret)
+			goto fail_nmetric_resource;
+	 }
 
 	mutex_init(&nd->memset_lock);
 
@@ -310,19 +476,19 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 fail_nds_resource:
 	neuron_ds_destroy(&nd->datastore);
 fail_bar4_resource:
-	ndhal->ndhal_pci.neuron_pci_release_bar(dev, ndhal->ndhal_pci.dram_bar);
+	neuron_pci_release_bar(dev, ndhal->ndhal_pci.dram_bar);
 fail_bar4_map:
 fail_bar2_resource:
-	ndhal->ndhal_pci.neuron_pci_release_bar(dev, ndhal->ndhal_pci.axi_bar);
+	neuron_pci_release_bar(dev, ndhal->ndhal_pci.axi_bar);
 fail_bar2_map:
 fail_bar0_resource:
-	ndhal->ndhal_pci.neuron_pci_release_bar(dev, ndhal->ndhal_pci.apb_bar);
+	neuron_pci_release_bar(dev, ndhal->ndhal_pci.apb_bar);
 fail_bar0_map:
 	pci_disable_device(dev);
 fail_dhal_init:
 fail_enable:
 	neuron_log_destroy( nd);
-	kfree(nd);
+	kvfree(nd);
 fail_alloc_nd_mem:
 	pci_set_drvdata(dev, NULL);
 	return ret;
@@ -342,11 +508,11 @@ static void neuron_pci_remove(struct pci_dev *dev)
 
     ndhal->ndhal_ext_cleanup();
 
-	ndhal->ndhal_pci.neuron_pci_release_bar(dev, ndhal->ndhal_pci.apb_bar);
+	neuron_pci_release_bar(dev, ndhal->ndhal_pci.apb_bar);
 
-	ndhal->ndhal_pci.neuron_pci_release_bar(dev, ndhal->ndhal_pci.axi_bar);
+	neuron_pci_release_bar(dev, ndhal->ndhal_pci.axi_bar);
 
-	ndhal->ndhal_pci.neuron_pci_release_bar(dev, ndhal->ndhal_pci.dram_bar);
+	neuron_pci_release_bar(dev, ndhal->ndhal_pci.dram_bar);
 
 	pci_disable_device(dev);
 
@@ -364,7 +530,7 @@ static void neuron_pci_remove(struct pci_dev *dev)
 
 	neuron_log_destroy(nd);
 
-	kfree(nd);
+	kvfree(nd);
 }
 
 static struct pci_driver neuron_pci_driver = {
diff --git a/neuron_pci.h b/neuron_pci.h
index de95f48..afe7afe 100644
--- a/neuron_pci.h
+++ b/neuron_pci.h
@@ -25,6 +25,13 @@ extern int wc_enable;
  */
 struct neuron_device *neuron_pci_get_device(u8 device_index);
 
+/**
+ * neuron_pci_handle_dup_routing_id() - Handle the case where multiple devices share the same routing id
+ *
+ * Return: 0 if successful, a negative error code otherwise.
+ */
+int neuron_pci_handle_dup_routing_id(void);
+
 /**
  * neuron_pci_module_init() - Initialize Neuron PCI driver.
  *
diff --git a/neuron_power.c b/neuron_power.c
index 4fc0fce..58887d5 100644
--- a/neuron_power.c
+++ b/neuron_power.c
@@ -203,7 +203,7 @@ static void npower_calculate_stats(struct neuron_power_samples *current_samples,
 			min_power_to_log = current_samples->max_power_bips;
 		}
 		if (power_enabled_in_fw) {
-			pr_info("Not enough data to aggregate stats.  Have %u data points, min of %u max of %u, total of %llu.",
+			pr_debug("Not enough data to aggregate stats.  Have %u data points, min of %u max of %u, total of %llu.",
 				current_samples->num_data_points, min_power_to_log,
 				current_samples->max_power_bips,
 				current_samples->total_power_util_bips);
diff --git a/neuron_reg_access.c b/neuron_reg_access.c
index 7f6c98b..eceac3e 100644
--- a/neuron_reg_access.c
+++ b/neuron_reg_access.c
@@ -7,7 +7,7 @@
 
 inline int reg_read32(const u32 __iomem *addr, u32 *value)
 {
-	return ndhal->ndhal_reg_access.reg_read32_array((void **)&addr, value, 1);
+	return ndhal->ndhal_fw_io.fw_io_read_csr_array((void **)&addr, value, 1, true);
 }
 
 inline void reg_write32(u32 __iomem *addr, u32 value)
diff --git a/neuron_reset.c b/neuron_reset.c
index 1794ba4..ff7b3a6 100644
--- a/neuron_reset.c
+++ b/neuron_reset.c
@@ -25,9 +25,13 @@ int no_reset = 0;
 module_param(no_reset, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
 MODULE_PARM_DESC(no_reset, "Dont reset device");
 
+int reset_top_dma = 0;
+module_param(reset_top_dma, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
+MODULE_PARM_DESC(reset_top_dma, "Reset top-level DMAs during TPB reset");
+
 #define NR_DEVICE_RESET_RETRY_INTERVAL  30000   // millisecond
 #define NR_TPB_RESET_RETRY_INTERVAL     10000   // millisecond
-
+#define NR_RESET_POLL_INTERVAL          100     // millisecond
 
 /**
  * ITER_COAL_REQS - iterate over coalesced reset requests
@@ -61,7 +65,7 @@ int nr_msleep_stoppable(struct neuron_device *nd, uint32_t msec)
 static int nr_call_post_reset_config(struct neuron_device *nd, uint32_t nc_map, bool reset_succeeded)
 {
 	if (nc_map == NEURON_NC_MAP_DEVICE) {
-		return ndhal->ndhal_reset.nr_post_reset_config(nd, reset_succeeded);
+		return ndhal->ndhal_reset.nr_post_reset_config(nd, reset_succeeded, no_reset);
 	}
 	return 0;
 }
@@ -94,7 +98,7 @@ static int nr_reset_thread_fn(void *arg)
 		if (first_request->request_id != NEURON_RESET_REQUEST_ALL && first_request->next != NULL) {
 			ITER_COAL_REQS(request_iter, first_request, last_request, {
 				coal_cnt++;
-				nc_map |= request_iter->nc_map; 
+				nc_map |= request_iter->nc_map;
 			})
 		} else {
 			last_request = first_request;
@@ -374,7 +378,7 @@ bool nr_op_in_reset_wnd(uint64_t op_start_time, struct neuron_device *nd)
 	return false;
 }
 
-int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t tpb_reset_map)
+int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t tpb_reset_map_lo, uint32_t tpb_reset_map_hi)
 {
 	bool is_device_reset;
 	uint32_t reset_retry_interval;
@@ -391,7 +395,7 @@ int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t
 	start_time = ktime_get();
 
 	/* Send reset request to firmware */
-	fw_io_initiate_reset(nd->npdev.bar0, is_device_reset, tpb_reset_map);
+	fw_io_initiate_reset(nd->npdev.bar0, is_device_reset, tpb_reset_map_lo, tpb_reset_map_hi);
 	next_reset_retry_time = ktime_add_ms(start_time, reset_retry_interval);
 
 	do {
@@ -399,7 +403,7 @@ int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t
 		 * After reset initiation, firmware becomes unresponsive until
 		 * the device completes the reset. Wait before next polling cycle.
 		 */
-		if (nr_msleep_stoppable(nd, ndhal->ndhal_reset.reset_poll_interval)) {
+		if (nr_msleep_stoppable(nd, NR_RESET_POLL_INTERVAL)) {
 			return -EINTR;
 		}
 
@@ -421,7 +425,7 @@ int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t
 			 * If timed out, retry the reset.
 			 * This handles cases where the initial/previous reset was missed.
 			 */
-			fw_io_initiate_reset(nd->npdev.bar0, is_device_reset, tpb_reset_map);
+			fw_io_initiate_reset(nd->npdev.bar0, is_device_reset, tpb_reset_map_lo, tpb_reset_map_hi);
 
 			next_reset_retry_time = ktime_add_ms(cur_time, reset_retry_interval);
 		}
diff --git a/neuron_reset.h b/neuron_reset.h
index 12a6ebc..afac5a5 100644
--- a/neuron_reset.h
+++ b/neuron_reset.h
@@ -113,12 +113,13 @@ bool nr_op_in_reset_wnd(uint64_t op_start_time, struct neuron_device *nd);
  * 
  * @nd: Neuron device structure
  * @nc_map: Neural Core map that specifies reset scope (device vs TPB level)
- * @tpb_reset_map: Bitmap of TPBs to reset
+ * @tpb_reset_map_lo: Bitmap of TPBs/SDMA/TopSp/CC_TOP to reset (bits 0-31)
+ * @tpb_reset_map_hi: Bitmap of top-level H2D DMAs to reset (bits 0-3)
  * 
  * @return: 0 on success, -1 on failure or interruption
  * 
  */
-int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t tpb_reset_map);
+int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t tpb_reset_map_lo, uint32_t tpb_reset_map_hi);
 
 /**
  * nr_msleep_stoppable() - Sleep until msec or reset thread is stopped
diff --git a/neuron_ring.c b/neuron_ring.c
index 0c8420c..280e961 100644
--- a/neuron_ring.c
+++ b/neuron_ring.c
@@ -31,7 +31,7 @@ module_param(dma_teardown_on_exit, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
 MODULE_PARM_DESC(dma_teardown_on_exit, "Reset the DMA state on user process exit");
 
 // forward
-static void ndmar_h2t_ring_free(struct ndma_ring *ring);
+static void ndmar_h2t_ring_free(struct ndma_eng *eng, struct ndma_ring *ring);
 
 struct ndma_eng *ndmar_acquire_engine(struct neuron_device *nd, u32 eng_id)
 {
@@ -150,7 +150,7 @@ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_
 	if (eng == NULL)
 		return -EINVAL;
 
-	if (qid >= DMA_MAX_Q_V4) {
+	if (qid >= ndhal->ndhal_udma.num_queues) {
 		ret = -EINVAL;
 		goto done;
 	}
@@ -220,7 +220,7 @@ void ndmar_handle_process_exit(struct neuron_device *nd, pid_t pid)
 	struct mem_chunk *mc = nd->ndma_q_dummy_mc;
 	const int desc_count = NDMA_QUEUE_DUMMY_RING_DESC_COUNT;
 	for (eng_id = 0; eng_id < ndhal->ndhal_address_map.seng_dma_eng_per_nd; eng_id++) {
-		for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) {
+		for (qid = 0; qid < ndhal->ndhal_udma.num_queues; qid++) {
 			struct ndma_eng *eng = ndmar_acquire_engine_nl(nd, eng_id);
 			struct ndma_queue *queue;
 			struct ndma_ring *ring;
@@ -393,12 +393,26 @@ static int ndmar_h2t_ring_alloc(struct neuron_device *nd, int nc_id, int qid)
 	ring->h2t_completion.ptr = h2t_completion_mc->va;
 	ring->h2t_completion.addr = virt_to_phys(ring->h2t_completion.ptr) | ndhal->ndhal_address_map.pci_host_base;
 
+	ret = ndma_h2d_compl_queue_init(nd, &ring->dma_compl_queue);
+	if (ret) {
+		pr_err("can't initialize h2d dma completion queue\n");
+		goto error;
+	}
+
+	ret = ndma_ctx_queue_init(&ring->dma_ctx_queue);
+	if (ret) {
+		pr_err("can't initialize dma context queue\n");
+		goto error_ctx_queue;
+	}
+
 	mutex_init(&ring->h2t_ring_lock);
 
 	ndmar_release_engine(eng);
 
 	return 0;
 
+error_ctx_queue:
+	ndma_h2d_compl_queue_destroy(&ring->dma_compl_queue);
 error:
 	ring->h2t_nc_id = -1;
 	ring->tx_mc = NULL;
@@ -469,7 +483,7 @@ int ndmar_h2t_ring_request(struct neuron_device *nd, int nc_id, bool h2t, int *r
 	if (eng == NULL)
 		return -EINVAL;
 
-	for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) {
+	for (qid = 0; qid < ndhal->ndhal_udma.num_queues; qid++) {
 		if (ndhal->ndhal_ndmar.ndmar_is_h2t_def_q(nd, eng_id, qid))
 			continue;
 		queue = ndmar_get_queue(eng, qid);
@@ -491,7 +505,7 @@ int ndmar_h2t_ring_request(struct neuron_device *nd, int nc_id, bool h2t, int *r
 				}
 				ret = ndmar_h2t_ring_init(eng, qid);
 				if (ret) {
-					ndmar_h2t_ring_free(ring);
+					ndmar_h2t_ring_free(eng, ring);
 					pr_err("nd%d:nc%d H2T ring init for qid:%d failed - %d\n", nd->device_index, nc_id, qid, ret);
 					ring->h2t_allocated = false;
 					goto done;
@@ -519,7 +533,7 @@ int ndmar_h2t_ring_release(struct neuron_device *nd, int nc_id, int qid)
 	struct ndma_queue *queue;
 	struct ndma_ring *ring;
 	
-	if (qid >= DMA_MAX_Q_MAX) {
+	if (qid >= ndhal->ndhal_udma.num_queues) {
 		return -EINVAL;
 	}
 
@@ -542,7 +556,7 @@ int ndmar_h2t_ring_release(struct neuron_device *nd, int nc_id, int qid)
 	}
 
 	if (ndmar_h2t_ring_is_h2t(ring)) {
-		ndmar_h2t_ring_free(ring);
+		ndmar_h2t_ring_free(eng, ring);
 	} else {
 		ndmar_h2t_ring_state_clr(ring);
 		queue->owner = 0;
@@ -768,7 +782,7 @@ int ndmar_init(struct neuron_device *nd)
 	return ndmar_init_ncs(nd, -1);
 }
 
-static void ndmar_h2t_ring_free(struct ndma_ring *ring)
+static void ndmar_h2t_ring_free(struct ndma_eng *eng, struct ndma_ring *ring)
 {
 	if (ring->tx_mc) {
 		mc_free(&ring->tx_mc);
@@ -790,6 +804,9 @@ static void ndmar_h2t_ring_free(struct ndma_ring *ring)
 		ring->h2t_completion_mc = NULL;
 	}
 
+	ndma_ctx_queue_free(eng, ring, &ring->dma_ctx_queue);
+	ndma_h2d_compl_queue_destroy(&ring->dma_compl_queue);
+
 	ndmar_h2t_ring_state_clr(ring);
 }
 
@@ -810,13 +827,13 @@ static void ndmar_h2t_ring_free_all(struct neuron_device *nd, int nc_idx)
 		return;
 	}
 
-	for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) {
+	for (qid = 0; qid < ndhal->ndhal_udma.num_queues; qid++) {
 		queue = ndmar_get_queue(eng, qid);
 		ring = ndmar_get_ring(queue);
 		if (ndmar_h2t_ring_is_allocated(ring) && ring->h2t_nc_id == nc_idx) {
 			if (ndmar_h2t_ring_is_h2t(ring)) {
 				// h2t queue free all resources
-				ndmar_h2t_ring_free(ring);
+				ndmar_h2t_ring_free(eng, ring);
 			} else {
 				// service queue only clear state
 				ndmar_h2t_ring_state_clr(ring);
diff --git a/neuron_ring.h b/neuron_ring.h
index f031be7..c9d3462 100644
--- a/neuron_ring.h
+++ b/neuron_ring.h
@@ -7,6 +7,7 @@
 #define NEURON_RING_H
 
 #include "udma/udma.h"
+#include "share/neuron_driver_shared.h"
 
 #define DMA_H2T_DESC_COUNT 4096
 #define NUM_DMA_ENG_PER_DEVICE 132 // for v2 2 nc with each 16,
@@ -19,6 +20,115 @@ extern int nc_per_dev_param;
 struct neuron_device;
 struct neuron_dma_eng_state;
 struct neuron_dma_queue_state;
+struct ndma_eng;
+struct ndma_ring;
+
+/*
+ * H2D DMA Completion Queue (CQ)
+ * -----------------------------
+ * A fixed-size circular buffer shared between driver and runtime, consisting of
+ * NDMA_H2D_COMPL_QUEUE_CAPACITY Completion Queue Entries (CQEs).
+ *
+ * Driver Lib mmaps both the CQ and its metadata (head/tail and capacity) to 
+ * runtime. The driver writes a completion result at tail CQE, while the runtime 
+ * consumes from head CQE and clears it. Each CQE contains a sequence number,
+ * a completion result, and an opaque context pointer.
+ *
+ * Correctness: CQ is SPMC (one driver kthread writes per ND; multiple runtime
+ * threads read). Runtime must lock; driver is lock-free. Both sides require
+ * smp_wmb()/smp_rmb() barriers.
+ *
+ * Async IO only.
+ *
+ * Also see neuron_h2d_dma_compl_queue_t and
+ * neuron_h2d_dma_compl_queue_entry_t in neuron_driver_shared.h.
+ *
+ */
+struct ndma_h2d_compl_queue {
+	uint32_t capacity_mask;	// Capacity mask of the CQ (capacity - 1).
+	uint32_t tail;			// Free-running index of the next free CQE to be written by driver. 
+							// Internally maintained by driver.
+	struct mem_chunk *mc;	// Memchunk for the CQ.
+	neuron_h2d_dma_compl_queue_t *compl_queue_shared;// the CQ structure mmapped to and shared with user space.
+};
+
+int ndma_h2d_compl_queue_init(struct neuron_device *nd, struct ndma_h2d_compl_queue *compl_queue);
+void ndma_h2d_compl_queue_destroy(struct ndma_h2d_compl_queue *compl_queue);
+
+/*
+ * H2D DMA Context Queue
+ * ---------------------
+ * A fixed-size circular buffer storing pointers to `ndma_h2t_zcdma_context`.
+ * The queue tracks DMA context lifecycles (submitted, pinned, or unpinned)
+ * and maintains four logical indices:
+ *
+ *   1. head
+ *        - Index of the first *valid* (non-empty) entry in the queue.
+ *        - The context at `head` may be in any ndma_zcdma_state except
+ *          NDMA_COMPLETED.
+ *
+ *   2. tail
+ *        - Index of the next free slot where a new context pointer will be
+ *          inserted.
+ *
+ *   3. first_pinned_unsubmitted
+ *        - Index of the earliest context that is pinned but not yet submitted.
+ *        - If no such context exists, this is set to first_unpinned.
+ *
+ *   4. first_unpinned
+ *        - Index of the earliest context that is unpinned.
+ *        - If no such context exists, this is set to tail.
+ *
+ * Queue state conditions:
+ *   - Empty: head == tail
+ *   - Full:  head == ((tail + 1) & (capacity - 1))
+ *   - Current size: (tail - head + capacity) & (capacity - 1)
+ *
+ * Example (capacity = 10):
+ *
+ *   index:  0    1    2    3    4    5    6    7    8    9
+ *   entry: [ ]  [S]  [S]  [S]  [P]  [P]  [U]  [U]  [ ]  [ ]
+ *                ^              ^         ^         ^
+ *                |              |         |         |
+ *                H             FPU       FU         T
+ *
+ *   Legend:
+ *     H  = head
+ *     T  = tail
+ *     FPU = first_pinned_unsubmitted
+ *     FU  = first_unpinned
+ *
+ *     [C] = completed
+ *     [S] = submitted
+ *     [P] = pinned but unsubmitted
+ *     [U] = unpinned
+ *     [ ] = empty slot
+ *
+ * @entries: Array of DMA context entries in the queue
+ * @completion_pool: Pre-allocated pool of completion_ptr buffer
+ * @page_list_pool: Pre-allocated pool of page_list arrays
+ * @capacity_mask: Maximum number of entries the queue can hold and minus one,
+ *                 capacity = capacity_mask + 1
+ * @head: Index pointing to the head of the queue
+ * @tail: Index pointing to the tail of the queue
+ * @first_pinned_unsubmitted: Index of the first pinned but not yet submitted entry
+ * @first_unpinned: Index of the first unpinned entry in the queue
+ * @nr_pinned_pages: Total count of pinned memory pages belonged to the queue
+ */
+struct ndma_ctx_queue {
+	struct ndma_h2t_zcdma_context *entries;
+	void *completion_pool;
+	void *page_list_pool;
+	u32 capacity_mask;
+	u32 head;
+	u32 tail;
+	u32 first_pinned_unsubmitted;
+	u32 first_unpinned;
+	u64 nr_pinned_pages;
+};
+
+int ndma_ctx_queue_init(struct ndma_ctx_queue *queue);
+void ndma_ctx_queue_free(struct ndma_eng *eng, struct ndma_ring *ring, struct ndma_ctx_queue *queue);
 
 /*
  * dma context for both sync and async DMA operations
@@ -49,6 +159,8 @@ struct ndma_ring {
 	struct udma_ring_ptr h2t_completion;  // TODO why are we using udma_ring_ptr...
 	struct mem_chunk *h2t_completion_mc;
 	struct ndma_h2t_dma_context h2t_dma_ctx[NEURON_DMA_H2T_CTX_HANDLE_CNT];
+	struct ndma_ctx_queue dma_ctx_queue;
+	struct ndma_h2d_compl_queue dma_compl_queue;
 	u32 h2t_nc_id;
 	bool h2t_allocated; // ring can be allocated for standard use or h2t 
 	u32 qid;
diff --git a/neuron_sysfs_metrics.c b/neuron_sysfs_metrics.c
index 2fd72b3..fd71ae0 100644
--- a/neuron_sysfs_metrics.c
+++ b/neuron_sysfs_metrics.c
@@ -389,7 +389,7 @@ static ssize_t nsysfsmetric_show_nrt_other_metrics(struct nsysfsmetric_metrics *
         char buffer[256];
         int ret = ndhal->ndhal_tpb.pe_format_activity_stats(nd, attr->nc_id, buffer, sizeof(buffer));
         if (ret) {
-            pr_err("sysfs failed to read pe_array activity counters, error = %d\n", ret);
+            pr_err_ratelimited("sysfs failed to read pe_array activity counters, error = %d\n", ret);
         }
         len = nsysfsmetric_sysfs_emit(buf, "%s", buffer);
     } else {
@@ -942,7 +942,6 @@ int nsysfsmetric_register(struct neuron_device *nd, struct kobject *neuron_devic
     }
 
     // neuron{0, 1, ...}/stats/power
-    pr_info("Installing neuron power sysfs node\n");
     struct nsysfsmetric_node *power_node =
             nsysfsmetric_init_and_add_one_node(metrics, stats_node, "power", false, -1,
                                                power_utilization_attrs_info_tbl_cnt,
diff --git a/share/neuron_driver_shared.h b/share/neuron_driver_shared.h
index b1f716c..030c19f 100644
--- a/share/neuron_driver_shared.h
+++ b/share/neuron_driver_shared.h
@@ -210,9 +210,25 @@ typedef struct neuron_memcpy_batch {
 	void *context;                  // [in] TBD. opaque context pointer passed back in completion queue
 } neuron_memcpy_batch_t;
 
+/* H2D Completion Queue Entry (CQE) */
+typedef struct neuron_h2d_dma_compl_queue_entry {
+    __u64 sequence_num; // Sequence number for the submitted IO request from runtime (0 means empty slot).
+    __s64 compl_ret;    // Completion status for the request (0 success; negative errno on failure; positive to be used for future).
+    void *context;      // Opaque context pointer copied from submission and represents a pointer to xu_error_list_t in runtime.
+} neuron_h2d_dma_compl_queue_entry_t;
+
+/* H2D DMA Completion Queue (CQ) */
+typedef struct neuron_h2d_dma_compl_queue {
+	__u32 capacity;	// Capacity of the completion queue (number of CQEs).
+	__u32 head;		// Free-running index of the next CQE to be consumed by runtime.
+	__u32 tail;		// Free-running index of the next free CQE to be written by driver.
+	// CQEs are laid out immediately after the header in the same mmap region.
+	neuron_h2d_dma_compl_queue_entry_t entries[]; // offset to the CQE array of the completion queue.
+} neuron_h2d_dma_compl_queue_t;
+
 /*
  * Memory allocation categories for sysfs counters
-*/
+ */
 typedef enum {
 	NEURON_MEMALLOC_TYPE_UNKNOWN_HOST, // only for old runtimes, do not use elsewhere
 	NEURON_MEMALLOC_TYPE_CODE_HOST,
@@ -244,16 +260,18 @@ typedef enum {
 /*
  * NDS stats
  * Note: 
- * 	To add a new counter type inside the enum, 
- * 		1. you need to manually decrease NDS_ND_COUNTER_RESERVED or NDS_NC_COUNTER_RESERVED by 1
- * 		2. you need to update NDS_ND_COUNTER_COUNT or NDS_NC_COUNTER_COUNT
- * 	To prevent compatability issues, you need to always append the new counter type to the end of the enum
+ * 	To add a new counter type inside the enum, you need to manually
+ *  decrease NDS_ND_COUNTER_RESERVED or NDS_EXT_NC_COUNTER_ADDED_RESERVED by 1.
+ *
+ * 	To prevent compatability issues, you need to always append the new counter type
+ *  to the end of the enum, before NDS_ND_COUNTER_LAST or NDS_EXT_NC_COUNTER_LAST
  */
-#define NDS_ND_COUNTER_RESERVED 18
+#define NDS_ND_COUNTER_RESERVED 17
 
 // Device counter types
 enum {
-	NDS_ND_COUNTER_RUNTIME_VERSION,
+	NDS_ND_COUNTER_START = 0,
+	NDS_ND_COUNTER_RUNTIME_VERSION = NDS_ND_COUNTER_START,
 	NDS_ND_COUNTER_FRAMEWORK_VERSION,
 	NDS_ND_COUNTER_FAL_VERSION,
 	NDS_ND_COUNTER_FEATURE_BITMAP,
@@ -270,8 +288,10 @@ enum {
 	NDS_ND_COUNTER_DYNAMIC_SYSFS_METRIC_BITMAP,
 
 	NDS_ND_COUNTER_DEVICE_CLUSTER_ID,
+	NDS_ND_COUNTER_AGG_NEFF_ID,
+	NDS_ND_COUNTER_LAST,
 
-	NDS_ND_COUNTER_COUNT = NDS_ND_COUNTER_DEVICE_CLUSTER_ID + NDS_ND_COUNTER_RESERVED + 1
+	NDS_ND_COUNTER_COUNT = NDS_ND_COUNTER_LAST + NDS_ND_COUNTER_RESERVED
 };
 
 #define NDS_NC_COUNTER_RESERVED 0
@@ -329,8 +349,9 @@ enum {
 	NDS_NC_COUNTER_MAC_COUNT,
 
 	NDS_NC_COUNTER_OOB,
+	NDS_NC_COUNTER_LAST,
 
-	NDS_NC_COUNTER_COUNT = NDS_NC_COUNTER_OOB + NDS_NC_COUNTER_RESERVED + 1
+	NDS_NC_COUNTER_COUNT = NDS_NC_COUNTER_LAST + NDS_NC_COUNTER_RESERVED
 };
 
 #define NDS_MAX_NEURONCORE_COUNT     (4)
diff --git a/udma/udma.h b/udma/udma.h
index 484b82e..51400b7 100644
--- a/udma/udma.h
+++ b/udma/udma.h
@@ -212,6 +212,28 @@ enum {
  */
 int udma_init(struct udma *udma, struct udma_params *udma_params);
 
+/**
+ * udma_set_defaults() - set default configuration of one DMA engine
+ *
+ * @udma: udma structure needs to be initialized
+ *
+ * Return: 0 if UDMA is initialized successfully, a negative error code otherwise.
+ */
+int udma_set_defaults(struct udma *udma);
+
+/**
+ * udma_cache_defaults() - Cache frequently used CSR values.
+ *
+ * CSR reads are very slow and only one application(neuron) is using the DMA.
+ * So instead of reading CSR use hardware reset value(from datasheet) as
+ * default value.
+ *
+ * @udma: udma structure
+ *
+ * Return: 0 if UDMA is initialized successfully, a negative error code otherwise.
+ */
+int udma_cache_defaults(struct udma *udma);
+
 /**
  * udma_q_init() - Initialize the udma queue.
  *
@@ -227,6 +249,15 @@ int udma_init(struct udma *udma, struct udma_params *udma_params);
  */
 int udma_q_init(struct udma *udma, u32 qid, struct udma_q_params *q_params);
 
+/**
+ * udma_q_enable() - Enables a udma queue
+ *
+ * @udma_q:	udma queue data structure
+ * @enable: flag to enable/disable
+ *
+ */
+void udma_q_enable(struct udma_q *udma_q, int enable);
+
 /**
  * udma_q_pause() - Pauses a udma queue
  *
@@ -290,7 +321,7 @@ void udma_m2m_mask_ring_id_error(struct udma *udma, void __iomem *intc_base);
  * @udma: udma data structure
  * @state: new state to set
  *
-* Return: 0 on success, a negative error code otherwise.
+ * Return: 0 on success, a negative error code otherwise.
  */
 int udma_state_set(struct udma *udma, enum udma_state state);
 
@@ -303,6 +334,17 @@ int udma_state_set(struct udma *udma, enum udma_state state);
  */
 enum udma_state udma_state_get(struct udma *udma, enum udma_type type);
 
+
+/**
+ * udma_set_max_descs_and_prefetch() - set maximum number descriptors per one DMA packet 
+ *
+ * @udma: udma handle
+ * @max_descs: max desc per packet
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int udma_set_max_descs_and_prefetch(struct udma *udma, u8 max_descs);
+
 /**
  * udma_available_get() - Get number of descriptors that can be submitted to the udma.
  *
diff --git a/udma/udma_m2m.c b/udma/udma_m2m.c
index 37372d5..c79fd9a 100644
--- a/udma/udma_m2m.c
+++ b/udma/udma_m2m.c
@@ -9,6 +9,7 @@
 
 #include "udma.h"
 #include "../neuron_arch.h"
+#include "../neuron_dhal.h"
 
 /* Note on terminology:
  * for historical reasons the code uses both m2s/s2m and Tx/Rx terminology
@@ -99,7 +100,7 @@ static void sdma_m2s_set_write_barrier(uint32_t *meta_ctrl)
 }
 
 /* set maximum number descriptors per one DMA packet */
-static int udma_set_max_descs_and_prefetch(struct udma *udma, u8 max_descs)
+int udma_set_max_descs_and_prefetch(struct udma *udma, u8 max_descs)
 {
 	// Due to DGE bug on V3 (https://tiny.amazon.com/tfw2hept)
 	// Min burst must equal Max burst, which is 8
@@ -467,7 +468,7 @@ void udma_m2m_set_axi_error_abort(struct udma *udma)
 
 	// step 2: program axi error control
 	for (i = 0; i < 6; i++) {
-		for (q = 0; q < DMA_MAX_Q_MAX; q++) {
+		for (q = 0; q < ndhal->ndhal_udma.num_queues; q++) {
 			reg_write32(&gen_regs->axi_error_control[i].table_addr, (q << 3) | 0x7);
 			reg_write32(&gen_regs->axi_error_control[i].table_data, 0x10);
 		}
diff --git a/udma/udma_main.c b/udma/udma_main.c
index 7390c6a..4147289 100644
--- a/udma/udma_main.c
+++ b/udma/udma_main.c
@@ -64,7 +64,7 @@ static int udma_m2s_packet_size_cfg_set(struct udma *udma, struct udma_m2s_pkt_l
 #define UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY (1 << 16)
 
 /* set default configuration of one DMA engine */
-static int udma_set_defaults(struct udma *udma)
+int udma_set_defaults(struct udma *udma)
 {
 	int ret = 0;
 	struct udma_gen_ex_regs __iomem *gen_ex_regs;
@@ -116,7 +116,7 @@ static int udma_set_defaults(struct udma *udma)
 
 	/* Set addr_hi selectors */
 	gen_ex_regs = (struct udma_gen_ex_regs __iomem *)udma->gen_ex_regs;
-	for (i = 0; i < DMA_MAX_Q_V4; i++)
+	for (i = 0; i < ndhal->ndhal_udma.num_queues; i++)
 		reg_write32(&gen_ex_regs->vmpr_v4[i].tx_sel, 0xffffffff);
 
 	/* Set M2S data read master configuration */
@@ -128,7 +128,7 @@ static int udma_set_defaults(struct udma *udma)
 
 	/* Set addr_hi selectors */
 	gen_ex_regs = (struct udma_gen_ex_regs __iomem *)udma->gen_ex_regs;
-	for (i = 0; i < DMA_MAX_Q_V4; i++) {
+	for (i = 0; i < ndhal->ndhal_udma.num_queues; i++) {
 		reg_write32(&gen_ex_regs->vmpr_v4[i].rx_sel[0], 0xffffffff);
 		reg_write32(&gen_ex_regs->vmpr_v4[i].rx_sel[1], 0xffffffff);
 		reg_write32(&gen_ex_regs->vmpr_v4[i].rx_sel[2], 0xffffffff);
@@ -181,10 +181,10 @@ static int udma_set_defaults(struct udma *udma)
  * So instead of reading CSR use hardware reset value(from datasheet) as
  * default value.
  */
-static int udma_cache_defaults(struct udma *udma)
+int udma_cache_defaults(struct udma *udma)
 {
 	int i;
-	for (i = 0; i < DMA_MAX_Q_V4; i++) {
+	for (i = 0; i < ndhal->ndhal_udma.num_queues; i++) {
 		struct udma_q *q = &udma->udma_q_m2s[i];
 		q->cfg = M2S_CFG_RESET_VALUE;
 		q->rlimit_mask = M2S_RATE_LIMIT_RESET_VALUE;
@@ -259,7 +259,7 @@ static int udma_q_set_pointers(struct udma_q *udma_q)
 
 /** enable/disable udma queue
  */
-static void udma_q_enable(struct udma_q *udma_q, int enable)
+void udma_q_enable(struct udma_q *udma_q, int enable)
 {
 	u32 reg;
 
diff --git a/udma/udma_regs.h b/udma/udma_regs.h
index f20ac3f..a24d589 100644
--- a/udma/udma_regs.h
+++ b/udma/udma_regs.h
@@ -92,7 +92,9 @@ struct udma_m2s_feature {
 struct udma_m2s_q {
 	/* [0x0] M2S descriptor prefetch configuration */
 	u32 desc_pref_cfg;
-	u32 reserved0[7];
+	/* [0x4] M2S descriptor prefetch configuration 2 */
+	u32 desc_pref_cfg2;
+	u32 reserved0[6];
 	/* [0x20] M2S descriptor ring configuration */
 	u32 cfg;
 	/* [0x24] M2S descriptor ring status and information */
@@ -345,7 +347,11 @@ struct udma_s2m_comp {
 };
 
 struct udma_s2m_q {
-	u32 reserved0[8];
+	/* [0x0] M2S descriptor prefetch configuration */
+	u32 desc_pref_cfg;
+	/* [0x4] M2S descriptor prefetch configuration 2 */
+	u32 desc_pref_cfg2;
+	u32 reserved0[6];
 	/* [0x20] S2M Descriptor ring configuration */
 	u32 cfg;
 	/* [0x24] S2M Descriptor ring status and information */
diff --git a/v2/address_map.h b/v2/address_map.h
index b0309a2..5607b43 100644
--- a/v2/address_map.h
+++ b/v2/address_map.h
@@ -17,7 +17,6 @@
 #define V2_PCIE_ALL_RT_MASK                    0x01f00000000000ull
 
 // relative to nc
-#define V2_MMAP_P_OFFSET 0x00000000000000ull
 #define V2_MMAP_NC_EVENT_OFFSET 0x00000002700000ull
 #define V2_MMAP_NC_SEMA_READ_OFFSET V2_MMAP_NC_EVENT_OFFSET + 0x00000000001000ull
 #define V2_MMAP_NC_SEMA_SET_OFFSET V2_MMAP_NC_EVENT_OFFSET + 0x00000000001400ull
@@ -35,8 +34,6 @@
 // relative to V2 address space
 #define V2_APB_MISC_RAM_OFFSET 0x000ffff0fa0000ull
 
-#define V2_MMAP_NC_SIZE 0x00000004000000ull
-
 // Number of dice per chip
 #define V2_NUM_DIE_PER_DEVICE 1
 
diff --git a/v2/neuron_dhal_v2.c b/v2/neuron_dhal_v2.c
index d196952..5fe4e61 100644
--- a/v2/neuron_dhal_v2.c
+++ b/v2/neuron_dhal_v2.c
@@ -26,7 +26,6 @@ extern int dev_nc_map;
 
 #define NR_RESET_RETRY_SLEEP_MS                     100
 #define V2_NR_RESET_INIT_MAX_TOTAL_WAIT_TIME_MS     (1000 * 120)
-#define V2_NR_RESET_POLL_INTERVAL                   100
 
 struct neuron_dm_special_mmap_ent dm_mmap_special_v2[] = {
 	DM_SPECIAL_MM_ENT( NEURON_DM_BLOCK_TPB,   0, NEURON_DM_RESOURCE_SEMAPHORE, V2_MMAP_TPB_OFFSET, V2_PCIE_BAR0_TPB_0_OFFSET,   V2_MMAP_TPB_SIZE, V2_MMAP_NC_EVENT_OFFSET, V2_MMAP_NC_SEMA_SIZE, 0),
@@ -149,7 +148,7 @@ static int nr_initiate_reset_v2(struct neuron_device *nd, uint32_t nc_map)
 	uint32_t tpb_reset_map = 0;
 	nr_get_tpb_reset_map(nc_map, &tpb_reset_map);
 
-	int ret = nr_initiate_reset_via_fw(nd, nc_map, tpb_reset_map);
+	int ret = nr_initiate_reset_via_fw(nd, nc_map, tpb_reset_map, 0);
 	if (ret) {
 		return ret;
 	}
@@ -168,7 +167,7 @@ static int nr_initiate_reset_v2_qemu(struct neuron_device *nd, uint32_t nc_map)
 	uint32_t tpb_reset_map = 0;
 	nr_get_tpb_reset_map(nc_map, &tpb_reset_map);
 
-	int ret = nr_initiate_reset_via_fw(nd, nc_map, tpb_reset_map);
+	int ret = nr_initiate_reset_via_fw(nd, nc_map, tpb_reset_map, 0);
 	if (ret) {
 		return ret;
 	}
@@ -240,8 +239,9 @@ static int nr_wait_for_reset_completion_v2_emu(struct neuron_device *nd)
  * 
  * @param nd - Neuron device which will be reset by the thread.
  */
-static int nr_post_reset_config_v2(struct neuron_device *nd, bool reset_successful)
+static int nr_post_reset_config_v2(struct neuron_device *nd, bool reset_successful, bool is_no_reset)
 {
+	nd->supports_hbm_7200 = 0;
 	return 0;
 }
 
@@ -367,29 +367,35 @@ static void ts_nq_destroy_one_v2(struct neuron_device *nd, u8 ts_id)
 /* Neuron Core Functions */
 /**
  * nc_get_semaphore_base() - get semaphore base address
- * 
+ *
  * @param nd - neuron device
  * @param nc_id - neuron core index
- * @return void* - semaphore base address
+ * @param sem_base - resulting semaphore base address
+ *
+ * Return: 0 on success, a negative error code otherwise.
  */
-static void *nc_get_semaphore_base_v2(struct neuron_device *nd, u8 nc_id)
+static int nc_get_semaphore_base_v2(struct neuron_device *nd, u8 nc_id, void **sem_base)
 {
-	return nd->npdev.bar0 + V2_PCIE_BAR0_TPB_0_OFFSET + (V2_PCIE_BAR0_TPB_0_SIZE * nc_id);
+	(*sem_base) = nd->npdev.bar0 + V2_PCIE_BAR0_TPB_0_OFFSET + (V2_PCIE_BAR0_TPB_0_SIZE * nc_id);
+	return 0;
 }
 
 /**
  * nc_get_event_addr() - get event address
- * 
+ *
  * @param nd - neuron device
  * @param nc_id - neuron core index
  * @param event_index - event index
- * @return void* - event address
+ * @param ev_addr - resulting event address
+ *
+ * Return: 0 on success, a negative error code otherwise.
  */
-static void *nc_get_event_addr_v2(struct neuron_device *nd, u8 nc_id, u16 event_index)
+static int nc_get_event_addr_v2(struct neuron_device *nd, u8 nc_id, u16 event_index, void **ev_addr)
 {
 	void *base = nd->npdev.bar0 + V2_PCIE_BAR0_TPB_0_OFFSET +
 			   (V2_PCIE_BAR0_TPB_0_SIZE * nc_id) + ndhal->ndhal_address_map.mmap_nc_event_offset;
-	return (base + (event_index * NC_EVENT_SIZE));
+	(*ev_addr) = (base + (event_index * NC_EVENT_SIZE));
+	return 0;
 }
 
 
@@ -447,7 +453,7 @@ static void nnq_set_hwaddr_v2(struct neuron_device *nd, u8 nc_id, u8 index, u32
  * @param device_dram_addr: DRAM Channel 0 and 1's addresses
  * @param device_dram_size: DRAM Channel 0 and 1's sizes
  */
-static void mpset_set_dram_and_mpset_info_v2(struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size)
+static void mpset_set_dram_and_mpset_info_v2(struct neuron_mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size)
 {
 	mpset->num_channels = V2_MAX_DRAM_CHANNELS;
 	mpset->mp_device_num_regions = 1;
@@ -459,60 +465,6 @@ static void mpset_set_dram_and_mpset_info_v2(struct mempool_set *mpset, u64 *dev
 	ndhal->ndhal_mpset.device_dram_end_addr[1] = device_dram_addr[1] + device_dram_size[1];
 }
 
-// Upper 16MB is used internally by the firmware, don't use it in the allocation pool
-#define MEMPOOL_CARVEOUT_SIZE 0x1000000 // 16MB
-/**
- * mpset_block_carveout_regions() 
- *          - in v2, block carve out regions: Upper 16 MB is used internally by firmware
- * 
- * @param nd: neuron device
- * @param mpset: pointer to mpset
- * @param device_dram_addr: DRAM Channel 0's and 1's addresses
- * @param device_dram_size: DRAM Channel 0's and 1's sizes
- * @param region_sz: region size
- * @return int: 0 on success, o/w on failure
- */
-static int mpset_block_carveout_regions_v2(struct neuron_device *nd, struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size)
-{
-	int ret;
-	u64 region_sz;
-	int channel = 0, region = 0;
-
-	/*
-	*  Block carve out regions: Upper 16 MB is used internally by firmware for trainuim
-	*
-	*  Ideally we would carve out by simply changing the start address of the chunk;
-	*  however, that breaks aligned allocation in 4.x kernel versions (fixed in 5.x).
-	*  Fix here:
-	*     commit 52fbf1134d479234d7e64ba9dcbaea23405f229e
-	*     Author: Alexey Skidanov <alexey.skidanov@intel.com>
-	*     Date:   Thu Jan 3 15:26:44 2019 -0800
-	*
-	*     lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk
-	*/
-	for (channel = 0; channel < mpset->num_channels; channel++) {
-		region_sz = device_dram_size[channel] / mpset->mp_device_num_regions;
-		for (region = 0; region < mpset->mp_device_num_regions; region++) {
-			const dma_addr_t start_addr = device_dram_addr[channel] + (region * region_sz);
-			struct mem_chunk *mc = NULL;
-			u32 nc_id = channel;
-			ret = mc_alloc_align(nd, MC_LIFESPAN_DEVICE, MEMPOOL_CARVEOUT_SIZE, 0, MEM_LOC_DEVICE, channel, region, nc_id, NEURON_MEMALLOC_TYPE_NCDEV_DEVICE, &mc);
-			if (ret) {
-				pr_err("failed to allocate hbm carveout region: ret=%d\n", ret);
-				return -ENOMEM;
-			}
-			if (mc->pa != start_addr) {
-				pr_err("carve out mc not offset 0!");
-				mc_free(&mc);
-				return -EINVAL;
-			}
-		}
-		ndhal->ndhal_mpset.device_dram_effective_base_addr[channel] = device_dram_addr[channel] + MEMPOOL_CARVEOUT_SIZE;
-	}
-
-	return 0;
-}
-
 
 /* DMA Ring Functions */
 /** 
@@ -619,26 +571,6 @@ static int ndmar_quiesce_queues_v2(struct neuron_device *nd, u32 nc_id, u32 engi
 	return 0;
 }
 
-/** ndmar_set_model_started()
- *
- * Checks to see if the pa belongs to PE IRAM FIFO offset. If so, then these
- * descs are used to load the iram. The mem chunk is going to have all the descriptors
- * to load the instructions in iram. So go through all the dma queues and check if this mem chunk is
- * in that queue. Once we have the queue we set that queue to have descs
- * for iram. The actual copy start of the queue would come when model is started and at that time
- * set the state of model start for this nc.
- *
- * @nd: Neuron device which contains the DMA engine
- * @pa: pa to check
- * @mc: mem chunk that has descs
- *
- * Return: None
- */
-static void ndmar_set_model_started_v2(struct neuron_device *nd, phys_addr_t pa, struct mem_chunk *mc)
-{
-	return;
-}
-
 
 /* FWIO Functions */
 const int trn1_32xl_neigbor_ids[16][4] = {
@@ -766,6 +698,11 @@ static int fw_io_read_csr_array_v2(void **ptrs, u32 *values, u32 num_csrs, bool
 	if (num_csrs > FW_IO_MAX_READLESS_READ_REGISTER_COUNT)
 		return -EINVAL;
 
+	// Force virtual platforms onto the direct path
+	if (narch_is_qemu() || narch_is_emu()) {
+		fw_io_read_csr_array_direct(ptrs, values, num_csrs, operational);
+	}
+
 	return fw_io_read_csr_array_direct(ptrs, values, num_csrs, operational);
 }
 
@@ -803,37 +740,6 @@ static int fw_io_post_metric_v2(struct fw_io_ctx *ctx, u8 *data, u32 size)
 }
 
 
-/* Register Access (read and write) Functions */
-/**
- * reg_read32_array() - read an array of 32bit registers.
- * 
- * @addr: register address.
- * @value: read value would be stored here.
- * @num_values: num values to read
- *
- * Return: 0 if read succeeds, a negative error code otherwise.
- */
-inline int reg_read32_array_v2(void **addr, u32 *value, u32 num_values)
-{
-	int ret;
-	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(addr, value, num_values, true);
-	if (ret != 0) {
-		pr_err("register read failure while reading %p\n", addr[0]);
-		dump_stack();
-	}
-	return ret;
-}
-
-inline int reg_read32_array_v2_qemu_emu(void **addr, u32 *value, u32 num_values)
-{
-	int i;
-	for (i = 0; i < num_values; i++) {
-		value[i] = readl(addr[i]);
-	}
-	return 0;
-}
-
-
 /* Memory Map Functions */
 /**
  * mmap_get_bar4_offset() - calculate the offset of BAR4
@@ -933,146 +839,6 @@ static int nsysfsmetric_add_tensor_engine_node_v2(struct nsysfsmetric_metrics *m
 
 
 /* PCI Functions */
-/**
- * neuron_pci_release_bar() - Release a PCI BAR
- * 
- * @param dev: PCI device whose resources were previously reserved by pci_request_region()
- * @param bar: BAR to be reserved
- * 
- * for V2, this function is dummy
- */
-static int neuron_pci_release_bar_v2(struct pci_dev *dev, int bar)
-{
-	if (bar != ndhal->ndhal_pci.apb_bar && bar != ndhal->ndhal_pci.axi_bar && bar != ndhal->ndhal_pci.dram_bar) {
-		pci_info(dev, "invalid BAR%d\n", bar);
-		return -ENODEV;
-	}
-	if (bar == BAR_UNUSED) {
-		return 0;
-	}
-
-	pci_release_region(dev, bar);
-	return 0;
-}
-
-/**
- * neuron_pci_reserve_bar() - Mark the PCI region associated with PCI BAR as being reserved
- * 
- * @param dev: PCI device whose resources are to be reserved
- * @param bar: BAR to be reserved
- * @param res_name: Name to be associated with resource.
- * @return int: Returns 0 on success, otherwise failure
- */
-static int neuron_pci_reserve_bar_v2(struct pci_dev *dev, int bar, const char *res_name)
-{
-	int ret;
-
-	if (bar != ndhal->ndhal_pci.apb_bar && bar != ndhal->ndhal_pci.axi_bar && bar != ndhal->ndhal_pci.dram_bar) {
-		pci_info(dev, "invalid BAR%d\n", bar);
-		return -ENODEV;
-	}
-	if (bar == BAR_UNUSED) {
-		return 0;
-	}
-
-	ret = pci_request_region(dev, bar, res_name);
-	if (ret) {
-		pci_info(dev, "BAR %d: can't reserve %s\n", bar, res_name);
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
- /**
- * neuron_pci_set_npdev() - set BAR's physical addr, io addr, and size of neuron_pci_device
- * 
- * @param dev: PCI device that owns the BAR
- * @param bar: BAR number
- * @param res_name: Name associated with resource
- * @param bar_pa: start physical address of BAR
- * @param bar_ioaddr: __iomem address to device BAR
- * @param bar_size: size of BAR
- * @return int: Returns 0 on success, otherwise failure
- */
-static int neuron_pci_set_npdev_v2(struct pci_dev *dev,
-                            int bar,
-                            const char *res_name,
-                            phys_addr_t *bar_pa,
-                            void __iomem **bar_ioaddr,
-                            u64 *bar_size)
-{
-	if (bar != ndhal->ndhal_pci.apb_bar && bar != ndhal->ndhal_pci.axi_bar && bar != ndhal->ndhal_pci.dram_bar) {
-		pci_info(dev, "invalid BAR%d\n", bar);
-		return -ENODEV;
-	}
-	if (bar == BAR_UNUSED) {
-		return 0;
-	}
-
-	if (pci_resource_len(dev, bar) == 0) {
-		pci_info(dev, "BAR%d len is 0\n", bar);
-		return -ENODEV;
-	}
-	
-	*bar_pa = pci_resource_start(dev, bar);
-	if (!(*bar_pa)) {
-		pci_info(dev, "Can't get start address of BAR%d %s\n", bar, res_name);
-		return -ENODEV;
-	}
-	*bar_size = pci_resource_len(dev, bar);
-	
-	if (bar == ndhal->ndhal_pci.dram_bar) {
-		ndhal->ndhal_pci.dram_bar_size = *bar_size;
-	}
-
-	if (bar == ndhal->ndhal_pci.dram_bar && wc_enable)
-		*bar_ioaddr = pci_iomap_wc(dev, bar, pci_resource_len(dev, bar));
-	else
-		*bar_ioaddr = pci_iomap(dev, bar, pci_resource_len(dev, bar));
-	
-	return 0;
-}
-
-extern int dup_helper_enable;
-static atomic_t dup_rid_cnt = ATOMIC_INIT(0); // count of duplicate routing IDs encountered
-static int neuron_pci_handle_dup_routing_id(void)
-{
-	int  ret = -ENODEV;
-	int  dup_cnt;
-	char cmd[256];
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
-	dup_cnt = atomic_fetch_add(1, &dup_rid_cnt);
-#else
-	dup_cnt = atomic_add_return(1, &dup_rid_cnt) - 1;
-#endif 
-
-	// If this is the first dup encounted, unload the driver
-	if ((dup_cnt == 0) && dup_helper_enable) {
-		pr_err("scheduling unload of %s due to duplicate routing id\n", module_name(THIS_MODULE));
-
-		int n = snprintf(cmd, sizeof(cmd), "sleep 10;/sbin/modprobe -r %s", module_name(THIS_MODULE));
-		if (n > sizeof(cmd)) {
-			pr_err("unable to schedule driver unload cmd buffer len exceeded\n");
-			return -EINVAL;
-		}
-		char *argv[] = 		  { "/bin/sh",
-								"-c",
-								cmd,
-								NULL};
-		static char *envp[] = { "HOME=/",
-								"TERM=linux",
-								"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-								NULL};
-
-		ret = call_usermodehelper( argv[0], argv, envp, UMH_WAIT_EXEC);
-		if (ret)
-			pr_err("unable to schedule driver unload. Error: %d\n", ret);
-	}
-
-	return ret;
-}
 
 // for V2 rename Neuron devices for better customer experience.
 // see internal documentation: TRN1-Discovery
@@ -1202,31 +968,6 @@ static void ncdev_compatible_version_v2(struct neuron_ioctl_compatible_version *
 	arg->max = V2_RT_MAX_COMPATIBLE_VERSION;
 }
 
-/**
- * ncdev_quiesce_exec_on_proc_exit()
- * 
- * Note:
- *      When a process is killed, the driver resets DMA but there is no
- *      way to soft reset neuron cores. This causes problem if the
- *      process was executing serial TPB or switching activation tables,
- *      which result in abrubtly stopping DMA engines hence engines are
- *      are blocked on semaphores. This results in next model
- *      load failure or inference timeout.
- * 
- *      Proper way is clearing out semaphore, events after resetting
- *      DMA engines. However, it is a lot of code change, hence
- *      adding a sleep for 1 second when process exits, which allows
- *      the NeuronCore to continue to execute for a second. Since
- *      no new inference can be submitted during this time, NeuronCore
- *      state would be cleared out.
- * 
- */
-static void ncdev_quiesce_exec_on_proc_exit_v2(void)
-{
-	// for V2, the 1 second DMA queisce delay in flush was eliminated to improve nrt_init performance
-	return;
-}
-
 static void ncdev_get_default_tpbs_for_hbm_v2(u32 hbm_index, u32 tpbs[MAX_NC_PER_DEVICE], u32 *tpb_count)
 {
 	tpbs[0] = hbm_index;
@@ -1257,18 +998,6 @@ static void ndma_get_wait_for_completion_time_v2(u32 count, bool async, u64 *fir
 	*following_wait_time *= 100;
 }
 
-static void ndma_get_wait_for_completion_time_v2_qemu(u32 count, bool async, u64 *first_wait_time, u64 *following_wait_time)
-{
-	ndma_get_wait_for_completion_time_v2(count, async, first_wait_time, following_wait_time);
-	*following_wait_time *= 10 * 1000;
-}
-
-static void ndma_get_wait_for_completion_time_v2_emu(u32 count, bool async, u64 *first_wait_time, u64 *following_wait_time)
-{
-	ndma_get_wait_for_completion_time_v2(count, async, first_wait_time, following_wait_time);
-	*following_wait_time *= 100 * 1000;
-}
-
 /**
  * ndma_validate_pa() - check the validity of the desc physical addresses
  *         west side: PCIEX4_1_BASE: 0x00c00000000000 host: PCIEX8_0_BASE: 0x00400000000000
@@ -1537,6 +1266,27 @@ static int perf_set_profile_v2(struct neuron_device *nd, uint32_t profile)
     return 0;
 }
 
+static int perf_get_profile_v2(struct neuron_device *nd, uint32_t *profile)
+{
+	// NOP implementation for v2 - return default profile value 0
+	if (!profile) {
+		return -EINVAL;
+	}
+	*profile = 0;
+    return 0;
+}
+
+static int perf_get_supported_profiles_v2(struct neuron_device *nd, u16 feature, u8 *num_profiles, u8 out_bitmap[32])
+{
+	*num_profiles = 0;
+	return 0;
+}
+
+static void perf_update_hbm_7200_supported_v2(struct neuron_device *nd) {
+	nd->supports_hbm_7200 = 0;
+	return;
+}
+
 /**
  * npe_class_node_id_show_data() - return sysfs class node_id
  *
@@ -1633,7 +1383,6 @@ int ndhal_register_funcs_v2(void) {
 	}
 
 	ndhal->ndhal_address_map.pci_host_base = V2_PCIE_A0_BASE;
-	ndhal->ndhal_address_map.mmap_p_offset = V2_MMAP_P_OFFSET;
 	ndhal->ndhal_address_map.mmap_nc_event_offset = V2_MMAP_NC_EVENT_OFFSET;
 	ndhal->ndhal_address_map.mmap_nc_sema_read_offset = V2_MMAP_NC_SEMA_READ_OFFSET;
 	ndhal->ndhal_address_map.mmap_nc_sema_set_offset = V2_MMAP_NC_SEMA_SET_OFFSET;
@@ -1641,7 +1390,6 @@ int ndhal_register_funcs_v2(void) {
 	ndhal->ndhal_address_map.mmap_nc_sema_decr_offset = V2_MMAP_NC_SEMA_DECR_OFFSET;
 	ndhal->ndhal_address_map.bar0_misc_ram_offset = V2_MMAP_BAR0_APB_MISC_RAM_OFFSET;
 	ndhal->ndhal_address_map.port_1_base = 0ull;
-	ndhal->ndhal_address_map.mmap_nc_size = V2_MMAP_NC_SIZE;
 	ndhal->ndhal_address_map.nc_per_device = V2_NC_PER_DEVICE;
 	ndhal->ndhal_address_map.dev_nc_map = (1 << V2_NC_PER_DEVICE) - 1;
 	ndhal->ndhal_address_map.dice_per_device = V2_NUM_DIE_PER_DEVICE;
@@ -1650,7 +1398,6 @@ int ndhal_register_funcs_v2(void) {
 	ndhal->ndhal_address_map.ts_per_device = V2_TS_PER_DEVICE;
 	ndhal->ndhal_address_map.dma_eng_per_nc = V2_DMA_ENG_PER_NC;
 	ndhal->ndhal_address_map.dram_channels = V2_MAX_DRAM_CHANNELS;
-	ndhal->ndhal_reset.reset_poll_interval = V2_NR_RESET_POLL_INTERVAL;
 	ndhal->ndhal_reset.initiate_max_wait_time = V2_NR_RESET_INIT_MAX_TOTAL_WAIT_TIME_MS;
 	ndhal->ndhal_reset.retry_count = NR_RESET_RETRY_COUNT;
 	ndhal->ndhal_reset.nr_post_reset_config = nr_post_reset_config_v2;
@@ -1665,14 +1412,12 @@ int ndhal_register_funcs_v2(void) {
 	ndhal->ndhal_mpset.mp_min_alloc_size = (mempool_min_alloc_size < 1024) ? 1024 : mempool_min_alloc_size;  // v2 has a bigger mem size and gen pool create fails if < 1024
 	ndhal->ndhal_mpset.small_pool_supported = true;
 	ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v2;
-	ndhal->ndhal_mpset.mpset_block_carveout_regions = mpset_block_carveout_regions_v2;
 	ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id = ndmar_get_h2t_eng_id_v2;
     ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid = ndmar_get_h2t_def_qid_v2;
     ndhal->ndhal_ndmar.ndmar_is_h2t_def_q = ndmar_is_h2t_def_q_v2;
 	ndhal->ndhal_ndmar.nr_init_h2t_eng = nr_init_h2t_eng_v2;
 	ndhal->ndhal_ndmar.ndmar_is_nx_ring = ndmar_is_nx_ring_v2;
 	ndhal->ndhal_ndmar.ndmar_quiesce_queues = ndmar_quiesce_queues_v2;
-	ndhal->ndhal_ndmar.ndmar_set_model_started = ndmar_set_model_started_v2;
 	ndhal->ndhal_fw_io.fw_io_topology = fw_io_topology_v2;
 	ndhal->ndhal_fw_io.fw_io_register_readless_read_region = fw_io_register_readless_read_region_v2;
 	ndhal->ndhal_fw_io.fw_io_read_csr_array = fw_io_read_csr_array_v2;
@@ -1687,17 +1432,14 @@ int ndhal_register_funcs_v2(void) {
 	ndhal->ndhal_sysfs_metrics.nsysfsmetric_add_tensor_engine_node = nsysfsmetric_add_tensor_engine_node_v2;
 	ndhal->ndhal_pci.axi_bar = BAR_UNUSED;
 	ndhal->ndhal_pci.dram_bar = 4;
-	ndhal->ndhal_pci.neuron_pci_release_bar = neuron_pci_release_bar_v2;
-	ndhal->ndhal_pci.neuron_pci_reserve_bar = neuron_pci_reserve_bar_v2;
-	ndhal->ndhal_pci.neuron_pci_set_npdev = neuron_pci_set_npdev_v2;
 	ndhal->ndhal_pci.neuron_pci_get_device_id = neuron_pci_get_device_id_v2;
 	ndhal->ndhal_pci.neuron_pci_device_id_to_rid_map = neuron_pci_device_id_to_rid_map_v2; 
 	ndhal->ndhal_cdev.ncdev_mem_regions = ncdev_mem_regions_v2;
 	ndhal->ndhal_cdev.ncdev_bar0_write_blocked_addrs = ncdev_bar0_write_blocked_addrs_v2;
 	ndhal->ndhal_cdev.ncdev_compatible_version = ncdev_compatible_version_v2;
-	ndhal->ndhal_cdev.ncdev_quiesce_exec_on_proc_exit = ncdev_quiesce_exec_on_proc_exit_v2;
 	ndhal->ndhal_cdev.ncdev_logical_to_physical_nc_map = NULL;
 	ndhal->ndhal_cdev.ncdev_get_default_tpbs_for_hbm = ncdev_get_default_tpbs_for_hbm_v2;
+	ndhal->ndhal_udma.num_queues = DMA_MAX_Q_V4;
 	ndhal->ndhal_udma.num_beats = 1024; // >= UDMA_REV_ID_4
 	ndhal->ndhal_ndma.ndma_retry_memcpy = true;
 	ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v2;
@@ -1714,6 +1456,9 @@ int ndhal_register_funcs_v2(void) {
 	ndhal->ndhal_npe.npe_class_server_id_show_data = npe_class_server_id_show_data_v2;
 	ndhal->ndhal_npe.npe_class_ultraserver_mode_show_data = npe_class_ultraserver_mode_show_data_v2;
 	ndhal->ndhal_perf.perf_set_profile = perf_set_profile_v2;
+	ndhal->ndhal_perf.perf_get_profile = perf_get_profile_v2;
+	ndhal->ndhal_perf.perf_get_supported_profiles = perf_get_supported_profiles_v2;
+	ndhal->ndhal_perf.perf_update_hbm_7200_supported = perf_update_hbm_7200_supported_v2;
 	ndhal->ndhal_tpb.pe_xbus_count = 5;
 	ndhal->ndhal_tpb.pe_row_grp_count = 4;
 	ndhal->ndhal_tpb.pe_col_grp_count = 4;
@@ -1730,9 +1475,7 @@ int ndhal_register_funcs_v2(void) {
 		ndhal->ndhal_reset.nr_wait_for_reset_completion = nr_wait_for_reset_completion_v2_qemu;
 		ndhal->ndhal_address_map.seng_dma_eng_per_nd = V2_NC_PER_DEVICE * V2_DMA_ENG_PER_NC;
 		ndhal->ndhal_address_map.h2d_dma_eng_per_nd = V2_NUM_H2D_DMA_PER_DEVICE;
-		ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v2_qemu_emu;
 		ndhal->ndhal_pci.apb_bar = 2;
-		ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v2_qemu;
 	} else if (narch_is_emu()) {
 		ndhal->ndhal_reset.retry_count *= 1000; // wait longer on the emulator
 		ndhal->ndhal_reset.nr_initiate_reset = nr_initiate_reset_v2_emu;
@@ -1741,15 +1484,12 @@ int ndhal_register_funcs_v2(void) {
 		ndhal->ndhal_address_map.h2d_dma_eng_per_nd = nc_per_dev_param;
 		ndhal->ndhal_address_map.nc_per_device = nc_per_dev_param;
 		ndhal->ndhal_address_map.dev_nc_map = dev_nc_map;
-		ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v2_qemu_emu;
 		ndhal->ndhal_pci.apb_bar = 0;
-		ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v2_emu;
 	} else {
 		ndhal->ndhal_reset.nr_initiate_reset = nr_initiate_reset_v2;
 		ndhal->ndhal_reset.nr_wait_for_reset_completion = nr_wait_for_reset_completion_v2;
 		ndhal->ndhal_address_map.seng_dma_eng_per_nd = V2_NC_PER_DEVICE * V2_DMA_ENG_PER_NC;
 		ndhal->ndhal_address_map.h2d_dma_eng_per_nd = V2_NUM_H2D_DMA_PER_DEVICE;
-		ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v2;
 		ndhal->ndhal_pci.apb_bar = 0;
 	}
 
diff --git a/v2/notific.c b/v2/notific.c
index 9cdd9a7..ed5ea6d 100644
--- a/v2/notific.c
+++ b/v2/notific.c
@@ -9,9 +9,6 @@
 
 #include "notific.h"
 
-#define NOTIFIC_NQ_SIZE 0x28   // total size of the NQ register space
-#define NOTIFIC_NQ_HEAD_OFFSET 0x10c
-
 static u64 seng_sdma_base[V2_MMAP_TPB_COUNT][V2_NUM_DMA_ENGINES_PER_TPB] = {
 	{ V2_APB_SENG_0_SDMA_0_BASE, V2_APB_SENG_0_SDMA_1_BASE, V2_APB_SENG_0_SDMA_2_BASE,
 		V2_APB_SENG_0_SDMA_3_BASE, V2_APB_SENG_0_SDMA_4_BASE, V2_APB_SENG_0_SDMA_5_BASE,
diff --git a/v2/notific.h b/v2/notific.h
index fa62b45..f25ccf8 100644
--- a/v2/notific.h
+++ b/v2/notific.h
@@ -19,6 +19,7 @@
  */
 
 #include "address_map.h"
+#include "../neuron_nq.h"
 #include "../neuron_reg_access.h"
 
 /** Returns NOTIFIC relative offset for given the DMA engine for given NC.
@@ -49,43 +50,3 @@ static inline u64 notific_get_relative_offset_topsp_v2(int ts_idx)
 
 int notific_decode_nq_head_reg_access_v2(u64 offset, u8 *nc_id, u32 *nq_type, u8 *instance,
 				      bool *is_top_sp);
-
-#define NOTIFIC_NQ_SIZE 0x28
-#define NOTIFIC_NQ_BASE_ADDR_LO_OFFSET_START 0x100
-#define NOTIFIC_NQ_BASE_ADDR_LO_OFFSET(index) (NOTIFIC_NQ_BASE_ADDR_LO_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0)
-
-#define NOTIFIC_NQ_BASE_ADDR_LO_RESET_VALUE 0x00000000
-
-static inline void notific_write_nq_base_addr_lo(void __iomem *base, size_t index,
-								  uint32_t value)
-{
-	const size_t offset = NOTIFIC_NQ_BASE_ADDR_LO_OFFSET(index);
-
-	reg_write32(base + offset, value);
-}
-
-#define NOTIFIC_NQ_BASE_ADDR_HI_OFFSET_START 0x104
-#define NOTIFIC_NQ_BASE_ADDR_HI_OFFSET(index) (NOTIFIC_NQ_BASE_ADDR_HI_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0)
-
-#define NOTIFIC_NQ_BASE_ADDR_HI_RESET_VALUE 0x00000000
-
-static inline void notific_write_nq_base_addr_hi(void __iomem *base, size_t index,
-								  uint32_t value)
-{
-	const size_t offset = NOTIFIC_NQ_BASE_ADDR_HI_OFFSET(index);
-
-	reg_write32(base + offset, value);
-}
-
-#define NOTIFIC_NQ_F_SIZE_OFFSET_START 0x108
-#define NOTIFIC_NQ_F_SIZE_OFFSET(index) (NOTIFIC_NQ_F_SIZE_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0)
-
-#define NOTIFIC_F_SIZE_RESET_VALUE 0x00000000
-
-static inline void notific_write_nq_f_size(void __iomem *base, size_t index,
-							    uint32_t value)
-{
-	const size_t offset = NOTIFIC_NQ_F_SIZE_OFFSET(index);
-
-	reg_write32(base + offset, value);
-}
diff --git a/v3/address_map.h b/v3/address_map.h
index 32c751f..d1c9ac1 100644
--- a/v3/address_map.h
+++ b/v3/address_map.h
@@ -15,7 +15,6 @@
 #define V3_PCIE_B0_3_BASE                      0x1c000000000000ull
 
 // relative to nc
-#define V3_MMAP_P_OFFSET 0x0000000d0000000ull
 #define V3_MMAP_NC_EVENT_OFFSET 0x00000002700000ull
 #define V3_MMAP_NC_SEMA_READ_OFFSET V3_MMAP_NC_EVENT_OFFSET + 0x00000000001000ull
 #define V3_MMAP_NC_SEMA_SET_OFFSET V3_MMAP_NC_EVENT_OFFSET + 0x00000000001400ull
diff --git a/v3/neuron_dhal_v3.c b/v3/neuron_dhal_v3.c
index 52e2d11..40e683e 100644
--- a/v3/neuron_dhal_v3.c
+++ b/v3/neuron_dhal_v3.c
@@ -25,10 +25,10 @@
 #include "neuron_pelect.h"
 
 extern int dev_nc_map;
+extern int reset_top_dma;
 
 #define NR_RESET_RETRY_SLEEP_MS                     100
 #define V3_NR_RESET_INIT_MAX_TOTAL_WAIT_TIME_MS     (1000 * 480)
-#define V3_NR_RESET_POLL_INTERVAL                   100
 
 int force_userver = 0;
 module_param(force_userver , int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
@@ -280,18 +280,29 @@ static enum neuron_platform_type ndhal_platform_type_v3(void)
     uint8_t cc_top_bv = (reset_unit_index_bv >> 24) & 0xFU; // Note: 4b here instead of 8b
  *
  */
-static void nr_get_tpb_reset_map(uint32_t nc_map, uint32_t *tpb_reset_map)
+static void nr_get_tpb_reset_map(uint32_t nc_map, uint32_t *tpb_reset_map_lo, uint32_t *tpb_reset_map_hi)
 {
 	int i;
+	uint32_t seng_mask;
 
 	// Build the tpb reset map if we are not performing a device reset
 	if (nc_map != NEURON_NC_MAP_DEVICE) {
 		for (i = 0; i < MAX_NC_PER_DEVICE; i++) {
 			if ((1 << i) & nc_map) {
 				// Add this tpb to the reset map
-				*tpb_reset_map |= (1 << i);
-				*tpb_reset_map |= (1 << (i+8));   // SDMA group for this core
-				*tpb_reset_map |= (1 << (i+16));  // TOP SP group for this core
+				*tpb_reset_map_lo |= (1 << i);
+				*tpb_reset_map_lo |= (1 << (i+8));   // SDMA group for this core
+				*tpb_reset_map_lo |= (1 << (i+16));  // TOP SP group for this core
+			}
+		}
+
+		// Reset top DMA only if both NCs in SENG are being reset
+		if (reset_top_dma) {
+			for (i = 0; i < V3_SENG_PER_DEVICE; i++) {
+				seng_mask = ((1 << V3_NC_PER_SENG) - 1) << (i * V3_NC_PER_SENG);
+				if ((nc_map & seng_mask) == seng_mask) {
+					*tpb_reset_map_hi |= (1 << i);
+				}
 			}
 		}
 	}
@@ -304,29 +315,32 @@ static void nr_get_tpb_reset_map(uint32_t nc_map, uint32_t *tpb_reset_map)
  */
 static int nr_initiate_reset_v3(struct neuron_device *nd, uint32_t nc_map)
 {
+	uint32_t tpb_reset_map_lo = 0, tpb_reset_map_hi = 0;
+	int ret;
+
 	if (no_reset)
 		return 0;
 
-	uint32_t tpb_reset_map = 0;
-	nr_get_tpb_reset_map(nc_map, &tpb_reset_map);
+	nr_get_tpb_reset_map(nc_map, &tpb_reset_map_lo, &tpb_reset_map_hi);
 
-	int ret = nr_initiate_reset_via_fw(nd, nc_map, tpb_reset_map);
-	if (ret) {
+	ret = nr_initiate_reset_via_fw(nd, nc_map, tpb_reset_map_lo, tpb_reset_map_hi);
+	if (ret)
 		return ret;
-	}
 
 	return 0;
 }
 
 static int nr_initiate_reset_v3_qemu(struct neuron_device *nd, uint32_t nc_map)
 {
+	uint32_t tpb_reset_map_lo = 0, tpb_reset_map_hi = 0;
+	volatile void *addr;
+
 	if (no_reset)
 		return 0;
 
-    uint32_t tpb_reset_map = 0;
-    nr_get_tpb_reset_map(nc_map, &tpb_reset_map);
-	volatile void *addr = nd->npdev.bar0 + V3_PCIE_BAR0_APB_IO_0_OFFSET + V3_APB_IO_0_USER_SE_0_RESERVED2_RELBASE + 0x10;
-	writel(tpb_reset_map, (volatile uint32_t *)addr);
+	nr_get_tpb_reset_map(nc_map, &tpb_reset_map_lo, &tpb_reset_map_hi);
+	addr = nd->npdev.bar0 + V3_PCIE_BAR0_APB_IO_0_OFFSET + V3_APB_IO_0_USER_SE_0_RESERVED2_RELBASE + 0x10;
+	writel(tpb_reset_map_lo, (volatile uint32_t *)addr);
 
 	return 0;
 }
@@ -396,8 +410,16 @@ static int nr_wait_for_reset_completion_v3_emu(struct neuron_device *nd)
  * @param nd - Neuron device which will be reset by the thread.
  * @param reset_successful - device reset was successful
  */
-static int nr_post_reset_config_v3(struct neuron_device *nd, bool reset_successful)
+static int nr_post_reset_config_v3(struct neuron_device *nd, bool reset_successful, bool is_no_reset)
 {
+	if (reset_successful && !is_no_reset) {
+		if (nd->supports_hbm_7200 == -1) {
+			ndhal->ndhal_perf.perf_update_hbm_7200_supported(nd);
+		}
+	} else {
+		nd->supports_hbm_7200 = 0;
+	}
+
 	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) {
 		return 0;
 	}
@@ -532,11 +554,14 @@ static void ts_nq_destroy_one_v3(struct neuron_device *nd, u8 ts_id)
  *
  * @param nd - neuron device
  * @param nc_id - neuron core index
- * @return void* - semaphore base address
+ * @param sem_base - resulting semaphore base address
+ *
+ * Return: 0 on success, a negative error code otherwise.
  */
-static void *nc_get_semaphore_base_v3(struct neuron_device *nd, u8 nc_id)
+static int nc_get_semaphore_base_v3(struct neuron_device *nd, u8 nc_id, void **sem_base)
 {
-	return nd->npdev.bar0 + V3_PCIE_BAR0_TPB_0_OFFSET + (V3_PCIE_BAR0_TPB_DIST * nc_id);
+	(*sem_base) = nd->npdev.bar0 + V3_PCIE_BAR0_TPB_0_OFFSET + (V3_PCIE_BAR0_TPB_DIST * nc_id);
+	return 0;
 }
 
 /**
@@ -545,12 +570,15 @@ static void *nc_get_semaphore_base_v3(struct neuron_device *nd, u8 nc_id)
  * @param nd - neuron device
  * @param nc_id - neuron core index
  * @param event_index - event index
- * @return void* - event address
+ * @param ev_addr - resulting event address
+ *
+ * Return: 0 on success, a negative error code otherwise.
  */
-static void *nc_get_event_addr_v3(struct neuron_device *nd, u8 nc_id, u16 event_index)
+static int nc_get_event_addr_v3(struct neuron_device *nd, u8 nc_id, u16 event_index, void **ev_addr)
 {
 	void * base = nd->npdev.bar0 + V3_PCIE_BAR0_TPB_0_OFFSET + (V3_PCIE_BAR0_TPB_DIST * nc_id) + ndhal->ndhal_address_map.mmap_nc_event_offset;
-	return (base + (event_index * NC_EVENT_SIZE));
+	(*ev_addr) = (base + (event_index * NC_EVENT_SIZE));
+	return 0;
 }
 
 
@@ -608,7 +636,7 @@ static void nnq_set_hwaddr_v3(struct neuron_device *nd, u8 nc_id, u8 index, u32
  * @param device_dram_addr: DRAM Channel 0 and 1's addresses
  * @param device_dram_size: DRAM Channel 0 and 1's sizes
  */
-static void mpset_set_dram_and_mpset_info_v3(struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size)
+static void mpset_set_dram_and_mpset_info_v3(struct neuron_mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size)
 {
 	mpset->num_channels = V3_MAX_DRAM_CHANNELS;
 	mpset->mp_device_num_regions = 1;
@@ -648,60 +676,6 @@ static void mpset_set_dram_and_mpset_info_v3(struct mempool_set *mpset, u64 *dev
 	}
 }
 
-// Upper 16MB is used internally by the firmware, don't use it in the allocation pool
-#define MEMPOOL_CARVEOUT_SIZE 0x1000000 // 16MB
-/**
- * mpset_block_carveout_regions()
- *          - in v3, block carve out regions: Upper 16 MB is used internally by firmware
- *
- * @param nd: neuron device
- * @param mpset: pointer to mpset
- * @param device_dram_addr: DRAM Channel 0's and 1's addresses
- * @param device_dram_size: DRAM Channel 0's and 1's sizes
- * @param region_sz: region size
- * @return int: 0 on success, o/w on failure
- */
-static int mpset_block_carveout_regions_v3(struct neuron_device *nd, struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size)
-{
-	int ret;
-	u64 region_sz;
-	int channel = 0, region = 0;
-
-	/*
-	*  Block carve out regions: Upper 16 MB is used internally by firmware for trainuim2
-	*
-	*  Ideally we would carve out by simply changing the start address of the chunk;
-	*  however, that breaks aligned allocation in 4.x kernel versions (fixed in 5.x).
-	*  Fix here:
-	*     commit 52fbf1134d479234d7e64ba9dcbaea23405f229e
-	*     Author: Alexey Skidanov <alexey.skidanov@intel.com>
-	*     Date:   Thu Jan 3 15:26:44 2019 -0800
-	*
-	*     lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk
-	*/
-	for (channel = 0; channel < mpset->num_channels; channel++) {
-		region_sz = device_dram_size[channel] / mpset->mp_device_num_regions;
-		for (region = 0; region < mpset->mp_device_num_regions; region++) {
-			const dma_addr_t start_addr = device_dram_addr[channel] + (region * region_sz);
-			struct mem_chunk *mc = NULL;
-			u32 nc_id = channel;
-			ret = mc_alloc_align(nd, MC_LIFESPAN_DEVICE, MEMPOOL_CARVEOUT_SIZE, 0, MEM_LOC_DEVICE, channel, region, nc_id, NEURON_MEMALLOC_TYPE_NCDEV_DEVICE, &mc);
-			if (ret) {
-				pr_err("failed to allocate hbm carveout region: ret=%d\n", ret);
-				return -ENOMEM;
-			}
-			if (mc->pa != start_addr) {
-				pr_err("carve out mc not offset 0!");
-				mc_free(&mc);
-				return -EINVAL;
-			}
-		}
-		ndhal->ndhal_mpset.device_dram_effective_base_addr[channel] = device_dram_addr[channel] + MEMPOOL_CARVEOUT_SIZE;
-	}
-
-	return 0;
-}
-
 
 /* DMA Ring Functions */
 /**
@@ -833,26 +807,6 @@ static int ndmar_quiesce_queues_v3(struct neuron_device *nd, u32 nc_id, u32 engi
 	return 0;
 }
 
-/** ndmar_set_model_started()
- *
- * Checks to see if the pa belongs to PE IRAM FIFO offset. If so, then these
- * descs are used to load the iram. The mem chunk is going to have all the descriptors
- * to load the instructions in iram. So go through all the dma queues and check if this mem chunk is
- * in that queue. Once we have the queue we set that queue to have descs
- * for iram. The actual copy start of the queue would come when model is started and at that time
- * set the state of model start for this nc.
- *
- * @nd: Neuron device which contains the DMA engine
- * @pa: pa to check
- * @mc: mem chunk that has descs
- *
- * Return: None
- */
-static void ndmar_set_model_started_v3(struct neuron_device *nd, phys_addr_t pa, struct mem_chunk *mc)
-{
-	return;
-}
-
 
 /* FWIO Functions */
 
@@ -958,6 +912,11 @@ static int fw_io_read_csr_array_v3(void **ptrs, u32 *values, u32 num_csrs, bool
 	if (num_csrs > FW_IO_MAX_READLESS_READ_REGISTER_COUNT)
 		return -EINVAL;
 
+	// Force virtual platforms onto the direct path
+	if (narch_is_qemu() || narch_is_emu()) {
+		fw_io_read_csr_array_direct(ptrs, values, num_csrs, operational);
+	}
+
 	return fw_io_read_csr_array_direct(ptrs, values, num_csrs, operational);
 }
 
@@ -995,37 +954,6 @@ static int fw_io_post_metric_v3(struct fw_io_ctx *ctx, u8 *data, u32 size)
 }
 
 
-/* Register Access (read and write) Functions */
-/**
- * reg_read32_array() - read an array of 32bit registers.
- *
- * @addr: register address.
- * @value: read value would be stored here.
- * @num_values: num values to read
- *
- * Return: 0 if read succeeds, a negative error code otherwise.
- */
-inline int reg_read32_array_v3(void **addr, u32 *value, u32 num_values)
-{
-	int ret;
-	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(addr, value, num_values, true);
-	if (ret != 0) {
-		pr_err("register read failure while reading %p\n", addr[0]);
-		dump_stack();
-	}
-	return ret;
-}
-
-inline int reg_read32_array_v3_qemu_emu(void **addr, u32 *value, u32 num_values)
-{
-	int i;
-	for (i = 0; i < num_values; i++) {
-		value[i] = readl(addr[i]);
-	}
-	return 0;
-}
-
-
 /* Memory Map Functions */
 /**
  * mmap_get_bar4_offset() - calculate the offset of BAR4
@@ -1167,158 +1095,6 @@ static int nsysfsmetric_add_tensor_engine_node_v3(struct nsysfsmetric_metrics *m
 
 
 /* PCI Functions */
-/**
- * neuron_pci_release_bar() - Release a PCI BAR
- *
- * @param dev: PCI device whose resources were previously reserved by pci_request_region()
- * @param bar: BAR to be reserved
- *
- * for V3, this function is dummy
- */
-static int neuron_pci_release_bar_v3(struct pci_dev *dev, int bar)
-{
-	if (bar != ndhal->ndhal_pci.apb_bar && bar != ndhal->ndhal_pci.axi_bar && bar != ndhal->ndhal_pci.dram_bar) {
-		pci_info(dev, "invalid BAR%d\n", bar);
-		return -ENODEV;
-	}
-	if (bar == BAR_UNUSED) {
-		return 0;
-	}
-
-	pci_release_region(dev, bar);
-	return 0;
-}
-
-/**
- * neuron_pci_reserve_bar() - Mark the PCI region associated with PCI BAR as being reserved
- *
- * @param dev: PCI device whose resources are to be reserved
- * @param bar: BAR to be reserved
- * @param res_name: Name to be associated with resource.
- * @return int: Returns 0 on success, otherwise failure
- */
-static int neuron_pci_reserve_bar_v3(struct pci_dev *dev, int bar, const char *res_name)
-{
-	int ret;
-
-	if (bar != ndhal->ndhal_pci.apb_bar && bar != ndhal->ndhal_pci.axi_bar && bar != ndhal->ndhal_pci.dram_bar) {
-		pci_info(dev, "invalid BAR%d\n", bar);
-		goto err;
-	}
-	if (bar == BAR_UNUSED) {
-		return 0;
-	}
-
-	ret = pci_request_region(dev, bar, res_name);
-	if (ret) {
-		pci_info(dev, "BAR %d: can't reserve %s\n", bar, res_name);
-		goto err;
-	}
-
-	return 0;
-
-err:
-	//return -ENODEV;  Until we can map BAR4 on cmdk
-	return (bar == 4)? 0:-ENODEV;
-
-}
-
- /**
- * neuron_pci_set_npdev() - set BAR's physical addr, io addr, and size of neuron_pci_device
- *
- * @param dev: PCI device that owns the BAR
- * @param bar: BAR number
- * @param res_name: Name associated with resource
- * @param bar_pa: start physical address of BAR
- * @param bar_ioaddr: __iomem address to device BAR
- * @param bar_size: size of BAR
- * @return int: Returns 0 on success, otherwise failure
- */
-static int neuron_pci_set_npdev_v3(struct pci_dev *dev,
-                            int bar,
-                            const char *res_name,
-                            phys_addr_t *bar_pa,
-                            void __iomem **bar_ioaddr,
-                            u64 *bar_size)
-{
-	if (bar != ndhal->ndhal_pci.apb_bar && bar != ndhal->ndhal_pci.axi_bar && bar != ndhal->ndhal_pci.dram_bar) {
-		pci_info(dev, "invalid BAR%d\n", bar);
-		return -ENODEV;
-	}
-	if (bar == BAR_UNUSED) {
-		return 0;
-	}
-
-	if (pci_resource_len(dev, bar) == 0) {
-		pci_info(dev, "BAR%d len is 0\n", bar);
-		goto err;
-	}
-
-	*bar_pa = pci_resource_start(dev, bar);
-	if (!(*bar_pa)) {
-		pci_info(dev, "Can't get start address of BAR%d %s\n", bar, res_name);
-		goto err;
-	}
-	*bar_size = pci_resource_len(dev, bar);
-
-	if (bar == ndhal->ndhal_pci.dram_bar) {
-		ndhal->ndhal_pci.dram_bar_size = *bar_size;
-	}
-
-	if (bar == ndhal->ndhal_pci.dram_bar && wc_enable)
-		*bar_ioaddr = pci_iomap_wc(dev, bar, pci_resource_len(dev, bar));
-	else
-		*bar_ioaddr = pci_iomap(dev, bar, pci_resource_len(dev, bar));
-
-	return 0;
-
-err:
-	//return -ENODEV;  Until we can map BAR4 on cmdk
-	*bar_pa = 0;
-	*bar_size = 0;
-	*bar_ioaddr = NULL;
-	return 0;
-}
-
-extern int dup_helper_enable;
-static atomic_t dup_rid_cnt = ATOMIC_INIT(0); // count of duplicate routing IDs encountered
-static int neuron_pci_handle_dup_routing_id(void)
-{
-	int  ret = -ENODEV;
-	int  dup_cnt;
-	char cmd[256];
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
-	dup_cnt = atomic_fetch_add(1, &dup_rid_cnt);
-#else
-	dup_cnt = atomic_add_return(1, &dup_rid_cnt) - 1;
-#endif
-
-	// If this is the first dup encounted, unload the driver
-	if ((dup_cnt == 0) && dup_helper_enable) {
-		pr_err("scheduling unload of %s due to duplicate routing id\n", module_name(THIS_MODULE));
-
-		int n = snprintf(cmd, sizeof(cmd), "sleep 10;/sbin/modprobe -r %s", module_name(THIS_MODULE));
-		if (n > sizeof(cmd)) {
-			pr_err("unable to schedule driver unload cmd buffer len exceeded\n");
-			return -EINVAL;
-		}
-		char *argv[] = 		  { "/bin/sh",
-								"-c",
-								cmd,
-								NULL};
-		static char *envp[] = { "HOME=/",
-								"TERM=linux",
-								"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-								NULL};
-
-		ret = call_usermodehelper( argv[0], argv, envp, UMH_WAIT_EXEC);
-		if (ret)
-			pr_err("unable to schedule driver unload. Error: %d\n", ret);
-	}
-
-	return ret;
-}
 
 // for V3 rename Neuron devices for better customer experience.
 // see internal documentation: TRN2-Discovery
@@ -1382,18 +1158,14 @@ static int neuron_pci_get_device_id_v3(struct neuron_device *nd, struct pci_dev
 	}
 
 	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
-		u32 server_info = 0;
-		bool server_id_valid = 0;
-		u32 server_id = 0;
-		ret = fw_io_server_info_read(nd->npdev.bar0, &server_info);
+		int server_id;
+		
+		ret = fw_io_server_info_read(nd->npdev.bar0, &server_id, NULL);
 		if (ret) {
 			return -ENODEV;
 		}
 
-		server_id_valid = (server_info >> 15) & 0x1; // TODO PDS we probably need const shift value or macro
-		if (server_id_valid) {
-			server_id = server_info & 0x7fff; // TODO PDS we probably need constant mask for this
-		} else {
+		if (server_id == -1) {
 			pr_err("Could not retrieve valid server id, ret = %d\n", ret);
 			return -ENODEV;
 		}
@@ -1499,31 +1271,6 @@ static void ncdev_compatible_version_v3(struct neuron_ioctl_compatible_version *
 	arg->max = V3_RT_MAX_COMPATIBLE_VERSION;
 }
 
-/**
- * ncdev_quiesce_exec_on_proc_exit()
- *
- * Note:
- *      When a process is killed, the driver resets DMA but there is no
- *      way to soft reset neuron cores. This causes problem if the
- *      process was executing serial TPB or switching activation tables,
- *      which result in abrubtly stopping DMA engines hence engines are
- *      are blocked on semaphores. This results in next model
- *      load failure or inference timeout.
- *
- *      Proper way is clearing out semaphore, events after resetting
- *      DMA engines. However, it is a lot of code change, hence
- *      adding a sleep for 1 second when process exits, which allows
- *      the NeuronCore to continue to execute for a second. Since
- *      no new inference can be submitted during this time, NeuronCore
- *      state would be cleared out.
- *
- */
-static void ncdev_quiesce_exec_on_proc_exit_v3(void)
-{
-	// for V3, the 1 second DMA queisce delay in flush was eliminated to improve nrt_init performance
-	return;
-}
-
 static void ncdev_get_default_tpbs_for_hbm_v3(u32 hbm_index, u32 tpbs[MAX_NC_PER_DEVICE], u32 *tpb_count)
 {
 	tpbs[0] = hbm_index * 2;
@@ -1555,18 +1302,6 @@ static void ndma_get_wait_for_completion_time_v3(u32 count, bool async, u64 *fir
 	*following_wait_time *= 100;
 }
 
-static void ndma_get_wait_for_completion_time_v3_qemu(u32 count, bool async, u64 *first_wait_time, u64 *following_wait_time)
-{
-	ndma_get_wait_for_completion_time_v3(count, async, first_wait_time, following_wait_time);
-	*following_wait_time *= 10 * 1000;
-}
-
-static void ndma_get_wait_for_completion_time_v3_emu(u32 count, bool async, u64 *first_wait_time, u64 *following_wait_time)
-{
-	ndma_get_wait_for_completion_time_v3(count, async, first_wait_time, following_wait_time);
-	*following_wait_time *= 100 * 1000;
-}
-
 /**
  * ndma_validate_pa() - check the validity of the desc physical addresses
  *         west side: PCIEX4_1_BASE: 0x00c00000000000 host: PCIEX8_0_BASE: 0x00400000000000
@@ -1962,11 +1697,58 @@ static int perf_set_profile_v3(struct neuron_device *nd, uint32_t profile)
 	ret = fw_io_set_power_profile(nd->fw_io_ctx, profile);
 	if (ret == 0) {
 		ndhal->ndhal_perf.current_performance_profile = profile;
+		nd->current_perf_profile = profile;
 		nmetric_set_performance_profile(nd, profile);
+	} else {
+		uint32_t cur_profile;
+		int retval = ndhal->ndhal_perf.perf_get_profile(nd, &cur_profile);
+		if (retval == 0) {
+			nd->current_perf_profile = cur_profile;
+		} else {
+			nd->current_perf_profile = 0;
+		}
 	}
     return ret;
 }
 
+static int perf_get_profile_v3(struct neuron_device *nd, uint32_t *profile)
+{
+	int ret;
+	if (!profile) {
+		return -EINVAL;
+	}
+	ret = fw_io_get_performance_profile(nd->fw_io_ctx, profile);
+	return ret;
+}
+
+static int perf_get_supported_profiles_v3(struct neuron_device *nd, u16 feature, u8 *num_profiles, u8 out_bitmap[32])
+{
+	return fw_io_get_available_profiles(nd->fw_io_ctx, feature, num_profiles, out_bitmap);
+}
+
+static void perf_update_hbm_7200_supported_v3(struct neuron_device *nd)
+{
+	struct fw_io_get_available_profiles_response tmp;
+	int i;
+	int supports_hbm_7200 = 0;
+	int ret = fw_io_get_available_profiles(nd->fw_io_ctx, FW_IO_AVAILABLE_PERF_PROFILES_HBM_7200, &tmp.num_profiles, tmp.profiles_bitmap);
+	if (ret) {
+		nd->supports_hbm_7200 = 0;
+		return;
+	}
+
+	for (i = 0; i < tmp.num_profiles; i++) {
+		int arr_idx = i / 8;
+		int bit_idx = i % 8;
+		if (tmp.profiles_bitmap[arr_idx] & (1 << bit_idx)) {
+			supports_hbm_7200 = 1;
+			break;
+		}
+	}
+
+	nd->supports_hbm_7200 = supports_hbm_7200;
+}
+
 /**
  * npe_class_node_id_show_data() - return sysfs class node_id
  *
@@ -1982,6 +1764,20 @@ static ssize_t npe_class_node_id_show_data_v3(char *buf, u32 sz)
 	return npe_class_node_id_show_data(buf, sz);
 }
 
+/**
+ * npe_class_node_cnt_show_data() - return sysfs class node_cnt
+ *
+ * @buf - sysfs buffer
+ *
+ */
+static ssize_t npe_class_node_cnt_show_data_v3(char *buf)
+{
+	if (ndhal->ndhal_arch.platform_type != NEURON_PLATFORM_TYPE_PDS) {
+    	return dhal_sysfs_emit(buf, "-1\n");
+	}
+	return npe_class_node_cnt_show_data(buf);
+}
+
 /**
  * npe_class_server_id_show_data() - return sysfs class node_id
  *
@@ -2079,7 +1875,6 @@ int ndhal_register_funcs_v3(void) {
 
 	ndhal->ndhal_arch.platform_type = ndhal_platform_type_v3();
 	ndhal->ndhal_address_map.pci_host_base = V3_PCIE_A0_BASE;
-	ndhal->ndhal_address_map.mmap_p_offset = V3_MMAP_P_OFFSET;
 	ndhal->ndhal_address_map.mmap_nc_event_offset = V3_MMAP_NC_EVENT_OFFSET;
 	ndhal->ndhal_address_map.mmap_nc_sema_read_offset = V3_MMAP_NC_SEMA_READ_OFFSET;
 	ndhal->ndhal_address_map.mmap_nc_sema_set_offset = V3_MMAP_NC_SEMA_SET_OFFSET;
@@ -2087,7 +1882,6 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_address_map.mmap_nc_sema_decr_offset = V3_MMAP_NC_SEMA_DECR_OFFSET;
 	ndhal->ndhal_address_map.bar0_misc_ram_offset = V3_MMAP_BAR0_APB_IO_0_MISC_RAM_OFFSET;
 	ndhal->ndhal_address_map.port_1_base = 0ull;
-	ndhal->ndhal_address_map.mmap_nc_size = V3_MMAP_NC_SIZE;
 	ndhal->ndhal_address_map.nc_per_device = V3_NC_PER_DEVICE;
 	ndhal->ndhal_address_map.dev_nc_map = (1 << V3_NC_PER_DEVICE) - 1;
 	ndhal->ndhal_address_map.dice_per_device = V3_NUM_DIE_PER_DEVICE;
@@ -2096,7 +1890,6 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_address_map.ts_per_device = V3_TS_PER_DEVICE;
 	ndhal->ndhal_address_map.dma_eng_per_nc = V3_DMA_ENG_PER_NC;
 	ndhal->ndhal_address_map.dram_channels = V3_MAX_DRAM_CHANNELS;
-	ndhal->ndhal_reset.reset_poll_interval = V3_NR_RESET_POLL_INTERVAL;
 	ndhal->ndhal_reset.initiate_max_wait_time = V3_NR_RESET_INIT_MAX_TOTAL_WAIT_TIME_MS;
 	ndhal->ndhal_reset.retry_count = NR_RESET_RETRY_COUNT;
 	ndhal->ndhal_reset.nr_post_reset_config = nr_post_reset_config_v3;
@@ -2111,14 +1904,12 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_mpset.mp_min_alloc_size = (mempool_min_alloc_size < 1024) ? 1024 : mempool_min_alloc_size;
 	ndhal->ndhal_mpset.small_pool_supported = true;
 	ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v3;
-	ndhal->ndhal_mpset.mpset_block_carveout_regions = mpset_block_carveout_regions_v3;
 	ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id = ndmar_get_h2t_eng_id_v3;
 	ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid = ndmar_get_h2t_def_qid_v3;
 	ndhal->ndhal_ndmar.ndmar_is_h2t_def_q = ndmar_is_h2t_def_q_v3;
 	ndhal->ndhal_ndmar.nr_init_h2t_eng = nr_init_h2t_eng_v3;
 	ndhal->ndhal_ndmar.ndmar_is_nx_ring = ndmar_is_nx_ring_v3;
 	ndhal->ndhal_ndmar.ndmar_quiesce_queues = ndmar_quiesce_queues_v3;
-	ndhal->ndhal_ndmar.ndmar_set_model_started = ndmar_set_model_started_v3;
 	ndhal->ndhal_fw_io.fw_io_topology = fw_io_topology_v3;
 	ndhal->ndhal_fw_io.fw_io_register_readless_read_region = fw_io_register_readless_read_region_v3;
 	ndhal->ndhal_fw_io.fw_io_read_csr_array = fw_io_read_csr_array_v3;
@@ -2134,17 +1925,14 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_pci.axi_bar = BAR_UNUSED;
 	ndhal->ndhal_pci.apb_bar = 0;
 	ndhal->ndhal_pci.dram_bar = 4;
-	ndhal->ndhal_pci.neuron_pci_release_bar = neuron_pci_release_bar_v3;
-	ndhal->ndhal_pci.neuron_pci_reserve_bar = neuron_pci_reserve_bar_v3;
-	ndhal->ndhal_pci.neuron_pci_set_npdev = neuron_pci_set_npdev_v3;
 	ndhal->ndhal_pci.neuron_pci_get_device_id = neuron_pci_get_device_id_v3;
 	ndhal->ndhal_pci.neuron_pci_device_id_to_rid_map = neuron_pci_device_id_to_rid_map_v3;
 	ndhal->ndhal_cdev.ncdev_mem_regions = ncdev_mem_regions_v3;
 	ndhal->ndhal_cdev.ncdev_bar0_write_blocked_addrs = ncdev_bar0_write_blocked_addrs_v3;
 	ndhal->ndhal_cdev.ncdev_compatible_version = ncdev_compatible_version_v3;
-	ndhal->ndhal_cdev.ncdev_quiesce_exec_on_proc_exit = ncdev_quiesce_exec_on_proc_exit_v3;
 	ndhal->ndhal_cdev.ncdev_logical_to_physical_nc_map = ncdev_logical_to_physical_nc_map_v3;
 	ndhal->ndhal_cdev.ncdev_get_default_tpbs_for_hbm = ncdev_get_default_tpbs_for_hbm_v3;
+	ndhal->ndhal_udma.num_queues = DMA_MAX_Q_V4;
 	ndhal->ndhal_udma.num_beats = 2296;  // allow up to 288 outstanding writes
 	ndhal->ndhal_ndma.ndma_retry_memcpy = false;
 	ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v3;
@@ -2158,10 +1946,14 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_npe.npe_pod_status = npe_pod_status_v3;
 	ndhal->ndhal_npe.npe_pod_ctrl = npe_pod_ctrl_v3;
 	ndhal->ndhal_npe.npe_class_node_id_show_data = npe_class_node_id_show_data_v3;
+	ndhal->ndhal_npe.npe_class_node_cnt_show_data = npe_class_node_cnt_show_data_v3;
 	ndhal->ndhal_npe.npe_class_server_id_show_data = npe_class_server_id_show_data_v3;
 	ndhal->ndhal_npe.npe_class_ultraserver_mode_show_data = npe_class_ultraserver_mode_show_data_v3;
 	ndhal->ndhal_npe.npe_neighbor_eng_ids = npe_neighbor_eng_ids_v3;
 	ndhal->ndhal_perf.perf_set_profile = perf_set_profile_v3;
+	ndhal->ndhal_perf.perf_get_profile = perf_get_profile_v3;
+	ndhal->ndhal_perf.perf_get_supported_profiles = perf_get_supported_profiles_v3;
+	ndhal->ndhal_perf.perf_update_hbm_7200_supported = perf_update_hbm_7200_supported_v3;
 	ndhal->ndhal_tpb.pe_xbus_count = 9;
 	ndhal->ndhal_tpb.pe_row_grp_count = 4;
 	ndhal->ndhal_tpb.pe_col_grp_count = 4;
@@ -2181,8 +1973,6 @@ int ndhal_register_funcs_v3(void) {
 		ndhal->ndhal_reset.nr_wait_for_reset_completion = nr_wait_for_reset_completion_v3_qemu;
 		ndhal->ndhal_address_map.seng_dma_eng_per_nd = V3_NC_PER_DEVICE * V3_DMA_ENG_PER_NC;
 		ndhal->ndhal_address_map.h2d_dma_eng_per_nd = V3_NUM_H2D_DMA_PER_DEVICE;
-		ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v3_qemu_emu;
-		ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v3_qemu;
 		ndhal->ndhal_address_map.dice_per_device = 1;
 
 		// Disable metrics on qemu
@@ -2195,8 +1985,6 @@ int ndhal_register_funcs_v3(void) {
 		ndhal->ndhal_address_map.h2d_dma_eng_per_nd = nc_per_dev_param;
 		ndhal->ndhal_address_map.nc_per_device = nc_per_dev_param;
 		ndhal->ndhal_address_map.dev_nc_map = dev_nc_map;
-		ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v3_qemu_emu;
-		ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v3_emu;
 		ndhal->ndhal_address_map.dice_per_device = 1;
 
 		// Disable metrics on emulation
@@ -2206,7 +1994,6 @@ int ndhal_register_funcs_v3(void) {
 		ndhal->ndhal_reset.nr_wait_for_reset_completion = nr_wait_for_reset_completion_v3;
 		ndhal->ndhal_address_map.seng_dma_eng_per_nd = V3_NC_PER_DEVICE * V3_DMA_ENG_PER_NC;
 		ndhal->ndhal_address_map.h2d_dma_eng_per_nd = V3_NUM_H2D_DMA_PER_DEVICE;
-		ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v3;
 	}
 
 	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) {
diff --git a/v3/neuron_pelect.c b/v3/neuron_pelect.c
index c00a579..26249c9 100644
--- a/v3/neuron_pelect.c
+++ b/v3/neuron_pelect.c
@@ -157,13 +157,9 @@
 #include "../neuron_crwl.h"
 #include "neuron_pelect.h"
 
-int userver_pds_node_cnt = 2;
-module_param(userver_pds_node_cnt, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
-MODULE_PARM_DESC(userver_pds_node_cnt, "pds ultraserver node count");
-
-int userver_pds_server_id = 0x0001;
-module_param(userver_pds_server_id, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
-MODULE_PARM_DESC(userver_pds_server_id, "pds ultraserver id");
+int pds_reservation_id = 0x0001;
+module_param(pds_reservation_id, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
+MODULE_PARM_DESC(pds_reservation_id, "pds reservation id");
 
 
 /* Enable ultraserver auto election (4 node configuration) by default  */
@@ -293,7 +289,7 @@ typedef struct pod_neighbor_io {
 	struct mem_chunk *data_mc;
 } pod_neighbor_io_t;
 
-static void npe_pds_spoof(void);
+static void npe_pds_config_init(void);
 
 static bool npe_pod_ctl_is_set(int value)
 {
@@ -1216,10 +1212,10 @@ int npe_election_exec_on_rst(struct neuron_device *nd, bool reset_successful)
 			goto done;
 	}
 	
-	// spoof PDS topology/election data
+	// initialize PDS configuration (topology/election) data
 	//
 	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
-		npe_pds_spoof();
+		npe_pds_config_init();
 		goto done;
 	}
 	
@@ -1272,6 +1268,11 @@ static int npe_get_modal_node_id(enum neuron_ultraserver_mode mode)
 {
 	int node_id = ndhal_pelect_data.node_id;
 
+	// PDS doesn't change node_id based on mode because nodes id are location based vs. election based
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		return node_id;
+	}
+
 	switch (mode) {
 		case NEURON_ULTRASERVER_MODE_UNSET:
 			break;
@@ -1426,6 +1427,31 @@ static bool npe_mode_is_supported(enum neuron_ultraserver_mode mode)
 	return false;
 }
 
+static enum neuron_ultraserver_mode npe_node_cnt_to_mode(int node_cnt)
+{
+	enum neuron_ultraserver_mode mode = NEURON_ULTRASERVER_MODE_UNSET;
+
+	switch (node_cnt) {
+		case 0:
+		case 1:
+			mode = NEURON_ULTRASERVER_MODE_X1;
+			break;
+		case 2:
+			if (npe_mode_is_supported(NEURON_ULTRASERVER_MODE_X2H)) {
+				mode = NEURON_ULTRASERVER_MODE_X2H;
+			} else if (npe_mode_is_supported(NEURON_ULTRASERVER_MODE_X2V)) {
+				mode = NEURON_ULTRASERVER_MODE_X2V;
+			}
+			break;
+		case 4:
+			mode = NEURON_ULTRASERVER_MODE_X4;
+			break;
+		default:
+			break;
+	}
+	return mode;
+}
+
 /**
  * npe_get_pod_id()
  *
@@ -1727,22 +1753,18 @@ static void npe_stop_thread(void)
 ssize_t npe_class_node_id_show_data(char *buf, u32 sz)
 {
 	int node_id;
-	enum neuron_ultraserver_mode mode = NEURON_ULTRASERVER_MODE_X1;
+	enum neuron_ultraserver_mode mode;
 
 	if (npe_pod_state_busy()) {
 		return dhal_sysfs_emit(buf, "busy\n");
 	}
 
-	if (sz == 4) {
-		mode = NEURON_ULTRASERVER_MODE_X4;
-	} else if (sz == 2) {
-		if (npe_mode_is_supported(NEURON_ULTRASERVER_MODE_X2H)) {
-			mode = NEURON_ULTRASERVER_MODE_X2H;
-		} else if (npe_mode_is_supported(NEURON_ULTRASERVER_MODE_X2V)) {
-			mode = NEURON_ULTRASERVER_MODE_X2V;
-		}
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		mode = npe_node_cnt_to_mode(ndhal_pelect_data.node_cnt);
+	} else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) {
+		mode = npe_node_cnt_to_mode(sz);
 	} else {
-		pr_err("Unexpected class entry: node_id_%d", sz);
+		pr_err("unexpected platform type %d", ndhal->ndhal_arch.platform_type);
 		return dhal_sysfs_emit(buf, "invalid\n");
 	}
 
@@ -1750,6 +1772,23 @@ ssize_t npe_class_node_id_show_data(char *buf, u32 sz)
 	return dhal_sysfs_emit(buf, "%d\n", node_id);
 }
 
+ssize_t npe_class_node_cnt_show_data(char *buf)
+{
+	int node_cnt = -1; // node_cnt is currently only returned for PDS
+
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		node_cnt = ndhal_pelect_data.node_cnt;
+	}
+
+	return dhal_sysfs_emit(buf, "%d\n", node_cnt);
+}
+
+/**
+ * npe_class_server_id_show_data()
+ *
+ *   return server id data for sysfs class node.  PD and PDS
+ *   have different server id data retrieval methodologies.
+ */
 ssize_t npe_class_server_id_show_data(char *buf, u32 sz)
 {
 	u64 pod_serial_number;
@@ -1759,18 +1798,12 @@ ssize_t npe_class_server_id_show_data(char *buf, u32 sz)
 		return dhal_sysfs_emit(buf, "0000000000000000\n");
 	}
 
-	if (sz == 4) {
-		mode = NEURON_ULTRASERVER_MODE_X4;
-	} else if (sz == 2) {
-		if (npe_mode_is_supported(NEURON_ULTRASERVER_MODE_X2H)) {
-			mode = NEURON_ULTRASERVER_MODE_X2H;
-		} else if (npe_mode_is_supported(NEURON_ULTRASERVER_MODE_X2V)) {
-			mode = NEURON_ULTRASERVER_MODE_X2V;
-		}
-	} else {
-		pr_err("Unexpected class entry: server_id_%d", sz);
-		return dhal_sysfs_emit(buf, "invalid\n");
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		mode = npe_node_cnt_to_mode(ndhal_pelect_data.node_cnt);
+	} else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) {
+		mode = npe_node_cnt_to_mode(sz);
 	}
+
 	pod_serial_number = npe_get_modal_serial_number(mode);
 
 	return dhal_sysfs_emit(buf, "%016llx\n", pod_serial_number);
@@ -1811,40 +1844,34 @@ struct {
 	uint64_t d0_serial_number; // serial number of a particular device 0 on a particular server
 	uint64_t server_num;       // server unique id of the associated server
 	uint32_t node_id;          // (rack id<<1 | server id)
+	uint32_t node_cnt;         // ultra-server nodes size.
 } npe_pds_tmp_mapping_tbl[] =  {
-	{0x644b8499cd7bf298ull, 0x0000004005590701ull, 0},
-	{0x001e8649a094af56ull, 0x0000004005590701ull, 1},
-	{0x4b63b0678ae2a930ull, 0x0000004005590701ull, 2},
-	{0x7242db0306415ed7ull, 0x0000004005590701ull, 3},
-	{0x7e3a518befdf7a57ull, 0x0000004005590689ull, 0},
-	{0x3c604484897a4f1aull, 0x0000004005590689ull, 1},
-	{0xacfba8515bb626a6ull, 0x0000004005590689ull, 2},
-	{0x48c2b73699e97cadull, 0x0000004005590689ull, 3},
-	{0xa952ff53b45fc298ull, 0x0000004005590680ull, 0},
-	{0x5961a8d75d827fc0ull, 0x0000004005590680ull, 1},
-	{0x714cf1792facf83bull, 0x0000004005590680ull, 2},
-	{0x9b3187e1756c8a7full, 0x0000004005590680ull, 3},
-	{0x85059f2db248f3dfull, 0x0000004005590682ull, 0},
-	{0xf4d2ef81ad1b1264ull, 0x0000004005590682ull, 1},
-	{0x3d3ea5a61b768cbdull, 0x0000004005590682ull, 2},
-	{0x85752a544054033aull, 0x0000004005590682ull, 3}
+    {0xf15812e4f3dc642cull, 0x0000004005590728ull, 0, 4}, //TRN3PDS US16 Node-1 Label swapped.
+    {0x51f84556b473ea1cull, 0x0000004005590728ull, 1, 4}, //TRN3PDS US16 Node-0 Label swapped.
+    {0xdd5da7090e13c984ull, 0x0000004005590728ull, 3, 4}, //TRN3PDS US16 Flipped the server_id(0) in rack1
+    {0x2c82d5db4d1c3969ull, 0x0000004005590728ull, 2, 4}, //TRN3PDS US16 Flipped the server_id(1) in rack1
 }; 
 
-/* npe_pds_spoof(void)
+/* npe_pds_config_init(void)
  *
- *   temp spoof of PDS platform data
+ *   Initialize pds configuration data.  Configuration data consists of:
+ *    - reservation_id - unique id indentifying all the instances belonging to this PDS reservation
+ *    - node_id        - node id for this node in the PDS server
+ *    - node_cnt       - count of nodes in the PDS server
  *
  */
-static void npe_pds_spoof(void)
+static void npe_pds_config_init(void)
 {
 	static bool initialized = false;
-	int ret;
+	int ret = 0;
 	int i;
 	struct neuron_device *nd;
 	uint64_t serial_number;
+	int instance_sz;
+	int partition_sz;
+	int server_id; 
+	int rack_id;
 
-	pr_info("spoofing pds data");
-	
 	if (initialized) {
 		return;
 	}
@@ -1857,42 +1884,87 @@ static void npe_pds_spoof(void)
 		return;
 	}
 
+	// TODO remove temp mapping table logic
+	//
 	ret = fw_io_serial_number_read(nd->npdev.bar0, &serial_number);
 	if (ret) {
 		pr_err("nd%02d: local serial number read failed", nd->device_index);
 		return;
 	}
 
+	// check temporary mapping table for PDS server data
+	//
 	for (i = 0; i < sizeof(npe_pds_tmp_mapping_tbl) / sizeof(*npe_pds_tmp_mapping_tbl); i++) {
 		if (serial_number == npe_pds_tmp_mapping_tbl[i].d0_serial_number) {
-			ndhal_pelect_data.node_cnt = 4;
 			ndhal_pelect_data.node_id = npe_pds_tmp_mapping_tbl[i].node_id;
 			ndhal_pelect_data.pod_serial_num = npe_pds_tmp_mapping_tbl[i].server_num;
+			ndhal_pelect_data.node_cnt = npe_pds_tmp_mapping_tbl[i].node_cnt;
 			goto done;
 		}
 	}
 
-	// otherwise, we use temporary parameter overrides  
+	// get PDS platform data (instance and partition size) to determine node cnt
 	//
-	ndhal_pelect_data.node_cnt = userver_pds_node_cnt;
+	ret = fw_io_instance_partition_sz_read(nd->npdev.bar0, &instance_sz, &partition_sz);
+	if (ret) {
+		goto done;
+	} 
 
-	if (ndhal_pelect_data.node_cnt == 0) {
-		ndhal_pelect_data.node_id = -1;
-	} else if (ndhal_pelect_data.node_cnt == 2) {
-		// node_cnt of 2 uses V-links
-		ndhal_pelect_data.lr_mask = 0x1;
-		ndhal_pelect_data.node_id = ndhal->ndhal_arch.server_id;
-	} else if (ndhal_pelect_data.node_cnt == 4) {
-		// TODO PDS add in rack id
-		ndhal_pelect_data.node_id = ndhal->ndhal_arch.server_id; 
+	if ((partition_sz == -1) || (instance_sz <= 0)) {
+		pr_warn("PDS partition/instance size data is invalid (%d/%d), defaulting to 4 node PDS configuration", partition_sz,  instance_sz);
+		ndhal_pelect_data.node_cnt = 4;
 	} else {
-		ndhal_pelect_data.node_cnt = 0;
-		pr_err("invalid PDS node count of %d", ndhal_pelect_data.node_cnt);
+		ndhal_pelect_data.node_cnt = partition_sz / instance_sz;
+	}
+
+	if (ndhal_pelect_data.node_cnt == 2) {
+		// node_cnt of 2 uses V-links (for mode selection)
+		ndhal_pelect_data.lr_mask = 0x1;
+	}
+
+	// get PDS reservation id
+	//
+	ret = fw_io_reservation_id_read(nd->npdev.bar0, &ndhal_pelect_data.pod_serial_num);
+	if (ret) {
+		goto done;
+	}
+
+	if (ndhal_pelect_data.pod_serial_num == 0) {
+		pr_warn("PDS server reservation id invalid (%llu), defaulting to 'pds_reservation_id' parameter value: %u", ndhal_pelect_data.pod_serial_num, pds_reservation_id);
+		ndhal_pelect_data.pod_serial_num = pds_reservation_id;
+	}
+
+	// get PDS server id and rack id
+	//
+	ret = fw_io_server_info_read(nd->npdev.bar0, &server_id, &rack_id);
+	if (ret || (server_id == -1) || (rack_id == -1)) {
+		pr_warn("Unable to retrieve PDS server server/rack ids, making best effort guess");
+		server_id = (server_id == -1) ? 0 : server_id;
+		rack_id = (rack_id == -1) ? 0 : rack_id;
 	}
 
-	ndhal_pelect_data.pod_serial_num = userver_pds_server_id;
+	// map server/rack to node id.  TODO covert to mapping function
+	//
+	switch ((rack_id << 1) | server_id) {
+		case 0:
+			ndhal_pelect_data.node_id = 0;
+			break;
+		case 1:
+			ndhal_pelect_data.node_id = 1;
+			break;
+		case 2:
+			ndhal_pelect_data.node_id = 3;
+			break;
+		case 3:
+			ndhal_pelect_data.node_id = 2;
+			break;
+		default:
+			ndhal_pelect_data.node_id = -1;
+			break;
+	}
 
 done:
+	// TODO - correctly report topology discovery/election failure once interfaces become more mature
 	ndhal_pelect_data.pod_state_internal = NEURON_NPE_POD_ST_ELECTION_SUCCESS;
 	
 	initialized = true;
diff --git a/v3/neuron_pelect.h b/v3/neuron_pelect.h
index e0c9136..2e9f4a2 100644
--- a/v3/neuron_pelect.h
+++ b/v3/neuron_pelect.h
@@ -92,6 +92,13 @@ int npe_pod_ctrl(struct neuron_device *nd, u32 ctrl, enum neuron_ultraserver_mod
  */
 ssize_t npe_class_node_id_show_data(char *buf, u32 sz);
 
+/**
+ * npe_class_node_cnt_show_data() - return sysfs class node_cnt
+ *
+ * @buf:		    sysfs buffer
+ */
+ssize_t npe_class_node_cnt_show_data(char *buf);
+
 /**
  * npe_class_server_id_show_data() - return sysfs class server_id
  *
diff --git a/v3/notific.c b/v3/notific.c
index ad66411..3c1e82a 100644
--- a/v3/notific.c
+++ b/v3/notific.c
@@ -9,8 +9,6 @@
 
 #include "notific.h"
 
-#define NOTIFIC_NQ_HEAD_OFFSET 0x10c
-
 static u64 get_sdma_misc_base(int nc_id, int eng_id)
 {
 	int seng_id = nc_id / V3_NC_PER_SENG;
diff --git a/v3/notific.h b/v3/notific.h
index be32e0b..ca16d99 100644
--- a/v3/notific.h
+++ b/v3/notific.h
@@ -19,6 +19,7 @@
  */
 
 #include "address_map.h"
+#include "../neuron_nq.h"
 #include "../neuron_reg_access.h"
 
 /** Returns NOTIFIC relative offset for given the DMA engine for given NC.
@@ -62,48 +63,3 @@ static inline u64 notific_get_relative_offset_topsp_v3(int ts_idx)
 
 int notific_decode_nq_head_reg_access_v3(u64 offset, u8 *nc_id, u32 *nq_type, u8 *instance,
 				      bool *is_top_sp);
-
-
-/*
- * COMMON with V2 need to move to shared include at some point
- *
- */
-#define NOTIFIC_NQ_SIZE 0x28   // total size of the NQ register space
-#define NOTIFIC_NQ_BASE_ADDR_LO_OFFSET_START 0x100
-#define NOTIFIC_NQ_BASE_ADDR_LO_OFFSET(index) (NOTIFIC_NQ_BASE_ADDR_LO_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0)
-
-#define NOTIFIC_NQ_BASE_ADDR_LO_RESET_VALUE 0x00000000
-
-static inline void notific_write_nq_base_addr_lo(void __iomem *base, size_t index,
-								  uint32_t value)
-{
-	const size_t offset = NOTIFIC_NQ_BASE_ADDR_LO_OFFSET(index);
-
-	reg_write32(base + offset, value);
-}
-
-#define NOTIFIC_NQ_BASE_ADDR_HI_OFFSET_START 0x104
-#define NOTIFIC_NQ_BASE_ADDR_HI_OFFSET(index) (NOTIFIC_NQ_BASE_ADDR_HI_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0)
-
-#define NOTIFIC_NQ_BASE_ADDR_HI_RESET_VALUE 0x00000000
-
-static inline void notific_write_nq_base_addr_hi(void __iomem *base, size_t index,
-								  uint32_t value)
-{
-	const size_t offset = NOTIFIC_NQ_BASE_ADDR_HI_OFFSET(index);
-
-	reg_write32(base + offset, value);
-}
-
-#define NOTIFIC_NQ_F_SIZE_OFFSET_START 0x108
-#define NOTIFIC_NQ_F_SIZE_OFFSET(index) (NOTIFIC_NQ_F_SIZE_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0)
-
-#define NOTIFIC_F_SIZE_RESET_VALUE 0x00000000
-
-static inline void notific_write_nq_f_size(void __iomem *base, size_t index,
-							    uint32_t value)
-{
-	const size_t offset = NOTIFIC_NQ_F_SIZE_OFFSET(index);
-
-	reg_write32(base + offset, value);
-}
diff --git a/v4/address_map.h b/v4/address_map.h
index 4391557..401925e 100644
--- a/v4/address_map.h
+++ b/v4/address_map.h
@@ -15,7 +15,6 @@
 #define V4_PCIE_B0_3_BASE                      0x1c000000000000ull
 
 // relative to nc
-#define V4_MMAP_P_OFFSET 0x0000000d0000000ull
 #define V4_MMAP_NC_EVENT_OFFSET 0x00000002700000ull
 #define V4_MMAP_NC_SEMA_READ_OFFSET V4_MMAP_NC_EVENT_OFFSET + 0x00000000001000ull
 #define V4_MMAP_NC_SEMA_SET_OFFSET V4_MMAP_NC_EVENT_OFFSET + 0x00000000001400ull
diff --git a/v4/neuron_dhal_v4.c b/v4/neuron_dhal_v4.c
index 9ed4d3c..72b87c2 100644
--- a/v4/neuron_dhal_v4.c
+++ b/v4/neuron_dhal_v4.c
@@ -198,7 +198,7 @@ static bool ndhal_instance_type_3xl(void)
  * @param device_dram_addr: DRAM Channel 0 and 1's addresses
  * @param device_dram_size: DRAM Channel 0 and 1's sizes
  */
-static void mpset_set_dram_and_mpset_info_v4(struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size)
+static void mpset_set_dram_and_mpset_info_v4(struct neuron_mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size)
 {
 	mpset->num_channels = V4_MAX_DRAM_CHANNELS;
 	mpset->mp_device_num_regions = 1;
@@ -265,45 +265,8 @@ static int mmap_get_bar4_offset_v4(u64 start_addr, u64 size, u64 *offset)
 	return 0;
 }
 
-extern int dup_helper_enable;
-static atomic_t dup_rid_cnt = ATOMIC_INIT(0); // count of duplicate routing IDs encountered
-static int neuron_pci_handle_dup_routing_id(void)
-{
-	int  ret = -ENODEV;
-	int  dup_cnt;
-	char cmd[256];
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
-	dup_cnt = atomic_fetch_add(1, &dup_rid_cnt);
-#else
-	dup_cnt = atomic_add_return(1, &dup_rid_cnt) - 1;
-#endif
-
-	// If this is the first dup encounted, unload the driver
-	if ((dup_cnt == 0) && dup_helper_enable) {
-		pr_err("scheduling unload of %s due to duplicate routing id\n", module_name(THIS_MODULE));
-
-		int n = snprintf(cmd, sizeof(cmd), "sleep 10;/sbin/modprobe -r %s", module_name(THIS_MODULE));
-		if (n > sizeof(cmd)) {
-			pr_err("unable to schedule driver unload cmd buffer len exceeded\n");
-			return -EINVAL;
-		}
-		char *argv[] = 		  { "/bin/sh",
-								"-c",
-								cmd,
-								NULL};
-		static char *envp[] = { "HOME=/",
-								"TERM=linux",
-								"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-								NULL};
-
-		ret = call_usermodehelper( argv[0], argv, envp, UMH_WAIT_EXEC);
-		if (ret)
-			pr_err("unable to schedule driver unload. Error: %d\n", ret);
-	}
 
-	return ret;
-}
+/* PCI Functions */
 
 // for V4 rename Neuron devices for better customer experience.
 // see internal documentation: TRN2-Discovery
@@ -373,18 +336,14 @@ static int neuron_pci_get_device_id_v4(struct neuron_device *nd, struct pci_dev
 	}
 
 	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
-		u32 server_info = 0;
-		bool server_id_valid = 0;
-		u32 server_id = 0;
-		ret = fw_io_server_info_read(nd->npdev.bar0, &server_info);
+		int server_id;
+
+		ret = fw_io_server_info_read(nd->npdev.bar0, &server_id, NULL);
 		if (ret) {
 			return -ENODEV;
 		}
 
-		server_id_valid = (server_info >> 15) & 0x1; // TODO we probably need const shift value or macro
-		if (server_id_valid) {
-			server_id = server_info & 0x7fff; // TODO we probably need constant mask for this
-		} else {
+		if (server_id == -1) {
 			pr_err("Could not retrieve valid server id, ret = %d\n", ret);
 			return -ENODEV;
 		}
@@ -415,21 +374,21 @@ static int neuron_pci_get_device_id_v4(struct neuron_device *nd, struct pci_dev
 #define NC_MAPPING_MAX_CORE_COUNT_V4 128
 static const struct neuron_ioctl_nc_map_entry nc_mapping_v0_seng_swap_pds[] = {
 	{ .device_id = 0,  .device_nc_idx = 4 }, { .device_id = 0,  .device_nc_idx = 5 }, { .device_id = 0,  .device_nc_idx = 6 }, { .device_id = 0,  .device_nc_idx = 7 }, { .device_id = 0,  .device_nc_idx = 2 }, { .device_id = 0,  .device_nc_idx = 3  }, { .device_id = 0,  .device_nc_idx = 0 }, { .device_id = 0,  .device_nc_idx = 1 }, // ND0
-	{ .device_id = 1,  .device_nc_idx = 2 }, { .device_id = 1,  .device_nc_idx = 3 }, { .device_id = 1,  .device_nc_idx = 0 }, { .device_id = 1,  .device_nc_idx = 1 }, { .device_id = 1,  .device_nc_idx = 4 }, { .device_id = 1,  .device_nc_idx = 5  }, { .device_id = 1,  .device_nc_idx = 6 }, { .device_id = 1,  .device_nc_idx = 7 }, // ND1
+	{ .device_id = 1,  .device_nc_idx = 6 }, { .device_id = 1,  .device_nc_idx = 7 }, { .device_id = 1,  .device_nc_idx = 4 }, { .device_id = 1,  .device_nc_idx = 5 }, { .device_id = 1,  .device_nc_idx = 0 }, { .device_id = 1,  .device_nc_idx = 1  }, { .device_id = 1,  .device_nc_idx = 2 }, { .device_id = 1,  .device_nc_idx = 3 }, // ND1
 	{ .device_id = 2,  .device_nc_idx = 4 }, { .device_id = 2,  .device_nc_idx = 5 }, { .device_id = 2,  .device_nc_idx = 6 }, { .device_id = 2,  .device_nc_idx = 7 }, { .device_id = 2,  .device_nc_idx = 2 }, { .device_id = 2,  .device_nc_idx = 3  }, { .device_id = 2,  .device_nc_idx = 0 }, { .device_id = 2,  .device_nc_idx = 1 }, // ND2
-	{ .device_id = 3,  .device_nc_idx = 2 }, { .device_id = 3,  .device_nc_idx = 3 }, { .device_id = 3,  .device_nc_idx = 0 }, { .device_id = 3,  .device_nc_idx = 1 }, { .device_id = 3,  .device_nc_idx = 4 }, { .device_id = 3,  .device_nc_idx = 5  }, { .device_id = 3,  .device_nc_idx = 6 }, { .device_id = 3,  .device_nc_idx = 7 }, // ND3
+	{ .device_id = 3,  .device_nc_idx = 6 }, { .device_id = 3,  .device_nc_idx = 7 }, { .device_id = 3,  .device_nc_idx = 4 }, { .device_id = 3,  .device_nc_idx = 5 }, { .device_id = 3,  .device_nc_idx = 0 }, { .device_id = 3,  .device_nc_idx = 1  }, { .device_id = 3,  .device_nc_idx = 2 }, { .device_id = 3,  .device_nc_idx = 3 }, // ND3
 	{ .device_id = 4,  .device_nc_idx = 4 }, { .device_id = 4,  .device_nc_idx = 5 }, { .device_id = 4,  .device_nc_idx = 6 }, { .device_id = 4,  .device_nc_idx = 7 }, { .device_id = 4,  .device_nc_idx = 2 }, { .device_id = 4,  .device_nc_idx = 3  }, { .device_id = 4,  .device_nc_idx = 0 }, { .device_id = 4,  .device_nc_idx = 1 }, // ND4
-	{ .device_id = 5,  .device_nc_idx = 2 }, { .device_id = 5,  .device_nc_idx = 3 }, { .device_id = 5,  .device_nc_idx = 0 }, { .device_id = 5,  .device_nc_idx = 1 }, { .device_id = 5,  .device_nc_idx = 4 }, { .device_id = 5,  .device_nc_idx = 5  }, { .device_id = 5,  .device_nc_idx = 6 }, { .device_id = 5,  .device_nc_idx = 7 }, // ND5
+	{ .device_id = 5,  .device_nc_idx = 6 }, { .device_id = 5,  .device_nc_idx = 7 }, { .device_id = 5,  .device_nc_idx = 4 }, { .device_id = 5,  .device_nc_idx = 5 }, { .device_id = 5,  .device_nc_idx = 0 }, { .device_id = 5,  .device_nc_idx = 1  }, { .device_id = 5,  .device_nc_idx = 2 }, { .device_id = 5,  .device_nc_idx = 3 }, // ND5
 	{ .device_id = 6,  .device_nc_idx = 4 }, { .device_id = 6,  .device_nc_idx = 5 }, { .device_id = 6,  .device_nc_idx = 6 }, { .device_id = 6,  .device_nc_idx = 7 }, { .device_id = 6,  .device_nc_idx = 2 }, { .device_id = 6,  .device_nc_idx = 3  }, { .device_id = 6,  .device_nc_idx = 0 }, { .device_id = 6,  .device_nc_idx = 1 }, // ND6
-	{ .device_id = 7,  .device_nc_idx = 2 }, { .device_id = 7,  .device_nc_idx = 3 }, { .device_id = 7,  .device_nc_idx = 0 }, { .device_id = 7,  .device_nc_idx = 1 }, { .device_id = 7,  .device_nc_idx = 4 }, { .device_id = 7,  .device_nc_idx = 5  }, { .device_id = 7,  .device_nc_idx = 6 }, { .device_id = 7,  .device_nc_idx = 7 }, // ND7
+	{ .device_id = 7,  .device_nc_idx = 6 }, { .device_id = 7,  .device_nc_idx = 7 }, { .device_id = 7,  .device_nc_idx = 4 }, { .device_id = 7,  .device_nc_idx = 5 }, { .device_id = 7,  .device_nc_idx = 0 }, { .device_id = 7,  .device_nc_idx = 1  }, { .device_id = 7,  .device_nc_idx = 2 }, { .device_id = 7,  .device_nc_idx = 3 }, // ND7
 	{ .device_id = 8,  .device_nc_idx = 4 }, { .device_id = 8,  .device_nc_idx = 5 }, { .device_id = 8,  .device_nc_idx = 6 }, { .device_id = 8,  .device_nc_idx = 7 }, { .device_id = 8,  .device_nc_idx = 2 }, { .device_id = 8,  .device_nc_idx = 3  }, { .device_id = 8,  .device_nc_idx = 0 }, { .device_id = 8,  .device_nc_idx = 1 }, // ND8
-	{ .device_id = 9,  .device_nc_idx = 2 }, { .device_id = 9,  .device_nc_idx = 3 }, { .device_id = 9,  .device_nc_idx = 0 }, { .device_id = 9,  .device_nc_idx = 1 }, { .device_id = 9,  .device_nc_idx = 4 }, { .device_id = 9,  .device_nc_idx = 5  }, { .device_id = 9,  .device_nc_idx = 6 }, { .device_id = 9,  .device_nc_idx = 7 }, // ND9
+	{ .device_id = 9,  .device_nc_idx = 6 }, { .device_id = 9,  .device_nc_idx = 7 }, { .device_id = 9,  .device_nc_idx = 4 }, { .device_id = 9,  .device_nc_idx = 5 }, { .device_id = 9,  .device_nc_idx = 0 }, { .device_id = 9,  .device_nc_idx = 1  }, { .device_id = 9,  .device_nc_idx = 2 }, { .device_id = 9,  .device_nc_idx = 3 }, // ND9
 	{ .device_id = 10, .device_nc_idx = 4 }, { .device_id = 10, .device_nc_idx = 5 }, { .device_id = 10, .device_nc_idx = 6 }, { .device_id = 10, .device_nc_idx = 7 }, { .device_id = 10, .device_nc_idx = 2 }, { .device_id = 10, .device_nc_idx = 3  }, { .device_id = 10, .device_nc_idx = 0 }, { .device_id = 10, .device_nc_idx = 1 }, // ND10
-	{ .device_id = 11, .device_nc_idx = 2 }, { .device_id = 11, .device_nc_idx = 3 }, { .device_id = 11, .device_nc_idx = 0 }, { .device_id = 11, .device_nc_idx = 1 }, { .device_id = 11, .device_nc_idx = 4 }, { .device_id = 11, .device_nc_idx = 5  }, { .device_id = 11, .device_nc_idx = 6 }, { .device_id = 11, .device_nc_idx = 7 }, // ND11
+	{ .device_id = 11, .device_nc_idx = 6 }, { .device_id = 11, .device_nc_idx = 7 }, { .device_id = 11, .device_nc_idx = 4 }, { .device_id = 11, .device_nc_idx = 5 }, { .device_id = 11, .device_nc_idx = 0 }, { .device_id = 11, .device_nc_idx = 1  }, { .device_id = 11, .device_nc_idx = 2 }, { .device_id = 11, .device_nc_idx = 3 }, // ND11
 	{ .device_id = 12, .device_nc_idx = 4 }, { .device_id = 12, .device_nc_idx = 5 }, { .device_id = 12, .device_nc_idx = 6 }, { .device_id = 12, .device_nc_idx = 7 }, { .device_id = 12, .device_nc_idx = 2 }, { .device_id = 12, .device_nc_idx = 3  }, { .device_id = 12, .device_nc_idx = 0 }, { .device_id = 12, .device_nc_idx = 1 }, // ND12
-	{ .device_id = 13, .device_nc_idx = 2 }, { .device_id = 13, .device_nc_idx = 3 }, { .device_id = 13, .device_nc_idx = 0 }, { .device_id = 13, .device_nc_idx = 1 }, { .device_id = 13, .device_nc_idx = 4 }, { .device_id = 13, .device_nc_idx = 5  }, { .device_id = 13, .device_nc_idx = 6 }, { .device_id = 13, .device_nc_idx = 7 }, // ND13
+	{ .device_id = 13, .device_nc_idx = 6 }, { .device_id = 13, .device_nc_idx = 7 }, { .device_id = 13, .device_nc_idx = 4 }, { .device_id = 13, .device_nc_idx = 5 }, { .device_id = 13, .device_nc_idx = 0 }, { .device_id = 13, .device_nc_idx = 1  }, { .device_id = 13, .device_nc_idx = 2 }, { .device_id = 13, .device_nc_idx = 3 }, // ND13
 	{ .device_id = 14, .device_nc_idx = 4 }, { .device_id = 14, .device_nc_idx = 5 }, { .device_id = 14, .device_nc_idx = 6 }, { .device_id = 14, .device_nc_idx = 7 }, { .device_id = 14, .device_nc_idx = 2 }, { .device_id = 14, .device_nc_idx = 3  }, { .device_id = 14, .device_nc_idx = 0 }, { .device_id = 14, .device_nc_idx = 1 }, // ND14
-	{ .device_id = 15, .device_nc_idx = 2 }, { .device_id = 15, .device_nc_idx = 3 }, { .device_id = 15, .device_nc_idx = 0 }, { .device_id = 15, .device_nc_idx = 1 }, { .device_id = 15, .device_nc_idx = 4 }, { .device_id = 15, .device_nc_idx = 5  }, { .device_id = 15, .device_nc_idx = 6 }, { .device_id = 15, .device_nc_idx = 7 }, // ND15
+	{ .device_id = 15, .device_nc_idx = 6 }, { .device_id = 15, .device_nc_idx = 7 }, { .device_id = 15, .device_nc_idx = 4 }, { .device_id = 15, .device_nc_idx = 5 }, { .device_id = 15, .device_nc_idx = 0 }, { .device_id = 15, .device_nc_idx = 1  }, { .device_id = 15, .device_nc_idx = 2 }, { .device_id = 15, .device_nc_idx = 3 }, // ND15
 };
 
 #define NC_MAPPING_V0_SENG_SWAP_SIZE (sizeof(nc_mapping_v0_seng_swap_pds) / sizeof(nc_mapping_v0_seng_swap_pds[0]))
@@ -457,6 +416,11 @@ static int ncdev_logical_to_physical_nc_map_v4(struct neuron_ioctl_nc_map *map,
 	return 0;
 }
 
+static void perf_update_hbm_7200_supported_v4(struct neuron_device *nd) {
+	nd->supports_hbm_7200 = 0;
+	return;
+}
+
 /**
  * ndhal_register_funcs_v4() - initialize the dhal for v4 chips
  *
@@ -478,6 +442,7 @@ int ndhal_register_funcs_v4(void) {
 	ndhal->ndhal_mmap.dm_mmap_special = dm_mmap_special_v4;
 	ndhal->ndhal_mmap.mmap_get_bar4_offset = mmap_get_bar4_offset_v4;
 	ndhal->ndhal_cdev.ncdev_mem_regions = ncdev_mem_regions_v4;
+	ndhal->ndhal_perf.perf_update_hbm_7200_supported = perf_update_hbm_7200_supported_v4;
 
 	if (narch_is_emu()) {
 		// Temporarily disable resets on emulation until support is ready